and
# anything else!) $1 = $doc_name & $2 = Filename
# The filename is the first file describing the condition of the
# URL ($doc_name)
$doc_url = $_;
# Ignore lines that start with # or space #
# and blank lines. Actually if it doens't start with an http:
# then ignore it.
if(!($doc_url =~ /^http/i)) {
next;
} else {
@dc = split("/", $doc_url);
$t = pop(@dc);
print STDERR "Checking: $t ($doc_url)\n" if($dbgme);
$output .= qq(Checking: $t
$doc_url\n);
}
# Zero out every thing
$Total = $L200 = $L400 = $L500 = $Lother = $NotChecked = $LEmpty = 0;
%L4 = ("", "");
%L5 = ("", ""); # Empty the Hashes
# Get the page whose links we want to check:
my $response = $ua->get($doc_url);
if($response->is_success) {
# Parsing of undecoded UTF-8 will give garbage when decoding entities at
# /usr/lib/perl5/vendor_perl/5.8.8/LWP/Protocol.pm line 114,
line 20.
#
if ($response->header('Content-Type') &&
$response->header('Content-Type') =~ m/charset=(\S+),/xms) {
my $encoding = "$1";
print "Encoding: $encoding\n" if ($dbgme);
$document = $response->content(Encode::decode($encoding, $response->content));
#$document = $response->content;
} else {
$document = $response->content; # get the Web page contents
}
$doc_url = $response->base();
#$ua = LWP::UserAgent->new;
# Remove all the line breaks and extra spaces. This makes it easier to
# display the links. Especially if they've been broken over 2 lines
# the order is important do not reverse these 2 lines
$document =~ s/\n/ /g; # replace newline with space
$document =~ s/\s{2,}/ /g; # replace multiple spaces with 1
# Parse the document
my $parser = HTML::TokeParser->new(\$document);
# Grab the title for reference
if($document =~ m{(.*?)}i) {
$Title = $1;
} else {
$Title =" *** Unknown ***";
}
# Search for href (URL) and description text (let the parser handle it)
# ARGH!
while(my $token = $parser->get_tag("a")){
# This handles any of the has a since we're
# not looking for it anyway.
# We're ignoring name= anchors, is name= & href= legal? that will hose
# this up
if(!($token->[1]{name})) {
my $text = $parser->get_trimmed_text("/a");
if($text ne "") {
my $url = $token->[1]{href} || "-";
print STDERR "URL: $url\n" if($dbgme);
#print "Text is: $text\n";
if($url ne "-") {
$Total++;
my $absolute_url = absolutize($url, $doc_url);
check_url($absolute_url, $text);
} else {
$LEmpty++;
$output .= sprintf "(XXX) %s A missing href\n", $text;
}
} else {
if(($n =$token->[1]{name})) {
$output .= " *** NAME = $n (Oops)\n";
} else {
$LEmpty++;
$output .= sprintf "(XXX) A missing href and text.\n";
}
}
}
}
$output .= "/==============================================================================\n";
$output .= "* Title: $Title\n";
$output .= "* Link: $doc_url\n\n";
$output .= "* ============ =======\n";
$output .= "* Total: $Total\n";
$output .= "* 200's: $L200\n";
$output .= "* 400's: $L400\n";
$output .= "* 500's: $L500\n";
$output .= "* Other: $Lother\n";
$output .= "* Not checked: $NotChecked\n";
$output .= "* Empty: $LEmpty\n";
$output .= "* ============ =======\n\n";
if($L400 > 0) {
$output .= "* -> 4xx list\n";
while(($lnk, $code) = each(%L4)) {
$output .= qq(* -> ( $code ) $lnk\n);
}
$output .= "*\n";
}
if($L500 > 0) {
$output .= "* -> 5xx list\n";
while(($lnk, $code) = each(%L5)) {
if($lnk) {
$output .= qq(* -> ( $code ) $lnk\n);
}
}
$output .= "*\n";
}
$output .= "
";
} else {
$output .= "Couldn't get $doc_url: " . $response->status_line . "\n";
}
}
$done = `date`;
chomp($done);
$output .= qq(Started:\t${start}\nCompleted:\t${done}
\n);
$output .= qq(--
Author of Linkcheck: ${Author}
Message Generated by Linkcheck v${Version}
);
if($base64) {
$output = encode_base64($output);
}
print "$output\n";