#!/usr/bin/perl -w $Version = "0.2.2c"; $Author = "Neil Cherry (ncherry\@linuxha.com)"; $Support_page = "http://www.linuxha.com/other/linkcheck/index.html"; # Simple enough program, it goes out and checks my web page for the # return code of the links (200/404/500 etc.) and reports back that # info. I should make this accept URL's from the command line. And # I should read links from a file (URL's of pages to check the links # on. Lastly I need to work on outputing the data to a web page so # it can be used within MisterHouse. # Want better (much better actually) try: http://validator.w3.org/checklink #[ linkcheck.pl ]############################################################# # 05/28/00 I've posted this to my web pages, doubt many will use it but # one never knows. (njc) V 0.1 # # 05/28/00 I've changed the code to read from the file ~/.linkcheckrc # It has a simple format. Put the link at the start of the line # starting with http://blah/blah/blah and it will check that file # anything else will be ignored # 09/05/04 Yeah, I'm still using the program. Seems to work pretty good but I # need to handle strings like this: # http://www.wolfgang.uucp/~njc/Personal/athome/common/Coffee/Coffee-1.html%22%3E%3CIMG%20SRC=%22next.gif%22%20ALT=%22Next # 09/06/04 Well I've added the lines to convert n and %22 to their ASCII # equivalents and I've also fixed the problem where the URL text # contained valid html commands such as Next # I don't know if I have taken care of thing such as # # 09/12/04 Damn! Broke something. Linkchecker no longer outputs the list of # links that weren't 2xx. # 12/29/04 Well I must have fixed the above since it works properly now. :-) # When I fixed it I don't know. I'm now using an Perl module to # accomplish the same things I did before (TokenParser). I've also # cleaned up the logic, code and comments. # 02/07/05 Added ANSI codes to turn 400's red. # 01/08/06 Turned off the ANSI codes. # 01/28/07 Added color back but used css and html. Debug message now have to # sent to STDERR and newlines must be preceded by
This will then # be sent to sendmail using the -t -n options (-t = Read message for # recipients. To:, Cc:, and Bcc: -n = Don't do aliasing (???)). This # means that the mail header has to be made in the program. # 02/09/07 Added check for empty links (
) and correct a couple # of errors that were popping up. I just discovered that the token # parser can't handle it expects the but I've # add code to correct that. # # 02/10/07 Added a mail signature at the end of the output that contains the # link to my support page and a link to my email address. require HTML::TokeParser; use LWP; use HTTP::Request; use HTTP::Response; use Getopt::Long; use Env qw( HOME ); # File .linkcheckrc format is real simple. If it doesn't start with # http then it's ignored. Later I may add support for ftp. my $start; my $done; my $dbgme; # For debugging (not used now) $start = `date`; chomp($start); my $to = "root"; my $from = "LHA Webmaster "; my $subject = "Linkcheck - " . `date "+%b %d,%Y"`; # Jan 01,2007 chomp($subject); my $linkrc = "$HOME/.linkcheckrc"; my $file = ""; my $doc_url; my $document; my $browser; my $ua; my ($Total, $L200, $L400, $L500, $Lother, $LEmpty, $NotChecked); sub check_rc { } sub help { print qq(linkcheck - checks links on a given web page --linkrc - alternate linkrc file --to - alternate recipient --from - alternate sender --subject - alternate mail subject --debug - turn on the debug messages --file - check links contained in this file --skip - (What is this for?) --text - Send a plain text message instead of a html formatted message --help - this message ); exit; } sub absolutize { my($url, $base) = @_; use URI; return URI->new_abs($url, $base)->canonical; } sub init_browser { $browser = LWP::UserAgent->new; # ...And any other initialization we might need to do... return $browser; } sub convert { $arg = $_[0]; # Convert %22 to ascii value " (double quote) $arg =~ s/%([A-Fa-f0-9]{2})/pack("c",hex($1))/ge; # Convert n to ascii value n $arg =~ s/&\#([0-9]{3})\;/pack("c",$1)/ge; return $arg; } sub check_url { my ($req, $resp, $arg, $str, $code); $arg = $_[0]; $str = $_[1]; # Check for empty arguments if($arg) { # Only check http for now, I've got mailto & ftp on my web page # but I have written anything to check them yet. Really I # don't care about mailto and I need to figure out how to # handle ftp. I don't use news or gopher so they're not supported # either if( $arg =~ /http/i ) { $arg = convert($arg); # Convert r & %22 to ASCII $req = HTTP::Request->new(GET, $arg); $resp = $ua->request($req); # we should check the return code, I've noticed that # I get a lot of 500's (47 out of 364) but the link # works. Maybe I need to setup the browser description # better. $code = $resp->code(); if($code =~ /2../) { $L200++; printf "(%s) %s\n", $code, $arg, $str; } elsif($code =~ /4../) { # Append link to the end of the @L4 list $L4{$arg} = $code; $L400++; # Turn code (i.e 404) red #printf "(%s) %s [ %s ]\n", $code, $arg, $str; #printf "(%s) %s [ %s ]\n", $code, $arg, $str; printf "(%s) %s [ %s ]\n", $code, $arg, $arg, $str; } elsif($code =~ /5../) { # Append link to the end of the @L5 list $L5{$arg} = $code; $L500++; # Turn code (i.e 404) red #printf "(%s) %s [ %s ]\n", $code, $arg, $str; #printf "(%s) %s [ %s ]\n", $code, $arg, $str; printf "(%s) %s [ %s ]\n", $code, $arg, $arg, $str; } else { # Append link to the end of the @Lo list $Lother++; #printf "(%s) %s [ %s ]\n", $code, $arg, $str; printf "(%s) %s [ %s ]\n", $code, $arg, $arg, $str; } } else { $NotChecked++; printf "(000) %s [ %s ]\n", $arg, convert($str); $i = 0; while($token->[$i]) { print "$i = $token->[$i]\n"; $i++; } } } } # $| = 1; # Turn on Autoflush (doesn't turn off buffering) # Lot of changes need to occur here! # # 1) Read initial doc_url from a file (can be more than 1, foreach loop # Format: http//URL/ Description # 2) Output results to a directory in a web page format, a summary page # should be the initial page. # my @argv = @ARGV; # Save a copy of the original arguements ############################################################################## # Here it starts # I should put a help in here, maybe later GetOptions('linkrc=s' => \$linkrc, # Alternate linkrc file 'to|t=s' => \$to, # who to sed mail to 'from=s' => \$from, # who mail is from 'subject|s=s' => \$subject, # Subject of the message 'debug' => \$dbgme, # Turn on the debug messages 'file=s' => \$file, # Check the links in this file # 'skip' => \$skip, # Skip ??? # 'text' => \$text, # Flag to send plain text instead of html formatted 'help|?' => \$help ); print "To: ($to)\nFrom: ($from)\nSubject: ($subject)\n" if($dbgme); # We need to error check this stuff my $i = @ARGV; # If anything is left over we have too many arguments if($i) { print STDERR "Too many arguement: (@argv)\n\n"; help(); exit(1); } help() if(defined($help)); $ua = init_browser( ); check_rc($file); print qq(To: $to From:$from Subject:$subject Content-Type: text/html; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit MIME-Version: 1.0 Test email ); # This is an html message print "RC: $linkrc
\n" if($dbgme); print "
\n"; open IniFile, "$linkrc" or die "Can't find file: $linkrc ($!)\n"; # Read the rc file and get the links I want checked. Those links are the web # pages with the URL's I need checked. Understand that we won't leave the # page we are checking. So if you need to check the links on another page # put it in the rc file to be checked and we'll go through the list one by # one. while( ) { # Remove the chop; # I'll be making a change here, the new file format is # http://blah/ Filename.ext (no space between and # anything else!) $1 = $doc_name & $2 = Filename # The filename is the first file describing the condition of the # URL ($doc_name) $doc_url = $_; # Ignore lines that start with # or space # # and blank lines. Actually if it doens't start with an http: # then ignore it. if(!($doc_url =~ /^http/i)) { next; } else { @dc = split("/", $doc_url); $t = pop(@dc); print qq(

Checking: $t

$doc_url\n

); } # Zero out every thing $Total = $L200 = $L400 = $L500 = $Lother = $NotChecked = $LEmpty = 0; %L4 = ("", ""); %L5 = ("", ""); # Empty the Hashes # Get the page whose links we want to check: #my $response = $browser->get($doc_url); my $response = $ua->get($doc_url); if($response->is_success) { $document = $response->content; # get the Web page contents $doc_url = $response->base(); #$ua = LWP::UserAgent->new; # Remove all the line breaks and extra spaces. This makes it easier to # display the links. Especially if they've been broken over 2 lines # the order is important do not reverse these 2 lines $document =~ s/\n/ /g; # replace newline with space $document =~ s/\s{2,}/ /g; # replace multiple spaces with 1 # Parse the document my $parser = HTML::TokeParser->new(\$document); # Grab the title for reference if($document =~ m{(.*?)}i) { $Title = $1; } else { $Title =" *** Unknown ***"; } # Search for href (URL) and description text (let the parser handle it) # ARGH! while(my $token = $parser->get_tag("a")){ # This handles any of the has a since we're # not looking for it anyway. # We're ignoring name= anchors, is name= & href= legal? that will hose # this up if(!($token->[1]{name})) { my $text = $parser->get_trimmed_text("/a"); #print "\nText is: $text\n"; if($text ne "") { my $url = $token->[1]{href} || "-"; #print "URL: $url\n"; #print "Text is: $text\n"; if($url ne "-") { $Total++; my $absolute_url = absolutize($url, $doc_url); check_url($absolute_url, $text); } else { $LEmpty++; printf "(XXX) %s A missing href\n", $text; } } else { if(($n =$token->[1]{name})) { print " *** NAME = $n (Oops)\n"; } else { $LEmpty++; printf "(XXX) A missing href and text.\n"; } } } } print "/==============================================================================\n"; print "* Title: $Title\n"; print "* Link: $doc_url\n\n"; print "* ============ =======\n"; print "* Total: $Total\n"; print "* 200's: $L200\n"; print "* 400's: $L400\n"; print "* 500's: $L500\n"; print "* Other: $Lother\n"; print "* Not checked: $NotChecked\n"; print "* Empty: $LEmpty\n"; print "* ============ =======\n\n"; if($L400 > 0) { print "* -> 4xx list\n"; while(($lnk, $code) = each(%L4)) { print qq(* -> ( $code ) $lnk\n); } print "*\n"; } if($L500 > 0) { print "* -> 5xx list\n"; while(($lnk, $code) = each(%L5)) { if($lnk) { print qq(* -> ( $code ) $lnk\n); } } print "*\n"; } print "
"; } else { print "Couldn't get $doc_url: ", $response->status_line, "\n"; } } $done = `date`; chomp($done); print qq(
Started:\t${start}\nCompleted:\t${done}

\n); print qq(--
Author of Linkcheck: ${Author}
Message Generated by Linkcheck v${Version}
);