#!/usr/bin/perl -w # Simple enough program, it goes out and checks my web page for the # return code of the links (200/404/500 etc.) and reports back that # info. I should make this accept URL's from the command line. And # I should read links from a file (URL's of pages to check the links # on. Lastly I need to work on outputing the data to a web page so # it can be used within MisterHouse. # Want better (much better actually) try: http://validator.w3.org/checklink #[ linkcheck.pl ]############################################################# # 05/28/00 I've posted this to my web pages, doubt many will use it but # one never knows. (njc) V 0.1 # # 05/28/00 I've changed the code to read from the file ~/.linkcheckrc # It has a simple format. Put the link at the start of the line # starting with http://blah/blah/blah and it will check that file # anything else will be ignored # 09/05/04 Yeah, I'm still using the program. Seems to work pretty good but I # need to handle strings like this: # http://www.wolfgang.uucp/~njc/Personal/athome/common/Coffee/Coffee-1.html%22%3E%3CIMG%20SRC=%22next.gif%22%20ALT=%22Next # 09/06/04 Well I've added the lines to convert n and %22 to their ASCII # equivalents and I've also fixed the problem where the URL text # contained valid html commands such as Next # I don't know if I have taken care of thing such as # # 09/12/04 Damn! Broke something. Linkchecker no longer outputs the list of # links that weren't 2xx. # 12/29/04 Well I must have fixed the above since it works properly now. :-) # When I fixed it I don't know. I'm now using an Perl module to # accomplish the same things I did before (TokenParser). I've also # cleaned up the logic, code and comments. # 02/07/05 Added ANSI codes to turn 400's red. # 01/08/06 Turned off the ANSI codes. # 01/28/07 Added color back but used css and html. Debug message now have to # sent to STDERR and newlines must be preceded by
This will then # be sent to sendmail using the -t -n options (-t = Read message for # recipients. To:, Cc:, and Bcc: -n = Don't do aliasing (???)). This # means that the mail header has to be made in the program. # 02/09/07 Added check for empty links (
) and correct a couple # of errors that were popping up. I just discovered that the token # parser can't handle it expects the but I've # add code to correct that. # # 02/10/07 Added a mail signature at the end of the output that contains the # link to my support page and a link to my email address. # 04/15/07 Comcast is going out of it's way to make it difficult to send # email! I can no longer send html emails that have more than # ~190 links in it. Seems that base64 encoding gets around this # (for now). This is going to require a major rewite of the code! # 09/07/08 Added a random user agent selector, I'm hoping that this will # alleviate some of the 403 errors I get on valid sites (such as # Mike's HA web site that I know is there and shows up fine with # Firefox. # # Odd that after 8 years I'm still using this program. Not only that # but I've put it under RCS control. And I've merge all the versions # in the RCS release 1.0. # # ---------------------------------------------------------------------------- # # $Id: linkcheck.pl,v 1.2 2008/09/07 21:28:27 njc Stable njc $ # # $Log: linkcheck.pl,v $ # Revision 1.2 2008/09/07 21:28:27 njc # I've added support for the To:, From: and Subject: to be added to the # .linkcheckrc file. The Subject: also supports `cmd` similar to typing # the -s "Linkcheck - `date`" from the command line. What it doesn't do # is to check what type of command is being run. This program should # never be run as root as this allows for the ever dangerous `rm -rf /` # command to be run. I guess in the next release I should check to see # that we are not running as root. # . # # Revision 1.1 2008/09/07 19:43:30 njc # This was version 0.3.0c it should be now 1.1. Hopefully this will # help keep the version straight. # # Added a random user agent selector, I'm hoping that this will # alleviate some of the 403 errors I get on valid sites (such as Mike's # HA web site that I know is there and shows up fine with Firefox). # # Also I added some code to eliminate the '...will give garbage...' # warning. I added the Encode::decode to the ->get() routine. # # Revision 1.0 2008/09/07 19:17:04 njc # Like V0.2 it ouputs html but I've added an option to encode the html # output to base64. Many other small changes. # # Revision 0.2 2008/09/07 19:10:11 njc # A lot of channges have been made to this version. I've added support # for color and HTML. It's in a form that allows it to be output # directly to sendmail. # # Revision 0.1 2008/09/07 19:02:38 njc # This is a working version but it's output is text only. # # # ---------------------------------------------------------------------------- my $Version = '$Revision: 1.2 $ $State: Stable $'; my $Author = "Neil Cherry (ncherry\@linuxha.com)"; my $Support_page = "http://www.linuxha.com/other/linkcheck/index.html"; require HTML::TokeParser; use LWP; use HTTP::Request; use HTTP::Response; use Getopt::Long; use MIME::Base64; use Encode; use Env qw( HOME ); # File .linkcheckrc format is real simple. If it doesn't start with # http then it's ignored. Later I may add support for ftp. my $start; my $done; my $dbgme; # For debugging (not used now) $start = `date`; chomp($start); # Defined later my $to = ""; my $from = ""; my $subject = ""; my $base64 = 0; my $linkrc = "$HOME/.linkcheckrc"; my $file = ""; my $doc_url; my $document; my $browser; my $ua; my ($Total, $L200, $L400, $L500, $Lother, $LEmpty, $NotChecked); # ---------------------------------------------------------------------------- sub help { print qq(linkcheck - checks links on a given web page --base64 - encode html in base64 --linkrc - alternate linkrc file --to - alternate recipient --from - alternate sender --subject - alternate mail subject --debug - turn on the debug messages --file - check links contained in this file --help - this message --version - show the version of this program ); exit; } # --skip - (What is this for?) # --text - Send a plain text message instead of a html formatted message sub absolutize { my($url, $base) = @_; use URI; return URI->new_abs($url, $base)->canonical; } # Return a random user agent string sub get_agent(@) { my (@agent) = @_; my $cnt = @agent; return @agent[rand($cnt)]; } # http://www.useragentstring.com/pages/useragentstring.php?name=All # UserAgent attributes my @User_Agent =( "Opera/9.51 (X11; Linux i686; U; en)", "Opera/9.24 (X11; SunOS i86pc; U; en)", "Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/0.9.8b", "Lynx/2.8.7dev.4 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/0.9.8d", "Mozilla/5.0 (X11; U; Linux; i686; en-US; rv:1.6) Gecko Debian/1.6-7", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071719 Firefox/3.0.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.6) Gecko/20060728 Firefox/1.5.0.6", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9pre) Gecko/2008061501 SeaMonkey/2.0a1pre", "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.9b3pre) Gecko/2008010602 SeaMonkey/2.0a1pre", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.16) Gecko/20080724 Thunderbird/2.0.0.16", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.12) Gecko/20080208 Firefox/2.0b2", "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.8.1b2) Gecko/20060821 Firefox/2.0b2", "Mozilla/4.0 (compatible; MSIE 6.1; Windows XP)", "Mozilla/4.0 (compatible; MSIE 6.0b; Windows NT 5.1)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)", "Mozilla/4.0 (compatible; MSIE 5.23; Mac_PowerPC)", "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT; .NET CLR 1.0.3705)", "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; PPC; 240x320) Opera [de]", "Mozilla/4.0 (compatible; MSIE 6.0; X11; Linux i686; en) Opera 9.51", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.51", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4; fr-fr) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1", "Mozilla/4.0 PPC (compatible; MSIE 4.01; Windows CE; PPC; 240x320; Sprint:PPC-6700; PPC; 240x320)", ); # sub init_browser { my $UsrAgent = get_agent(@User_Agent); $browser = LWP::UserAgent->new(agent => $UsrAgent); # ...And any other initialization we might need to do... return $browser; } sub convert { $arg = $_[0]; # Convert %22 to ascii value " (double quote) $arg =~ s/%([A-Fa-f0-9]{2})/pack("c",hex($1))/ge; # Convert n to ascii value n $arg =~ s/&\#([0-9]{3})\;/pack("c",$1)/ge; return $arg; } sub check_url { my ($req, $resp, $arg, $str, $code); $arg = $_[0]; $str = $_[1]; # Check for empty arguments if($arg) { # Only check http for now, I've got mailto & ftp on my web page # but I have written anything to check them yet. Really I # don't care about mailto and I need to figure out how to # handle ftp. I don't use news or gopher so they're not supported # either if( $arg =~ /http/i ) { $arg = convert($arg); # Convert r & %22 to ASCII $req = HTTP::Request->new(GET, $arg); $resp = $ua->request($req); # we should check the return code, I've noticed that # I get a lot of 500's (47 out of 364) but the link # works. Maybe I need to setup the browser description # better. see init_browser (above) $code = $resp->code(); # http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html # # 1xx - Informational # 2xx - Successful # 3xx - Redirection # 4xx - Client Error # 5xx - Server Error # if($code =~ /2../) { $L200++; $output .= sprintf "(%s) %s\n", $code, $arg, $str; } elsif($code =~ /4../) { # Append link to the end of the @L4 list $L4{$arg} = $code; $L400++; # Turn code (i.e 404) red $output .= sprintf "(%s) %s [ %s ]\n", $code, $arg, $arg, $str; } elsif($code =~ /5../) { # Append link to the end of the @L5 list $L5{$arg} = $code; $L500++; # Turn code (i.e 404) red $output .= sprintf "(%s) %s [ %s ]\n", $code, $arg, $arg, $str; } else { # Append link to the end of the @Lo list $Lother++; $output .= sprintf "(%s) %s [ %s ]\n", $code, $arg, $arg, $str; } } else { $NotChecked++; $output .= sprintf "(000) %s [ %s ]\n", $arg, convert($str); $i = 0; while($token->[$i]) { $output .= "$i = $token->[$i]\n"; $i++; } } } } # # tr 'a-zA-Z' 'n-za-mN-ZA-M' # # rot13: # # perl -pe "tr/A-Za-z/N-ZA-Mn-za-m/" FileToRot # # rot13 reversal: # # perl -pe "tr/N-ZA-Mn-za-m/A-Za-z/" Rotten_File # # rot13: # # $ARGV[0] =~ tr/A-Za-z/N-ZA-Mn-za-m/; # print "encrypted: $ARGV[0]\n"; # #sub rot13 { # my @p, $q; # # print "enter the string you want to convert: \n"; # foreach ('a'..'m', 'A'..'M') { # $q = chr(ord($_)+13); # @p{($_,$q)} = ($q,$_); # } # while (<>) { # s#([a-zA-Z])#$p{$1}#g; print; &MAIN; # } #} # ---------------------------------------------------------------------------- # $| = 1; # Turn on Autoflush (doesn't turn off buffering) # Lot of changes need to occur here! # # 1) Read initial doc_url from a file (can be more than 1, foreach loop # Format: http//URL/ Description # 2) Output results to a directory in a web page format, a summary page # should be the initial page. # my @argv = @ARGV; # Save a copy of the original arguements my $v = 0; ############################################################################## # Here it starts # I should put a help in here, maybe later GetOptions('linkrc=s' => \$linkrc, # Alternate linkrc file 'to|t=s' => \$to, # who to sed mail to 'from=s' => \$from, # who mail is from 'subject|s=s' => \$subject, # Subject of the message 'debug' => \$dbgme, # Turn on the debug messages 'file=s' => \$file, # Check the links in this file 'base64' => \$base64, # encode html in base64 # 'skip' => \$skip, # Skip ??? # 'text' => \$text, # Flag to send plain text instead of html formatted 'version' => \$v, 'help|?|h' => \$help ); if($v != 0) { print "$Version\n"; exit 0; } print "To: ($to)\nFrom: ($from)\nSubject: ($subject)\n" if($dbgme); # We need to error check this stuff my $i = @ARGV; # If anything is left over we have too many arguments if($i) { print STDERR "Too many arguement: (@argv)\n\n"; help(); exit(1); } help() if(defined($help)); # ---------------------------------------------------------------------------- # File .linkcheckrc format is real simple. If it doesn't start with # http then it's ignored. Later I may add support for ftp. open RCFile, "$linkrc" or die "Can't find file: $linkrc ($!)\n"; # Read the rc file and get the links I want checked. Those links are the web # pages with the URL's I need checked. Understand that we won't leave the # page we are checking. So if you need to check the links on another page # put it in the rc file to be checked and we'll go through the list one by # one. my $tto = ""; my $tfrom = ""; my $tsubject = ""; while( ) { # Remove the chomp; #$line = $_; if(/^To:(.*)/) { $tto = "$1"; } elsif(/^From:(.*)/) { $tfrom = "$1"; } else { if(/^Subject:(.*)/) { # Need to support `cli_cmd_string` $tsubject = "$1"; my $a; # This ugly regex is so we can support shell commands filling variables if(($a) = $tsubject =~ /(\`.+\`)/) { ### ### DANGER Will Robinson, DANGER! ### ### Never run this as root, there is no command checking so if ### Someone decides to run `rm -rf /` you are frelled!!! ### my $rside = eval "$a"; # `shell commands` # command input error, abort # I don't know if this being inconsistent or not as I just return # FAIL/SUCCESS for everything else. if( $? ) { # Be careful of using die as no internal error has really occurred print "Error command input: $a (errno $?)\n"; exit $?; } # We'll only accept the first line of a return string and drop everything else $rside =~ s/\r//g; # \r\n becomes just \n ($rside) = split(/\n/, $rside, 2); chomp($rside); # I only clean up the very last newline # Now we need to subsitute the $rside for the $a in $tsubject $tsubject =~ s/(\`.+\`)/$rside/; } } } } if("$to" eq "") { if("$tto" eq "") { # RFC2606 - use example.com for your setup. warn "Change me! root\@localhost"; $to = "root\@localhost"; } else { $to = "$tto"; # from the rc file } } if("$from" eq "") { if("$tfrom" eq "") { # RFC2606 - use example.com for your setup. warn "Change me! Example Webmaster "; $from = "Example Webmaster "; } else { $from = "$tfrom"; } } if("$subject" eq "") { if("$tsubject" eq "") { $subject = "Linkcheck - " . `date "+%b %d,%Y"`; # Jan 01,2007 } else { $subject = "$tsubject"; } chomp($subject); } close(RCFile); # ---------------------------------------------------------------------------- $ua = init_browser( ); if(!$base64) { print qq(To: $to From:$from Subject:$subject Content-Type: text/html; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit MIME-Version: 1.0 ); } else { print qq(To: $to From:$from Subject:$subject Content-Type: text/html; charset=ISO-8859-1 Content-Transfer-Encoding: base64 Content-ID: <00Perl_ColorLinkChecker3> X-Attachment-Id: 0.1 Content-Disposition: inline ); } $output .= qq( Test email ); # This is an html message $output .= "
\n"; open IniFile, "$linkrc" or die "Can't find file: $linkrc ($!)\n"; # Read the rc file and get the links I want checked. Those links are the web # pages with the URL's I need checked. Understand that we won't leave the # page we are checking. So if you need to check the links on another page # put it in the rc file to be checked and we'll go through the list one by # one. while( ) { # Remove the chop; # I'll be making a change here, the new file format is # http://blah/ Filename.ext (no space between and # anything else!) $1 = $doc_name & $2 = Filename # The filename is the first file describing the condition of the # URL ($doc_name) $doc_url = $_; # Ignore lines that start with # or space # # and blank lines. Actually if it doens't start with an http: # then ignore it. if(!($doc_url =~ /^http/i)) { next; } else { @dc = split("/", $doc_url); $t = pop(@dc); print STDERR "Checking: $t ($doc_url)\n" if($dbgme); $output .= qq(

Checking: $t

$doc_url\n

); } # Zero out every thing $Total = $L200 = $L400 = $L500 = $Lother = $NotChecked = $LEmpty = 0; %L4 = ("", ""); %L5 = ("", ""); # Empty the Hashes # Get the page whose links we want to check: my $response = $ua->get($doc_url); if($response->is_success) { # Parsing of undecoded UTF-8 will give garbage when decoding entities at # /usr/lib/perl5/vendor_perl/5.8.8/LWP/Protocol.pm line 114, line 20. # if ($response->header('Content-Type') && $response->header('Content-Type') =~ m/charset=(\S+),/xms) { my $encoding = "$1"; print "Encoding: $encoding\n" if ($dbgme); $document = $response->content(Encode::decode($encoding, $response->content)); #$document = $response->content; } else { $document = $response->content; # get the Web page contents } $doc_url = $response->base(); #$ua = LWP::UserAgent->new; # Remove all the line breaks and extra spaces. This makes it easier to # display the links. Especially if they've been broken over 2 lines # the order is important do not reverse these 2 lines $document =~ s/\n/ /g; # replace newline with space $document =~ s/\s{2,}/ /g; # replace multiple spaces with 1 # Parse the document my $parser = HTML::TokeParser->new(\$document); # Grab the title for reference if($document =~ m{(.*?)}i) { $Title = $1; } else { $Title =" *** Unknown ***"; } # Search for href (URL) and description text (let the parser handle it) # ARGH! while(my $token = $parser->get_tag("a")){ # This handles any of the has a since we're # not looking for it anyway. # We're ignoring name= anchors, is name= & href= legal? that will hose # this up if(!($token->[1]{name})) { my $text = $parser->get_trimmed_text("/a"); if($text ne "") { my $url = $token->[1]{href} || "-"; print STDERR "URL: $url\n" if($dbgme); #print "Text is: $text\n"; if($url ne "-") { $Total++; my $absolute_url = absolutize($url, $doc_url); check_url($absolute_url, $text); } else { $LEmpty++; $output .= sprintf "(XXX) %s A missing href\n", $text; } } else { if(($n =$token->[1]{name})) { $output .= " *** NAME = $n (Oops)\n"; } else { $LEmpty++; $output .= sprintf "(XXX) A missing href and text.\n"; } } } } $output .= "/==============================================================================\n"; $output .= "* Title: $Title\n"; $output .= "* Link: $doc_url\n\n"; $output .= "* ============ =======\n"; $output .= "* Total: $Total\n"; $output .= "* 200's: $L200\n"; $output .= "* 400's: $L400\n"; $output .= "* 500's: $L500\n"; $output .= "* Other: $Lother\n"; $output .= "* Not checked: $NotChecked\n"; $output .= "* Empty: $LEmpty\n"; $output .= "* ============ =======\n\n"; if($L400 > 0) { $output .= "* -> 4xx list\n"; while(($lnk, $code) = each(%L4)) { $output .= qq(* -> ( $code ) $lnk\n); } $output .= "*\n"; } if($L500 > 0) { $output .= "* -> 5xx list\n"; while(($lnk, $code) = each(%L5)) { if($lnk) { $output .= qq(* -> ( $code ) $lnk\n); } } $output .= "*\n"; } $output .= "
"; } else { $output .= "Couldn't get $doc_url: " . $response->status_line . "\n"; } } $done = `date`; chomp($done); $output .= qq(
Started:\t${start}\nCompleted:\t${done}

\n); $output .= qq(--
Author of Linkcheck: ${Author}
Message Generated by Linkcheck v${Version}
); if($base64) { $output = encode_base64($output); } print "$output\n";