#!/usr/bin/perl -w # Simple enough program, it goes out and checks my web page for the # return code of the links (200/404/500 etc.) and reports back that # info. I should make this accept URL's from the command line. And # I should read links from a file (URL's of pages to check the links # on. Lastly I need to work on outputing the data to a web page so # it can be used within MisterHouse. #[ linkcheck.pl ]############################################################# # 05/28/00 I've posted this to my web pages, doubt many will use it but # one never knows. (njc) V 0.1 # 05/28/00 I've changed the code to read from the file ~/.linkcheckrc # It has a simple format. Put the link at the start of the line # starting with http://blah/blah/blah and it will check that file # anything else will be ignored # 09/05/04 Yeah, I'm still using the program. Seems to work pretty good but I # need to handle strings like this: # http://www.wolfgang.uucp/~njc/Personal/athome/common/Coffee/Coffee-1.html%22%3E%3CIMG%20SRC=%22next.gif%22%20ALT=%22Next # 09/06/04 Well I've added the lines to convert n and %22 to their ASCII # equivalents and I've also fixed the problem where the URL text # contained valid html commands such as Next # I don't know if I have taken care of thing such as # use LWP; use HTTP::Request; use HTTP::Response; use Env qw( HOME ); # File .linkcheckrc format is real simple. If it doesn't start with # http then it's ignored. Later I may add support for ftp. my $start; my $done; $start = `date`; my $doc_file = "$HOME/.linkcheckrc"; my $doc_url; my $document; my $browser; my $ua; my ($Total, $L200, $L400, $L500, $Lother, $NotChecked); sub absolutize { my($url, $base) = @_; use URI; return URI->new_abs($url, $base)->canonical; } sub init_browser { $browser = LWP::UserAgent->new; # ...And any other initialization we might need to do... return $browser; } sub convert { $arg = $_[0]; # Convert %22 to ascii value " (double quote) $arg =~ s/%([A-Fa-f0-9]{2})/pack("c",hex($1))/ge; # Convert n to ascii value n $arg =~ s/&\#([0-9]{3})\;/pack("c",$1)/ge; return $arg; } sub check_url { my ($req, $resp, $arg, $str, $code); $arg = $_[0]; $str = $_[1]; # Only check http for now, I've got mailto & ftp on my web page # but I have written anything to check them yet. Really I # don't care about mailto and I need to figure out how to # handle ftp. I don't use news or gopher if( $arg =~ /http/i ) { $arg = convert($arg); # Convert r & %22 to ASCII $req = HTTP::Request->new(GET,$arg); $resp = $ua->request($req); # we should check the return code, I've noticed that # I get a lot of 500's (47 out of 364) but the link # works. Maybe I need to setup the browser description # better. $code = $resp->code(); if ($code =~ /2../) { $L200++; } elsif($code =~ /4../) { # Append link to the end of the @L4 list $L4{$arg} = $code; $L400++; } elsif($code =~ /5../) { # Append link to the end of the @L5 list $L5{$arg} = $code; $L500++; } else { $Lother++; # Append link to the end of the @Lo list } printf "(%s) %s [ %s ]\n", $code, $arg, $str; } else { $NotChecked++; printf "(000) %s [ %s ]\n", $arg, convert($str); } } # # Lot of changes need to occur here! # # 1) Read initial doc_url from a file (can be more than 1, foreach loop) # Format: http//URL/ Description # 2) Output results to a directory in a web page format, a summary page # should be the initial page. # init_browser( ); ############################################################################## # Here it starts print "\n"; open IniFile, "$doc_file" or die "Can't find file: $doc_file ($!)\n"; while( ) { # Remove the chop; # I'll be making a change here, the new file format is # http://blah/ Filename.ext (no space between and # anything else!) $1 = $doc_name & $2 = Filename # The filename is the first file describing the condition of the # URL ($doc_name) $doc_url = $_; # Ignore lines that start with # or space # # and blank lines if(!($doc_url =~ /^http/i)) { next; } else { print "Checking: $doc_url\n\n"; } # Get the page whose links we want to check: my $response = $browser->get($doc_url); warn "Couldn't get $doc_url: ", $response->status_line unless $response->is_success; $document = $response->content; $doc_url = $response->base(); # In case we need to resolve relative URLs later $| = 1; # Turn on Autoflush (doesn't turn off buffering) $ua = LWP::UserAgent->new; $Total = $L200 = $L400 = $L500 = $Lother = $NotChecked = 0; # the order is important do not reverse these 2 lines $document =~ s/\n/ /g; # replace newline with space $document =~ s/\s{2,}/ /g; # replace multiple spaces with 1 # Grab the title for reference if($document =~ m{(.*?)}i) { $Title = $1; } else { $Title =" *** Unknown ***"; } # Search for href (URL) and description text >blah blah # hmm, href = "http:// ..." isn't caught by this regex (???) # note the spaces (???) # Also this gets hosed: # Next # into this: # http://www.dmc.uucp/~njc/Personal/athome/common/Coffee/Coffee-1.html%22%3E%3CIMG%20SRC=%22next.gif%22%20ALT=%22Next # http://www.wolfgang.uucp/~njc/Personal/athome/common/Coffee/Coffee-1.html">Next # ARGH! # New problem: is valid. So what do I do? while ($document =~ m||gi) { #}"){ # This is good enough $Total++; # $1 will be the URL # $2 is text between ... # # URL">Text # /----------- URL # V /--- Text if( $1 =~ m|(.*?)">(.*)|s) { #" split into URL & text if(defined($2)) { $str = $2; } else { $str = "?????"; # something went wrong } } my $absolute_url = absolutize($1, $doc_url); check_url($absolute_url, $str); } print "\n"; print "Title: $Title\n"; print "Link: $doc_url\n\n"; print "============ =======\n"; print "Total: $Total\n"; print "200's: $L200\n"; print "400's: $L400\n"; print "500's: $L500\n"; print "Other: $Lother\n"; print "Not checked: $NotChecked\n"; print "============ =======\n\n"; } if($L400 > 0) { print "4xx list\n"; while(($lnk, $code) = each(%L4)) { print "( $code ) $lnk\n"; } print "\n"; } if($L500 > 0) { print "5xx list\n"; while(($lnk, $code) = each(%L5)) { print "( $code ) $lnk\n"; } print "\n"; } $done = `date`; print "Started:\t${start}Completed:\t${done}";