#!/usr/bin/perl -w # Simple enough program, it goes out and checks my web page for the # return code of the links (200/404/500 etc.) and reports back that # info. I should make this accept URL's from the command line. And # I should read links from a file (URL's of pages to check the links # on. Lastly I need to work on outputing the data to a web page so # it can be used within MisterHouse. ############################################################################## # 05/28/00 I've posted this to my web pages, doubt many wil use it but # one never knows. (njc) V 0.1 # 05/28/00 I've changed the code to read from the file ~/.linkcheckrc # It has a simple format. Put the link at the start of the line # starting with http://blah/blah/blah and it will check that file # anything else will be ignored use LWP; use HTTP::Request; use HTTP::Response; use Env qw( HOME ); # I should make this an arguement passed from the CLI # BTW, don't bother trying to use this link, replace it with your own. my $doc_url = "http://www.dmc.uucp/~njc/Personal/athome/index.html"; my $doc_file = "$HOME/.linkcheckrc"; my $document; my $browser; my $ua; my ($Total, $L200, $L400, $L500, $Lother, $NotChecked); sub absolutize { my($url, $base) = @_; use URI; return URI->new_abs($url, $base)->canonical; } sub init_browser { $browser = LWP::UserAgent->new; # ...And any other initialization we might need to do... return $browser; } sub check_url { my ($req, $resp, $arg, $str, $code); $arg = $_[0]; $str = $_[1]; # Only check http for now, I've got mailto & ftp on my web page # but I have written anything to check them yet. Really I # don't care about mailto and I need to figure out how to # handle ftp. I don't use news or gopher if( $arg =~ /http/i ) { $req = HTTP::Request->new(GET,$arg); $resp = $ua->request($req); # we should check the return code, I've noticed that # I get a lot of 500's (47 out of 364) but the link # works. Maybe I need to setup the browser description # better. $code = $resp->code(); printf "(%s) %s [ %s ]\n", $code, $arg, $str; if ($code =~ /2../) { $L200++; } elsif($code =~ /4../) { $L400++; } elsif($code =~ /5../) { $L500++; } else { $Lother++; } } else { $NotChecked++; printf "(000) %s [ %s ]\n", $arg, $str; } } # # Lot of changes need to occur here! # # 1) Read initial doc_url from a file (can be more than 1, foreach loop) # Format: http//URL/ Description # 2) Output results to a directory in a web page format, a summary page # should be the initial page. # init_browser( ); ############################################################################## # Here it starts print "\n"; open IniFile, "$doc_file" or die "Can't find file: $doc_file ($!)\n"; while( ) { # Remove the chop; $doc_url = $_; # Ignore lines that start with # or space # # and blank lines if(!($doc_url =~ /^http/i)) { next; } else { print "Checking: $doc_url\n\n"; } # Get the page whose links we want to check: my $response = $browser->get($doc_url); warn "Couldn't get $doc_url: ", $response->status_line unless $response->is_success; $document = $response->content; $doc_url = $response->base(); # In case we need to resolve relative URLs later $| = 1; # Turn on Autoflush (doesn't turn off buffering) $ua = LWP::UserAgent->new; $Total = $L200 = $L400 = $L500 = $Lother = $NotChecked = 0; # the order is important do not reverse these 2 lines $document =~ s/\n/ /g; # replace newline with space $document =~ s/\s{2,}/ /g; # replace multiple spaces with 1 # Grab the title for reference if($document =~ m{(.*?)}i) { $Title = $1; } else { $Title =" *** Unknown ***"; } # Search for href (URL) and description text >blah blah # hmm, href = "http:// ..." isn't caught by this regex (???) # note the spaces (???) # Also this gets hosed: # Next # into this: # http://www.dmc.uucp/~njc/Personal/athome/common/Coffee/Coffee-1.html%22%3E%3CIMG%20SRC=%22next.gif%22%20ALT=%22Next # ARGH! while ($document =~ m||gi) { #}"){ $Total++; # $1 will be the URL # $2 is text between ... if($1 =~ m{(.*)">(.*)}){ #}"}) { # split into URL & text if(defined($2)) { $str = $2; } else { $str = "?????"; # something went wrong } } my $absolute_url = absolutize($1, $doc_url); check_url($absolute_url, $str); } print "\n"; print "Title: $Title\n"; print "Link: $doc_url\n\n"; print "============ =======\n"; print "Total: $Total\n"; print "200's: $L200\n"; print "400's: $L400\n"; print "500's: $L500\n"; print "Other: $Lother\n"; print "Not checked: $NotChecked\n"; print "============ =======\n\n"; }