#!/usr/bin/perl -w
# Simple enough program, it goes out and checks my web page for the
# return code of the links (200/404/500 etc.) and reports back that
# info. I should make this accept URL's from the command line. And
# I should read links from a file (URL's of pages to check the links
# on. Lastly I need to work on outputing the data to a web page so
# it can be used within MisterHouse.
#[ linkcheck.pl ]#############################################################
# 05/28/00 I've posted this to my web pages, doubt many will use it but
# one never knows. (njc) V 0.1
# 05/28/00 I've changed the code to read from the file ~/.linkcheckrc
# It has a simple format. Put the link at the start of the line
# starting with http://blah/blah/blah and it will check that file
# anything else will be ignored
# 09/05/04 Yeah, I'm still using the program. Seems to work pretty good but I
# need to handle strings like this:
# http://www.wolfgang.uucp/~njc/Personal/athome/common/Coffee/Coffee-1.html%22%3E%3CIMG%20SRC=%22next.gif%22%20ALT=%22Next
# 09/06/04 Well I've added the lines to convert n and %22 to their ASCII
# equivalents and I've also fixed the problem where the URL text
# contained valid html commands such as
# I don't know if I have taken care of thing such as
#
use LWP;
use HTTP::Request;
use HTTP::Response;
use Env qw( HOME );
# File .linkcheckrc format is real simple. If it doesn't start with
# http then it's ignored. Later I may add support for ftp.
my $start;
my $done;
$start = `date`;
my $doc_file = "$HOME/.linkcheckrc";
my $doc_url;
my $document;
my $browser;
my $ua;
my ($Total, $L200, $L400, $L500, $Lother, $NotChecked);
sub absolutize {
my($url, $base) = @_;
use URI;
return URI->new_abs($url, $base)->canonical;
}
sub init_browser {
$browser = LWP::UserAgent->new;
# ...And any other initialization we might need to do...
return $browser;
}
sub convert {
$arg = $_[0];
# Convert %22 to ascii value " (double quote)
$arg =~ s/%([A-Fa-f0-9]{2})/pack("c",hex($1))/ge;
# Convert n to ascii value n
$arg =~ s/&\#([0-9]{3})\;/pack("c",$1)/ge;
return $arg;
}
sub check_url {
my ($req, $resp, $arg, $str, $code);
$arg = $_[0];
$str = $_[1];
# Only check http for now, I've got mailto & ftp on my web page
# but I have written anything to check them yet. Really I
# don't care about mailto and I need to figure out how to
# handle ftp. I don't use news or gopher
if( $arg =~ /http/i ) {
$arg = convert($arg); # Convert r & %22 to ASCII
$req = HTTP::Request->new(GET,$arg);
$resp = $ua->request($req);
# we should check the return code, I've noticed that
# I get a lot of 500's (47 out of 364) but the link
# works. Maybe I need to setup the browser description
# better.
$code = $resp->code();
if ($code =~ /2../) {
$L200++;
} elsif($code =~ /4../) {
# Append link to the end of the @L4 list
$L4{$arg} = $code;
$L400++;
} elsif($code =~ /5../) {
# Append link to the end of the @L5 list
$L5{$arg} = $code;
$L500++;
} else {
$Lother++;
# Append link to the end of the @Lo list
}
printf "(%s) %s [ %s ]\n", $code, $arg, $str;
} else {
$NotChecked++;
printf "(000) %s [ %s ]\n", $arg, convert($str);
}
}
#
# Lot of changes need to occur here!
#
# 1) Read initial doc_url from a file (can be more than 1, foreach loop)
# Format: http//URL/ Description
# 2) Output results to a directory in a web page format, a summary page
# should be the initial page.
#
init_browser( );
##############################################################################
# Here it starts
print "\n";
open IniFile, "$doc_file" or die "Can't find file: $doc_file ($!)\n";
while( ) {
# Remove the
chop;
# I'll be making a change here, the new file format is
# http://blah/ Filename.ext (no space between and
# anything else!) $1 = $doc_name & $2 = Filename
# The filename is the first file describing the condition of the
# URL ($doc_name)
$doc_url = $_;
# Ignore lines that start with # or space #
# and blank lines
if(!($doc_url =~ /^http/i)) {
next;
} else {
print "Checking: $doc_url\n\n";
}
# Get the page whose links we want to check:
my $response = $browser->get($doc_url);
warn "Couldn't get $doc_url: ", $response->status_line
unless $response->is_success;
$document = $response->content;
$doc_url = $response->base();
# In case we need to resolve relative URLs later
$| = 1; # Turn on Autoflush (doesn't turn off buffering)
$ua = LWP::UserAgent->new;
$Total = $L200 = $L400 = $L500 = $Lother = $NotChecked = 0;
# the order is important do not reverse these 2 lines
$document =~ s/\n/ /g; # replace newline with space
$document =~ s/\s{2,}/ /g; # replace multiple spaces with 1
# Grab the title for reference
if($document =~ m{(.*?)}i) {
$Title = $1;
} else {
$Title =" *** Unknown ***";
}
# Search for href (URL) and description text >blah blah
# hmm, href = "http:// ..." isn't caught by this regex (???)
# note the spaces (???)
# Also this gets hosed:
#
# into this:
# http://www.dmc.uucp/~njc/Personal/athome/common/Coffee/Coffee-1.html%22%3E%3CIMG%20SRC=%22next.gif%22%20ALT=%22Next
# http://www.wolfgang.uucp/~njc/Personal/athome/common/Coffee/Coffee-1.html">
# ARGH!
# New problem: is valid. So what do I do?
while ($document =~ m||gi) { #}"){ # This is good enough
$Total++;
# $1 will be the URL
# $2 is text between ...
#
# URL">Text
# /----------- URL
# V /--- Text
if( $1 =~ m|(.*?)">(.*)|s) { #" split into URL & text
if(defined($2)) {
$str = $2;
} else {
$str = "?????"; # something went wrong
}
}
my $absolute_url = absolutize($1, $doc_url);
check_url($absolute_url, $str);
}
print "\n";
print "Title: $Title\n";
print "Link: $doc_url\n\n";
print "============ =======\n";
print "Total: $Total\n";
print "200's: $L200\n";
print "400's: $L400\n";
print "500's: $L500\n";
print "Other: $Lother\n";
print "Not checked: $NotChecked\n";
print "============ =======\n\n";
}
if($L400 > 0) {
print "4xx list\n";
while(($lnk, $code) = each(%L4)) {
print "( $code ) $lnk\n";
}
print "\n";
}
if($L500 > 0) {
print "5xx list\n";
while(($lnk, $code) = each(%L5)) {
print "( $code ) $lnk\n";
}
print "\n";
}
$done = `date`;
print "Started:\t${start}Completed:\t${done}";