(.*?)

#!/usr/bin/perl -w

# Simple enough program, it goes out and checks my web page for the
# return code of the links (200/404/500 etc.) and reports back that
# info. I should make this accept URL's from the command line. And
# I should read links from a file (URL's of pages to check the links
# on. Lastly I need to work on outputing the data to a web page so
# it can be used within MisterHouse.

#[ linkcheck.pl ]#############################################################

# mm/dd/yy Comment
# 05/28/00 I've posted this to my web pages, doubt many will use it but
#          one never knows. (njc) V 0.1

# 05/28/00 I've changed the code to read from the file ~/.linkcheckrc
#          It has a simple format. Put the link at the start of the line
#          starting with http://blah/blah/blah and it will check that file
#          anything else will be ignored
# 09/05/04 Yeah, I'm still using the program. Seems to work pretty good but I
#          need to handle strings like this:
#          http://www.wolfgang.uucp/~njc/Personal/athome/common/Coffee/Coffee-1.html%22%3E%3CIMG%20SRC=%22next.gif%22%20ALT=%22Next
# 09/06/04 Well I've added the lines to convert &#110; and %22 to their ASCII
#          equivalents and I've also fixed the problem where the URL text
#          contained valid html commands such as <IMG SRC="next.gif" ALT="Next">
#          I don't know if I have taken care of thing such as
#          <A TITLE="blah blah" href="url" xxx="">
# 10/25/10 Dave Alger has suggested a change to line 201 (from (.*?)">(.*)|s)
#          to (.*?)".*?>(.*)
use LWP;
use HTTP::Request;
use HTTP::Response;

use Env qw( HOME );

# File .linkcheckrc format is real simple. If it doesn't start with
# http then it's ignored. Later I may add support for ftp.

my $start;
my $done;

$start = `date`;

my $doc_file = "$HOME/.linkcheckrc";
my $doc_url;
my $document;
my $browser;
my $ua;

my ($Total, $L200, $L400, $L500, $Lother, $NotChecked);

sub absolutize {
  my($url, $base) = @_;
  use URI;
  return URI->new_abs($url, $base)->canonical;
}

sub init_browser {
    $browser = LWP::UserAgent->new;
    # ...And any other initialization we might need to do...
    return $browser;
}

sub convert {
    $arg = $_[0];

    # Convert %22 to ascii value " (double quote)
    $arg =~ s/%([A-Fa-f0-9]{2})/pack("c",hex($1))/ge;
    # Convert &#110; to ascii value n
    $arg =~ s/&\#([0-9]{3})\;/pack("c",$1)/ge;

    return $arg;
}

sub check_url {
    my ($req, $resp, $arg, $str, $code);

    $arg = $_[0];
    $str = $_[1];

    # Only check http for now, I've got mailto & ftp on my web page
    # but I have written anything to check them yet. Really I
    # don't care about mailto and I need to figure out how to
    # handle ftp. I don't use news or gopher
    if( $arg =~ /http/i ) {

	$arg = convert($arg); # Convert &#114; & %22 to ASCII

	$req = HTTP::Request->new(GET,$arg);
	$resp = $ua->request($req);

	# we should check the return code, I've noticed that
	# I get a lot of 500's (47 out of 364) but the link
	# works. Maybe I need to setup the browser description
	# better.

	$code = $resp->code();

	if     ($code =~ /2../) {
	    $L200++;
	} elsif($code =~ /4../) {
	    # Append link to the end of the @L4 list
	    $L4{$arg} = $code;
	    $L400++;
	} elsif($code =~ /5../) {
	    # Append link to the end of the @L5 list
	    $L5{$arg} = $code;
	    $L500++;
	} else {
	    $Lother++;
	    # Append link to the end of the @Lo list
	}

	printf "(%s) %s [ %s ]\n", $code, $arg, $str;
    } else {
	$NotChecked++;
	printf "(000) %s [ %s ]\n", $arg, convert($str);
    }
}

#
# Lot of changes need to occur here!
#
# 1) Read initial doc_url from a file (can be more than 1, foreach loop)
#    Format: http//URL/ <TAB> Description <NL>
# 2) Output results to a directory in a web page format, a summary page
#    should be the initial page.
#

init_browser( );

##############################################################################
# Here it starts

print "\n";

open IniFile, "$doc_file" or die "Can't find file: $doc_file ($!)\n";

while( <IniFile> ) {
    # Remove the <NL>
    chop;

    # I'll be making a change here, the new file format is
    # http://blah/ <TAB> Filename.ext<NL> (no space between <TAB> and
    # anything else!) $1 = $doc_name & $2 = Filename
    # The filename is the first file describing the condition of the
    # URL ($doc_name)

    $doc_url = $_;

    # Ignore lines that start with # or space #
    # and blank lines
    if(!($doc_url =~ /^http/i)) {
	next;
    } else {
	print "Checking: $doc_url\n\n";
    }

    # Get the page whose links we want to check:
    my $response = $browser->get($doc_url);
    warn "Couldn't get $doc_url: ", $response->status_line
	unless $response->is_success;

    $document = $response->content;
    $doc_url  = $response->base();
    # In case we need to resolve relative URLs later

    $| = 1; # Turn on Autoflush (doesn't turn off buffering)
    $ua = LWP::UserAgent->new;

    $Total = $L200 = $L400 = $L500 = $Lother = $NotChecked = 0;

    # the order is important do not reverse these 2 lines
    $document =~ s/\n/ /g;     # replace newline with space
    $document =~ s/\s{2,}/ /g; # replace multiple spaces with 1

    # Grab the title for reference
    if($document =~ m{<title>(.*?)</title>}i) {
	$Title = $1;
    } else {
	$Title =" *** Unknown ***";
    }

    # Search for href (URL) and  description text >blah blah</A>
    # hmm, href = "http:// ..." isn't caught by this regex (???)
    # note the spaces (???)
    # Also this gets hosed:
    # <A HREF="Coffee-1.html"><IMG SRC="next.gif" ALT="Next"></A>
    # into this:
    # http://www.dmc.uucp/~njc/Personal/athome/common/Coffee/Coffee-1.html%22%3E%3CIMG%20SRC=%22next.gif%22%20ALT=%22Next
    # http://www.wolfgang.uucp/~njc/Personal/athome/common/Coffee/Coffee-1.html"><IMG SRC="next.gif" ALT="Next">
    # ARGH!
    # New problem: <A TITLE="blah blah" href="url" xxx=""> is valid. So what do I do?
    while ($document =~ m|<a\s*href\s*=\s*"(.*?)</a>|gi) {	#}"){ # This is good enough
	$Total++;

	# $1 will be the URL
	# $2 is text between <A> ... </A>
	#
	#              URL">Text
	#            /----------- URL
	#            V       /--- Text
#	if( $1 =~ m|(.*?)">(.*)|s) { #" split into URL & text 
	#              URL">Text
	#            /----------- URL
	#            V          /--- Text
	if( $1 =~ m|(.*?)".*?>(.*) { #" split into URL & text 
	    if(defined($2)) {
		$str = $2;
	    } else {
		$str = "?????"; # something went wrong
	    }
	}

	my $absolute_url = absolutize($1, $doc_url);
	check_url($absolute_url, $str);
    }


    print "\n";
    print "Title: $Title\n";
    print "Link:  $doc_url\n\n";

    print "============ =======\n";
    print "Total:       $Total\n";
    print "200's:       $L200\n";
    print "400's:       $L400\n";
    print "500's:       $L500\n";
    print "Other:       $Lother\n";
    print "Not checked: $NotChecked\n";
    print "============ =======\n\n";

}

if($L400 > 0) {
    print "4xx list\n";
    while(($lnk, $code) = each(%L4)) {
	print "( $code )  $lnk\n";
    }

    print "\n";
}

if($L500 > 0) {
    print "5xx list\n";
    while(($lnk, $code) = each(%L5)) {
	print "( $code )  $lnk\n";
    }

    print "\n";
}

$done = `date`;
print "Started:\t${start}Completed:\t${done}";