LJ 40: A Web Crawler in Perl

#! /usr/bin/perl
#
# spider.pl   Set tabstops to 3.
#

$| = 1;
# 0=no debug, 1=display progress, 2=complete dump
$DEBUG = 0;  		
# Check hyperlinks to other hosts?
$SPANHOSTS = "off";	

if(scalar(@ARGV) < 2){
print "Usage: $0 <fully-qualified-URL> <search-phrase>\n";
exit 1;
}

# Initialize.
%URLqueue = ();
chop($client_host=`hostname`);
$been = 0;
$search_phrase = $ARGV[1];

# Load the queue with the first URL to hit.
$URLqueue{$ARGV[0]} = 0;
$thisURL = &find_new(%URLqueue);

# While there's a URL in our queue which we haven't looked at ...
while($thisURL ne ""){

# Progress report.
$count = 0;
while(($key,$value) = each(%URLqueue)){
	$count ++;
}
print "-----------------------------------------\n" if($DEBUG>=1);
printf("Been: %d  To Go: %d\n", $been, $count-$been)
if($DEBUG>=1);
print "Current URL: $thisURL\n" if($DEBUG>=1);
&dump_stack() if($DEBUG>=2);

# Split the protocol from the URL.
($protocol, $rest) = $thisURL =~ m|^([^:/]*):(.*)$|;

# If the protocol is http, fetch the page and process it.
if($protocol eq "http"){

	# Split out the hostname, port and document.
	($server_host, $port, $document) =
		$rest =~ m|^//([^:/]*):*([0-9]*)/*([^:]*)$|;

	# Get the page of text and remove CR/LF characters and HTML
	# comments from it.
	$page_text = &get_http($client_host, $server_host, $port, 
		$document);
	$page_text =~ tr/\r\n//d;
	$page_text =~ s|<!--[^>]*-->||g;

	# Report if our search string is found here.
	if($page_text =~ m|$search_phrase|i){
		print "$thisURL\n"
	}	

	# Find anchors in the HTML and update our list of URLs..
	(@anchors) = $page_text =~ m|<A[^>]*HREF\s*=\s*"([^
">]*)"|gi;
	foreach $anchor (@anchors){
		$newURL = &fqURL($thisURL, $anchor);
		if($URLqueue{$newURL} > 0){

			# Increment the count for URLs we've already 
			# checked out.
			$URLqueue{$newURL}++;

		}else{

			# Add a zero record for URLs we haven't 
			# encountered.
			# Optionally, ignore URL's which point to other
			# hosts.
			($new_host) =
			   $newURL =~ m|^[^:/]*:/*([^/:]*):*[0-9]*/*[^:]*$|;
			if($SPANHOSTS eq "on" || $new_host eq 
				$server_host){
				$URLqueue{$newURL}=0;
			}
		}
	}
}else{
	print "Protocol '$protocol' ignored.\n" if($DEBUG>=1);
}

# Record the fact that we've been here, and get a new URL to process.
$URLqueue{$thisURL} ++;
$been ++;
$thisURL = &find_new(%URLqueue);

}
exit;

#--------------------------------------------------------------
# Build a fully specified URL.
#--------------------------------------------------------------
sub fqURL
{
local($thisURL, $anchor) = @_;
local($has_proto, $has_lead_slash, $currprot, $currhost, $newURL);

# Strip anything following a number sign '#', because its
# just a reference to a position within a page.
$anchor =~ s|^.*#[^#]*$|$1|;

# Examine anchor to see what parts of the URL are specified.
$has_proto = 0;
$has_lead_slash=0;
$has_proto = 1 if($anchor =~ m|^[^/:]+:|);
$has_lead_slash = 1 if ($anchor =~ m|^/|);

if($has_proto == 1){

	# If protocol specified, assume anchor is fully qualified.
	$newURL = $anchor;

}
elsif($has_lead_slash == 1){

	# If document has a leading slash, it just needs protocol and host.
	($currprot, $currhost) = $thisURL =~ m|^([^:/]*):/+([^:/]*)|;
	$newURL = $currprot . "://" . $currhost . $anchor;

}	
else{

	# Anchor must be just relative pathname, so append it to current URL.
	($newURL) = $thisURL =~ m|^(.*)/[^/]*$|;
	$newURL .= "/" if (! ($newURL =~ m|/$|));
	$newURL .= $anchor;

}
if($DEBUG >=2){
	print "Link Found\n   In:$thisURL\n   Anchor:$anchor\n   Result: $newURL\n"
}
return $newURL;
}

#---------------------------------------------------------------
# Do a linear search of the URL stack to find a URL with a data
# value of 0 (i.e. one we haven't checked out yet).
#---------------------------------------------------------------
sub find_new
{
local(%URLqueue) = @_;
local($key, $value);

while(($key, $value) = each(%URLqueue)){
	return $key if($value == 0);
}
return "";
}

#-------------------------------------------------------------------
# Debugging utility.
#-------------------------------------------------------------------
sub dump_stack
{
local($key, $x);
local($done, $togo) = ("", "");

foreach $key (keys(%URLqueue)){
	if($URLqueue{$key} == 0){
		$togo .= "  " . $key . "\n";
	}else{
		$done .= "  " . $key . " (hitcount = "
		    . $URLqueue{$key} . ")\n";
	}
}

print "Been There:\n" . $done;
print "To Go:\n" . $togo;
print "------- Hit Q to Quit, Enter to Continue -------\n";
read(STDIN, $key, 1);
exit(1) if($key eq 'Q' || $key eq 'q');
}

#-------------------------------------------------------------------------
# Get the page indicated by the $server_host and $document parameters.
#-------------------------------------------------------------------------
sub get_http
{
local($client_host, $server_host, $port, $document) = @_;
local($name,$aliases,$type,$len);
local($this,$thisaddr,$that,$thataddr);
local($client_host, $sockaddr, $a,$b,$c,$d);
local($page, $header, $header_text, $content);

# Some constants used to access the TCP network.
$AF_INET=2;
$SOCK_STREAM=1;

# Use default http port if none specified.
$port = 80 if($port == 0);

# Get the protocol number for TCP.
($name,$aliases,$proto)=getprotobyname("tcp");

# Get the IP addresses for the two hosts.
($name,$aliases,$type,$len,$thisaddr) = gethostbyname($client_host);
($name,$aliases,$type,$len,$thataddr) = gethostbyname($server_host);

# Check we could resolve the server host name.
($a,$b,$c,$d) = unpack('C4', $thataddr);
if($a eq "" && $b eq "" && $c eq  "" && $d eq ""){
	print "ERROR: Unknown host $server_host.\n";
	return "";
}
print "Server: $server_host ($a.$b.$c.$d)\n" if($DEBUG>=2);

# Pack the AF_INET magic number, the port, and the (already packed) IP
# addresses into the same format as the C structure would use. Note
# this is architecture dependent: this pack format works for 32 bit
# architectures.
$sockaddr="S n a4 x8";
$this=pack($sockaddr, $AF_INET, 0, $thisaddr);
$that=pack($sockaddr, $AF_INET, $port, $thataddr);

# Create the socket and connect.
if(socket(S, $AF_INET, $SOCK_STREAM, $proto) == false){
	print "ERROR: Cannot create socket.\n";
	return "";
}
print "Socket OK\n" if($DEBUG>=2);
if(connect(S, $that) == false){
	print "ERROR: Cannot connect to server $server_host,
		 port $port.\n";
	return "";
}
print "Connect OK\n" if($DEBUG>>>>>>>>=2);

# Turn buffering in the socket off, and send request to the server.
select(S); $| = 1; select(STDOUT);
print S "GET /$document HTTP/1.0\n\n";

# Receive the response. Check to ensure the response is of MIME
# type text/html or text/plain.
$page = "";
$header = 1;
$header_text = "";
while(<S>){
	
	# Check if we've hit the end of the HTTP header (an empty
line).
	# If we have, check for a content-type header line, and
ensure
	# it is valid.
	if( m|^[\n\r]*$| ){
		$header = 0;
		($content) = $header_text =~ m|Content-type: (\S+)|i;
		if($content ne "text/html" && $content ne "text/plain"){
			print "Content type '$content' ignored.\n" 
				if($DEBUG>=1);
			last;
		}
	}
	# Save to a header string if we're still working on the HTTP
	# header.
	elsif($header == 1){
		$header_text .= "   " . $_;
	}
	# Otherwise, save to the html page string.
	else{
		$page .= $_;
	}

print "HTTP header: \n $header_text" if($DEBUG>=2);
return $page;
}