#! /usr/bin/perl # # spider.pl Set tabstops to 3. # $| = 1; # 0=no debug, 1=display progress, 2=complete dump $DEBUG = 0; # Check hyperlinks to other hosts? $SPANHOSTS = "off"; if(scalar(@ARGV) < 2){ print "Usage: $0 <fully-qualified-URL> <search-phrase>\n"; exit 1; } # Initialize. %URLqueue = (); chop($client_host=`hostname`); $been = 0; $search_phrase = $ARGV[1]; # Load the queue with the first URL to hit. $URLqueue{$ARGV[0]} = 0; $thisURL = &find_new(%URLqueue); # While there's a URL in our queue which we haven't looked at ... while($thisURL ne ""){ # Progress report. $count = 0; while(($key,$value) = each(%URLqueue)){ $count ++; } print "-----------------------------------------\n" if($DEBUG>=1); printf("Been: %d To Go: %d\n", $been, $count-$been) if($DEBUG>=1); print "Current URL: $thisURL\n" if($DEBUG>=1); &dump_stack() if($DEBUG>=2); # Split the protocol from the URL. ($protocol, $rest) = $thisURL =~ m|^([^:/]*):(.*)$|; # If the protocol is http, fetch the page and process it. if($protocol eq "http"){ # Split out the hostname, port and document. ($server_host, $port, $document) = $rest =~ m|^//([^:/]*):*([0-9]*)/*([^:]*)$|; # Get the page of text and remove CR/LF characters and HTML # comments from it. $page_text = &get_http($client_host, $server_host, $port, $document); $page_text =~ tr/\r\n//d; $page_text =~ s|<!--[^>]*-->||g; # Report if our search string is found here. if($page_text =~ m|$search_phrase|i){ print "$thisURL\n" } # Find anchors in the HTML and update our list of URLs.. (@anchors) = $page_text =~ m|<A[^>]*HREF\s*=\s*"([^ ">]*)"|gi; foreach $anchor (@anchors){ $newURL = &fqURL($thisURL, $anchor); if($URLqueue{$newURL} > 0){ # Increment the count for URLs we've already # checked out. $URLqueue{$newURL}++; }else{ # Add a zero record for URLs we haven't # encountered. # Optionally, ignore URL's which point to other # hosts. ($new_host) = $newURL =~ m|^[^:/]*:/*([^/:]*):*[0-9]*/*[^:]*$|; if($SPANHOSTS eq "on" || $new_host eq $server_host){ $URLqueue{$newURL}=0; } } } }else{ print "Protocol '$protocol' ignored.\n" if($DEBUG>=1); } # Record the fact that we've been here, and get a new URL to process. $URLqueue{$thisURL} ++; $been ++; $thisURL = &find_new(%URLqueue); } exit; #-------------------------------------------------------------- # Build a fully specified URL. #-------------------------------------------------------------- sub fqURL { local($thisURL, $anchor) = @_; local($has_proto, $has_lead_slash, $currprot, $currhost, $newURL); # Strip anything following a number sign '#', because its # just a reference to a position within a page. $anchor =~ s|^.*#[^#]*$|$1|; # Examine anchor to see what parts of the URL are specified. $has_proto = 0; $has_lead_slash=0; $has_proto = 1 if($anchor =~ m|^[^/:]+:|); $has_lead_slash = 1 if ($anchor =~ m|^/|); if($has_proto == 1){ # If protocol specified, assume anchor is fully qualified. $newURL = $anchor; } elsif($has_lead_slash == 1){ # If document has a leading slash, it just needs protocol and host. ($currprot, $currhost) = $thisURL =~ m|^([^:/]*):/+([^:/]*)|; $newURL = $currprot . "://" . $currhost . $anchor; } else{ # Anchor must be just relative pathname, so append it to current URL. ($newURL) = $thisURL =~ m|^(.*)/[^/]*$|; $newURL .= "/" if (! ($newURL =~ m|/$|)); $newURL .= $anchor; } if($DEBUG >=2){ print "Link Found\n In:$thisURL\n Anchor:$anchor\n Result: $newURL\n" } return $newURL; } #--------------------------------------------------------------- # Do a linear search of the URL stack to find a URL with a data # value of 0 (i.e. one we haven't checked out yet). #--------------------------------------------------------------- sub find_new { local(%URLqueue) = @_; local($key, $value); while(($key, $value) = each(%URLqueue)){ return $key if($value == 0); } return ""; } #------------------------------------------------------------------- # Debugging utility. #------------------------------------------------------------------- sub dump_stack { local($key, $x); local($done, $togo) = ("", ""); foreach $key (keys(%URLqueue)){ if($URLqueue{$key} == 0){ $togo .= " " . $key . "\n"; }else{ $done .= " " . $key . " (hitcount = " . $URLqueue{$key} . ")\n"; } } print "Been There:\n" . $done; print "To Go:\n" . $togo; print "------- Hit Q to Quit, Enter to Continue -------\n"; read(STDIN, $key, 1); exit(1) if($key eq 'Q' || $key eq 'q'); } #------------------------------------------------------------------------- # Get the page indicated by the $server_host and $document parameters. #------------------------------------------------------------------------- sub get_http { local($client_host, $server_host, $port, $document) = @_; local($name,$aliases,$type,$len); local($this,$thisaddr,$that,$thataddr); local($client_host, $sockaddr, $a,$b,$c,$d); local($page, $header, $header_text, $content); # Some constants used to access the TCP network. $AF_INET=2; $SOCK_STREAM=1; # Use default http port if none specified. $port = 80 if($port == 0); # Get the protocol number for TCP. ($name,$aliases,$proto)=getprotobyname("tcp"); # Get the IP addresses for the two hosts. ($name,$aliases,$type,$len,$thisaddr) = gethostbyname($client_host); ($name,$aliases,$type,$len,$thataddr) = gethostbyname($server_host); # Check we could resolve the server host name. ($a,$b,$c,$d) = unpack('C4', $thataddr); if($a eq "" && $b eq "" && $c eq "" && $d eq ""){ print "ERROR: Unknown host $server_host.\n"; return ""; } print "Server: $server_host ($a.$b.$c.$d)\n" if($DEBUG>=2); # Pack the AF_INET magic number, the port, and the (already packed) IP # addresses into the same format as the C structure would use. Note # this is architecture dependent: this pack format works for 32 bit # architectures. $sockaddr="S n a4 x8"; $this=pack($sockaddr, $AF_INET, 0, $thisaddr); $that=pack($sockaddr, $AF_INET, $port, $thataddr); # Create the socket and connect. if(socket(S, $AF_INET, $SOCK_STREAM, $proto) == false){ print "ERROR: Cannot create socket.\n"; return ""; } print "Socket OK\n" if($DEBUG>=2); if(connect(S, $that) == false){ print "ERROR: Cannot connect to server $server_host, port $port.\n"; return ""; } print "Connect OK\n" if($DEBUG>>>>>>>>=2); # Turn buffering in the socket off, and send request to the server. select(S); $| = 1; select(STDOUT); print S "GET /$document HTTP/1.0\n\n"; # Receive the response. Check to ensure the response is of MIME # type text/html or text/plain. $page = ""; $header = 1; $header_text = ""; while(<S>){ # Check if we've hit the end of the HTTP header (an empty line). # If we have, check for a content-type header line, and ensure # it is valid. if( m|^[\n\r]*$| ){ $header = 0; ($content) = $header_text =~ m|Content-type: (\S+)|i; if($content ne "text/html" && $content ne "text/plain"){ print "Content type '$content' ignored.\n" if($DEBUG>=1); last; } } # Save to a header string if we're still working on the HTTP # header. elsif($header == 1){ $header_text .= " " . $_; } # Otherwise, save to the html page string. else{ $page .= $_; } print "HTTP header: \n $header_text" if($DEBUG>=2); return $page; }