#!/usr/bin/perl -w use strict; use diagnostics; use LWP::RobotUA; use HTML::LinkExtor; use URI::URL; # Where do we want to start? my $origin = "http://www.lerner.co.il/"; # Create two hashes, one for URLs we have seen and # another for those we haven't my %already_retrieved = (); my %to_be_retrieved = ($origin => 1); # Create a hash in which we will deposit URLs we # wish to use my %interesting_urls = (); # Create a new user agent my $ua = new LWP::RobotUA "ATF/1.0", 'reuven@lerner.co.il'; $ua->delay(0); while (my $url = (keys %to_be_retrieved)[0]) { # Create a new HTTP request my $request = new HTTP::Request('GET', $url); # Move this URL from %to_be_retrieved to # %already_retrieved delete $to_be_retrieved{$url}; $already_retrieved{$url} = 1; # Indicate what we are retrieving print "Retrieving $url..."; # Hand $request to $ua, and get an HTTP # response back in return my $response = $ua->request($request); # Complete printout print ".\n"; # If there was a problem, send a report to # STDERR if (!$response->is_success) { print STDERR qq{Error retrieving "$url":}, $response->status_line , "\n"; } # Create an instance of HTML::LinkExtor, # making links relative to $url my $parser = HTML::LinkExtor->new(\&callback, $url); # Parse the output $parser->parse($response->content); } #--------------------------------------------- # Define our callback, which is passed scalar and # a hash sub callback { # Get the tag and its associated attributes my ($tag, %attributes) = @_; # We only care about anchor tags return unless ($tag eq "a"); # Iterate through the attributes foreach my $name (sort keys %attributes) { # If this is a link, then put the URL in the # queue if ($name eq "href") { # Get the URL for this anchor my $url = $attributes{$name}; # Ignore non-HTTP URLs next unless ($url =~ m/^http:/); # Indicate that we should retrieve this # URL in the future $to_be_retrieved{$url} = 1 unless $already_retrieved{$url}; } } }