#!/usr/bin/perl # grab - takes a url, finds all the files of a # certain type linked to from # that url, then downloads all of those. # then it makes an index of them. # steev hise, steev AT datamassage.com # been tweaking this since at least early 1999... # this code is licensed to all via the GNU GPL. # see http://www.fsf.org/licenses/licenses.html#GPL # for more information. # # $Id: grab,v 1.4 2001/12/12 17:24:14 steev Exp $ # # $Log: grab,v $ # Revision 1.4 2001/12/12 17:24:14 steev # made option handling better. added auto-directory creation. # # Revision 1.3 2001/10/30 07:12:03 steev # added non-indexing option. # # Revision 1.2 2001/08/27 05:47:11 steev # modified to use LWP. Also fakes referer. works beautifully. # # ########################################################### use Getopt::Std; use File::Path; $| = 1; require "flush.pl"; #$lynx = "/usr/bin/lynx -useragent=Mozilla/4.7 "; getopts('e:r:i'); if($opt_e) { $extension = $opt_e; } else { $extension = "jpg"; } $origurl = shift @ARGV; $localdir = shift @ARGV; # several things possible to do with destination directory. if(!$localdir) { $localdir = '.'; } elsif(! -e $localdir) { mkpath($localdir) || die "error creating $localdir: $!"; } elsif(! -d $localdir) { die "$localdir is not a directory"; } # if we're given a range of numbers then we tack those onto # the end of the original url. if($opt_r) { ($min,$max) = split /-/, $opt_r; for $i ($min..$max) { push @urls, $origurl . $i . ".$extension"; } } else { # or we get that page. print "getting page " , $origurl , "....\n"; $page = &webget($origurl); open(OUT, ">/tmp/tmp.html"); print OUT $page; # `$lynx -source '$origurl' > /tmp/tmp.html`; print STDOUT "grabbed html\n"; $origurl =~ m#(http://[^/]*)/*([^ ]*)#; $site = $1; $file = "/".$2; @path = split('/', $file); # who cares about the original file name. # however, if url is just a directory, keep whole thing. if($file !~ /\/$/) { pop @path; } $path = join('/', @path); print STDOUT "Scanning page $origurl for files ending in .$extension.\n"; # remove linebreaks cuz they can fuck stuff up. $page =~ s/\n|\r//g; # remove linebreaks # now scan the page for images. @imgs = split(//i) { $url = "$1.$2"; # its either relative, absolute, or full #first, full if ($url =~ /^http/i) { push @urls, $url; # add that url unchanged next; } elsif ($url =~ /^\//) { push @urls, "$site$url"; next; } else { # must be a relative push @urls, "$site$path/$url"; next; } } } } # one way or the other we have a list of urls to get. $count = int(@urls); print STDOUT "\nDone scanning. Found $count files\n"; #unlink "/tmp/tmp.html"; die "Didn't find any files.\n" unless @urls; foreach $image (@urls) { $image =~ s/\s//g; # remove spaces in url $i++; @path = split('/', $image); $local = $path[$#path]; # get just the filename # if local file already exists, add time to it. if (-e "$localdir/$local") { $local = time . $local; } print STDOUT " $i: $image -- "; flush(STDOUT); open(IMG, ">$localdir/$local") || die "can't write to $localdir/$local:$!\n"; print IMG &webget($image, $origurl); #`$lynx -source '$image' > $localdir/$local`; $size = -s "$localdir/$local"; if($size > 500) { print STDOUT "saving to $localdir/$local", " size: $size.\n"; $totalsize = $totalsize + (-s "$localdir/$local"); } else { print STDOUT "ERROR.\n\n"; push @errors, $image; unlink "$localdir/$local"; # if it's too small, then delete. } flush(STDOUT); } print "total bytes downloaded: $totalsize.\n"; if(@errors) { print "the following files couldnt be downloaded:\n"; print join("\n ", @errors); print "\n\n"; } # now count what we have and make an index # but not if the -i switch was given. if($opt_i) { print "\ndone.\n"; exit; } print "Building index.html.\n"; opendir(DIR, $localdir); @images = sort grep(/$extension$/i, readdir(DIR)); open(FILE, ">$localdir/index.html") || die "no open $localdir/index.html"; print FILE <<"EndOT"; $origurl original url: $origurl

up\n "; close FILE; print "done.\n\n"; # subroutines ######## # this just grabs a url, instead of using Lynx. # give it a url and optionally, a referrer. sub webget { use LWP::UserAgent; use HTTP::Request; use HTTP::Response; my ($url,$referer) = @_; unless(defined $referer) { $referer = 'http://disney.com' }; $| = 1; # to flush next line # printf "%s =>\n\t", $url; my $ua = LWP::UserAgent->new(); $ua->agent("Mozilla/4.7"); # pretend, just in case my $req = HTTP::Request->new(GET => $url); $req->referer($referer); my $response = $ua->request($req); if ($response->is_error()) { warn " %s\n", $response->status_line; return 0; } else { my $content = $response->content(); return $content; } }