#!/usr/bin/perl # # pixpirate v0.25 (c) ajax@mobis.com, October 1998 # # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. # ############# # # goto a source url, download all other sub-url's from an index # of url's, then goto each of those source url's, downloading every jpg # file that it comes into contact with. # # Ultimately this script would be ran as a crontab every 24 hours, # looking for new url's. It will compare all sub-url's with an index of # retrieved sub-url's and each image has its md5 signature saved and compared # with an index of other md5 signatures, making redundancy minimal. # Duplicate filenames are renamed with leading random numbers. # # todo: replace md5 logging with pack() and unpack() to reduce size of md5 log # # Here are some sample web sites you can use to plug into $site_url # # http://www.purextc.com/amateur/amat.htm # http://www.purextc.com/gay/lesbian.htm # http://www.blastsite.com/main.shtml # http://www.adultbuffet.com/tgp/tgp.htm # http://www.purextc.com/ # http://www.ebonyporn.com/main.html # http://www.pornno.com/habit/gallerypost.html # http://www.youngnympho.com/main.html # http://www.book-mark.net/mark.html # http://ww3.voyeurweb.com/main/Picturep.html # CONFIGURATION OPTIONS (change these) $site_url = "http://www.purextc.com/"; $dwnld_dir = "/home/ajax/public_html/pics"; $dupindexfile = "/home/ajax/bin/pixpirate.done"; $debug = 0; # debug mode, 1=on, 0=off # MD5 CONFIG OPTIONS (change these) $md5_checking = 1; # 0=off, 1=on $md5sum = "/usr/bin/md5sum"; # location of md5sum binary $md5_checksum_list = "/home/ajax/bin/pixpirate.md5"; umask 022; # set the umask so files will be -rwxr-xr-x (chmod 755) $version = "v0.25"; $SIG{ALRM} = \&timed_out; $timeout_seconds = 30; chop($yymmdd = `date +%y%m%d`); srand; # initialize rand function $jpg = ".jpg"; ############# ## M A I N ## ############# &display_banner; # show startup banner if ($#ARGV < 0) { print "Using URL: $site_url\n"; $site_url = $ARGV[0]; } else { print "Using (param) URL: $ARGV[0]\n"; $site_url = $ARGV[0]; } #&get_options; &setup_working_dir; # setup working directory @new_parse_sites = `lynx -source $site_url |egrep -i http| egrep -i href`; #@new_parse_sites = `cat www.purextc.com |egrep -i http|egrep -i href |egrep '\ '`; print "Loading URL: $site_url\n"; ############################################################################## ## ## FIRST LOOP: Parse each $full_http_address line and pull out the http ## address only. Then download each http web link into an ## array @jpg_index. ## ############################################################################## foreach $full_http_address (@new_parse_sites) { chop $full_http_address; #$full_http_address =~ s/.+href=\"([\w\d\/\W]+)\"\>.*$/$1/i; $full_http_address =~ s/^.+href=\"(http:.+)\"[\>\s].*$/$1/i; #pull out the ur l $full_http_address =~ s/\"[\s\>].+$//ig; # remove ending data after '">' $full_http_address =~ s/href=\"//ig; # remove 'href="' throughout line $full_http_address =~ s/\"$//g; # remove '"' at end of lines $full_http_address =~ s/\&/\\\&/g; # replace '&' with '\&' $full_http_address =~ s/\?/\\\?/g; # replace '?' with '\?' $full_http_address =~ s/[\>\<]//g; # remove all '>' and '<' characters $full_http_address =~ s/^.+(http.*$)/$1/; # remove any crap at the beginnin g # Check for duplicate http address line $check_for_duplicate = `egrep \'$full_http_address\' $dupindexfile`; if ($check_for_duplicate =~ /\w/) { print " Already Downloaded: $full_http_address\n"; } else { print "Loading Sub-Url: $full_http_address\n"; @jpg_index = `lynx -source $full_http_address |egrep -i jpg|egrep -i href`; ########################################################################## ## ## SECOND LOOP: Download each $full_http_address from the first loop ## put it into @jpg_index. Parse this for all lines with ## a jpeg file in it. Then download those files in the ## third loop. ## ########################################################################## foreach $jpeg_line (@jpg_index) { # pull out everything between the href="...." double-quotes. $jpeg_line =~ /href=\"([\w\d_-]+\.jpg)\"/i; $jpeg_url = $1; if ($jpeg_url =~ /jpg/i) { #if we found something, move on. if ($debug) {print "DEBUG: (110) \$jpeg_url = $jpeg_url\n"; } $using_rand = 0; # are we using a random filename now? NO. if ($jpeg_url =~ /\/$/i) { # does the last char end in a "/" $jpeg_url =~ /([\w\d]+\.jpg)/i; $jpgfile = $1; if ($debug) {print "DEBUG: (200) \$jpgfile (parse of \$jpeg_url) = $ jpgfile\n"; } if ( -e "$dwnld_dir/$yymmdd/$jpgfile" ) { $randnum = rand 1000000; $randnum =~ s/^(\d+)\..*$/$1/i; print " Downloading: $jpeg_url\n"; $jpgfile =~ s/\.jpg//i; print " File Exists, Saving as: $dwnld_dir/$yymmdd/$jpgfile$ran dnum$jpg\n"; alarm ($timeout_seconds); $working_filename = "$dwnld_dir/$yymmdd/$jpgfile_$randnum.jpg"; system("lynx -source $jpeg_url > $dwnld_dir/$yymmdd/$jpgfile$rand num$jpg"); alarm (0); if ((-s "$dwnld_dir/$yymmdd/$jpgfile$randnum$jpg") < 5000) { print " File Size less than 5000 bytes. $dwnld_dir/$yymmdd/ $jpgfile$randnum$jpg REMOVED.\n"; alarm (10); system("rm $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg"); alarm (0); } $using_rand = 1; # We are using a random filename now. } else { print " Downloading: $jpeg_url\n"; alarm($timeout_seconds); $working_filename = "$dwnld_dir/$yymmdd/$jpgfile"; system("lynx -source $jpeg_url > $dwnld_dir/$yymmdd/$jpgfile"); alarm(0); if ((-s "$dwnld_dir/$yymmdd/$jpgfile") < 5000) { print " File Size less than 5000 bytes. $dwnld_dir/$yymmdd/ $jpgfile REMOVED.\n"; alarm(10); system("rm $dwnld_dir/$yymmdd/$jpgfile"); alarm(0); } } if ($md5_checking eq 1) { &md5_check; } } elsif ($jpeg_url =~ /\.jpg/i) { # does the file name have .jpg in i t if ($full_http_address =~ /\.s?htm/i) { $full_http_address =~ /(http.+)\/.+$/i; $http_stem = $1; $jpgfile = $jpeg_url; if ($debug) {print "DEBUG: (210) \$full_http_address = $full_http _address\n"; } } elsif ($full_http_address =~ /.\/$/) { #$full_http_address =~ /(http.+)\/.+\/$/i; $http_stem = $full_http_address; $jpgfile = $jpeg_url; if ($debug) {print "DEBUG: (220) \$full_http_address = $full_http _address\n"; } } else { $full_http_address =~ /(http.+)\/.+\.jpg.+$/i; $http_stem = $1; $jpgfile = $jpeg_url; if ($debug) {print "DEBUG: (225) \$full_http_address = $full_http _address\n"; } } if ($debug) {print "DEBUG: (230) \$http_stem = $http_stem\n";} if ($debug) {print "DEBUG: (240) \$jpgfile = $jpgfile\n"; } if ( -e "$dwnld_dir/$yymmdd/$jpgfile" ) { $randnum = rand 1000000; $randnum =~ s/^(\d+)\..*$/$1/i; $jpgfile =~ s/\.jpg//i; print " Downloading: $http_stem/$jpgfile\n"; print " File Exists, Saving as: $dwnld_dir/$yymmdd/$jpgfile$ra ndnum$jpg\n"; alarm($timeout_seconds); $working_filename = "$dwnld_dir/$yymmdd/$jpgfile$randnum$jpg"; system("lynx -source $http_stem/$jpeg_url > $dwnld_dir/$yymmdd/$ jpgfile$randnum$jpg"); alarm(0); if ((-s "$dwnld_dir/$yymmdd/$jpgfile$randnum$jpg") < 5000) { print " File Size less than 5000 bytes. $dwnld_dir/$yymmdd /$jpgfile$randnum$jpg REMOVED.\n"; alarm(10); system("rm $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg"); alarm(0); } $using_rand = 1; # We are using random numbered filenames. } else { print " Downloading: $http_stem/$jpgfile\n"; alarm($timeout_seconds); $working_filename = "$dwnld_dir/$yymmdd/$jpgfile"; system("lynx -source $http_stem/$jpeg_url > $dwnld_dir/$yymmdd/$ jpgfile"); alarm(0); if ((-s "$dwnld_dir/$yymmdd/$jpgfile") < 5000) { print " File Size less than 5000 bytes. $dwnld_dir/$yymmdd /$jpgfile REMOVED.\n"; alarm(10); system("rm $dwnld_dir/$yymmdd/$jpgfile"); alarm(0); } } if ($md5_checking eq 1) { &md5_check; } } } #end if } #end foreach $jpgline open (INDEXFILE,">> $dupindexfile"); print INDEXFILE "$full_http_address\n"; close (INDEXFILE); } } #end foreach @new_parse_sites ## ## S U B R O U T I N E S ## sub display_banner { print "+-------------------------------------------------------------------- -+\n"; print "| pixpirate $version (c) ajax\@mobis.com October 1998 |\n"; print "+-------------------------------------------------------------------- -+\n"; } sub setup_working_dir { if ( -e "$dwnld_dir/$yymmdd" ) { print "Using Directory: $dwnld_dir/$yymmdd\n"; } else { print "Creating Directory: $dwnld_dir/$yymmdd\n"; system("mkdir $dwnld_dir/$yymmdd"); } } # end setup_working_dir subroutine sub md5_check { # # COMPARE MD5 CHECKSUM # if ($using_rand eq 1) { if (-e "$dwnld_dir/$yymmdd/$jpgfile$randnum$jpg") { alarm(10); chop($md5_fingerprint = `$md5sum $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg`); alarm(0); $md5_fingerprint =~ s/^([\w\d]+)\s.+$/$1/i; alarm(10); $vrfyMD5 = `egrep $md5_fingerprint $md5_checksum_list`; alarm(0); if ($vrfyMD5 =~ /\w/) { print " Duplicate file: $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg REMOVED.\ n"; alarm(10); system ("rm -f $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg"); alarm(0); } else { open (CHECKSUMFILE,">> $md5_checksum_list"); print CHECKSUMFILE "$md5_fingerprint\n"; close (CHECKSUMFILE); } $using_rand = 0; # Reset this value back to Zero } } else { if (-e "$dwnld_dir/$yymmdd/$jpgfile") { alarm(10); chop($md5_fingerprint = `$md5sum $dwnld_dir/$yymmdd/$jpgfile`); alarm(0); alarm(10); chop($vrfyMD5 = `egrep $md5_fingerprint $md5_checksum_list`); alarm(0); if ($vrfyMD5 =~ /\w/) { print " Duplicate file: $dwnld_dir/$yymmdd/$jpgfile REMOVED.\n"; alarm(10); system ("rm -f $dwnld_dir/$yymmdd/$jpgfile"); alarm(0); } else { open (CHECKSUMFILE,">> $md5_checksum_list"); print CHECKSUMFILE "$md5_fingerprint\n"; close (CHECKSUMFILE); } } #end if -e jpgfile } # end else } # end md5_check subroutine sub timed_out { print "Operation Timed Out: $error\n"; system("rm $working_filename"); }