#!/usr/bin/perl
#
# pixpirate v0.25 (c) ajax@mobis.com, October 1998
#
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
#
#############
#
# goto a source url, download all other sub-url's from an index
# of url's, then goto each of those source url's, downloading every jpg
# file that it comes into contact with.
#
# Ultimately this script would be ran as a crontab every 24 hours,
# looking for new url's. It will compare all sub-url's with an index of
# retrieved sub-url's and each image has its md5 signature saved and compared
# with an index of other md5 signatures, making redundancy minimal.
# Duplicate filenames are renamed with leading random numbers.
#
# todo: replace md5 logging with pack() and unpack() to reduce size of md5 log
#
# Here are some sample web sites you can use to plug into $site_url
#
# http://www.purextc.com/amateur/amat.htm
# http://www.purextc.com/gay/lesbian.htm
# http://www.blastsite.com/main.shtml
# http://www.adultbuffet.com/tgp/tgp.htm
# http://www.purextc.com/
# http://www.ebonyporn.com/main.html
# http://www.pornno.com/habit/gallerypost.html
# http://www.youngnympho.com/main.html
# http://www.book-mark.net/mark.html
# http://ww3.voyeurweb.com/main/Picturep.html
# CONFIGURATION OPTIONS (change these)
$site_url = "http://www.purextc.com/";
$dwnld_dir = "/home/ajax/public_html/pics";
$dupindexfile = "/home/ajax/bin/pixpirate.done";
$debug = 0; # debug mode, 1=on, 0=off
# MD5 CONFIG OPTIONS (change these)
$md5_checking = 1; # 0=off, 1=on
$md5sum = "/usr/bin/md5sum"; # location of md5sum binary
$md5_checksum_list = "/home/ajax/bin/pixpirate.md5";
umask 022; # set the umask so files will be -rwxr-xr-x (chmod 755)
$version = "v0.25";
$SIG{ALRM} = \&timed_out;
$timeout_seconds = 30;
chop($yymmdd = `date +%y%m%d`);
srand; # initialize rand function
$jpg = ".jpg";
#############
## M A I N ##
#############
&display_banner; # show startup banner
if ($#ARGV < 0) {
print "Using URL: $site_url\n";
$site_url = $ARGV[0];
} else {
print "Using (param) URL: $ARGV[0]\n";
$site_url = $ARGV[0]; }
#&get_options;
&setup_working_dir; # setup working directory
@new_parse_sites = `lynx -source $site_url |egrep -i http| egrep -i href`;
#@new_parse_sites = `cat www.purextc.com |egrep -i http|egrep -i href |egrep '\
'`;
print "Loading URL: $site_url\n";
##############################################################################
##
## FIRST LOOP: Parse each $full_http_address line and pull out the http
## address only. Then download each http web link into an
## array @jpg_index.
##
##############################################################################
foreach $full_http_address (@new_parse_sites) {
chop $full_http_address;
#$full_http_address =~ s/.+href=\"([\w\d\/\W]+)\"\>.*$/$1/i;
$full_http_address =~ s/^.+href=\"(http:.+)\"[\>\s].*$/$1/i; #pull out the ur
l
$full_http_address =~ s/\"[\s\>].+$//ig; # remove ending data after '">'
$full_http_address =~ s/href=\"//ig; # remove 'href="' throughout line
$full_http_address =~ s/\"$//g; # remove '"' at end of lines
$full_http_address =~ s/\&/\\\&/g; # replace '&' with '\&'
$full_http_address =~ s/\?/\\\?/g; # replace '?' with '\?'
$full_http_address =~ s/[\>\<]//g; # remove all '>' and '<' characters
$full_http_address =~ s/^.+(http.*$)/$1/; # remove any crap at the beginnin
g
# Check for duplicate http address line
$check_for_duplicate = `egrep \'$full_http_address\' $dupindexfile`;
if ($check_for_duplicate =~ /\w/) {
print " Already Downloaded: $full_http_address\n";
} else {
print "Loading Sub-Url: $full_http_address\n";
@jpg_index = `lynx -source $full_http_address |egrep -i jpg|egrep -i href`;
##########################################################################
##
## SECOND LOOP: Download each $full_http_address from the first loop
## put it into @jpg_index. Parse this for all lines with
## a jpeg file in it. Then download those files in the
## third loop.
##
##########################################################################
foreach $jpeg_line (@jpg_index) {
# pull out everything between the href="...." double-quotes.
$jpeg_line =~ /href=\"([\w\d_-]+\.jpg)\"/i;
$jpeg_url = $1;
if ($jpeg_url =~ /jpg/i) { #if we found something, move on.
if ($debug) {print "DEBUG: (110) \$jpeg_url = $jpeg_url\n"; }
$using_rand = 0; # are we using a random filename now? NO.
if ($jpeg_url =~ /\/$/i) { # does the last char end in a "/"
$jpeg_url =~ /([\w\d]+\.jpg)/i;
$jpgfile = $1;
if ($debug) {print "DEBUG: (200) \$jpgfile (parse of \$jpeg_url) = $
jpgfile\n"; }
if ( -e "$dwnld_dir/$yymmdd/$jpgfile" ) {
$randnum = rand 1000000;
$randnum =~ s/^(\d+)\..*$/$1/i;
print " Downloading: $jpeg_url\n";
$jpgfile =~ s/\.jpg//i;
print " File Exists, Saving as: $dwnld_dir/$yymmdd/$jpgfile$ran
dnum$jpg\n";
alarm ($timeout_seconds);
$working_filename = "$dwnld_dir/$yymmdd/$jpgfile_$randnum.jpg";
system("lynx -source $jpeg_url > $dwnld_dir/$yymmdd/$jpgfile$rand
num$jpg");
alarm (0);
if ((-s "$dwnld_dir/$yymmdd/$jpgfile$randnum$jpg") < 5000) {
print " File Size less than 5000 bytes. $dwnld_dir/$yymmdd/
$jpgfile$randnum$jpg REMOVED.\n";
alarm (10);
system("rm $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg");
alarm (0);
}
$using_rand = 1; # We are using a random filename now.
} else {
print " Downloading: $jpeg_url\n";
alarm($timeout_seconds);
$working_filename = "$dwnld_dir/$yymmdd/$jpgfile";
system("lynx -source $jpeg_url > $dwnld_dir/$yymmdd/$jpgfile");
alarm(0);
if ((-s "$dwnld_dir/$yymmdd/$jpgfile") < 5000) {
print " File Size less than 5000 bytes. $dwnld_dir/$yymmdd/
$jpgfile REMOVED.\n";
alarm(10);
system("rm $dwnld_dir/$yymmdd/$jpgfile");
alarm(0);
}
}
if ($md5_checking eq 1) { &md5_check; }
} elsif ($jpeg_url =~ /\.jpg/i) { # does the file name have .jpg in i
t
if ($full_http_address =~ /\.s?htm/i) {
$full_http_address =~ /(http.+)\/.+$/i;
$http_stem = $1;
$jpgfile = $jpeg_url;
if ($debug) {print "DEBUG: (210) \$full_http_address = $full_http
_address\n"; }
} elsif
($full_http_address =~ /.\/$/) {
#$full_http_address =~ /(http.+)\/.+\/$/i;
$http_stem = $full_http_address;
$jpgfile = $jpeg_url;
if ($debug) {print "DEBUG: (220) \$full_http_address = $full_http
_address\n"; }
} else {
$full_http_address =~ /(http.+)\/.+\.jpg.+$/i;
$http_stem = $1;
$jpgfile = $jpeg_url;
if ($debug) {print "DEBUG: (225) \$full_http_address = $full_http
_address\n"; }
}
if ($debug) {print "DEBUG: (230) \$http_stem = $http_stem\n";}
if ($debug) {print "DEBUG: (240) \$jpgfile = $jpgfile\n"; }
if ( -e "$dwnld_dir/$yymmdd/$jpgfile" ) {
$randnum = rand 1000000;
$randnum =~ s/^(\d+)\..*$/$1/i;
$jpgfile =~ s/\.jpg//i;
print " Downloading: $http_stem/$jpgfile\n";
print " File Exists, Saving as: $dwnld_dir/$yymmdd/$jpgfile$ra
ndnum$jpg\n";
alarm($timeout_seconds);
$working_filename = "$dwnld_dir/$yymmdd/$jpgfile$randnum$jpg";
system("lynx -source $http_stem/$jpeg_url > $dwnld_dir/$yymmdd/$
jpgfile$randnum$jpg");
alarm(0);
if ((-s "$dwnld_dir/$yymmdd/$jpgfile$randnum$jpg") < 5000) {
print " File Size less than 5000 bytes. $dwnld_dir/$yymmdd
/$jpgfile$randnum$jpg REMOVED.\n";
alarm(10);
system("rm $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg");
alarm(0);
}
$using_rand = 1; # We are using random numbered filenames.
} else {
print " Downloading: $http_stem/$jpgfile\n";
alarm($timeout_seconds);
$working_filename = "$dwnld_dir/$yymmdd/$jpgfile";
system("lynx -source $http_stem/$jpeg_url > $dwnld_dir/$yymmdd/$
jpgfile");
alarm(0);
if ((-s "$dwnld_dir/$yymmdd/$jpgfile") < 5000) {
print " File Size less than 5000 bytes. $dwnld_dir/$yymmdd
/$jpgfile REMOVED.\n";
alarm(10);
system("rm $dwnld_dir/$yymmdd/$jpgfile");
alarm(0);
}
}
if ($md5_checking eq 1) { &md5_check; }
}
} #end if
} #end foreach $jpgline
open (INDEXFILE,">> $dupindexfile");
print INDEXFILE "$full_http_address\n";
close (INDEXFILE);
}
} #end foreach @new_parse_sites
##
## S U B R O U T I N E S
##
sub display_banner {
print "+--------------------------------------------------------------------
-+\n";
print "| pixpirate $version (c) ajax\@mobis.com October
1998 |\n";
print "+--------------------------------------------------------------------
-+\n";
}
sub setup_working_dir {
if ( -e "$dwnld_dir/$yymmdd" ) {
print "Using Directory: $dwnld_dir/$yymmdd\n";
} else {
print "Creating Directory: $dwnld_dir/$yymmdd\n";
system("mkdir $dwnld_dir/$yymmdd");
}
} # end setup_working_dir subroutine
sub md5_check {
#
# COMPARE MD5 CHECKSUM
#
if ($using_rand eq 1) {
if (-e "$dwnld_dir/$yymmdd/$jpgfile$randnum$jpg") {
alarm(10);
chop($md5_fingerprint = `$md5sum $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg`);
alarm(0);
$md5_fingerprint =~ s/^([\w\d]+)\s.+$/$1/i;
alarm(10);
$vrfyMD5 = `egrep $md5_fingerprint $md5_checksum_list`;
alarm(0);
if ($vrfyMD5 =~ /\w/) {
print " Duplicate file: $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg REMOVED.\
n";
alarm(10);
system ("rm -f $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg");
alarm(0);
} else {
open (CHECKSUMFILE,">> $md5_checksum_list");
print CHECKSUMFILE "$md5_fingerprint\n";
close (CHECKSUMFILE);
}
$using_rand = 0; # Reset this value back to Zero
}
} else {
if (-e "$dwnld_dir/$yymmdd/$jpgfile") {
alarm(10);
chop($md5_fingerprint = `$md5sum $dwnld_dir/$yymmdd/$jpgfile`);
alarm(0);
alarm(10);
chop($vrfyMD5 = `egrep $md5_fingerprint $md5_checksum_list`);
alarm(0);
if ($vrfyMD5 =~ /\w/) {
print " Duplicate file: $dwnld_dir/$yymmdd/$jpgfile REMOVED.\n";
alarm(10);
system ("rm -f $dwnld_dir/$yymmdd/$jpgfile");
alarm(0);
} else {
open (CHECKSUMFILE,">> $md5_checksum_list");
print CHECKSUMFILE "$md5_fingerprint\n";
close (CHECKSUMFILE);
}
} #end if -e jpgfile
} # end else
} # end md5_check subroutine
sub timed_out {
print "Operation Timed Out: $error\n";
system("rm $working_filename");
}