#!/usr/bin/perl
# Please refer to the Plain Old Documentation (POD) at the end of this Perl Script for further information

use strict;

# SOAP::Lite version 0.52 or newer is recommended by http://code.google.com/apis/soapsearch/api_faq.html#tech20
use SOAP::Lite;
use Getopt::Long;
use Data::Dumper;

my $VERSION = 0.1; # May be required to upload script to CPAN i.e. http://www.cpan.org/scripts/submitting.html

print "\n\"Download Indexed Cache\" Proof of Concept (PoC) v0.1 (Released at RUXCON 2K8)\n";
print "\n";
print "Copyright 2008 Christian Heinrich\n";
print "Licensed under the Apache License, Version 2.0\n\n";

# Take the query from the command line
my $google_api_key;
my $query;
my $start;

# TODO Input Validation of command line arguments
# TODO Display Usage if no command line arguments are specified
GetOptions(
    "key=s"   => \$google_api_key,
    "query=s" => \$query,
    "start=s" => \$start
);

# Process command line arguements 
$start = $start - 1;
chomp($query);
# For demonstrations without exposing the Google SOAP Search API insert your Google SOAP Search API Key below to use dic.pl -key "demo" 
if ( $google_api_key == "demo" ) {
	# Replace "insert_google_api_key" with your Google SOAP Search API Key
  	# $google_api_key = "insert_google_api_key";
}

# strip ":" from Google Search Operator for Filename
# TODO Expand this to strip illegal filename chars e.g. \/:*?<>|
my $stripped_query = $query;
$stripped_query =~ s/://g;
my $dir = "$stripped_query/dic";

# The directory which holds the output of dic 
if ( !( -e $dir ) ) {
    print("Creating ./$dir\n\n");
    if ( !( -e "./$stripped_query" ) ) {
        mkdir("./$stripped_query");
    }
    system("mkdir $dir");
}
else { print "Appending ./$dir\n\n"; }

my $google_search_results
    = do_Google_Search( "$google_api_key", "$query", "$start" );

# TODO Display a warning if <estimatedTotalResultsCount> and <estimateIsExact> exceeds 1000

open( DATA_DUMPER, ">>./$dir/datadumper.txt" );
print DATA_DUMPER ( Data::Dumper::Dumper($google_search_results) );

# The URL corresponding to the Search Result .html file is listed in this .CSV file
open( URL, ">>./$dir/$stripped_query.csv" );

my $google_search_result_number = $start;

# Loop through the results.
foreach
    my $google_search_result ( @{ $google_search_results->{resultElements} } )
{

    # Set the results as variables
    ++$google_search_result_number;
    my $URL        = $google_search_result->{URL};
    my $cachedSize = $google_search_result->{cachedSize};
    print(    "Downloading "
            . $URL
            . " from Google Cache ["
            . $cachedSize . "] as "
            . $google_search_result_number
            . ".html\n" );
    my $google_cached_page = doGetCachedPage( "$google_api_key", "$URL" );
    open( CACHEDPAGE, ">./$dir/$google_search_result_number.html" );
    print CACHEDPAGE $google_cached_page;
    close(CACHEDPAGE);

    # TODO Include the date and time the page was indexed i.e. to quote the cache page "It is a snapshot of the page as it appeared on [Date] [Time]"
    print URL ( "$google_search_result_number" . "," . "$URL\n" );
}

sub do_Google_Search {

# Variable Naming Convention is as per Google SOAP Search API Reference Documentation
    my $key = $_[0];

    # $q is Google Search Query from Google SOAP Search API Reference
    # TODO Check length of Google Search Query is 2048 bytes
    # TODO Check Google Search Query is a maximum of 10 Words
    # TODO Check only one site: term is in the Google Search Query
    my $q = $_[1];

    # my $start = -start cmd line argument
    my $start = $_[2];

    # TODO Must add a test to ensure that $maxResults is between 1 to 1000
    my $maxResults = "10";

    # $filter is boolean i.e. either "true" or "false"
    my $filter = "false";

    # TODO Check Country of Restrict
    # TODO Check Topic of Restrict
    my $restricts  = "";
    my $safeSearch = "false";

    # TODO Check Language Restrict
    my $lr = "";

    # ie is Input Encoding and this has been deprecated in the Google SOAP Search API
    my $ie = "UTF-8";

    # oe is Output Encoding and this has been deprecated in the Google SOAP Search API
    my $oe = "UTF-8";

    # Location of the GoogleSearch WSDL file
    my $google_wsdl = "http://api.google.com/GoogleSearch.wsdl";

    # Create a new SOAP::Lite instance, feeding it GoogleSearch.wsdl
    my $google_search = SOAP::Lite->service("$google_wsdl");

    # TODO Confirm that connection with api.google.com can be established
    my $google_search_results = $google_search->doGoogleSearch(
        $key,       $q,          $start, $maxResults, $filter,
        $restricts, $safeSearch, $lr,    $ie,         $oe
    );
    # TODO Confirm that doGoogleSearchResponse SOAP Message is not empty due to exceeding 10K SOAP Messages with Google SOAP Search API Key 
    return $google_search_results;
}

sub doGetCachedPage {

# Variable Naming Convention is as per Google SOAP Search API Reference Documentation

    my $key = $_[0];
    my $URL = $_[1];

    # Location of the GoogleSearch WSDL file
    my $google_wsdl = "http://api.google.com/GoogleSearch.wsdl";

    my $google_cache = SOAP::Lite->service("$google_wsdl");
    my $doGetCachedPageResponse
        = $google_cache->doGetCachedPage( $google_api_key, $URL );
	# TODO Confirm that doGetCachedPageResponse SOAP Message is not empty due to exceeding 10K SOAP Messages with Google SOAP Search API Key
    return $doGetCachedPageResponse;
}

=head1 NAME

dic.pl - "Download Indexed Cache"

=head1 VERSION

This documentation refers to dic PoC v0.1. Released at RUXCON 2K8 (AU)

=head1 USAGE

dic.pl -key [key] -query [Google Search Query] -start [Starting Google Search Result Number]

=head1 REQUIRED ARGUMENTS

 -key           Google SOAP Search API Key
 -q		Google Search Query
 -start         Starting Google Search Result Number

=head1 DESCRIPTION

"Download Indexed Cache" implements the Google SOAP Search API to retrieve
content indexed within the Google Cache and supports the "Search Engine
Reconnaissance" section of the recently released OWASP Testing Guide v3.

=head1 DEPENDENCIES

=head1 PREREQUISITES

SOAP::Lite v0.52 CPAN Module
Data::Dumper CPAN Module

=head1 COREQUISITES

=head1 OSNAMES

cygwin

=head1 SCRIPT CATEGORIES

Web

=head1 INCOMPATIBILITIES

=head1 BUGS AND LIMITATIONS

Please refer to the comments beginning with "TODO" in the Perl Code.

=head1 AUTHOR

Christian Heinrich

=head1 CONTACT INFORMATION

christian.heinrich@owasp.org
christian.heinrich@cmlh.id.au
cmlh@cpan.org

http://www.linkedin.com/in/ChristianHeinrich

=head1 MAILING LIST

https://lists.owasp.org/mailman/listinfo/owasp-google-hacking
http://groups.google.com/group/download-indexed-cache

=head1 SUBVERSION REPOSITORY

# TODO svn propset svn:keywords

http://code.google.com/p/dic

=head1 FURTHER INFORMATION AND UPDATES

http://del.icio.us/cmlh/dic
https://lists.owasp.org/mailman/listinfo/owasp-google-hacking
http://groups.google.com/group/download-indexed-cache
http://code.google.com/p/dic

=head1 LICENSE AND COPYRIGHT

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Copyright 2008 Christian Heinrich