#!/usr/local/bin/perl
#
#   CGIProxy 1.4.1
#
#   nph-proxy.cgi-- CGIProxy 1.4.1: an proxy in the form of a CGI script.
#     Retrieves the resource at any HTTP or FTP URL, updating embedded URLs
#     in HTML resources to point back through this script.  By default, no
#     user info is sent to the server.  Options include text-only proxying
#     to save bandwidth, cookie support, ad-filtering, script removal,
#     user-defined encoding of the target URL, and more.  Runs in Perl 4 or 5.
#
#   Copyright (C) 1996, 1998-2001 by James Marshall, james@jmarshall.com
#   All rights reserved.
#
#   For the latest, see http://www.jmarshall.com/tools/cgiproxy/
#
#
#   IMPORTANT NOTE ABOUT ANONYMOUS BROWSING:
#     CGIProxy was originally made for indirect browsing more than
#       anonymity, but since people are using it for anonymity, I've tried
#       to make it as anonymous as possible.  Suggestions welcome.  For best
#       anonymity, browse with JavaScript turned off.
#     Anonymity is pretty good, but may not be bulletproof.  For example,
#       if even a single JavaScript statement can be run, your anonymity can
#       be compromised.  I've tried to remove JS from every place it can
#       exist, but please tell me if I missed any.  Also, browser plugins or
#       other executable extensions may be able to reveal you to a server.
#
#
#   CONFIGURATION:
#
#     None required in most situations.  On some servers, these might be
#       required (all in the "user configuration" section):
#       . If this is running on an SSL server, set $RUNNING_ON_SSL_SERVER=1.
#       . If this is running on Windows, set $RUNNING_ON_WINDOWS=1 to work
#           around a couple of Windows and IIS bugs.
#       . If you're using another HTTP proxy, set $ENV{'http_proxy'} and
#           $ENV{'no_proxy'}.  If that proxy uses authentication, set
#           $PROXY_AUTH. 
#       . To make this run in Perl 4, follow the instructions around the 
#           "use Socket" line.
#
#     Options include:
#       . Set $TEXT_ONLY, $REMOVE_COOKIES, $REMOVE_SCRIPTS, $FILTER_ADS, and
#           $INSERT_ENTRY_FORM as desired.  Set $REMOVE_SCRIPTS if anonymity
#           is important.
#       . To allow the user to choose all of those settings except $TEXT_ONLY,
#           set $ALLOW_USER_CONFIG=1.
#       . To change the encoding format of the URL, modify the
#           &proxy_encode() and &proxy_decode() routines.  The default
#           routines are suitable for simple PATH_INFO compliance.
#       . You can restrict which servers this proxy will access, with
#           @ALLOWED_SERVERS and @BANNED_SERVERS.
#       . Similarly, you can specify allowed and denied server lists for
#           both cookies and scripts.
#       . If filtering ads, you can customize this with a few settings.
#       . To insert your own block of HTML into each page, set $INSERT_HTML
#           or $INSERT_FILE.
#       . As a last resort, if you really can't run this script as NPH,
#           you can try to run it as non-NPH by setting $NOT_RUNNING_AS_NPH=1.
#           BUT, read the notes and warnings above that line.  Caveat surfor.
#       . For crude load-balancing among a set of proxies, set @PROXY_GROUP.
#       . Other minor config is possible; see the user configuration section.
#       . If heavy use of this proxy puts a load on your server, see the
#           "NOTES ON PERFORMANCE" section below.
#
#     For more info, read the comments regarding any config options you set.
#
#     This script MUST be installed as a non-parsed header (NPH) script.
#       In Apache and many other servers, this is done by simply starting the
#       filename with "nph-".  It MAY be possible to fake it as a non-NPH
#       script, MOST of the time, by using the $NOT_RUNNING_AS_NPH feature.
#       This is not advised.
#
#
#   TO USE:
#     Call this script's URL to start a session, or browse directly to a page
#       by putting the encoded target URL in PATH_INFO.  You can bookmark
#       pages you browse to through the proxy.
#
#
#   NOTES ON PERFORMANCE:
#     Unfortunately, this has gotten slower through the versions, mostly
#       because of optional new features.  Configured equally, version 1.3
#       takes 25% longer to run than 1.0 or 1.1 (based on *cough* highly
#       abbreviated testing).  Compiling takes about 50% longer.
#     Leaving $REMOVE_SCRIPTS=1 adds 25-50% to the running time.
#     Leaving $FASTER_HTML_LESS_PRIVACY=0 increases the running time by 10%.
#       (Previous versions had a privacy hole which this plugs.)
#     For a gain in compile speed, follow the instructions near the
#       "use Socket" line-- that module takes quite a while to load.
#     Remember that we're talking about tenths of a second here.  Most of
#       the delay experienced by the user is from waiting on two network
#       connections.  These performance issues only matter if your server
#       CPU is getting overloaded.  Also, these only matter when retrieving
#       HTML, because it's the HTML modification that takes all the time.
#     If you can, use mod_perl.  Starting with version 1.3.1, this should
#       work under mod_perl, but in this case you need at least Perl 5.004.
#       If you use mod_perl, be careful to install this as an NPH script,
#       i.e. set the "PerlSendHeader Off" configuration directive.  For more
#       info, see the mod_perl documentation.
#     If you use mod_perl and modify this script, see the note near the
#       "reset 'a-z'" line below, regarding UPPER_CASE and lower_case
#       variables.
#
#
#   TO DO:
#     What I want to hear about:
#       . Any HTML tags not being converted here.
#       . Any method of introducing JavaScript or other script, that's not
#           being filtered out here.
#       . Any script MIME types other than application/x-javascript and
#           text/javascript.
#       . Any HTML-like MIME types other than text/html.
#
#     plug any other script holes (e.g. MSIE-proprietary, other MIME types?)
#     This could use cleaner URL-encoding all over ($base_url, etc.)
#     more error checking?
#     find a simple encryption technique for proxy_encode()
#     support more protocols, like SSL or gopher
#     For ad filtering, add option to disable images from servers other than
#       that of the containing HTML page?  Is it worth it?
#
#
#   BUGS:
#     Anonymity is not perfect.  In particular, there may be some remaining
#       JavaScript holes.
#     URLs generated by JavaScript or similar mechanisms won't be re-proxy'ed
#       correctly.  JavaScript in general may not work as expected.
#     Since ALL of your cookies are sent to this script (which then chooses
#       the relevant ones), some cookies could conceivably be dropped if
#       you accumulate a whole lot.  I haven't seen this happen yet.
#     The regex that converts URLs is not perfect-- it always matches what's
#       needed, but sometimes it matches more.  A perfect regex would be
#       much slower.  See the notes above that block for more details.
#     When converted, URLs are not HTML-unescaped then re-escaped as they
#       should be.  To fix this would cost a lot of CPU time and would seldom
#       matter, but it's easy enough to do if there's demand.
#     Comments and script contents aren't parsed correctly, so sometimes
#       URLs within those will be mistakenly converted.  In reality, this
#       sometimes approximates what you want anyway.
#
#
#   I first wrote this in 1996 as an experiment to allow indirect browsing.
#     The original seed was a program I wrote for Rich Morin's article
#     in the June 1996 issue of Unix Review, online at
#     http://www.cfcl.com/tin/P/199606.shtml.
#
#   Confession: I didn't originally write this with the spec for HTTP
#     proxies in mind, and there are probably some violations of the protocol
#     (at least for proxies).  This whole thing is one big violation of the
#     proxy model anyway, so I hereby rationalize that the spec can be widely
#     interpreted here.  If there is demand, I can make it more conformant.
#
#--------------------------------------------------------------------------
#
# Here are all global variables used in this script.  Some could be
#   rewritten as local variables, possibly with a performance loss.  With
#   these declared and all local() statements converted to my(), you can
#   (almost) compile with "use strict", which is always a good idea when
#   programming in Perl 5.
#
# my($TEXT_ONLY,
#    $REMOVE_COOKIES, $REMOVE_SCRIPTS, $FILTER_ADS, $INSERT_ENTRY_FORM,
#    $ALLOW_USER_CONFIG,
#    $NO_COOKIE_WITH_IMAGE, @ALLOWED_COOKIE_SERVERS, @BANNED_COOKIE_SERVERS,
#    @ALLOWED_SCRIPT_SERVERS, @BANNED_SCRIPT_SERVERS,
#    @ALLOWED_SERVERS, @BANNED_SERVERS,
#    @BANNED_IMAGE_URL_PATTERNS, $RETURN_EMPTY_GIF,
#    $INSERT_HTML, $INSERT_FILE, $ANONYMIZE_INSERTION, $FORM_AFTER_INSERTION,
#    $DOING_INSERT,
#    $USER_AGENT, $SHOW_FTP_WELCOME, $MAX_REQUEST_SIZE, $PROXY_AUTH,
#    $RUNNING_ON_WINDOWS, $FASTER_HTML_LESS_PRIVACY, $RUNNING_ON_SSL_SERVER,
#    $NOT_RUNNING_AS_NPH, $NO_BROWSE_THROUGH_SELF, $HTTP_1_0,
#    $USE_POST_ON_START, $NON_TEXT_EXTENSIONS, @PROXY_GROUP,
#    @SCRIPT_MIME_TYPES, $SCRIPT_TYPE_REGEX, %IS_SCRIPT_MIME_TYPE,
#    $AF_INET, $SOCK_STREAM,
#    $PROXY_VERSION, $FLAGS, @FLAGS, $env_accept,
#    $images_are_banned_here, $scripts_are_banned_here, $cookies_are_banned_here,
#    $URL, $scheme, $authority, $path, $host, $port, $username, $password,
#    $cookie_to_server, %auth,
#    $this_url, $this_url_inframe, $this_url_noframe, $is_in_frame,
#    $base_url, $base_scheme, $base_host, $base_path,
#    $status, $headers, $body, @body, @welcome, $is_html, $response_sent,
#    $tag_name, $att_init) ;
#
#--------------------------------------------------------------------------

#--------------------------------------------------------------------------
#    user configuration
#--------------------------------------------------------------------------

# If set, then proxy traffic will be restricted to text data only, to save 
#   bandwidth (though it can still be circumvented with uuencode, etc.).

$TEXT_ONLY= 0 ;      # set to 1 to allow only text data, 0 to allow all


# If set, then prevent all cookies from passing through the proxy.  To allow
#   cookies from some servers, set this to 0 and see @ALLOWED_COOKIE_SERVERS
#   and @BANNED_COOKIE_SERVERS below.  You can also prevent cookies with
#   images by setting $NO_COOKIE_WITH_IMAGE below.
# Note that this only affects cookies from the target server.   The proxy
#   script sends its own cookies for other reasons too, like to support
#   authentication.  This flag does not stop these cookies from being sent.

$REMOVE_COOKIES= 0 ;


# If set, then remove as much scripting as possible.  On the HTTP level:
#   . prevent transmission of script MIME types (which only works if the server
#       marks them as such, so a malicious server could get around this).
#   . remove Content-Style-Type: headers that are set to a script MIME type.
#   . remove Link: headers entirely, since there may be an obscure script hole.
# Within HTML resources:
#   . remove <script>...</script>
#   . remove HTML tag attributes whose names begin with "on".
#   . remove <style>...</style> where "type" attribute is a script MIME type.
#   . remove various HTML tags that appear to link to a script MIME type.
#   . remove <meta> tags that define the default Content-Style-Type to be a
#       script MIME type.
#   . remove Netscape-specific "JavaScript entities", i.e. any attributes
#       containing the string "&{".
#   . remove "JavaScript conditional comments"
# This removes most, but not all, JavaScript and other scripts.
# To allow scripts from some sites but not from others, set this to 0 and 
#   see @ALLOWED_SCRIPT_SERVERS and @BANNED_SCRIPT_SERVERS below.
# The list of script MIME types is surely incomplete.  See @SCRIPT_MIME_TYPES
#   below.
# I do NOT know for certain that this removes all JavaScript!  It removes all
#   that I know of, but I don't have a definitive list of places JavaScript
#   can exist.  If you do, please send it to me.  EVEN RUNNING A SINGLE
#   JAVASCRIPT STATEMENT CAN COMPROMISE YOUR ANONYMITY!  Just so you know.
# Richard Smith has a good test site for anonymizing proxies, at
#   http://users.rcn.com/rms2000/anon/test.htm
# Note that turning this on removes most popup ads!  :)

$REMOVE_SCRIPTS= 1 ;


# If set, then filter out images that match one of @BANNED_IMAGE_URL_PATTERNS,
#   below.  Also removes cookies attached to images, by setting
#   $NO_COOKIE_WITH_IMAGE.
# To remove most popup advertisements, also set $REMOVE_SCRIPTS=1 above.

$FILTER_ADS= 0 ;


# If set, insert a compact version of the URL entry form at the top of each
#   page.  This will also display the URL currently being viewed.
# It will not be inserted in frames, because that would look really ugly.
# If you want to customize the appearance of the form, modify the routine
#   mini_start_form() near the end of the script.
# If you want to insert something other than this form, see $INSERT_HTML and
#   $INSERT_FILE below.
# Users should realize that options changed via the form only take affect when
#   the form is submitted by entering a new URL or pressing the "Go" button.
#   Selecting an option, then following a link on the page, will not cause
#   the option to take effect.
# Users should also realize that anything inserted into a page may throw
#   off any precise layout.  The insertion will also be subject to
#   background colors and images, and any other page-wide settings.

$INSERT_ENTRY_FORM= 0 ;


# If set, then allow the user to control $REMOVE_COOKIES, $REMOVE_SCRIPTS,
#   $FILTER_ADS, and $INSERT_ENTRY_FORM, via a flag string at start of
#   PATH_INFO.  Note that they can't fine-tune any related options, such
#   as the various @ALLOWED... and @BANNED... lists.

$ALLOW_USER_CONFIG= 1 ;


# Create your own proxy_encode() and proxy_decode() to tranform the target
#   URL to and from the format that will be stored in PATH_INFO.  The encoded
#   form should only contain characters that are legal in PATH_INFO.  This
#   varies by server, but using only printable chars, no "?" or "#", and no 
#   two adjacent slashes ("//") works on most servers.  Don't let PATH_INFO
#   contain the strings "./", "/.", "../", or "/..", or else it may get
#   compressed like a pathname somewhere.  Try not to make the resulting
#   string too long, either.
# Of course, proxy_decode() must exactly undo whatever proxy_encode() does.
# Make proxy_encode() as fast as possible-- it's a major bottleneck for the 
#   whole program.
# Because of the simplified absolute URL resolution in full_url(), there may
#   be ".." segments in the default encoding here, notably in the first path
#   segment.  Normally, that's just an HTML mistake, but please tell me if
#   you see any privacy exploit with it.

sub proxy_encode {
    local($URL)= @_ ;
    $URL=~ s#^([\w+.-]+)://#$1/# ;                 # http://xxx -> http/xxx
#    $URL=~ s/(.)/ sprintf('%02x',ord($1)) /ge ;   # each char -> 2-hex
#    $URL=~ tr/a-zA-Z/n-za-mN-ZA-M/ ;              # rot-13
    return $URL ;
}

sub proxy_decode {
    local($PATH_INFO)= @_ ;
#    $PATH_INFO=~ tr/a-zA-Z/n-za-mN-ZA-M/ ;        # rot-13
#    $PATH_INFO=~ s/([0-9A-Fa-f]{2})/ sprintf("%c",hex($1)) /ge ;
    $PATH_INFO=~ s#^([\w+.-]+)/#$1://# ;           # http/xxx -> http://xxx
    return $PATH_INFO ;
}



# Use these to restrict which servers a user can visit through this proxy.
# They work like @ALLOWED_COOKIE_SERVERS and @BANNED_COOKIE_SERVERS below
#   (see those notes for more details):  They're a list of patterns; if
#   @ALLOWED_SERVERS is set, then only allow access to those servers; always
#   prevent access to any servers in @BANNED_SERVERS.
# Again, these are patterns, not literal strings-- if you want to specify
#   one server like "www.example.com", use "^www\.example\.com$", not
#   the pattern "www.example.com"; otherwise, you may match more than that
#   one server.
@ALLOWED_SERVERS= () ;
@BANNED_SERVERS= () ;



# Settings to fine-tune cookie behavior, if $REMOVE_COOKIES is not set above.
# Note that @ALLOWED_COOKIE_SERVERS and @BANNED_COOKIE_SERVERS use Perl
#   regular expressions (aka patterns or regexes), not literal host names.
#   To turn a hostname into a pattern, replace every "." with "\.", add "^"
#   to the beginning, and add "$" to the end.  For example, "www.example.com"
#   becomes "^www\.example\.com$".  To match *every* host ending in
#   something, leave out the "^".  For example, "\.example\.com$" matches
#   every host ending in ".example.com".

# Set this to reject cookies attached to images
$NO_COOKIE_WITH_IMAGE= 1 ;

# If non-empty, only allow cookies from servers matching one of these patterns.
# Comment this out to allow all cookies (subject to @BANNED_COOKIE_SERVERS).
#@ALLOWED_COOKIE_SERVERS= ('\bslashdot\.org$') ;

# Reject cookies from servers matching these patterns.
@BANNED_COOKIE_SERVERS= (
    '\.doubleclick\.net$', 
    '\.preferences\.com$',
    '\.imgis\.com$',
    '\.adforce\.com$',
    '\.focalink\.com$',
    '\.flycast\.com$',
    '\.go\.com$',
    '\.avenuea\.com$',
    '\.linkexchange\.com$',
    '\.pathfinder\.com$',
    '\.burstnet\.com$',
    '\btripod\.com$',
    '\bgeocities\.yahoo\.com$',
    '\.mediaplex\.com$',
    ) ;


# If $REMOVE_SCRIPTS is set, then all scripts will be removed.  Otherwise,
#   set @ALLOWED_SCRIPT_SERVERS and @BANNED_SCRIPT_SERVERS to select which
#   sites you'll allow scripts from.  They work like @ALLOWED_COOKIE_SERVERS
#   and @BANNED_COOKIE_SERVERS above:  They're a list of patterns; always
#   reject scripts from @BANNED_SCRIPT_SERVERS, then if @ALLOWED_SCRIPT_SERVERS
#   is set, restrict scripts to those servers.  Empty lists will allow all
#   scripts to pass (unless $REMOVE_SCRIPTS is set above).
@ALLOWED_SCRIPT_SERVERS= () ;
@BANNED_SCRIPT_SERVERS= () ;



# Various options to help filter ads and stop cookie-based privacy invasion.
# These are only effective if $FILTER_ADS is set above.
# The default settings are OK, but the sample pattern lists are very simple
#   and could be expanded.  Improvements welcome!
# The champs of ad-filtering are JunkBusters, at http://www.junkbusters.com.
#   If ad-filtering is your primary motive, go there, dey da best.
# @BANNED_IMAGE_URL_PATTERNS uses Perl patterns.  If an image's URL
#   matches one of the patterns, it will not be downloaded (typically for
#   ad-filtering).  For more information on Perl regular expressions, see
#   the Perl documentation.
# Note that most popup ads can be removed by setting $REMOVE_SCRIPTS=1, above.

# If set, replace banned images with 1x1 transparent GIF
$RETURN_EMPTY_GIF= 1 ;

# Reject images whose URL matches any of these patterns.  This is just a
#   sample list; add more depending on which sites you visit.
@BANNED_IMAGE_URL_PATTERNS= (
    '\b[a-z]\.doubleclick\.net(:\d*)?/',
    '\.imgis\.com\b',
    '\.adforce\.com\b',
    '\.avenuea\.com\b',
    '\.go\.com(:\d*)?/ad/',
    '\.eimg\.com\b',
    '\bexcite\.netscape\.com(:\d*)?/.*/promo/',
    '/excitenetscapepromos/',
    '\.yimg\.com(:\d*)?.*/promo/',
    '\bus\.yimg\.com/[a-z]/(\w\w)/\1',
    '\bpromotions\.yahoo\.com(:\d*)?/promotions/',
    '\bcnn\.com(:\d*)?/ads/',
    'ads\.msn\.com\b',
    '\blinkexchange\.com\b',
    '\badknowledge\.com\b',
    '/SmartBanner/',
    '\bdeja\.com/ads/',
    '\bimage\.pathfinder\.com/sponsors',
    'ads\.tripod\.com',
    ) ;



# If either $INSERT_HTML or $INSERT_FILE is set, then that HTML text or the
#   contents of that named file (respectively) will be inserted into any
#   downloaded HTML page.  $INSERT_HTML takes precedence over $INSERT_FILE.
# Nothing will be inserted in frames, because then it would look really ugly.
# The frame avoidance means that it's fairly easy for a surfer to bypass
#   this, by pretending in effect to be in a frame.  There's not much we
#   can do about that, since a page is retrieved the same way regardless of
#   whether it's in a frame.  This script uses a parameter in the URL to
#   communicate to itself between calls, but the user can merely change
#   that URL to make the script think it's retrieving a page for a frame.
# If you're using $INSERT_FILE and mod_perl, do NOT set $INSERT_HTML here,
#   not even to ''.  The way it's set up, the file in $INSERT_FILE will
#   only have to be read once*, and it's stored in $INSERT_HTML.  If you
#   clear the value of $INSERT_HTML here, the program will have to read the
#   file every time it's run-- very inefficient.
# [*Well, once for every new mod_perl context, which may be more than once
#   but is less than every time the script is run.]
# As with $INSERT_ENTRY_FORM, note that any insertion may throw off any
#   precise layout, and the insertion is subject to background colors and
#   other page-wide settings.

#$INSERT_HTML= "<h1>This is an inserted header</h1><hr>" ;
#$INSERT_FILE= 'insert_file_name' ;


# If your insertion has links that you want anonymized along with the rest
#   of the downloaded HTML, then set this to 1.  Otherwise leave it at 0.
$ANONYMIZE_INSERTION= 0 ;

# If there's both a URL entry form and an insertion via $INSERT_HTML or
#   $INSERT_FILE on the same page, the entry form normally goes at the top.
#   Set this to put it after the other insertion.
$FORM_AFTER_INSERTION= 0 ;



# Depending on whether you're using Perl 5 or Perl 4 (or want a slight speed
#   advantage), use ONE of the following two blocks.

# Use this for Perl 5 (slows it down a little).
# You can also remove the "local($1,$2)" line in full_url() for slight speedup.
use Socket qw(AF_INET SOCK_STREAM) ;
$AF_INET= AF_INET ; $SOCK_STREAM= SOCK_STREAM ;

# Use this line for Perl 4, or to get a little more speed.  The values of
#   $AF_INET and $SOCK_STREAM must be set correctly for your system; the
#   correct values are usually in either /usr/include/sys/socket.h
#   or /usr/include/linux/socket.h .
#$AF_INET= 2 ; $SOCK_STREAM= 1 ;  # these are GUESSES; set correct values



# Set this if the script is running on an SSL server, i.e. it is accessed
#   through a URL starting with "https:".  This is needed to know how to
#   route URLs back through the proxy.  Regrettably, standard CGI does not
#   yet provide a way for scripts to determine this without help.
# This has nothing to do with retrieving pages that are on SSL servers.
$RUNNING_ON_SSL_SERVER= 0 ;


# If you're running this on a Windows machine, set this.  Windows lacks some
#   features that exist on Unix, and there are bugs in the IIS server.  If
#   this variable is set, the script tries to work around these limitations.
$RUNNING_ON_WINDOWS= 0 ;


# If your server doesn't support NPH scripts, then set this variable to true
#   and try running the script as a normal non-NPH script.  HOWEVER, this
#   won't work as well as running it as NPH; there may be bugs, maybe some
#   privacy holes, and results may not be consistent.  It's a hack.
# Try to install the script as NPH before you use this option, because
#   this may not work.  NPH is supported on almost all servers, and it's
#   usually very easy to install a script as NPH (on Apache, for example,
#   you just need to name the script something starting with "nph-").
# One example of a problem is that Location: headers may get messed up,
#   because they mean different things in an NPH and a non-NPH script.
#   You have been warned.
# For this to work, your server MUST support the "Status:" CGI response
#   header.
$NOT_RUNNING_AS_NPH= 0 ;


# Here's an experimental feature that may or may not be useful.  It's trivial
#   to add, so I added it.  It was inspired in part by Mike Reiter's and Avi
#   Rubin's "Crowds", at http://www.research.att.com/projects/crowds/ .
#   Let me know if you find a use for it.
# The idea is that you have a number of mutually-trusting, cooperating
#   proxies that you list in @PROXY_GROUP().  If that is set, then instead
#   of rerouting all URLs back through this proxy, the script will choose
#   one of these proxies at random to reroute all URLs through, for each
#   run.  This could be used to balance the load among several proxies, for
#   example.  Under certain conditions it could conceivably help privacy by
#   making it harder to track a user's session, but under certain other
#   conditions it could make it easier, depending on how many people,
#   proxies, and proxy servers are involved.  For each page, both its
#   included images and followed links will go through the same proxy, so a
#   clever target server could determine which proxy servers are in each
#   group.
# proxy_encode() and proxy_decode() must be the same for all proxies in
#   the group.
# Cookies and Basic authentication can't be supported with this, sorry, since
#   cookies can only be sent back to the proxy that created them.
# Set this to a list of absolute URLs of proxies, ending with "nph-proxy.cgi/"
#   (or whatever you named the script).  Be sure to include the URL of this
#   proxy, or it will never redirect back through here.  Each proxy in the
#   group should have the same @PROXY_GROUP.
# Alternately, you could set each proxy's @PROXY_GROUP differently for more
#   creative configuration, such as to balance the load unevenly, or to send
#   users through a "round-robin" cycle of proxies.

#@PROXY_GROUP= ('http://www.example.com/~grommit/proxy/nph-proxy.cgi/',
#	        'http://www.fnord.mil/langley/bavaria/atlantis/nph-proxy.cgi/',
#	        'http://www.nothinghere.gov/No/Such/Agency/nph-proxy.cgi/',
#	        ) ;


# Set HTTP proxy if needed
#$ENV{'http_proxy'}= '' ;
#$ENV{'no_proxy'}= '' ;

# If your HTTP proxy requires authentication, this script supports it in a
#   limited way: you can have a single username/password pair to authenticate
#   with, regardless of realm.  In other words, multiple realms aren't
#   supported for proxy authentication (though they are for normal server
#   authentication, elsewhere).
# Set $PROXY_AUTH either in the form of "username:password", or to the actual
#   base64 string that gets sent in the Proxy-Authorization: header.
#$PROXY_AUTH= 'Aladdin:open sesame' ;


# There's a would-be privacy hole:  If an HTML tag has more than one of the
#   same URL-containing attribute, this proxy only converts the last of that
#   attribute to have its URL point back through this script.  For example,
#   with <img src="evil.cgi" src="decoy.gif">, only decoy.gif will be
#   converted, and evil.cgi will most likely be called directly from your
#   browser.  It's not valid HTML, but it's exploitable.
# So, this program removes duplicate attributes in each tag.  Unfortunately,
#   that action slows down the main URL-converting block by about 10%, and I
#   can't find any faster way.  But if you're not worried about this privacy
#   hole, and performance is a major issue for you, you can set this flag to
#   disable the protection.
$FASTER_HTML_LESS_PRIVACY= 0 ;


# Set $USER_AGENT to something generic like this if you want to be extra
#   careful.  Conceivably, revealing which browser you're using may be a
#   slight privacy or security risk.
# However, note that some URLs serve different pages depending on which
#   browser you're using, so some pages will change if you set this.
# This defaults to the user's HTTP_USER_AGENT.
#$USER_AGENT= 'Mozilla/4.05 [en] (X11; I; Linux 2.0.34 i586)' ;


# Unlike a normal browser which can keep an FTP session open between requests,
#   this script must make a new connection with each request.  Thus, the
#   FTP welcome message (e.g. the README file) will be received every time;
#   there's no way for this script to know if you've been here before.  Set
#   $SHOW_FTP_WELCOME to true to always show the welcome message, or false
#   to never show it.
$SHOW_FTP_WELCOME= 1 ;


# For the obscure case when a POST must be repeated because of user
#   authentication, this is the max size of the request body that this
#   script will store locally.  If CONTENT_LENGTH is bigger than this,
#   the body's not saved at all-- the first POST will be correct, but
#   the second will not happen at all (since a partial POST is worse than
#   nothing).
$MAX_REQUEST_SIZE= 4194304 ;  # that's 4 Meg to you and me


# Apparently, some censoring filters search outgoing request URIs, but not
#   POST request bodies.  Set this to make the initial input form submit
#   using POST instead of GET.
$USE_POST_ON_START= 1 ;


# If set, this option prevents a user from calling the proxy through the
#   proxy itself, i.e. looping.  It's normally a mistake on the user's part,
#   and a waste of resources.  A malicious user can probably still find a
#   way to make it call itself, or can always use two proxies to call each
#   other in a loop.  Also, this doesn't account for IP addresses or
#   multiple hostnames for the same server.
$NO_BROWSE_THROUGH_SELF= 0 ;


# All MIME types that could identify a script, and which will be filtered out
#   as well as possible if removing scripts.  HTTP responses with
#   Content-type: set to one of these will be nixed, and style sheets with
#   a type here will be disabled.
# Obviously, this is an incomplete list-- what about ActiveX, other script
#   types, or future script types?
@SCRIPT_MIME_TYPES= ( 'application/x-javascript', 'text/javascript',
		      'text/scriptlet', 'text/x-scriptlet',
		      'application/x-vbscript', 'application/hta' ) ;


# This is a list of all file extensions that will be disallowed if
#   $TEXT_ONLY is set.  It's an inexact science.  If you want to ban
#   other file extensions, you can add more to this list.  Note that
#   removing extensions from this list won't necessarily allow those
#   files through, since there are other ways $TEXT_ONLY is implemented,
#   such as only allowing MIME types of text/* .
# The format of this list is one long string, with the extensions
#   separated by "|".  This is because the string is actually used as
#   a regular expression.  Don't worry if you don't know what that means.
# Extensions are roughly taken from Netscape's "Helper Preferences" screen
#   (but that was in 1996).
# Boy, I really need that list of binary file extensions.  Write me if
#   you have one (August 1998).
$NON_TEXT_EXTENSIONS= 
	  'gif|jpeg|jpe|jpg|tiff|tif|png|bmp|xbm'   # images
	. '|mp2|mp3|wav|aif|aiff|au|snd'            # audios
	. '|avi|qt|mov|mpeg|mpg|mpe'                # videos
	. '|gz|Z|exe|gtar|tar|zip|sit|hqx|pdf'      # applications
	. '|ram|rm|ra' ;                            # others



#--------------------------------------------------------------------------
#    end of (useful) user configuration
#--------------------------------------------------------------------------

# This is now set directly in footer(), the only place it's used.
# $PROXY_VERSION= '1.4.1' ;


# This is needed to run an NPH script under mod_perl.
# Other stuff needed for mod_perl:
#   must use at least Perl 5.004, or STDIN and STDOUT won't behave correctly;
#   cannot use exit();
#   must initialize or reset all vars;
#   regex's with /o option retain state between calls, so be careful;
#   typeglobbing of *STDIN doesn't work, so must pass filehandles as strings.
local($|)= 1 ;

# For mod_perl, global variables are retained between calls, so they must
#   be initialized correctly.  Here, we assume all UPPER_CASE variables are
#   initialized correctly elsewhere, and no lower_case variables are set
#   before here.  It's a little hacky, deprecated, and possibly error-prone
#   if the assumption is broken elsewhere, but it's fast and simple.
reset 'a-z' ;


# Fix submitted by Alex Freed:  Under some unidentified conditions,
#   instances of nph-proxy.cgi can hang around for many hours and drag the
#   system.  So until we figure out why that is, here's a 10-minute timeout.
#   Please write me with any insight into this, since I can't reproduce the
#   problem.  Under what conditions, on what systems, does it happen?
# 9-9-1999: One theory is that it's a bug in older Apaches, and is fixed by
#   upgrading to Apache 1.3.6 or better.  Julian Haight reports seeing the
#   same problem with other scripts on Apache 1.3.3, and it cleared up when
#   he upgraded to Apache 1.3.6.  Let me know if you can confirm this.
# Windows doesn't seem to support alarms, so don't do this on Windows.

unless ($RUNNING_ON_WINDOWS) {
    $SIG{'ALRM'} = 'timeexit' ;
    alarm(600);
}

# Exit upon timeout.  If you wish, add code to clean up and log an error.
sub timeexit { $ENV{'MOD_PERL'}  ? goto EXIT  : exit 1 }


# SCRIPT_NAME should have a leading slash, but the old CGI "standard" from
#   NCSA was unclear on that, so some servers didn't give it a leading
#   slash.  Here we ensure it has a leading slash.
$ENV{'SCRIPT_NAME'}=~ s#^/?#/# ;


# The IIS server doesn't set PATH_INFO correctly-- it sets it to the entire
#   request URI, rather than just the part after the script name.  So fix it
#   here if we're running on IIS.  Thanks to Dave Moscovitz for the info!
if ( $RUNNING_ON_WINDOWS && ($ENV{'SERVER_SOFTWARE'}=~ /IIS/) ) {
    $ENV{'PATH_INFO'} =~ s/^$ENV{'SCRIPT_NAME'}// ;
}


# PATH_INFO may or may not be URL-encoded when we get it; it seems to vary
#   by server.  This script assumes it's still encoded.  Thus, if it's not,
#   we need to re-encode it.
# The only time this seems to come up is when spaces are in URLs, correctly
#   represented in the URL as %20 but decoded to " " in PATH_INFO.  Thus,
#   this hack only focuses on space characters.  It's a hack that I'm not at
#   all comfortable with.  :P
# Very yucky business, this encoding thing.
if ($ENV{'PATH_INFO'}=~ / /) {
    $ENV{'PATH_INFO'} =~ s/%/%25/g ;
    $ENV{'PATH_INFO'} =~ s/ /%20/g ;
}


# PATH_INFO consists of a path segment of flags, followed by the encoded
#   target URL.  For example, PATH_INFO might be something like
#   "/01010/http/www.example.com".
# Now we strip flag segment off the front of PATH_INFO, and leave PATH_INFO
#   with only the encoded target URL.
# Thanks to Mike Harding for the idea of using another flag for the
#   $is_in_frame parameter, instead of using two parallel scripts.

# Set globals according to the flags at the start of PATH_INFO, and strip
#   those flags from PATH_INFO.  A little hacky, but works OK.
($FLAGS, $ENV{'PATH_INFO'})= $ENV{'PATH_INFO'}=~ m#/([^/]*)(/.*)?# ;
$FLAGS= '_' if $FLAGS eq '' ;   # avoid '//' in PATH_INFO

# Split $FLAGS into @FLAGS and assign to variables as needed.
@FLAGS= split(//, $FLAGS) ;

# Only set these options if allowed, and if flag segment was in PATH_INFO.
if ( $ALLOW_USER_CONFIG && ($FLAGS ne '') && ($FLAGS ne '_') ) {
    ($REMOVE_COOKIES, $REMOVE_SCRIPTS, $FILTER_ADS, $INSERT_ENTRY_FORM)=
	@FLAGS[0..3] ;
    # Force all flags to valid values (currently all are 1 or 0).
    $REMOVE_COOKIES=    $REMOVE_COOKIES     ? 1  : 0  ;
    $REMOVE_SCRIPTS=    $REMOVE_SCRIPTS     ? 1  : 0  ;
    $FILTER_ADS=        $FILTER_ADS         ? 1  : 0  ;
    $INSERT_ENTRY_FORM= $INSERT_ENTRY_FORM  ? 1  : 0  ;
}

# $is_in_frame is set in any case.
$is_in_frame= $FLAGS[4]  ? 1  : 0  ;


# Flags are now set, and PATH_INFO now contains only the encoded target URL.



# Create a one-flag test for whether we're inserting anything.
# This must happen after user flags are read, just above.
$DOING_INSERT= ($INSERT_FILE ne '') || ($INSERT_HTML ne '')
	     || $INSERT_ENTRY_FORM ;


# Hack to support non-NPH installation-- luckily, the format of a
#   non-NPH response is almost exactly the same as an NPH response.
#   The main difference is the first word in the status line-- something
#   like "HTTP/1.0 200 OK" can be simulated with "Status: 200 OK", as
#   long as the server supports the Status: CGI response header.  So,
#   we set that first word to either "HTTP/1.0" or "Status:", and use
#   it for all responses throughout the script.
# NOTE:  This is not the only difference between an NPH and a non-NPH
#   response.  For example, the Location: header has different semantics
#   between the two types of responses.  This hack is only an approximation
#   that we hope works most of the time.  It's better to install the script
#   as an NPH script if possible (which it almost always is).
$HTTP_1_0=  $NOT_RUNNING_AS_NPH   ? 'Status:'   : 'HTTP/1.0' ;


# Base64-encode $PROXY_AUTH if it's not done so already.
$PROXY_AUTH= &base64($PROXY_AUTH)  if $PROXY_AUTH=~ /:/ ;


# Default to $ENV{'HTTP_USER_AGENT'}
$USER_AGENT= $ENV{'HTTP_USER_AGENT'} if $USER_AGENT eq '' ;


# Copy often-used environment vars into scalars, for efficiency
$env_accept= $ENV{'HTTP_ACCEPT'} || '*/*' ;     # may be modified later


# One user reported problems with binary files on certain other OS's, and
#   this seemed to fix it.  Supposedly, either this or the "binmode S"
#   statements below the newsocketto() calls work, or all; I'm putting all in.
#   Tell me anything new you figure out about this.
binmode STDOUT ;


# %urlsin was an associative array that listed tags and attributes that 
#   may contain URLs, declared like:
# %urlsin= ('a', 'href',
#           'applet', 'codebase',
#           'fig', 'src|imagemap',
#           'form', 'action|script',  ...
#          ) ;


# OK, let's time this thing
#$starttime= time ;
#($sutime,$sstime)= (times)[0,1] ;

#--------------------------------------------------------------------------
#    parse URL, make checks, and set various globals
#--------------------------------------------------------------------------

# Calculate $this_url for use later in &full_url() and elsewhere.  It's an
#   integral part of &full_url(), placed here for speed, similar to the
#   variables set in &fix_base_vars.
# $this_url is an absolute URL pointing to this proxy script, complete with
#   trailing slash.  It has to be absolute to serve those HTML attributes
#   that require absolute URLs, like <base href>.
# Included in $this_url is the initial flag segment of PATH_INFO.  Thus, a
#   complete $this_url might be "http://www.example.com/nph-proxy.cgi/01010/" .
# Another way to describe $this_url is that it's everything in a complete
#   proxified URL except the encoded target URL at the end.  It's almost
#   always used by appending it with &proxy_encode(something).

# Set $this_url to a random element of @PROXY_GROUP, if that is set.
if (@PROXY_GROUP) {
    srand( $$ ^ unpack('%32L*', $ENV{'PATH_INFO'}) ) ;  # seed with $$, URL
    $this_url= $PROXY_GROUP[ rand(scalar @PROXY_GROUP) ] ;
    $this_url=~ s#/?$#/# ;    # guarantee trailing slash

} else {
    # Build the local $this_url from environment variables.  Only include
    #   SERVER_PORT if it's not 80 (or 443 for SSL).
    $this_url= $RUNNING_ON_SSL_SERVER
	?  join('', 'https://', $ENV{'SERVER_NAME'}, 
		($ENV{'SERVER_PORT'}==443  ? ''  : ':'.$ENV{'SERVER_PORT'}),
		$ENV{'SCRIPT_NAME'}, '/')
	:  join('', 'http://', $ENV{'SERVER_NAME'}, 
		($ENV{'SERVER_PORT'}==80  ? ''  : ':'.$ENV{'SERVER_PORT'}),
		$ENV{'SCRIPT_NAME'}, '/') ;
}

# Append any flags that are common to all variants of $this_url (e.g.
#   not the frame flag, because the $this_url variants below vary by
#   that frame flag).
$this_url.= $REMOVE_COOKIES . $REMOVE_SCRIPTS . $FILTER_ADS
	  . $INSERT_ENTRY_FORM ;

# Create any needed $this_url variants from $this_url so far.
if ($DOING_INSERT) {
    $this_url_inframe= $this_url . '1/' ;
    $this_url_noframe= $this_url . '0/' ;
}

# Finally, complete $this_url with any flags not added above, plus final "/".
$this_url.= $is_in_frame . '/' ;



# If there's no PATH_INFO, then start a browsing session.
&show_start_form if $ENV{'PATH_INFO'}=~ m#^/?$# ;


# Read the URL from PATH_INFO, stripping the leading slash
$URL= &proxy_decode(substr($ENV{'PATH_INFO'},1)) ;


# Set the query string correctly, from either $ENV{QUERY_STRING} or what's
#   already in $URL.
# The query string may exist in either the encoded URL or in the containing
#   URL, as $ENV{QUERY_STRING}.  If the former, then the query string was
#   (definitely?) in a referenced URL, while the latter most likely implies a
#   GET form input.  Either query string is valid, but form input takes
#   precedence-- if $ENV{QUERY_STRING} exists, it should be used over any
#   query string in the encoded URL.
# Note that Netscape does not pass any query string data that is part of the
#   URL in the <form action> attribute, which is probably correct behaviour.
#   For this program to act exactly the same, it would need to strip the
#   query string when updating all <form action> URLs, way below.
# Question:  Is there ever a valid case when both QUERY_STRINGs exist??

$URL=~ s/(\?.*)?$/?$ENV{'QUERY_STRING'}/   if $ENV{'QUERY_STRING'} ne '' ;


# Parse the URL, using a regex modelled from the one in RFC 2396 (URI syntax),
#   appendix B.
# This assumes a hierarchical scheme; it won't work for e.g. mailto:
# "authority" is the combination of host, port, and possibly other info.
# Note that $path here will also contain any query component; it's more like
#   the request URI.
# Note that $URL is guaranteed to be an absolute URL with no "#" fragment, 
#   though this does little error-checking.  Note also that the old ";" 
#   parameters are now included in the path component.

($scheme, $authority, $path)= ($URL=~ m#^([\w+.-]+)://([^/?]*)(.*)$#i) ;
$scheme=~ tr/A-Z/a-z/ ;
$path= "/$path" if $path!~ m#^/# ;   # if path is '' or contains only query


# Magic here-- if $URL uses special scheme "x-proxy", immediately call the
#   general-purpose xproxy() routine.
&xproxy($URL) if $scheme eq 'x-proxy' ;


# Set $is_html if $path (minus query) ends in .htm or .html .
# MSIE has a bug (and privacy hole) whereby URLs with QUERY_STRING ending
#   in .htm or .html are mistakenly treated as HTML, and thus could have 
#   untranslated links, <script> blocks, etc.  So for those cases, set
#   $is_html=true to make sure we later transform it as necessary.
if ($ENV{'HTTP_USER_AGENT'}=~ /MSIE/) {
    $is_html= 1  if $path=~ /\.html?(\?|$)/i ;
} else {
    $is_html= 1  if $path=~ /^[^?]*\.html?(\?|$)/i ;
}


# Alert the user to unsupported URL, with an intermediate page
&unsupported_warning($URL) unless ($scheme=~ /^(http|ftp)$/) ;

# Require a host to be present (for $base_url safety later)
# jsm-- actually, should restrict this to valid hostname characters.
&HTMLdie('The target URL cannot contain an empty host name.')
    unless $authority=~ /^\w/ ;


# Parse $authority into $host, $port, and possibly others, depending on
#   which URL scheme is used.
# Since most URL schemes use the simple host:port, make that the default.
#   This may avoid oversight later when other URL schemes are added.
# Note that this does not set $port to a default.  In the interest of
#   encapsulation, the default $port should be set in the routine that
#   implements the protocol (i.e. http_get(), ftp_get(), etc.)

if ($scheme eq 'ftp') {
    # FTP authority can be username:password@host:port, with username,
    #   password, and port all optional.
    # Embedding your username/password in a URL is NOT RECOMMENDED!  Here,
    #   the second clause should almost always be used.
    if ($authority=~ /@/) {
	($username, $password, $host, $port)=
	    $authority=~ /([^:@]*):?([^@]*)@([^:]*):?(.*)/ ;
    } else {
	($username, $password)= ('anonymous', 'not@available.com') ;
	($host, $port)= $authority=~ /^([^:]*):?(.*)$/ ;
    }

} else {
    ($host, $port)= $authority=~ /^([^:]*):?(.*)$/ ;   # covers HTTP, etc.
}

$host=~ tr/A-Z/a-z/ ;   # hostnames are case-insensitive
$host=~ s/\.*$//g ;     # removes trailing dots to close a potential exploit


# If so configured, disallow browsing back through the script itself (looping).
# This assumes the script can only be called by an http:// or https:// URL.
if ($NO_BROWSE_THROUGH_SELF) {
    # Default $port's not set yet, so hack up an ad hoc version.
    local($port2)=  $port || ( $scheme eq 'https'  ? 443  : 80 ) ;
    &loop_disallowed_die($URL)
	if     ($scheme=~ /^https?/)
	    && ($host=~ /^$ENV{'SERVER_NAME'}$/i)
	    && ($port2 == $ENV{'SERVER_PORT'})
	    && ($path=~ /^$ENV{'SCRIPT_NAME'}\b/) ;
}


# Die if the target server is not allowed.
if (@ALLOWED_SERVERS) {
    local($server_is_allowed) ;
    foreach (@ALLOWED_SERVERS) {
	$server_is_allowed= 1, last   if $host=~ /$_/ ;
    }
    &banned_server_die unless $server_is_allowed ;
}
foreach (@BANNED_SERVERS) {
    &banned_server_die if $host=~ /$_/ ;
}


# If we're filtering ads, force $NO_COOKIE_WITH_IMAGE=1, and set
#   $images_are_banned_here appropriately.
if ($FILTER_ADS) {
    $NO_COOKIE_WITH_IMAGE= 1 ;
    foreach (@BANNED_IMAGE_URL_PATTERNS) {
	$images_are_banned_here= 1, last if $URL=~ /$_/ ;
    }
}


# Set $scripts_are_banned_here appropriately
$scripts_are_banned_here= $REMOVE_SCRIPTS ;
unless ($scripts_are_banned_here) {
    if (@ALLOWED_SCRIPT_SERVERS) {
	$scripts_are_banned_here= 1 ;
	foreach (@ALLOWED_SCRIPT_SERVERS) {
	    $scripts_are_banned_here= 0, last   if $host=~ /$_/ ;
	}
    }
    unless ($scripts_are_banned_here) {
	foreach (@BANNED_SCRIPT_SERVERS) {
	    $scripts_are_banned_here= 1, last   if $host=~ /$_/ ;
	}
    }
}


# Set $cookies_are_banned_here appropriately
$cookies_are_banned_here= $REMOVE_COOKIES ;
unless ($cookies_are_banned_here) {
    if (@ALLOWED_COOKIE_SERVERS) {
	$cookies_are_banned_here= 1 ;
	foreach (@ALLOWED_COOKIE_SERVERS) {
	    $cookies_are_banned_here= 0, last   if $host=~ /$_/ ;
	}
    }
    unless ($cookies_are_banned_here) {
	foreach (@BANNED_COOKIE_SERVERS) {
	    $cookies_are_banned_here= 1, last   if $host=~ /$_/ ;
	}
    }
}


# Regex that matches a script MIME type, and analogous hash.
if ($scripts_are_banned_here) {
    $SCRIPT_TYPE_REGEX= '(' . join("|", @SCRIPT_MIME_TYPES) . ')' ;
    @IS_SCRIPT_MIME_TYPE{@SCRIPT_MIME_TYPES}= (1) x @SCRIPT_MIME_TYPES ;
}


# Exclude non-text if it's not allowed.  Err on the side of allowing too much.
if ($TEXT_ONLY) {
    # First, forbid requests for filenames with non-text-type extensions
    &non_text_die if ($path=~ /\.($NON_TEXT_EXTENSIONS)(;|\?|$)/i) ;

    # Then, filter the "Accept:" header to accept only text
    $env_accept=~ s#\*/\*#text/*#g ;    # not strictly perfect
    $env_accept= join(', ', grep(m#^text/#i, split(/\s*,\s*/, $env_accept)) ) ;
    &non_text_die unless $env_accept ne '' ;
}


# For a potential banner ad, intercept request if it looks like an image is
#   requested, i.e. unless the Accept: header allows either text/... or */... .
if ($images_are_banned_here) {
    &skip_image unless grep(m#^(text|\*)/#i, split(/\s*,\s*/, $env_accept) ) ;
}


$*= 1 ;     # allow multi-line matching


# $base_url must be set correctly at any time &full_url() may be called.
#   &fix_base_vars() must be called as well, to set $base_scheme, $base_host 
#   and $base_path.
# Unfortunately, the base URL may change over the course of this program.  We
#   will keep it set based on whatever info we have so far, i.e. request URI,
#   then e.g. HTTP response headers, then e.g. <base> tag (which happens to
#   be in the reverse order of the ultimate precedence).
$base_url= $URL ;
&fix_base_vars ;   # must be called whenever $base_url is set


# Parse the cookie for real cookies and authentication information.
($cookie_to_server, %auth)= &parse_cookie($ENV{'HTTP_COOKIE'}, $path, $host) ;


#--------------------------------------------------------------------------
#    Retrieve the resource into $body or @body using the correct scheme, 
#      also setting $status, $headers, and $is_html (all globals).
#      $is_html indicates whether the original resource is HTML, not
#      if a generated response is in HTML (e.g. an error message).
#    $response_sent might be set, indicating the response was sent by the
#      subroutine.  This is appropriate for streaming media, for example.
#--------------------------------------------------------------------------

if ($scheme eq 'http') {
    &http_get ;
} elsif ($scheme eq 'ftp') {
    &ftp_get ;
}

#--------------------------------------------------------------------------
#    Modify entire response to point back through this script
#--------------------------------------------------------------------------

# NOTE: IT IS IMPORTANT TO DO THIS AS COMPLETELY AS POSSIBLE!  IF A
# USER UNKNOWINGLY GOES TO PAGE DIRECTLY AND NOT THROUGH THIS PROXY,
# HE/SHE MAY REVEAL HIM/HERSELF IN AN UNINTENDED WAY.

#--------------------------------------------------------------------------
# These were notes to myself from testing the speed of different methods.
#   Names like "nph-proxy2" refer to different modifications.  This version,
#   the fastest, was called nph-proxy2b.
#
# If YOU figure out a faster method, please tell me about it!
#
# It would certainly be much faster if rewritten in C, because you could
#   very quickly read each character from the input and write it to the
#   output, maintaining state and altering the data stream as needed.
#
#--------------------------------------------------------------------------
#
# [This version is nph-proxy2b:  Break into @body array, do not use 
#   %urlsin to test if tags should be updated, replacement strings 
#   use (a|b|c) syntax.]
#
# This is by far the most time-consuming part of the script, the updating
#   of all URLs in an HTML file.
#
# Results of informal speed testing (not what I expected):
#   Breaking into @body array instead of one big string definitely saves 
#       significant time-- compare nph-proxy2 to nph-proxy1.  One test
#       showed a time saving of 1/3 to 1/2.
#   Using a %urlsin array does NOT seem to save time, even when only used
#       as boolean test to see if tag might contain URL-- compare nph-proxy2
#       to nph-proxy3, nph-proxy4, and (most similar) nph-proxy5.  Oh well.
#   It seems that reading one tag at a time, converting, and sending it
#       through does NOT save elapsed time over reading all tags before
#       converting, like I thought it would.  Both the CPU and elapsed
#       time are longer for one-tag-at-a-time approach-- compare
#       nph-proxy2 and nph-piper2.
#
#   Results of nph-proxy2 (blocks for each tag) to nph-proxy2b 
#   (single "(att1|att2|att3)" style regex) testing:
#       Mixed results, but overall, using a single regex (nph-proxy2b)
#       takes less CPU "user time", about the same "system time", and
#       slightly more elapsed time than nph-proxy2; I don't know why.
#       The elapsed time is slightly more both within the script and at
#       the HTTP client's end.  All differences are less than 10% on
#       average, and nph-proxy2b occasionally shows LESS elapsed time
#       than nph-proxy2.  CPU time is always less for nph-proxy2b than
#       for nph-proxy2.
#   Since the bottleneck of CPU time is tighter than for elapsed time,
#       and since the elapsed-time loss is less than the CPU time gain,
#       let's go with nph-proxy2b (not that it really makes much difference).
#       Besides, the code is easier to read that way.
#
# So basically, breaking into @body array helps, but not much else does.
#
#--------------------------------------------------------------------------
# 8-4-98 JSM: Found a bug in the regex, so changed it.  It more resembles
#   nph-proxy2.cgi now.
#--------------------------------------------------------------------------


# If the resource is HTML (and not empty), update all URLs in all tags that 
#   refer to URLs.  Plus a bunch of other stuff.

if ( $is_html  && ($body[0] ne '') && !$response_sent ) {

    # Set $base_url if there is an HTML <base> tag.
    foreach (@body) {
	if (/<\s*base\b/i) {
	    $base_url= &HTMLunescape($1), &fix_base_vars, last
		if m#<\s*base\b[^>]*[^\w.>\/?&-]href\s*=\s*"?([\w+.-]+://[^\s">]+)#i ;
	}
	last if m#<\s*(/head|body)\b#i ; # stop looking after </head> or <body>
    }


    # If so configured, remove script elements as much as possible.
    if ($scripts_are_banned_here) {
	local($s) ;  # sequence number in .. tests
	foreach (@body) {

	    # Strip everything between <script> and </script>.
	    # Note the use of the seldom-seen ".." scalar operator.
	    if ( $s=(/<\s*script\b/i .. /<\s*\/script\b/i) ) {
		# Don't remove the part before the initial "<script>".
		if ($s==1) {
		    s/<\s*script\b(.|\n)*>//i ;
		} else {
		    $_= '' ;
		}
	    }

	    # If style sheet uses a script MIME type, remove it.
	    # Note the use of the seldom-seen ".." scalar operator.
	    if ( $s= (/<\s*style\b(.|\n)*[^\w.>\/?&-]type\s*=\s*["']?$SCRIPT_TYPE_REGEX\b/io
		   .. /<\s*\/style\b/i) ) {
		# Don't remove the part before the initial "<style>".
		if ($s==1) {
		    s/<\s*style\b(.|\n)*>//i ;
		} else {
		    $_= '' ;
		}
	    }

	    # Remove JavaScript "conditional comments", which begin with
	    #   "<!--&{".  They evaluate the initial expression, and depending
	    #   on that, include or exclude the rest of the comment.
	    # Now, I'm pretty sure that comments don't have to end with "-->";
	    #   there can be stuff in between the "--" and the final ">" (but
	    #   I can't find the spec-- enlighten me if you can!).  However,
	    #   it seems that most browsers only close comments on exactly
	    #   "-->", so let's err on the side of removing too much: remove
	    #   the whole comment.  It shouldn't be a big deal either way,
	    #   since embedded <script>'s will still be stripped out, above.
	    #   We just don't want the initial expression to be evaluated.
	    # Note the use of the seldom-seen ".." scalar operator.
	    if ( $s= (/<!--\s*&{/ .. /-->/) ) {
		# Don't remove the part before the initial "<!--&{".
		if ($s==1) {
		    s/<!--(.|\n)*//i ;
		} else {
		    $_= '' ;
		}
	    }

	    # Remove (Java-)script attributes.  For now, assume all (and only)
	    #   attributes that begin with "on" are script attributes; this
	    #   seems to be the case with HTML 4.0.
	    # Note that this regex suffers from the same imperfections as
	    #   the similar regex used below.  It may be more susceptible
	    #   to errors, since it catches every word beginning with "on"
	    #   followed by an "=", possibly with intervening blanks.
	    1 while s/(<\s*\w+[^>]*)[^\w.>\/?&-]on\w+\s*=\s*("[^">]*"|'[^'>]*'|[^'"][^\s>]*)\s*/$1/i ;

	    # Remove the Netscape-specific "JavaScript entities", which
	    #   allow JavaScript to be invoked within *any* HTML attribute.
	    #   Such attributes contain "&{" (which unfortunately means that
	    #   attributes that innocently contain that string will be
	    #   removed too).  Do this for all browsers, in case others
	    #   emulate Netscape.  Remove the entire attribute.
	    1 while s/(<\s*\w+[^>]*)[^\w.>\/?&-]\w+\s*=\s*("[^">]*&{[^">]*"|'[^'>]*&{[^'>]*'|[^\s>]*&{[^\s>]*)\s*/$1/ ;

	    # Remove various tags that might link to a resource with a
	    #   script MIME type:  <object> with "type" or "codetype"
	    #   attribute; <a>, <link>, or <param> with "type" attribute.
	    #   <script> and <style> are already handled above.  With <link>,
	    #   this handles externally-defined style sheets correctly.
	    # I took this list from the attribute list in the HTML 4.0 spec,
	    #   at http://www.w3.org/TR/REC-html40/index/attributes.html ,
	    #   handling all (appropriate) attributes with type "%ContentType".
	    s/<\s*object\b(.|\n)*[^\w.>\/?&-](type|codetype)\s*=\s*["']?$SCRIPT_TYPE_REGEX\b[^>]*>//io ;
	    s/<\s*(a|link|param)\b(.|\n)*[^\w.>\/?&-]type\s*=\s*["']?$SCRIPT_TYPE_REGEX\b[^>]*>//io ;

	    # Remove <meta http-equiv="Content-Style-Type" ...> tags if the
	    #   content attribute identifies a script MIME type.
	    if (/<\s*meta\b(.|\n)*[^\w.>\/?&-]http-equiv\s*=\s*["']?Content-Style-Type\b/i) {
		s/<[^>]*[^\w.>\/?&-]content\s*=\s*["']?$SCRIPT_TYPE_REGEX\b[^>]*>//io ;
	    }

	}


	# Remove the MSIE-specific "dynamic properties" in style attributes.
	#   I can't find a definitive reference for their syntax (tell me if
	#   you know of one), so err on the safe side and remove all style
	#   attributes that contain the string "expression(".  I think I've
	#   seen "function()" used too, so remove those.  But I don't know
	#   either JS or style sheets.
	if ($ENV{'HTTP_USER_AGENT'}=~ /MSIE/) {
	    foreach (@body) {
		1 while s/(<\s*\w+[^>]*)[^\w.>\/?&-]style\s*=\s*("[^">]*expression\s*\([^">]*"|'[^'>]*expression\s*\([^'>]*'|[^\s>]*expression\s*\([^\s>]*)\s*/$1/i ;
		1 while s/(<\s*\w+[^>]*)[^\w.>\/?&-]style\s*=\s*("[^">]*function\s*\([^">]*"|'[^'>]*function\s*\([^'>]*'|[^\s>]*function\s*\([^\s>]*)\s*/$1/i ;
	    }
	}


    }


    # Remove duplicate attributes-- they're not valid HTML and open a rare
    #   privacy hole in this script, because the conversion regex's below
    #   only transform the last of any attribute in a tag.
    unless ($FASTER_HTML_LESS_PRIVACY) {
	foreach (@body) {

	    # This regex isn't perfect, but will catch all offending HTML.
	    # Unfortunately, this one statement slows down the main block of
	    #   the program by about 10%-- yuck.  A few alternate methods
	    #   are below, but they all cost 9-11%.  They are: a) disable
	    #   the attribute by appending its name with "_bad"; b) remove
	    #   the whole tag (which messes up sites which mistakenly use
	    #   duplicate attributes, like Slashdot); and c) remove the part
	    #   of the tag after a duplicate attribute.
	    # If you find a faster way to do this, PLEASE TELL ME!

	    1 while s/(<[^>]*[^\w.>\/?&-]([\w.-]+)\s*=[^>]*[^\w.>\/?&-])\2\s*=\s*("[^">]*"|'[^'>]*'|[^'"][^\s>]*)/$1/i ;   # the "right" way
#	    1 while s/(<[^>]*\b(\w+)\s*=[^>]*[\s"']\2)\s*=/${1}_bad=/i ;   # a)
#	    next if s/(<[^>]*\b(\w+)\s*=[^>]*[\s"']\2\s*=[^>]*>?)//i ;     # b)
#	    next if s/(<[^>]*\b(\w+)\s*=[^>]*)[\s"']\2\s*=[^>]*(>?)/$1$3/i;# c)

	}
    }



    # If we're doing an insertion, then find the position in @body where
    #   it should be inserted.  In order of precedence, that's:  after the
    #   <body> tag, or after the <html> tag, or at the start of the HTML.
    # Do not insert it into frame documents, indicated by the <frameset> tag.
    # The variables $insert_pos and $is_frameset are set for later use.
    # Of course, this whole operation is skipped if we think this HTML
    #   resource is destined for a frame, as indicated by $is_in_frame.

    local($insert_pos, $is_frameset) ;
    if ($DOING_INSERT && !$is_in_frame) {
	local($body_pos, $html_pos, $i) ;
	for $i (0..$#body) {
	    $body_pos= $i+1,  last  if $body[$i]=~ /<\s*body\b/i ;
	    $is_frameset= 1,  last  if $body[$i]=~ /<\s*frameset\b/i ;
	    $html_pos= $i+1         if $body[$i]=~ /<\s*html\b/i ;
	}
	$insert_pos= $body_pos || $html_pos ;  # results in 0 if neither is set

	# This block adds the insertion when it SHOULD BE anonymized, which
	#   is why it comes before the URL conversion below.  Note that the
	#   insertion has to be correctly split into tags for the URL
	#   conversion to work.
	# $INSERT_HTML takes precedence over $INSERT_FILE.  If $INSERT_HTML
	#   is empty, we set it to the file contents of $INSERT_FILE and
	#   use $INSERT_HTML hereafter.  This means the inserted file is only
	#   opened and read once, even under mod_perl.  :)
	if ( $ANONYMIZE_INSERTION && !$is_frameset &&
	     (($INSERT_HTML ne '') || ($INSERT_FILE ne '')) ) {
	    local(@insertion) ;
	    $INSERT_HTML= &readfile($INSERT_FILE) if $INSERT_HTML eq '' ;
	    @insertion= $INSERT_HTML=~ /([^>]*>?)/g ;  # split on ">"
	    splice(@body, $insert_pos, 0, @insertion) ;
	    $insert_pos+= @insertion  if $FORM_AFTER_INSERTION ;
	}
    }



    # This is a complete list of HTML tags/attributes that may include a URL,
    #   compiled from the list at Sandia (read on 9-17-96, but last updated on
    #   12-7-95), and the latest HTML DTD as of 9-17-96.  The Sandia list
    #   includes non-standard tags found to be used by Netscape or Microsoft.
    # List updated 7-31-98 by JSM to include all URL-type attributes defined
    #   in HTML 4.0.
    # If anyone knows of a well-maintained list of standard and non-standard
    #   tags/attributes with URLs in them, please let me know!!
    # Note that most of these are very rarely used, if ever.  They're included
    #   for safety, since we don't want an anonymous user accidentally
    #   revealing themselves because of a non-anonymized URL.
    # This regex favors speed over readability.
    # This could probably be more efficient, but the solution eludes me.
    # Note that this is not a perfect regex.  The substitutions can be fooled
    #   by, e.g., <input value="<a href=bad"> or <input value="src=bad">.
    #   It could be fixed with a slower regex.
    # The regex terminates URLs at spaces; URLs with spaces are disallowed,
    #   but fragments sometimes contain them anyway.
    # Denoting which of these are for images/binaries might be helpful, if
    #   we need more elaborate text-only support.
    # Err on the side of matching too much-- we'd rather a little mangled HTML
    #   than an anonymity hole.
    # 8-4-98 JSM:  Changed the regex to fix a bug: would only change the 
    #   first URL in a tag.  New regex is probably slower.
    # 11-18-00 JSM:  Changed to handle attributes with leading spaces.
    # 1-13-01 JSM: Changed regex to match attribute boundaries better, because
    #   a simple \b is matching too much.  SGML attributes match \w[\w.-]*
    #   (except no underscores), so a boundary excludes those.  Add ">" too.
    #   We don't want to match words inside of URLs, so add "/" because of
    #   paths, and "?" and "&" because of query strings.  Thus the boundary
    #   match is [^\w.>\/?&-]... yeesh.  Regex now makes line longer than 80
    #   chars, bummer.  Time to use wider editor.

    foreach (@body) {

	# Extract tag name first-- saves ~40% time over old way, for this loop.
	# Also extract first non-blank after tag name-- if it's ">", then we
	#   can skip this too.
	($tag_name, $att_init)= /<\s*(\w+)\s*(.)/ ; # assumes tag names are \w+

	next if $tag_name eq '' ;    # e.g. for comments and end tags
	next if $att_init eq '>' ;   # when a tag has no attributes

	$tag_name=~ tr/A-Z/a-z/ ;


	# First handle the specific cases where full_url_by_frame() should
	#   be called instead of full_url.  These include "going into
	#   frame mode" for <frame src="..."> tags, and "leaving frame
	#   mode" for <a target="_top"> or <a target="_blank">.
	# Everything else leaves the "frame mode" unchanged.
	# Hmmm, inefficient to do this check for every tag, when most
	#   installations aren't using the feature anyway.
	if ($DOING_INSERT) {
	    s/(<[^>]*[^\w.>\/?&-]href\s*=\s*["']?)\s*([^\s"'>]*)/     $1 . &full_url_by_frame($2,0) /ie,
		next if /<\s*a\b[^>]*[^\w.>\/?&-]target\s*=\s*["']?(_top|_blank)\b/i ;

	    s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url_by_frame($2,1) /ie,
	    s/(<[^>]*[^\w.>\/?&-]longdesc\s*=\s*["']?)\s*([^\s"'>]*)/ $1 . &full_url($2) /ie,
		next if $tag_name eq 'frame' ;
	}


	# Put the most common cases first

	# Only check <a> tags AFTER above check for <a target="_top/_blank">
	s/(<[^>]*[^\w.>\/?&-]href\s*=\s*["']?)\s*([^\s"'>]*)/     $1 . &full_url($2) /ie,
	    next if $tag_name eq 'a' ;

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]lowsrc\s*=\s*["']?)\s*([^\s"'>]*)/   $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]longdesc\s*=\s*["']?)\s*([^\s"'>]*)/ $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]usemap\s*=\s*["']?)\s*([^\s"'>]*)/   $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]dynsrc\s*=\s*["']?)\s*([^\s"'>]*)/   $1 . &full_url($2) /ie,
	    next if $tag_name eq 'img' ;

	s/(<[^>]*[^\w.>\/?&-]background\s*=\s*["']?)\s*([^\s"'>]*)/ $1.&full_url($2) /ie,
	    next if $tag_name eq 'body' ;

	s/(<[^>]*[^\w.>\/?&-]href\s*=\s*["']?)\s*([^\s"'>]*)/     $1 . &full_url($2) /ie,
	    next if $tag_name eq 'base' ;     # has special significance

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]longdesc\s*=\s*["']?)\s*([^\s"'>]*)/ $1 . &full_url($2) /ie,
	    next if $tag_name eq 'frame' ;

	s/(<[^>]*[^\w.>\/?&-]action\s*=\s*["']?)\s*([^\s"'>]*)/   $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]script\s*=\s*["']?)\s*([^\s"'>]*)/
		$scripts_are_banned_here ? $1  : $1 . &full_url($2) /ie,
	    next if $tag_name eq 'form' ;     # needs special attention

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]usemap\s*=\s*["']?)\s*([^\s"'>]*)/   $1 . &full_url($2) /ie,
	    next if $tag_name eq 'input' ;

	s/(<[^>]*[^\w.>\/?&-]href\s*=\s*["']?)\s*([^\s"'>]*)/     $1 . &full_url($2) /ie,
	    next if $tag_name eq 'area' ;

	s/(<[^>]*[^\w.>\/?&-]codebase\s*=\s*["']?)\s*([^\s"'>]*)/ $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]code\s*=\s*["']?)\s*([^\s"'>]*)/     $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]object\s*=\s*["']?)\s*([^\s"'>]*)/   $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]archive\s*=\s*["']?)\s*([^\s"'>]*)/  $1 . &full_url($2) /ie,
	    next if $tag_name eq 'applet' ;


	# These are seldom-used tags, or tags that seldom have URLs in them

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	    next if $tag_name eq 'bgsound' ;  # Microsoft only

	s/(<[^>]*[^\w.>\/?&-]cite\s*=\s*["']?)\s*([^\s"'>]*)/     $1 . &full_url($2) /ie,
	    next if $tag_name eq 'blockquote' ;

	s/(<[^>]*[^\w.>\/?&-]cite\s*=\s*["']?)\s*([^\s"'>]*)/     $1 . &full_url($2) /ie,
	    next if $tag_name eq 'del' ;

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	    next if $tag_name eq 'embed' ;    # Netscape only

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]imagemap\s*=\s*["']?)\s*([^\s"'>]*)/ $1 . &full_url($2) /ie,
	    next if $tag_name eq 'fig' ;      # HTML 3.0

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	    next if $tag_name=~ /^h[1-6]$/ ;  # HTML 3.0

	s/(<[^>]*[^\w.>\/?&-]profile\s*=\s*["']?)\s*([^\s"'>]*)/  $1 . &full_url($2) /ie,
	    next if $tag_name eq 'head' ;

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	    next if $tag_name eq 'hr' ;       # HTML 3.0

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]longdesc\s*=\s*["']?)\s*([^\s"'>]*)/ $1 . &full_url($2) /ie,
	    next if $tag_name eq 'iframe' ;

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	    next if $tag_name eq 'ilayer' ;

	s/(<[^>]*[^\w.>\/?&-]cite\s*=\s*["']?)\s*([^\s"'>]*)/     $1 . &full_url($2) /ie,
	    next if $tag_name eq 'ins' ;

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	    next if $tag_name eq 'layer' ;

	s/(<[^>]*[^\w.>\/?&-]href\s*=\s*["']?)\s*([^\s"'>]*)/     $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]urn\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	    next if $tag_name eq 'link' ;


	# <meta http-equiv> is a special case-- these must be handled like real
	#   HTTP headers.
	# Remove http-equiv attribute if content is empty, else may generate
	#   empty cookie.
	# The Netscape-only "url" attribute can be handled normally.
	# Bug in Perl 4: using $new_value in "content" regex below makes it
	#   become defined.  Thus, we need to use its definedness before that.
	# Need ";" covered when checking for "url=", because it could be in
	#   e.g. <meta http-equiv="refresh" content="5;URL=...">.
	# Therefore, ugh, we need to handle if it's instead 'content=5; URL=',
	#   i.e. has a space before "URL".  Hack here is to remove those
	#   those spaces.
	if ($tag_name eq 'meta') {
	    s/(<[^>]*;)\s*url\s*=/ $1 . 'url=' /ie ;    # hack here
	    s/(<[^>]*[^\w.>\/?&;-]url\s*=\s*["']?)\s*([^\s"'>]*)/  $1 . &full_url($2) /ie ;
	    local($name, $value, $new_value) ;
	    next unless ( ($name)=  /<[^>]*[^\w.>\/?&-]http-equiv\s*=\s*["']?\s*([\w.-]*)/i)
		     && ( ($value)= /<[^>]*[^\w.>\/?&-]content\s*=\s*("[^">]*"|'[^'>]*'|[^'"][^\s>]*)/i ) ;
	    $value=~ s/^["']|["']$//g ;   # strip quotes
	    $new_value= &new_header_value($name, $value) ;
	    s/(<[^>]*[^\w.>\/?&-]http-equiv\s*=\s*["']?)\s*([^\s"'>]*)/ $1 /ie
		unless defined($new_value) ;
	    s/(<[^>]*[^\w.>\/?&-]content\s*=)\s*("[^">]*"|'[^'>]*'|[^'"][^\s>]*)/
	      qq($1"$new_value") /ie ;

	    next ;
	}


	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	    next if $tag_name eq 'note' ;     # HTML 3.0

	s/(<[^>]*[^\w.>\/?&-]usemap\s*=\s*["']?)\s*([^\s"'>]*)/   $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]codebase\s*=\s*["']?)\s*([^\s"'>]*)/ $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]data\s*=\s*["']?)\s*([^\s"'>]*)/     $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]archive\s*=\s*["']?)\s*([^\s"'>]*)/  $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]classid\s*=\s*["']?)\s*([^\s"'>]*)/  $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]name\s*=\s*["']?)\s*([^\s"'>]*)/     $1 . &full_url($2) /ie,
	    next if $tag_name eq 'object' ;

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]imagemap\s*=\s*["']?)\s*([^\s"'>]*)/ $1 . &full_url($2) /ie,
	    next if $tag_name eq 'overlay' ;  # HTML 3.0

	s/(<[^>]*[^\w.>\/?&-]cite\s*=\s*["']?)\s*([^\s"'>]*)/     $1 . &full_url($2) /ie,
	    next if $tag_name eq 'q' ;

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	s/(<[^>]*[^\w.>\/?&-]for\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	    next if $tag_name eq 'script' ;

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	    next if $tag_name eq 'select' ;   # HTML 3.0

	s/(<[^>]*[^\w.>\/?&-]src\s*=\s*["']?)\s*([^\s"'>]*)/      $1 . &full_url($2) /ie,
	    next if $tag_name eq 'ul' ;       # HTML 3.0

	s/(<[^>]*[^\w.>\/?&-]background\s*=\s*["']?)\s*([^\s"'>]*)/ $1.&full_url($2) /ie,
	    next if $tag_name eq 'td' ;       # Netscape extension?

	s/(<[^>]*[^\w.>\/?&-]background\s*=\s*["']?)\s*([^\s"'>]*)/ $1.&full_url($2) /ie,
	    next if $tag_name eq 'th' ;       # Netscape extension?

	s/(<[^>]*[^\w.>\/?&-]background\s*=\s*["']?)\s*([^\s"'>]*)/ $1.&full_url($2) /ie,
	    next if $tag_name eq 'tr' ;       # Netscape extension?

	s/(<[^>]*[^\w.>\/?&-]background\s*=\s*["']?)\s*([^\s"'>]*)/ $1.&full_url($2) /ie,
	    next if $tag_name eq 'table' ;    # Netscape extension?

    }   # foreach (@body)



    # Insert anything needed-- entry form and/or custom insertion.
    # This relies on the variables $insert_pos and $is_frameset, which were
    #   set in the block above the foreach(@body) block directly above
    #   here.  The comments in that block apply here too.
    if (!$is_in_frame && !$is_frameset) {

	# Insert the custom insertion if needed.
	# This block adds it when it SHOULD NOT BE anonymized, which is why
	#   it comes after the URL conversion above.
	if ( !$ANONYMIZE_INSERTION &&
	     (($INSERT_HTML ne '') || ($INSERT_FILE ne '')) ) {
	    $INSERT_HTML= &readfile($INSERT_FILE) if $INSERT_HTML eq '' ;
	    splice(@body, $insert_pos, 0, $INSERT_HTML) ;
	    $insert_pos++  if $FORM_AFTER_INSERTION ;
	}

	# Insert the entry form if needed-- never anonymized.
	splice(@body, $insert_pos, 0, &mini_start_form) if $INSERT_ENTRY_FORM ;
    }



    # Does this ever cause a problem, putting a comment before the entire body?
    #   It may affect the initial <!DOCTYPE ...> tag, etc.
    unshift(@body, "<!-- resource has been modified by proxy -->\n" ) ;

    # Change Content-Length header, since we changed the content
    $headers=~ s/^Content-Length:.*\012/ 
		 'Content-Length: ' . length(join('',@body)) . "\015\012"/ie ;

}


#--------------------------------------------------------------------------
#    Send response back to user
#--------------------------------------------------------------------------

# If the response has not already been sent, print the status line, headers,
#   and the entire (possibly modified) resource.
# The $response_sent flag was added purely to support streaming media and
#   large files.
if (!$response_sent) {
    if ($ENV{'REQUEST_METHOD'} eq 'HEAD') {
	print $status, $headers ;
    } elsif ($is_html) {
	print $status, $headers, @body ;
	# print $debug ;   # handy for sprinkling checks throughout the code
    } else {
	print $status, $headers, $body ;
    }
}


# Put this back in to run speed trials
#if ($is_html) {
#    # OK, let's time this thing
#    ($eutime,$estime)= (times)[0,1] ;
#    open(LOG,">>proxy.log") ;
#    print LOG "full times: ", $eutime-$sutime, " ", $estime-$sstime, 
#        " ", time-$starttime, "  URL: $URL\n" ;
#    close(LOG) ;
#}



# mod_perl scripts must not exit.
EXIT:
exit unless $ENV{'MOD_PERL'} ;

#--------------------------------------------------------------------------
#   DONE!!
#--------------------------------------------------------------------------


# Returns the full absolute URL to query our script for the given URI
#   reference.  PATH_INFO will include the encoded absolute URL of the target,
#   but the fragment will be appended unencoded so browsers will resolve it
#   correctly.
# This is a major bottleneck for the whole program, so speed is important here.
# Note that the calculations of $this_url, $base_scheme, $base_host, and
#   $base_path throughout the program are an integral part of this routine,
#   placed elsewhere for speed.
# This was rewritten 2-18-99 to be much more flexible, e.g. to allow encoding
#   and avoid illegal PATH_INFO sequences.  Unfortunately, it is a little 
#   slower.
# For HTTP, The URL to be encoded should include everything that is sent in
#   the request, including any query, but not any fragment.
# This only returns absolute URLs, though relative URLs would usually suffice.
#   If it matters, we could have a fullrelurl() and fullabsurl(), the latter
#   used for those HTML attributes that require an absolute URL (like <base>).
#
# The ?:?:?: statement resolves relative URLs to absolute URLs, given the
#   $base_{url,scheme,host,path} variables figured earlier.  It does it
#   simply and efficiently, and accurately enough; the full procedure is
#   described in RFC 2396 (URI syntax), section 5.2.
# RFC 2396, section 5 states that there are three types of relative URIs:
#   net_path (beginning with //, rarely used), abs_path (beginning with /),
#   and rel_path, any of which may be followed by a "?query"; the query must
#   be included in the result.  Thus, we only need to examine the start of
#   the relative URL.
# This ?:?:?: statement passes all test cases in RFC 2396 appendix C, except
#   for the following:  It does not reduce . and .. path segments (to do
#   so would take a lot more time), and it assumes $uri_ref has something
#   other than an empty fragment in it, i.e. that the URI is non-empty.
# This only works for hierarchical schemes, like HTTP or FTP.  Conceivably,
#   there's a problem if the base URL uses a non-hierarchical scheme, and
#   the document contains relative URLs.  Absolute URLs will be OK.
# Any HTML-escaping/unescaping should be done outside of this routine, since
#   it is used for any relative->absolute URL conversion, not just HTML.

sub full_url {
    local($uri_ref)= @_ ;
    local($1,$2) ;   # Required for Perl 4-- else, caller's $1,$2 get clobbered

    # For now, prevent redirecting into x-proxy URLs.
    # This slows down the main tag-converting loop by 0-1%.
    return undef if $uri_ref=~ m#^x-proxy://#i ;

    # Separate fragment from URI
    local($uri,$frag)= $uri_ref=~ /^([^#]*)(#.*)?/ ;
    return $uri_ref if $uri eq '' ;  # allow bare fragments to pass unchanged

    # calculate absolute URL based on three possible cases
    local($absurl)= 
	    $uri=~ m#^[\w+.-]*:#i   ?  $uri                 # absolute URL
	  : $uri=~ m#^//#           ?  $base_scheme . $uri  # net_path (rare)
	  : $uri=~ m#^/#            ?  $base_host . $uri    # abs_path, rel URL
	  :                            $base_path . $uri ;  # relative path

    return join('', $this_url, &proxy_encode($absurl), $frag ) ;
}


# Identical to full_url(), except second parameter explicitly determines
#   whether we use $this_url_inframe or $this_url_noframe.
# This could be wrapped into the full_url() routine, but I'm guessing it
#   is more efficient to do it this way.  This won't be called often and
#   full_url() is called a lot.
# This uses a little trick with local() that lets us use full_url(), which
#   keeps the routines synchronized and reduces code size.  We set a local
#   version of $this_url, which is used by full_url() because it remains
#   in scope there, but when we exit this routine the scope closes and
#   the old $this_url is restored.
sub full_url_by_frame {
    local($uri_ref, $is_frame)= @_ ;
    local($this_url)= $is_frame   ? $this_url_inframe  : $this_url_noframe ;
    return &full_url($uri_ref) ;
}


# Set globals $base_url, $base_scheme, $base_host, and $base_path, based on 
#   value of $base_url.  This must be called whenever $base_url is set, which
#   unfortunately may vary over the course of the program.
# These are an integral part of &full_url(), placed outside of that for speed.
# To specify:
#   $base_scheme is the scheme of the base URL, ending in ":", like "http:".
#   $base_host is the scheme/host/port of the base URL, with no final slash.
#   $base_path is the scheme/host/port/path, through final slash.
# These are only relevant (and accurate) for hierarchical "/"-using schemes,
#   like HTTP or FTP.
# Any HTML-escaping/unescaping should be done outside of this routine.
sub fix_base_vars {
    # Guarantee that $base_url has at least a path of '/', inserting before 
    #   ?query if needed.
    $base_url=~ s#^([\w+.-]+://[^/?]+)/?#$1/# ;

    ($base_scheme)= $base_url=~ m#^([\w+.-]+:)//# ;
    ($base_host)=   $base_url=~ m#^([\w+.-]+://[^/?]+)# ; # no ending slash
    ($base_path)=   $base_url=~ m#^([^?]*/)# ;            # use greedy matching
}



#--------------------------------------------------------------------------
#    Scheme-specific routines
#--------------------------------------------------------------------------

#
# <scheme>_get: get resource at URL and set globals $status, $headers, 
#   $is_html, and either $body or @body depending on $is_html.  Optionally,
#   set $response_sent to signal that the response has already been sent.
#   These are all globals for speed, to prevent unneeded copying of huge
#   strings.
#

# http_get: actually supports both GET and POST

sub http_get {
    local($portst, $realhost, $realport, $request_uri,
	  $realm, $tried_realm, $auth, $proxy_auth_header,
	  $lefttoget, $postblock, @postbody, $body_too_big, $rin, $dummy,
	  $first_block) ;
    local($/)= "\012" ;

    $port= 80 if $port eq '' ;

    # Some servers don't like default port in a Host: header, so use $portst.
    $portst= ($port==80)  ? ''  : ":$port" ;

    $realhost= $host ;
    $realport= $port ;
    $request_uri= $path ;

    # there must be a smoother way to handle proxies....
    if ($ENV{'http_proxy'}) {
	local($dont_proxy) ;
	foreach (split(/\s*,\s*/, $ENV{'no_proxy'})) {
	    $dont_proxy= 1, last if $host=~ /$_$/ ;
	}
	unless ($dont_proxy) {
	    # could be slightly more efficient in Perl 5
	    ($dummy,$realhost,$realport)=
		$ENV{'http_proxy'}=~ m#^(http://)?([^/?:]*):?([^/?]*)#i ;
	    $realport= 80 if $realport eq '' ;
	    $request_uri= $URL ;
	    $proxy_auth_header= "Proxy-Authorization: Basic $PROXY_AUTH\015\012"
	       if $PROXY_AUTH ne '' ;
	}
    }


    #------ Connect socket to host; send request; wait with select() ------

    # To be able to retry on a 401 Unauthorized response, put the whole thing
    #   in a labeled block.  Note that vars have to be reinitialized.
    HTTP_GET: {

	# Open a socket and send the request.
	# Accept: and User-Agent: headers are here because they affect results.
	# We're not fully supporting HTTP 1.1, but at least send Host: header.
	# We're anonymously browsing, so don't include From: or Referer:.
	#   The User-Agent: header is a very teensy privacy risk, but some
	#   pages load differently with different browsers.

	&newsocketto(*S, $realhost, $realport) ;
	binmode S ;   # see note with "binmode STDOUT", above

	print S $ENV{'REQUEST_METHOD'}, ' ', $request_uri, " HTTP/1.0\015\012",
		'Host: ', $host, $portst, "\015\012",    # being a good netizen
		'Accept: ', $env_accept, "\015\012",     # possibly modified
		'User-Agent: ', $USER_AGENT, "\015\012",
		$proxy_auth_header ;                     # empty if not needed

	# Add the cookie if it exists and cookies aren't banned here.
	if (!$cookies_are_banned_here && ($ENV{'HTTP_COOKIE'} ne '')) {
	    print S 'Cookie: ', $cookie_to_server, "\015\012"
		if $cookie_to_server ne '' ;
	}


	# Add Authorization: header if we've had a challenge.
	if ($realm ne '') {
	    # If we get here, we know $realm has a defined $auth and has not
	    #   been tried.
	    print S "Authorization: Basic $auth{$realm}\015\012" ;
	    $tried_realm= $realm ;

	} else {
	    # If we have auth information for this server, what the hey, let's
	    #   try one, it may save us a request/response cycle.
	    if ( ($tried_realm,$auth)= each %auth ) {
		print S "Authorization: Basic $auth\015\012" ;
	    }
	}


	# A little problem with authorization and POST requests: If auth
	#   is required, we won't know which realm until after we make the
	#   request and get part of the response.  But to make the request,
	#   we have to send the entire POST body, because some servers
	#   mistakenly require that before returning even an error response.
	#   So this means we have to send the entire POST body, and be
	#   prepared to send it a second time, thus we have to store it
	#   locally.  Either that, or fail to send the POST body a second
	#   time.  Here, we let the owner of this proxy set $MAX_REQUEST_SIZE:
	#   store and post a second time if a request is smaller, or else
	#   die with 413 the second time through.

	# If request method is POST, copy content headers and body to request.
	# The first time through here, save body to @postbody, if the body's
	#   not too big.
	if ($ENV{'REQUEST_METHOD'} eq 'POST') {

	    if ($body_too_big) {
		# Quick 'n' dirty response for an unlikely occurrence.
		# 413 is not actually an HTTP/1.0 response...
		&HTMLdie("Sorry, this proxy can't handle a request larger "
		       . "than $MAX_REQUEST_SIZE bytes at a password-protected"
		       . " URL.  Try reducing your submission size, or submit "
		       . "it to an unprotected URL.", 'Submission too large',
			 '413 Request Entity Too Large') ;
	    }

	    # Otherwise...
	    $lefttoget= $ENV{'CONTENT_LENGTH'} ;
	    print S 'Content-type: ', $ENV{'CONTENT_TYPE'}, "\015\012", 
		    'Content-length: ', $lefttoget, "\015\012\015\012" ;

	    if (@postbody) {
		print S @postbody ;
	    } else {
		$body_too_big= ($lefttoget > $MAX_REQUEST_SIZE) ;
		# Loop to guarantee all is read from STDIN.  
		do {
		    $lefttoget-= read(STDIN, $postblock, $lefttoget) ;
		    print S $postblock ;
		    # efficient-- only doing test when input is slow anyway.
		    push(@postbody, $postblock) unless $body_too_big ;
		} while $lefttoget && ($postblock ne '') ;
	    }

	# For GET requests, just add extra blank line
	} else {
	    print S "\015\012" ;
	}


	# Wait a minute for the response to start
	vec($rin= '', fileno(S), 1)= 1 ;
	select($rin, undef, undef, 60) 
	    || &HTMLdie("No response from $realhost:$realport") ;


	#------ Read full response into $status, $headers, and $body/@body ----

	# Support both HTTP 1.x and HTTP 0.9
	$status= <S> ;  # first line, which is the status line in HTTP 1.x


	# HTTP 0.9
	# Ignore possibility of HEAD, since it's not defined in HTTP 0.9.
	# Do any HTTP 0.9 servers really exist anymore?
	unless ($status=~ m#^HTTP/#) {
	    $is_html= 1 ;   # HTTP 0.9 by definition implies an HTML response
	    undef $/ ;
	    $body= $status . <S> ;
	    $status= '' ;

	    # split $body into @body; each element ends in ">" or EOstring
	    @body= $body=~ /([^>]*>?)/g ;

	    close(S) ;
	    return ;
	}


	# After here, we know we're using HTTP 1.x

	$headers= '' ;   # could have been set by first attempt
	do {
	    $headers.= $_= <S> ;    # $headers includes last blank line
	} until (/^(\015\012|\012)$/) ;   # lines may end with LF or CRLF

	# Unfold long header lines, a la RFC 822 section 3.1.1
	$headers=~ s/(\015\012|\012)[ \t]+/ /g ;

	# Check for 401 Unauthorized response
	if ($status=~ m#^HTTP/\d+\.\d+\s+401\b#) {
	    ($realm)=
		$headers=~ /^WWW-Authenticate:\s*Basic\s+realm="([^"\n]*)/i ;
	    &HTMLdie("Error by target server: no WWW-Authenticate header.")
		unless $realm ne '' ;

	    if ($auth{$realm} eq '') {
		&get_auth_from_user($host, $realm, $URL) ;
	    } elsif ($realm eq $tried_realm) {
		&get_auth_from_user($host, $realm, $URL, 1) ;
	    }

	    # so now $realm exists, has defined $auth, and has not been tried
	    close(S) ;
	    redo HTTP_GET ;
	}


	# Extract $content_type, used in several places
	local($content_type)= $headers=~ m#^Content-type:\s*([\w/.-]*)#i ;
	$content_type=~ tr/A-Z/a-z/ ;

	# If we're text only, then cut off non-text responses (but allow 
	#   unspecified types).
	if ($TEXT_ONLY) {
	    if ( ($content_type ne '') && ($content_type!~ m#^text/#i) ) {
		close(S) ;
		&non_text_die ;
	    }
	}

	# If we're removing scripts, then disallow script MIME types.
	if ($scripts_are_banned_here) {
	    if (defined( $IS_SCRIPT_MIME_TYPE{$content_type} )) {
		close(S) ;
		&script_content_die ;
	    }

	    # Remove any style sheet type headers that identify a script type.
	    $headers=~ s/^Content-Style-Type:\s*$SCRIPT_TYPE_REGEX\b.*\012//ig;

	    # Note that the non-standard Link: header, which may link to a
	    #   style sheet, is handled in http_fix().
	}


	# If URL matches one of @BANNED_IMAGE_URL_PATTERNS, then skip the
	#   resource unless it's clearly a text type.
	if ($images_are_banned_here) {
	    close(S), &skip_image  unless $content_type=~ m#^text/#i ;
	}

	# Keeping $base_url, $base_host, and $base_path up-to-date is an 
	#   ongoing job.  Here, we look in appropriate headers.  Note that if
	#   Content-Base: doesn't exist, Content-Location: is an absolute URL.
	if        ($headers=~ m#^Content-Base:\s*([\w+.-]+://\S+)#i) {
	    $base_url= $1, &fix_base_vars ;
	} elsif   ($headers=~ m#^Content-Location:\s*([\w+.-]+://\S+)#i) {
	    $base_url= $1, &fix_base_vars ;
	} elsif   ($headers=~ m#^Location:\s*([\w+.-]+://\S+)#i) {
	    $base_url= $1, &fix_base_vars ;
	}

	# Now, fix the headers with &http_fix().  It uses &full_url(), and
	#   may modify the headers we just extracted the base URL from.
	# This also includes cookie support.
	&http_fix ;


	# Set $is_html if headers indicate HTML response.
	# Question: are there any other HTML-like MIME types, including x-... ?
	$is_html= 1  if $content_type eq 'text/html' ;


	# Some servers return HTML content without the Content-Type: header.
	#   These MUST be caught, because Netscape displays them as HTML, and
	#   a user could lose their anonymity on these pages.

	# If Content-Type: is empty, then we still must test for HTML
	#   content:  read the first block of characters, test for HTML-like
	#   content, and prepend to remaining data.  Unfortunately, should
	#   err on the side of too much being called HTML.
	# Perl 4 has trouble combining <S> and read(S), so use <S> for it.
	# The test here is a little sketchy-- anyone have a better idea?
	if ( !$is_html && ($content_type eq '') ) {
	    if ($]>=5) {                                 # Perl>=5
		$first_block= &read_socket('S', 1024) ;
		return unless defined($first_block) ;    # panic on error
	    } else {
		$/= '>' ;
		$first_block.= <S> until (length($first_block)>=1024 || eof) ;
	    }

	    # Let's call it HTML if it has one of these tags, or <a href>.
	    $is_html= ($first_block=~ /<\s*(html|!doctype|head|body|title|img)/i)
		   || ($first_block=~ /<\s*a\b[^>]*\bhref\b/i) ;
	}


	# To support non-NPH hack, replace first part of $status with
	#   "Status:" if needed.
	$status=~ s#^\S+#Status:#  if $NOT_RUNNING_AS_NPH ;


	# To support streaming media and large files, read the data from
	#   the server and send it immediately to the client.  The exception
	#   is HTML content, which still must be read fully to be converted
	#   in the main block.  HTML content is not normally streaming or
	#   very large.
	# This requires $status and $headers to be returned now, which is
	#   OK since headers have been completely cleaned up by now.  This
	#   also means that changes after this point to $body/@body won't
	#   have any effect, which in fact is fine in the case of non-HTML
	#   resources.  Set $response_sent to prevent the main block from
	#   sending a response.
	# Unfortunately, Perl 4 seems to have buffer problems when using
	#   read() and misses the first block-- conflict with <S>, maybe?
	#   Anyway, it has to be handled differently.  :P
	# Be sure to prepend $first_block to socket input, if it's set above.

	# Only read body if the request method is not HEAD
	if ($ENV{'REQUEST_METHOD'} ne 'HEAD') {
	    if ($is_html) {
		$/= '>' ;
		@body= () ;
		if ($first_block ne '') {
		    $first_block.= <S> ;  # read up through the next ">"
		    # split $first_block into @body; each item ends in > or EOS
		    @body= $first_block=~ /([^>]*>?)/g ;
		}
		push(@body, <S>) ;

	    } else {

		# This is the primary change to support streaming media.
		local($buf) ;
		print $status, $headers, $first_block ;

		if ($]>=5) { print $buf while read(S, $buf, 16384) }  # Perl>=5
		else       { print $buf while $buf= <S> }             # Perl 4

		$response_sent= 1 ;

		# Was:
		#undef $/ ;
		#$body= <S> ;
	    }

	} else {
	    $body= ''; @body= () ;
	}

	close(S) ;

    }  # HTTP_GET:

}  # sub http_get()



# ftp_get:

sub ftp_get {
    local($is_dir, $rcode, @r, @cwdmsg, $dataport, $remote_addr,
	  $ext, $content_type, %content_type, $content_length,
	  $enc_URL) ;
    local($/)= "\012" ;

    $port= 21 if $port eq '' ;

    # List of file extensions and associated MIME types, or at least the ones
    #   a typical browser distinguishes from a nondescript file.
    # I'm open to suggestions for improving this.
    %content_type=
	  ('txt',  'text/plain',
	   'text', 'text/plain',
	   'htm',  'text/html',
	   'html', 'text/html',
	   'png',  'image/png',
	   'jpg',  'image/jpeg',
	   'jpeg', 'image/jpeg',
	   'jpe',  'image/jpeg',
	   'gif',  'image/gif',
	   'xbm',  'image/x-bitmap',
	   'mpg',  'video/mpeg',
	   'mpeg', 'video/mpeg',
	   'mpe',  'video/mpeg',
	   'qt',   'video/quicktime',
	   'mov',  'video/quicktime',
	   'aiff', 'audio/aiff',
	   'aif',  'audio/aiff',
	   'au',   'audio/basic',
	   'snd',  'audio/basic',
	   'wav',  'audio/x-wav',
	   'mp2',  'audio/x-mpeg',
	   'mp3',  'audio/mpeg',
	   'ram',  'audio/x-pn-realaudio',
	   'rm',   'audio/x-pn-realaudio',
	   'ra',   'audio/x-pn-realaudio',
	   'zip',  'application/x-gzip-compressed',
	   ) ;


    $is_dir= $path=~ m#/$# ;
    $is_html= 0 if $is_dir ;   # for our purposes, do not treat dirs as HTML

    # Set $content_type based on file extension.
    # Hmm, still unsure how best to handle unknown file types.  This labels
    #   them as text/plain, so that README's, etc. will display right.
    ($ext)= $path=~ /\.(\w+)$/ ;  # works for FTP, not for URLs with query etc.
    $ext=~ tr/A-Z/a-z/ ;
    $content_type= ($is_html || $is_dir)  ? 'text/html'
					  : $content_type{$ext} 
					    || 'text/plain' ;

    # Create $status and $headers, and leave $body/@body and $is_html as is.
    # Directories use an HTML response, though $is_html is false when $is_dir.
    $status= "$HTTP_1_0 200 OK\015\012" ;
    $headers= $content_type  ? "Content-type: $content_type\015\012\015\012"
			     : "\015\012" ;


    # Open the control connection to the FTP server
    &newsocketto(*S, $host, $port) ;
    binmode S ;   # see note with "binmode STDOUT", above

    # Luckily, RFC 959 (FTP) has a really good list of all possible response
    #   codes to all possible commands, on pages 50-53.

    # Connection establishment
    ($rcode)= &ftp_command('', '120|220') ;
    &ftp_command('', '220') if $rcode==120 ;

    # Login
    ($rcode, @welcome)= &ftp_command("USER $username\015\012", '230|331') ;
    ($rcode, @welcome)= &ftp_command("PASS $password\015\012", '230|202') 
	if $rcode==331 ;

    # Set transfer parameters
    &ftp_command("TYPE I\015\012", '200') ;

    # Create and listen on data socket
    socket(DATA_LISTEN, $AF_INET, $SOCK_STREAM, (getprotobyname('tcp'))[2])
	|| &HTMLdie("Couldn't create FTP data socket: $!") ;
    bind(DATA_LISTEN, pack('S n a4 x8', $AF_INET, 0, "\0\0\0\0") )
	|| &HTMLdie("Couldn't bind FTP data socket: $!") ;
    $dataport= (unpack('S n a4 x8', getsockname(DATA_LISTEN)))[1] ;
    listen(DATA_LISTEN,1)
	|| &HTMLdie("Couldn't listen on FTP data socket: $!") ;
    select((select(DATA_LISTEN), $|=1)[0]) ;    # unbuffer the socket
    select((select(DATA_XFER), $|=1)[0]) ;      # unbuffer the socket

    # Tell FTP server which port to connect to
    &ftp_command( sprintf("PORT %d,%d,%d,%d,%d,%d\015\012",
			  unpack('C4', substr(getsockname(S),4,4)),
			  $dataport>>8, $dataport & 255),
		  '200') ;


    # Do LIST for directories, RETR for files
    if ($is_dir) {
	# If we don't CWD first, then symbolic links won't be followed.
	($rcode, @cwdmsg)= &ftp_command("CWD $path\015\012", '250') ;
	($rcode, @r)= &ftp_command("LIST\015\012", '125|150') ;
# was:  ($rcode, @r)= &ftp_command("LIST $path\015\012", '125|150') ;

    } else {
	($rcode, @r)= &ftp_command("RETR $path\015\012", '125|150|550') ;

	# If 550 response, it may be a symlink to a directory.
	# Try to CWD to it; if successful, do a redirect, else die with the
	#   original error response.  Note that CWD isn't required to be
	#   be supported, but any FTP server these days supports it.
	if ($rcode==550) {
	    ($rcode)= &ftp_command("CWD $path\015\012", '') ;
	    &ftp_error(550,@r) unless $rcode==250 ;

	    ($enc_URL= $URL)=~ s/ /%20/g ;  # URL-encode any spaces

	    # Redirect the browser to the same URL with a trailing slash
	    print "$HTTP_1_0 301 Moved Permanently\015\012",
		  "Location: ", $this_url, &proxy_encode($enc_URL . '/'), 
		  "\015\012\015\012" ;
	    goto EXIT ;
	}
    }


    # Streaming support added in 1.3.  For notes about streaming, look near
    #   the end of the http_get() routine.  Basically, as long as a resource
    #   isn't HTML (or a directory listing, in the case of FTP), we can pass
    #   the data immediately to the client, since it won't be modified.  Be
    #   sure to set $response_sent here.

    # Accept the connection and read the data into @body or $body
    ($remote_addr= accept(DATA_XFER, DATA_LISTEN))
	|| &HTMLdie("Error accepting FTP data socket: $!") ;
#    &HTMLdie("Intruder Alert!  Someone other than the server is trying to "
#	   . "send you data.")
#	unless (substr($remote_addr,4,4) eq substr(getpeername(S),4,4)) ;

    if ($is_html) {
	$/= '>' ;
	@body= <DATA_XFER> ;
    } elsif ($is_dir) {
	undef $/ ;            # This was used for all non-HTML before streaming
	$body= <DATA_XFER> ;  #   was supported.
    } else {

	# Stick a Content-Length: header into the headers if appropriate (often
	#   there's a "(xxx bytes)" string in a 125 or 150 response line).
	# Be careful about respecting previous value of $headers, which may
	#   already end in a blank line.
	foreach (grep(/^(125|150)/, @r)) {
	    if ( ($content_length)= /\((\d+)[ \t]+bytes\)/ ) {
		$headers= "Content-Length: $content_length\015\012" .$headers ;
		last ;
	    }
	}

	# This is the primary change to support streaming media.
	local($buf) ;
	print $status, $headers ;
	print $buf while read(DATA_XFER, $buf, 16384) ;
	$response_sent= 1 ;

    }


    close(DATA_XFER) ;
    close(DATA_LISTEN) ;

    # Get the final completion response
    &ftp_command('', '226|250') ;

    &ftp_command("QUIT\015\012") ;   # don't care how they answer

    close(S) ;

    # Make a user-friendly directory listing.  Add Content-Length: header.
    if ($is_dir) {
	&ftp_dirfix ;
	$headers= "Content-Length: " . length($body) . "\015\012" . $headers ;
    }

}


# Send $cmd and return response code followed by full lines of  FTP response.
# Die if response doesn't match the regex $ok_response.
sub ftp_command {
    local($cmd,$ok_response)= @_ ;
    local(@r,$rcode) ;
    local($/)= "\012" ;

    print S $cmd ;

    $_= $r[0]= <S> ;
    $rcode= substr($r[0],0,3) ;
    until (/^$rcode /) {      # this catches single- and multi-line responses
	push(@r, $_=<S>) ;
    }

    &ftp_error($rcode,@r) if $ok_response ne '' && $rcode!~ /$ok_response/ ;
    return $rcode, @r ;
}


# Convert a directory listing to user-friendly HTML.
# A couple of tangles here to handle spaces in filenames.  We should probably
#   handle spaces in other protocols too, but URLs normally prohibit spaces--
#   it's only relative paths within a scheme (like FTP) that would have them.
sub ftp_dirfix {
    local($newbody, $parent_link, $max_namelen,
	  @f, $is_dir, $is_link, $link, $name, $size, $size_type, $file_type,
	  $welcome, $cwdmsg, $enc_path) ;

    # Set minimum name column width; longer names will widen the column
    $max_namelen= 16 ;

    # each file should have name/, size, date
    @body= split(/\015?\012/, $body) ;
    foreach (@body) {
	@f= split(" ", $_, 9) ;   # Note special use of " " pattern.
	next unless $#f>=8 ;
	next if $f[8]=~ /^\.\.?$/ ;
	$file_type= '' ;
	$is_dir=  $f[0]=~ /^d/i ;
	$is_link= $f[0]=~ /^l/i ;
	$file_type= $is_dir     ? 'Directory'
		  : $is_link    ? 'Symbolic link'
		  :               '' ;
	$name= $f[8] ;
	$name=~ s/^(.*) ->.*$/$1/ if $is_link ;   # remove symlink's " -> xxx"
	$name.= '/' if $is_dir ;
	$max_namelen= length($name) if length($name)>$max_namelen ;
	if ($is_dir || $is_link) {
	    ($size, $size_type)= () ;
	} else {
	    ($size, $size_type)= ($f[4], 'bytes') ;
	    ($size, $size_type)= ($size>>10, 'Kb') if $size > 10240 ;
	}

	# Easy absolute URL calculation, because we know it's a relative path.
	($enc_path= $base_path . $name)=~ s/ /%20/g ;  # URL-encode any spaces
	$link=  &HTMLescape( $this_url . &proxy_encode($enc_path) ) ;

	$newbody.= 
	    sprintf("  <a href=\"%s\">%s</a>%s %5s %-5s %3s %2s %5s  %s\012",
			   $link, $name, "\0".length($name),
			   $size, $size_type,
			   @f[5..7],
			   $file_type) ;
    }

    # A little hack to get filenames to line up right-- replace embedded 
    #  "\0"-plus-length with correct number of spaces.
    $newbody=~ s/\0(\d+)/ ' ' x ($max_namelen-$1) /ge ;

    if ($path eq '/') {
	$parent_link= '' ;
    } else {
	($enc_path= $base_path)=~ s#[^/]*/$## ;
	$enc_path=~ s/ /%20/g ;  # URL-encode any spaces
	$link=  &HTMLescape( $this_url . &proxy_encode($enc_path) ) ;
	$parent_link= "<a href=\"$link\">Up to higher level directory</a>" ;
    }

    if ($SHOW_FTP_WELCOME) {
	$welcome= join('', grep(s/^230-//, @welcome)) ;
	# Make links of any URLs in $welcome.  Imperfect regex, but does OK.
	# "\"abc" doesn't work in Perl 4 substitutions; must use '"' . "abc"
	$welcome=~ s#\b([\w+.-]+://[^\s"']+[\w/])(\W)#
	    '<a href="' . &full_url($1) . '"' . ">$1</a>$2" #ge ;
	$welcome.= "<hr>" if $welcome ne '' ;
    } else {
	$welcome= '' ;
    }

    # If CWD returned a message about this directory, display it.  Make links
    #   a la $welcome, above.
    $cwdmsg= join('', grep(s/^250-//, @cwdmsg)) ;
    $cwdmsg=~ s#\b([\w+.-]+://[^\s"']+[\w/])(\W)#
	'<a href="' . &full_url($1) . '"' . ">$1</a>$2" #ge ;
    $cwdmsg.= "<hr>" if $cwdmsg ne '' ;


    $body= <<EOS ;
<html>
<title>FTP directory of $URL</title>
<body>
<h1>FTP server at $host</h1>
<h2>Current directory is $path</h2>
<hr>
<pre>
$welcome$cwdmsg
$parent_link
$newbody
</pre>
<hr>
</body>
</html>
EOS

}


# Return a generalized FTP error page.
# For now, respond with 200.  In the future, give more appropriate codes.
sub ftp_error {
    local($rcode,@r)= @_ ;

    close(S) ; close(DATA_LISTEN) ; close(DATA_XFER) ;

    print <<EOH ;
$HTTP_1_0 200 OK
Content-type: text/html

<html>
<head><title>FTP Error</title></head>
<body>
<h1>FTP Error</h1>
<h3>The FTP server at $host returned the following error response:</h3>
<pre>
EOH
    print @r, "</pre>\n" ;

    &footer ;
    goto EXIT ;
}


#--------------------------------------------------------------------------

#
# <scheme>_fix: modify response as appropriate for given protocol (scheme).
#

# http_fix: modify headers as needed, including cookie support.
# Note that headers have already been unfolded, when they were read in.
sub http_fix {
    local($name, $value, $new_value) ;
    local(@headers)= $headers=~ /^([^\012]*\012?)/g ;  # split into lines

    foreach (@headers) {
	next unless ($name, $value)= /^([\w.-]*):\s*([^\015\012]*)/ ;
	$new_value= &new_header_value($name, $value) ;
	$_= defined($new_value)  ? "$name: $new_value\015\012"   : '' ;
    }

    $headers= join('', @headers) ;
}


# Returns the value of an updated header, e.g. with URLs transformed to point
#   back through this proxy.  Returns undef if the header should be removed.
# This is used to translate both real headers and <meta http-equiv> headers.
sub new_header_value {
    local($name, $value)= @_ ;
    $name=~ tr/A-Z/a-z/ ;

    # sanity check
    return undef if $name eq '' ;

    # First, modify any appropriate HTTP headers to point back through the
    #   proxy.  Location: is most important.
    # Note that all these are absolute URIs, except possibly Content-Location:
    #   or URI:, which may be relative to Content-Base or the request URI--
    #   notably, NOT relative to anything in the content, like a <base> tag.
    return &full_url($value)
	if    $name eq 'content-base'
	   || $name eq 'content-location'
	   || $name eq 'location' ;
#	   || $name eq 'uri' ;   # I think this was a bug in previous versions


    # And the non-standard Refresh: header... any others?
    $value=~ s/(;\s*URL=)(\S*)/ $1 . &full_url($2) /ie,   return $value
	if $name eq 'refresh' ;

    # The deprecated URI: header may contain several URI's, inside <> brackets.
    $value=~ s/<(\s*[^>\015\012]*)>/ '<'.&full_url($1).'>' /gie, return $value
	if $name eq 'uri' ;

    # The non-standard Link: header is a little problematic.  It's described
    #   in the HTTP 1.1 spec, section 19.6.2.4, but it is not standard.  It
    #   can be used to link to style sheets, but the mechanism for indicating
    #   the style sheet type (=language, which could be a script MIME type)
    #   is not defined.  However, that could be added as an extension.  So,
    #   we remove the header entirely if $scripts_are_banned_here is set;
    #   otherwise, we convert the URLs within it, which are contained within
    #   <> brackets.
    if ($name eq 'link') {
	return undef if $scripts_are_banned_here ;
	$value=~ s/<(\s*[^>\015\012]*)>/ '<' . &full_url($1) . '>' /gie ;
	return $value ;
    }

    # Modify cookies to point back through the script, or they won't work.
    # If they're banned from this server, or they're not allowed with the
    #   current non-text resource, then filter them all out.  This actually
    #   covers all non-text resources, not just images.
    if ($name eq 'set-cookie') {
	return undef
	    if $cookies_are_banned_here ||
	      ($NO_COOKIE_WITH_IMAGE && $headers!~ m#^Content-Type:\s*text/#i);

	return &cookie_to_client($value, $path, $host) ;
    }


    # For all non-special headers, return $value
    return $value ;

}


#--------------------------------------------------------------------------
#    Special admin routines, when called via the scheme type "x-proxy://"
#--------------------------------------------------------------------------

#--------------------------------------------------------------------------
#
#   I took the liberty of creating a general mechanism to let this proxy do
#   whatever tricks it needs to do, via the magic URL scheme "x-proxy://".
#   It was required to support HTTP Basic Authentication, and it's useful
#   for other things too.  The mechanism uses a heirarchical URL space: a
#   function family is in the normal "hostname" location, then the functions
#   and subfunctions are where the path segments would be.  A query string
#   is allowed on the end.
#
#   Don't add functions to this that may compromise security, since anyone
#   can request a URL beginning with x-proxy://.
#
#   Which URLs map to which functions should really be documented here.  So,
#
#     //auth/make_auth_cookie
#         receives the authorization form data, sends a formatted auth
#         cookie to the user, and redirects the user to the desired URL.
#
#     //start
#         initiates a browsing session.
#
#--------------------------------------------------------------------------

# A general-purpose routine to handle all x-proxy requests.
sub xproxy {
    local($URL)= @_ ;
    $URL=~ s/^x-proxy://i ;

    # $qs will contain the query string in $URL, whether it was encoded with
    #   the URL or came from QUERY_STRING.
    local($family, $function, $qs)=  $URL=~ m#//(\w+)(/?[^?]*)\??(.*)#i ;

    if ($family eq 'auth') {

	# For //auth/make_auth_cookie, return an auth cookie and redirect user
	#   to the desired URL.  The URL is already 
	if ($function eq '/make_auth_cookie') {
	    local(%in)= &getformvars() ; # must use () or will pass current @_!
	    local($location)= $this_url . $in{'l'} ;  # was already encoded
	    local($cookie)= &auth_cookie(@in{'u', 'p', 'r', 's'}) ;

	    print "$HTTP_1_0 302 Moved\015\012",
		  "Set-Cookie: $cookie\015\012",
		  "Location: $location\015\012\015\012" ;

	    goto EXIT ;


	} else {
	    &HTMLdie("Sorry, no such function as //$family$function.", '',
		     '404 Not Found') ;
	}


    } elsif ($family eq 'start') {
	&startproxy ;


    } else {
	&HTMLdie("Sorry, no such function as //$family$function.", '',
		 '404 Not Found') ;
    }

}


#--------------------------------------------------------------------------
#    Support routines for x-proxy
#--------------------------------------------------------------------------

# Initiate a browsing session. Formerly in the separate program startproxy.cgi.
sub startproxy {
    local(%in)= &getformvars() ;  # must use () or will pass current @_!
    $in{'URL'}=~ s/^\s+|\s+$//g ;    # strip leading or trailing spaces

    # scheme defaults to FTP if host begins with "ftp.", otherwise HTTP.
    if ($in{'URL'}!~ m#^[\w+.-]+://#) {
	$in{'URL'}=  ($in{'URL'}=~ /^ftp\./i)
	    ? "ftp://$in{'URL'}"
	    : "http://$in{'URL'}" ;
    }

    # scheme is case-insensitive, path must start with "/".
    local($scheme, $authority, $path)= 
	($in{'URL'}=~ m#^([\w+.-]+)://([^/?]*)(.*)$#i) ;
    $scheme=~ tr/A-Z/a-z/ ;
    $path= "/$path" if $path!~ m#^/# ;

    # Allow various shorthand forms of the authority (which is usually
    #   host+port), such as adding "www"+"com" or "ftp"+"com" if the host
    #   doesn't exist on the LAN.

    if ($scheme eq 'http') {
	local($host,$portst)= $authority=~ /^([^:]*)(:?.*)$/ ;
	$host=~ tr/A-Z/a-z/ ;

	# Convert integer hostnames like 3467251275 to a.b.c.d format.
	# This is for big-endian; reverse the list for little-endian.
	$host= join('.', $host>>24 & 255, $host>>16 & 255, $host>>8 & 255, 
			 $host & 255)
	    if $host=~ /^\d+$/ ;

	$host= "www.$host.com"  unless ($host=~ /\./) || gethostbyname($host) ;
	$authority= "$host$portst" ;

    } elsif ($scheme eq 'ftp') {
	# Don't mess with it if there's username/password embedded (which you
	#   REALLY shouldn't do).
	unless ($authority=~ /@/) {
	    local($host, $portst)= $authority=~ /^([^:]*)(:?.*)$/ ;
	    $host=~ tr/A-Z/a-z/ ;
	    $host= "ftp.$host.com"
		unless ($host=~ /\./) || gethostbyname($host) ;
	    $authority= "$host$portst" ;
	}

    } else {
	&HTMLdie("Sorry, only HTTP and FTP are currently supported.") ;
    }


    # Prepend flag segment of PATH_INFO
    # This "erroneously" sets flags to "00000" when user config is not allowed,
    #   but it doesn't really affect anything.
    $this_url=~ s#[^/]*/$## ;   # remove old flags from $this_url
    $this_url.= $in{'rc'}     ? '1'  : '0' ;   # $REMOVE_COOKIES
    $this_url.= $in{'rs'}     ? '1'  : '0' ;   # $REMOVE_SCRIPTS
    $this_url.= $in{'fa'}     ? '1'  : '0' ;   # $FILTER_ADS
    $this_url.= $in{'if'}     ? '1'  : '0' ;   # $INSERT_ENTRY_FORM
    $this_url.= $is_in_frame  ? '1'  : '0' ;
    $this_url.= '/' ;


    print "$HTTP_1_0 302 Moved\015\012",
	  "Location: ", $this_url, &proxy_encode("$scheme://$authority$path"),
	  "\015\012\015\012" ;

    goto EXIT ;
}


#--------------------------------------------------------------------------
#    Cookie routines
#--------------------------------------------------------------------------

# As of version 1.3, cookies are now a general mechanism for sending various
#   data to the proxy.  So far that's only authentication info and actual
#   cookies, but more functions could be added.  The new scheme essentially
#   divides up the cookie name space to accommodate many categories.
# Explanation: Normally, a cookie is uniquely identified ("keyed") by the
#   domain, path, and name, but for us the domain and path will always be
#   that of the proxy script, so we need to embed all "key" information into
#   the cookie's name.  Here, the general format for a cookie's name is
#   several fields, joined by ";".  The first field is always a cookie type
#   identifier, like "AUTH" or "COOKIE", and the remaining fields vary
#   according to cookie type.  This compound string is then URL-encoded as
#   necessary (cookie names and values can't contain semicolons, commas, or
#   white space).  The cookie's value contains whatever you need to store,
#   also URL-encoded as necessary.

# A general bug in cookie routines-- ports are not considered, which may
#   matter for both AUTH and COOKIE cookies.  It only matters when two ports
#   on the same server are being used.


# Returns all info we need from cookies.  Right now, that means one composite
#   cookie with all cookies that match the domain and path (and no others!),
#   and an %auth hash to look up auth info by server and realm.  Essentially,
#   this undoes the transformation done by the cookie creation routines.
# @auth is used instead of %auth for slight speedup.
# See notes where the various cookies are created for descriptions of their
#   format; currently, that's in cookie_to_client() and auth_cookie().
sub parse_cookie {
    local($cookie, $target_path, $target_server)= @_ ;
    local($name, $value, $type, @f,
	  $cname, $path, $domain, @matches, %pathlen,
	  $realm, $server, @auth) ;

    foreach ( split(/\s*;\s*/, $cookie) ) {
	($name, $value)= split(/=/, $_, 2) ;     # $value may contain "="
	$name=~ s/%([\da-fA-F]{2})/ pack('c', hex($1)) /ge ;
#        $name=~ s/%3d/=/g ;
#        $name=~ s/%2c/,/g ;
#        $name=~ s/%3b/;/g ;
#	 $name=~ s/%25/%/g ;
	($type, @f)= split(/;/, $name) ;
	if ($type eq 'COOKIE') {
	    ($cname, $path, $domain)= @f ;
	    if ($target_server=~ /$domain$/i  && $target_path=~ /^$path/) {
		push(@matches, $cname.'='.$value) ;
		$pathlen{$matches[$#matches]}= length($path) ;
	    }
	} elsif ($type eq 'AUTH') {
	    # format of auth cookie's name is AUTH;$enc_realm;$enc_server
	    ($realm, $server)= @f ;
	    $realm=~  s/%([\da-fA-F]{2})/ pack('c', hex($1)) /ge ;
	    $server=~ s/%([\da-fA-F]{2})/ pack('c', hex($1)) /ge ;
	    push(@auth, $realm, $value)  if  $server eq $target_server ;
	}
    }

    # More specific path mappings (i.e. longer paths) should be sent first.
    # If %pathlen is declared with my(), this generates a warning in Perl 5.
    sub reversepathlength { length($pathlen{$b}) <=> length($pathlen{$a}) }
    $cookie= join('; ', sort reversepathlength @matches) ;

    return $cookie, @auth ;
}


# Old notes:
#
# Cookie support:  The trick is how to send a cookie back to the client that
#   it will return for appropriate hosts.  Given that the target URL may be
#   encoded, and the client can't always tell where the target URL is, the 
#   only way to do that is to get *all* the cookies from the client and send
#   along the matching ones.  If the client has a lot of cookies through the
#   proxy, this could conceivably be a problem.  Oh well, it works for the
#   limited amount I've tested.
# Here, we transform the cookie from the server into something the client
#   will always send back to us, and embed the real server/path info in the
#   name of the name-value pair, since the cookie is uniquely identified by
#   the domain, path, and name.  Upon return from the client, we split the
#   name back into its original fields.
# One way to get around *some* of the all-cookies-all-the-time problem,
#   *sometimes*, may be possible to program with the following approach:
#   First, the target URL must be "encoded" (in proxy_encode()) in a way
#   that it resembles a path.  For example, the default "://" --> "/"
#   encoding does this.  Then, let the cookies go back to the client with
#   the target paths still intact.  This would only work when the cookie
#   domain is the default, i.e. the source host.  Check other possibilities
#   carefully, too, but I think you could get it to work somehow.
# Question-- is the port supposed to be used in the domain field?
#   Everything here assumes not, which is conceivably a security risk.

# Transform one cookie into something the client will send back through
#   the script, but still has all the needed info.  Returns a transformed
#   Set-Cookie: header, or undef if the cookie is invalid (e.g. comes from
#   the wrong host).  This lets us eliminate an invalid cookie header,
#   instead of just reducing it to an empty Set-Cookie: header.
# A cookie is uniquely identified by the domain, path, and name, so this 
#   transformation embeds the path and domain info into the "name".
# This doesn't handle multiple comma-separated cookies-- possible, but 
#   which seems a slight contradiction between the HTTP spec (section 4.2
#   of both HTTP 1.0 and 1.1 specs) and the cookie spec at
#   http://www.netscape.com/newsref/std/cookie_spec.html.
sub cookie_to_client {
    local($cookie, $source_path, $source_server)= @_ ;
    local($name, $value, $expires_clause, $path, $domain, $secure_clause) ;
    local($new_name, $new_cookie) ;

    ($name, $value)=   $cookie=~ /^\s*([^=;,\s]*)=?([^;,\s]*)/ ;
    ($expires_clause)= $cookie=~ /;\s*(expires\s*=[^;]*)/i ;
    ($path)=     $cookie=~ /;\s*path\s*=\s*([^;,\s]*)/i ;  # clash w/ ;-params?
    ($domain)=         $cookie=~ /;\s*domain\s*=\s*([^;,\s]*)/i ;
    ($secure_clause)=  $cookie=~ /;(\s*secure\b)/i ;

    # Path defaults to path of URL that sent the cookie
    $path= $source_path if $path eq '' ;

    # Domain must be checked for validity: defaults to the server that sent
    #   the cookie; otherwise, must match end of that server name, and must
    #   contain at least two dots if in one of these seven top-level domains, 
    #   three dots otherwise.
    # As it turns out, hostnames ending in extraneous dots, like
    #   "slashdot.org.." resolve to the hostname without the dots.  So we
    #   need to guard against malicious cookie servers getting around the
    #   two/three-dot requirement this way.
    if ($domain eq '') {
	$domain= $source_server ;
    } else {
	$domain=~ s/\.*$//g ;  # removes trailing dots!
	$domain=~ tr/././s ;   # ... and double dots for good measure.
	return(undef) if $source_server!~ /$domain$/ ;
	return(undef) unless 
	    ( ( ($domain=~ tr/././) >= 3 ) ||
	      ( ($domain=~ tr/././) >= 2 && 
		$domain=~ /\.(com|edu|net|org|gov|mil|int)$/ )
	    ) ;
    }

    # This is hereby the transformed format: name is COOKIE;$name;$path;$domain
    #   with %, ;, and = URL-encoded.  (The three values won't already
    #   have semicolons in them.)  $value is unchanged, since it's already
    #   cookie-legal, so to speak.
    $new_name= join(';', 'COOKIE', $name, $path, $domain) ;
    $new_name=~ s/%/%25/g ;    # must be first
    $new_name=~ s/;/%3b/g ;
    $new_name=~ s/,/%2c/g ;
    $new_name=~ s/=/%3d/g ;

    # Create the new cookie from its components, removing the empty ones.
    # The new domain is this proxy server, which is the default if it is not 
    #   specified.
    $new_cookie= join('; ', grep(length,
				 $new_name . '=' . $value, 
				 $expires_clause, 
				 'path=' . $ENV{'SCRIPT_NAME'} . '/',
				 $secure_clause 
		     )) ;
    return $new_cookie ;

}



# Returns a cookie that contains authentication information for a particular
#   realm and server.  The format of the cookie is:  The name is
#   AUTH;$encoded_realm;$encoded_server, and the value is the base64-encoded
#   "$username:$password" needed for the Authorization: header.  This means
#   some double-URL-encoding here.  :P
# Leave the "expires" clause out, which means the cookie lasts as long as
#   the session, which is what we want.
sub auth_cookie {
    local($username, $password, $realm, $server)= @_ ;
    local($name, $value) ;

    $realm=~ s/(\W)/ '%' . sprintf('%02x',ord($1)) /ge ;
    $server=~ s/(\W)/ '%' . sprintf('%02x',ord($1)) /ge ;

    $name= join(';', 'AUTH', $realm, $server) ;
    $name=~ s/%/%25/g ;    # must be first
    $name=~ s/;/%3b/g ;
    $name=~ s/,/%2c/g ;
    $name=~ s/=/%3d/g ;

    # base64 result never needs to be URL-encoded.
    return join('', $name, '=', &base64($username.':'.$password), 
		    '; path=' . $ENV{'SCRIPT_NAME'} . '/' ) ;

}


#--------------------------------------------------------------------------
#    Utility routines
#--------------------------------------------------------------------------

# The following subroutine looks messy, but can be used to open any
#   TCP/IP socket in any Perl program.  Except for the &HTMLdie() part.
# Note that $AF_INET and $SOCK_STREAM are set near the start of the program,
#   near the "use Socket" statement.
sub newsocketto {
    local(*S, $host, $port)= @_ ;
    local($hostaddr, $remotehost) ;

    # If $host is long integer like 3467251275, break it into a.b.c.d format.
    # This is for big-endian; reverse the list for little-endian.
    $host= join('.', $host>>24 & 255, $host>>16 & 255, $host>>8 & 255, 
		     $host & 255)
	if $host=~ /^\d+$/ ;

    # Create the remote host data structure, from host name or IP address
    $hostaddr= ($host=~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/)
		  ?  pack('C4', $1, $2, $3, $4)     # for IP address
		  :  ( (gethostbyname($host))[4]    # for alpha host name
		       || &HTMLdie("Couldn't find address for $host: $!") ) ;
    $remotehost= pack('S n a4 x8', $AF_INET, $port, $hostaddr) ;

    # Create the socket and connect to the remote host
    socket(S, $AF_INET, $SOCK_STREAM, (getprotobyname('tcp'))[2])
	|| &HTMLdie("Couldn't create socket: $!") ;
    connect(S, $remotehost) 
	|| &HTMLdie("Couldn't connect to $host:$port: $!") ;
    select((select(S), $|=1)[0]) ;      # unbuffer the socket
}


# Read a specific number of bytes from a socket, looping if necessary.
# Returns all bytes read (possibly less than $length), or undef on error.
# Typeglobbing *STDIN into *S doesn't seem to work with mod_perl 1.21,
#   so pass socket handle as a string instead (e.g. 'STDIN').
# Using *S, the read() below immediately fails under mod_perl.
sub read_socket {
#    local(*S, $length)= @_ ;
    local($S, $length)= @_ ;
    local($ret, $numread, $thisread) ;

    #$numread= 0 ;
    while (    ($numread<$length)
#	    && ($thisread= read(S, $ret, $length-$numread, $numread) ) )
	    && ($thisread= read($S, $ret, $length-$numread, $numread) ) )
    {
	$numread+= $thisread ;
    }

    return undef unless defined($thisread) ;
    return $ret ;
}



# This is a minimal routine that reads URL-encoded variables from a string,
#   presumably from something like QUERY_STRING.  If no string is passed,
#   it will read from either QUERY_STRING or STDIN, depending on
#   REQUEST_METHOD.  STDIN can't be read more than once for POST requests.
# It returns a hash.  In the event of multiple variables with the same name,
#   it concatenates the values into one hash element, delimiting with "\0".
# Returns undef on error.
sub getformvars {
    local($in)= @_ ;
    local(%in, $name, $value) ;

    # If no string is passed, read it from the usual channels.
    unless (defined($in)) {
	if ( ($ENV{'REQUEST_METHOD'} eq 'GET') ||
	     ($ENV{'REQUEST_METHOD'} eq 'HEAD') ) {
	    $in= $ENV{'QUERY_STRING'} ;
	} elsif ($ENV{'REQUEST_METHOD'} eq 'POST') {
	    return undef unless
	      ($ENV{'CONTENT_TYPE'}=~ m#^application/x-www-form-urlencoded$#i);
	    return undef unless defined($ENV{'CONTENT_LENGTH'}) ;
	    $in= &read_socket('STDIN', $ENV{'CONTENT_LENGTH'}) ;
	    # should we return undef if not all bytes were read?
	} else {
	    return undef ;   # unsupported REQUEST_METHOD
	}
    }

    foreach (split('&', $in)) {
	s/\+/ /g ;
	($name, $value)= split('=', $_, 2) ;
	$name=~ s/%([\da-fA-F]{2})/ pack('c', hex($1)) /ge ;
	$value=~ s/%([\da-fA-F]{2})/ pack('c', hex($1)) /ge ;
	$in{$name}.= "\0" if defined($in{$name}) ;  # concatenate multiple vars
	$in{$name}.= $value ;
    }
    return %in ;
}


# Escape any &"<> chars to &xxx; and return resulting string.
sub HTMLescape {
    local($s)= @_ ;
    $s=~ s/&/&amp;/g ;      # must be before all others
    $s=~ s/"/&quot;/g ;
    $s=~ s/</&lt;/g ;
    $s=~ s/>/&gt;/g ;
    return $s ;
}


# Unescape any &xxx; codes back to &"<> and return resulting string.
# Simplified version here; only includes &"<>.
sub HTMLunescape {
    local($s)= @_ ;
    $s=~ s/&quot;/"/g ;
    $s=~ s/&lt;/</g ;
    $s=~ s/&gt;/>/g ;
    $s=~ s/&amp;/&/g ;      # must be after all others
    return $s ;
}
    

# Base64-encode a string, except not inserting line breaks
sub base64 {
    local($s)= @_ ;
    local($ret, @c) ;

    # If this routine gets called several times per run, then make this global.
    local(@char)= ('A'..'Z', 'a'..'z', '0'..'9', '+', '/') ;

    while (@c= split(//,substr($s,0,3))) {
	substr($s,0,3)= '' ;
	foreach (@c) { $_= ord }
	$ret.= $char[$c[0]>>2] ;
	$ret.= $char[(($c[0]%4) <<4) + ($c[1]>>4)] ;
	$ret.= (@c>1)  ? $char[(($c[1]%16)<<2) + ($c[2]>>6)]  : '=' ;
	$ret.= (@c>2)  ? $char[$c[2]%64]  : '=' ;
    }

    return $ret ;
}


# Read an entire file into a string and return it; return undef on error.
# Does NOT check for any security holes in $fname!
sub readfile {
    local($fname)= @_ ;
    local(*F, $/, $ret) ;

    open(F, "<$fname") || return undef ;
    undef $/ ;
    $ret= <F> ;
    close(F) ;

    return $ret ;
}

#--------------------------------------------------------------------------
#    Output routines
#--------------------------------------------------------------------------

# Print the footer common to most error responses
sub footer {
    print <<EOF ;
<p>
<hr>
<a href="http://www.jmarshall.com/tools/cgiproxy/"><i>CGIProxy 1.4.1</i></a>
<p>
</body>
</html>
EOF
}



# Present the initial entry form
sub show_start_form {
    local($method, $action, $flags) ;

    $method= $USE_POST_ON_START   ? 'post'   : 'get' ;

    $action= $this_url . &proxy_encode('x-proxy://start') ;
    $action= &HTMLescape($action) ;   # probably never necessary

    # Include checkboxes if user config is allowed.
    if ($ALLOW_USER_CONFIG) {
	local($rc_on)= $REMOVE_COOKIES     ? ' checked'  : '' ;
	local($rs_on)= $REMOVE_SCRIPTS     ? ' checked'  : '' ;
	local($fa_on)= $FILTER_ADS         ? ' checked'  : '' ;
	local($if_on)= $INSERT_ENTRY_FORM  ? ' checked'  : '' ;
	$flags= <<EOF ;
<br><input type=checkbox name="rc"$rc_on> Remove all cookies (except certain proxy cookies)
<br><input type=checkbox name="rs"$rs_on> Remove all scripts (recommended for anonymity)
<br><input type=checkbox name="fa"$fa_on> Remove ads
<br><input type=checkbox name="if"$if_on> Show URL entry form
EOF
    }

    print <<EOF ;
$HTTP_1_0 200 OK
Content-type: text/html

<html>
<head>
<title>Start Using CGI Proxy</title>
</head>
<body>
<h1>CGI Proxy</h1>
<p>Start browsing through this CGI-based proxy by entering a URL below.
Only HTTP and FTP URLs are supported.  Not all functions will work 
(e.g. some JavaScript), but most pages will be fine.

<form action="$action" method=$method>
<input name="URL" size=50>
$flags
<p><input type=submit value="   Begin browsing   ">
</form>
EOF

    &footer ;
    goto EXIT ;
}


# Returns a mini version of the start form, as a string.  It requires
#   $this_url and $URL to be already set.
# In case we ever support this form in a frame, point it to target="_top" .
sub mini_start_form {
    local($method, $action, $flags, $safe_URL) ;

    $method= $USE_POST_ON_START   ? 'post'   : 'get' ;
    $action= $this_url . &proxy_encode('x-proxy://start') ;
    $action= &HTMLescape($action) ;   # probably never necessary
    $safe_URL= &HTMLescape($URL) ;

    # Include checkboxes if user config is allowed.
    if ($ALLOW_USER_CONFIG) {
	local($rc_on)= $REMOVE_COOKIES     ? ' checked'  : '' ;
	local($rs_on)= $REMOVE_SCRIPTS     ? ' checked'  : '' ;
	local($fa_on)= $FILTER_ADS         ? ' checked'  : '' ;
	local($if_on)= $INSERT_ENTRY_FORM  ? ' checked'  : '' ;
	$flags= <<EOF ;
<br><font size="-1"><input type=checkbox name="rc"$rc_on> No cookies
&nbsp;&nbsp;<input type=checkbox name="rs"$rs_on> No scripts
&nbsp;&nbsp;<input type=checkbox name="fa"$fa_on> No ads
&nbsp;&nbsp;<input type=checkbox name="if"$if_on> Show this form
</font>
EOF
    }

    return <<EOF ;
<form action="$action" method=$method target="_top">
<center>
Location&nbsp;via&nbsp;proxy:<input name="URL" size=66 value="$safe_URL"><input type=submit value="Go">
$flags
<hr>
</center>
</form>
EOF
}



# Present the user with a special form that lets them enter authentication.
# The target URL is proxy_encoded in the form, for obscurity.
# Uses POST, because a GET request would show auth info in a logged URL.
sub get_auth_from_user {
    local($server, $realm, $URL, $tried)= @_ ;
    local($action, $msg) ;

    $server= &HTMLescape($server) ;
    $realm=  &HTMLescape($realm) ;
    $URL=    &HTMLescape(&proxy_encode($URL)) ;

    $action= $this_url . &proxy_encode('x-proxy://auth/make_auth_cookie') ;
    $action= &HTMLescape($action) ;   # probably never necessary

    $msg= "<h3><font color=red>Authorization failed.  Try again.</font></h3>"
	if $tried ;

    print <<EOF ;
$HTTP_1_0 200 OK
Cache-Control: no-cache
Pragma: no-cache
Content-type: text/html

<html>
<head><title>Enter username and password for $realm at server</title></head>
<body>
<h1>Authorization Required</h1>
$msg

<form action="$action" method=post>
<input type=hidden name="s" value="$server">
<input type=hidden name="r" value="$realm">
<input type=hidden name="l" value="$URL">

<table border=1 cellpadding=5>
<tr><th bgcolor="#ff6666">
    Enter username and password for <nobr>$realm</nobr> at $server:</th></tr>
<tr><td bgcolor="#b0b0b0">
    <table cellpadding=0 cellspacing=0>
    <tr><td>Username:</td><td><input name="u" size=20></td>
	<td>&nbsp;&nbsp;&nbsp;<input type=submit value="OK"></tr>
    <tr><td>Password:</td><td><input type=password name="p" size=20></td></tr>
    </table>
</table>
</form>
<p>This requires cookie support turned on in your browser.
<p><i><b>Note:</b> Anytime you use a proxy, you're trusting the owner of that
proxy with all information you enter, including your name and password here.
This is true for <b>any</b> proxy, not just this one.
EOF

    &footer ;
    goto EXIT ;
}



# Return "403 Forbidden" message if the target server is forbidden.
sub banned_server_die {
    print <<EOF ;
$HTTP_1_0 403 Forbidden
Content-type: text/html

<html>
<head><title>The proxy can't access that server, sorry.</title></head>
<body>
<h1>The proxy can't access that server, sorry.</h1>
<p>The owner of this proxy has restricted which servers it can access,
presumably for security or bandwidth reasons.  The server you just tried
to access is not on the list of allowed servers.
EOF

    &footer ;
    goto EXIT ;
}



# If so configured, disallow browsing back through this same script.
sub loop_disallowed_die {
    local($URL)= @_ ;
    print <<EOF ;
$HTTP_1_0 403 Forbidden
Content-type: text/html

<html>
<head><title>Proxy cannot loop back through itself</title></head>
<body>
<h1>Proxy cannot loop back through itself</h1>
<p>The URL you tried to access would cause this proxy to access itself,
which is redundant and probably a waste of resources.  The owner of this
proxy has configured it to disallow such looping.
<p>Rather than telling the proxy to access the proxy to access the desired
resource, try telling the proxy to access the resource directly.  The link
below <i>may</i> do this.
<blockquote><tt><a href="$URL">$URL</a></tt></blockquote>
EOF

    &footer ;
    goto EXIT ;
}



# Alert the user to an unsupported URL, with this intermediate page.
sub unsupported_warning {
    local($URL)= @_ ;
    print <<EOF ;
$HTTP_1_0 200 OK
Content-type: text/html

<html>
<head><title>WARNING: Entering non-anonymous area!</title></head>
<body>
<h1>WARNING: Entering non-anonymous area!</h1>
<h3>This proxy only supports HTTP and FTP.  Any browsing to another URL will
be directly from your browser, and no longer anonymous.</h3>
<h3>Follow the link below to continue to the URL, non-anonymously.</h3>
<blockquote><tt><a href="$URL">$URL</a></tt></blockquote>
EOF

    &footer ;
    goto EXIT ;
}



# Return "406 Not Acceptable" response for script content-type.
sub script_content_die {
    print <<EOF ;
$HTTP_1_0 406 Not Acceptable
Content-type: text/html

<html>
<head><title>Script content blocked</title>
<body>
<h1>Script content blocked</h1>
<p>The resource you requested (or were redirected to without your knowledge)
is apparently an executable script.  Such resources have been blocked by this
proxy, presumably for your own protection.
<p>Even if you're sure you want the script, you can't get it through this
proxy the way it's configured.  If permitted, try browsing through this proxy
without removing scripts.  Otherwise, you'll need to reconfigure the proxy or
find another way to get the resource.
EOF

    &footer ;
    goto EXIT ;
}



# Return "403 Forbidden" message if images are forbidden.
sub non_text_die {
    print <<EOF ;
$HTTP_1_0 403 Forbidden
Content-type: text/html

<html>
<head><title>Proxy cannot forward non-text files</title></head>
<body>
<h1>Proxy cannot forward non-text files</h1>
<p>Due to bandwidth limitations, the owner of this particular proxy is
forwarding only text files.  For best results, turn off automatic image 
loading if your browser lets you.
<p>If you need access to images or other binary data, route your browser 
through another proxy (or install one yourself--
<a href="http://www.jmarshall.com/tools/cgiproxy/">it's easy</a>).
EOF

    &footer ;
    goto EXIT ;
}



# When an image should be blanked, returns either a transparent 1x1 GIF or
#   a 406 result ("Not Acceptable").  Yes, that's an inlined 43-byte GIF.
sub skip_image {
    if ($RETURN_EMPTY_GIF) {
	print <<EOF ;
$HTTP_1_0 200 OK
Content-Type: image/gif
Content-Length: 43

GIF89a\x01\0\x01\0\x80\0\0\0\0\0\xff\xff\xff\x21\xf9\x04\x01\0\0\0\0\x2c\0\0\0\0\x01\0\x01\0\x40\x02\x02\x44\x01\0\x3b
EOF
    } else {
	print "$HTTP_1_0 406 Not Acceptable\015\012\015\012" ;
    }

    goto EXIT ;
}



# Die, outputting HTML error page, with optional response code and title.
sub HTMLdie {
    local($msg, $title, $status)= @_ ;
    $title= 'CGIProxy Error' if $title eq '' ;
    $status= '200 OK' if $status eq '' ;
    print <<EOF ;
$HTTP_1_0 $status
Content-Type: text/html

<html>
<head><title>$title</title></head>
<body>
<h1>$title</h1>
<h3>$msg</h3>
EOF

    &footer ;
    goto EXIT ;
}
