#!/usr/bin/env python
# generates a PDF containing its own MD5 hash using text and XObject tricks
#
# Copyright (c) 2017 Mako
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

try:
    from io import BytesIO
except ImportError:
    from cStringIO import StringIO as BytesIO
import sys, re, zlib
from struct import pack, unpack
from hashlib import sha1, md5
import subprocess, random
from ctypes import *

libcoll = CDLL("./libcoll-pdf.so")

def _new_iv():
    return create_string_buffer(pack("IIII",0x67452301,0xefcdab89,0x98badcfe,0x10325476), 16);

def othername(s):
    assert len(s) == 60
    block0 = list(unpack("<15I", s))
    block0[4] = (block0[4]+(1<<31)) & 0xffffffff
    block0[11] = (block0[11]+(1<<15)) & 0xffffffff
    block0[14] = (block0[14]+(1<<31)) & 0xffffffff
    return pack("<15I", *block0)

def otherside(s):
    assert len(s) == 128
    block = list(unpack("<32I",s))
    block[4] = (block[4]+(1<<31)) & 0xffffffff
    block[11] = (block[11]+(1<<15)) & 0xffffffff
    block[14] = (block[14]+(1<<31)) & 0xffffffff
    block[16+4] = (block[16+4]-(1<<31)) & 0xffffffff
    block[16+11] = (block[16+11]-(1<<15)) & 0xffffffff
    block[16+14] = (block[16+14]-(1<<31)) & 0xffffffff
    return pack("<32I",*block)

def dofixup(f, pos):
    f.seek(pos); s = otherside(f.read(128))
    f.seek(pos); f.write(s)

def prepad(prefix, n, suffix):
    # mangles whitespace in padding until the MD5 meets recommended conditions
    assert((len(prefix)+n+len(suffix)) % 64 == 0)
    assert(len(suffix) < 56)
    preprepad = b""
    if(n+len(suffix) > 64):
        preprepad = b" " *(n+len(suffix)-64)
        prefix += preprepad
        prefix2 = b""
        n = 64 - len(suffix)
    else:
        prefix2 = prefix[len(prefix)&~63:]
        prefix = prefix[:len(prefix)&~63]
        preprepad = b""
    assert(len(prefix) % 64 == 0)
    iv = _new_iv();
    for i in range(0, len(prefix), 64):
        libcoll.MD5Transform(iv,prefix[i:i+64]);

    while True:
        pad = bytes(bytearray([random.choice(b" \t\n\r") for i in range(0,n)]))
        iv2 = create_string_buffer(iv.raw, 16)
        libcoll.MD5Transform(iv2,prefix2+pad+suffix)
        iv2 = unpack("IIII", iv2.raw)
        if (iv2[2]>>25)&1 != (iv2[2]>>24)&1 and (iv2[3]>>25)&1 == (iv2[3]>>24)&1:
            return preprepad+pad

# PDFium treats \x80 and \xff as spaces, other implementations don't
# Only pdfminer (a Python library) seems to treat \x0b as whitespace
_char_blacklist = b"\x00\n\x0c\x0b\r\t /<>[]%)({}#\x80\xff"

_badchars = bytearray(256)
for c in bytearray(_char_blacklist):
    _badchars[c] = 1
_badchars = bytes(_badchars)
_badchars2 = bytearray(256);
_badchars2[ord(")")] = 1
_badchars2 = bytes(_badchars2)

def makecoll(prefix):
    assert len(prefix) % 64 == 0
    iv = _new_iv();
    for i in range(0, len(prefix), 64):
        libcoll.MD5Transform(iv,prefix[i:i+64]);

    block0 = create_string_buffer(64);
    libcoll.MD5CollideBlock0(iv, block0, _badchars);
    libcoll.MD5Transform(iv,block0);
    block1 = create_string_buffer(64);
    # don't push the badchars filter on block 1 too hard, it's a little dodgy.
    libcoll.MD5CollideBlock1(iv, block1, _badchars2);
    
    a = pack("<32I",*unpack("32I",block0.raw+block1.raw))
    b = otherside(a)
    assert md5(prefix+a).hexdigest() == md5(prefix+b).hexdigest()
    return (a,b)

def _is_clean(s):
    for c in _char_blacklist:
        if s.find(c) != -1:
            return False
    return True

def nameify(s):
    return s.split(b" ")[0]

MAX_PARENS = 8

_escaped_paren_re = re.compile(br"\\(\d){0,2}\(")

def cleancoll(prefix):
    # retry until we get a suitable collision, usually works first time
    while True:
        a, b = makecoll(prefix)
        assert a[60:64] == b" Do("
        assert b[60:64] == b" Do("
        assert(_is_clean(a[:60]) and _is_clean(b[:60]))
        assert(a[64:].find(b")") == -1 and b[64:].find(b")") == -1)
        # avoid escaped parens altogether because we aren't handling them yet
        # (What happens if only *one* side of the collision has a backslash?)
        if len(a[64:].split(b"(")) == len(b[64:].split(b"(")) and len(a[64:].split(b"(")) <= MAX_PARENS and not _escaped_paren_re.search(a[64:]) and not  _escaped_paren_re.search(b[64:]) and a[124:].find(b"\\") == -1 and b[124:].find(b"\\") == -1:
            return (a, b)

def closeparens(a):
    # have to match parens to close our strings and keep Adobe Reader happy
    n = len(a[64:].split(b"("))
    assert(n <= MAX_PARENS)
    return b")"*n + b" "*(MAX_PARENS-n)

MD5LEN = 32

class PDFMaker:
    def __init__(self, f):
        self.f = f
        self.xrefs = []
        self.xobjnames = []

    def beginobj(self):
        self.xrefs.append((1, self.f.tell(), 0))
        self.f.write(b"%i 0 obj\n" % len(self.xrefs));
        return len(self.xrefs)

    def compressedobj(self, data):
        self.xrefs.append((1, self.f.tell(), 0))
        self.f.write(b"%i 0 obj\n" % len(self.xrefs))
        objstm_n = len(self.xrefs)
        self.xrefs.append((2, objstm_n, 0))
        s = b"%i 0\n" % len(self.xrefs)
        first = len(s)
        s += data
        s = zlib.compress(s, 9)
        self.f.write(b"<< /Type /ObjStm /N 1 /First %i /Length %i /Filter /FlateDecode >>\nstream\n" % (first, len(s)))
        self.f.write(s)
        self.f.write(b'\nendstream\nendobj\n')
        return len(self.xrefs)
        
    def writetrailer(self, root_n, pdf15=True):
        self.f.write(b'\n\n')
        startxref = self.f.tell()
        if pdf15:
            self.xrefs.append((1, self.f.tell(), 0))
            self.f.write(b'%i 0 obj\n' % len(self.xrefs))

            t = pack(">BIB", 0, len(self.xrefs)+1, 0)
            for xref in self.xrefs:
                t += pack(">BIB", *xref)
            t = zlib.compress(t, 9)
            
            self.f.write(b'<</Type/XRef/Root %i 0 R/Size %i/W [1 4 1]/Length %i/Filter/FlateDecode>>\nstream\n' % (root_n, len(self.xrefs)+1, len(t) ))
            self.f.write(t)
            self.f.write(b'\nendstream\nendobj\n\n')
        else:
            self.f.write(b'xref\n0 %i \n0000000000 65535 f \n' % (len(self.xrefs)+1))
            for xref in self.xrefs:
                assert(xref[0] == 1)
                self.f.write(b'%010i 00000 n \n' % xref[1]);
            self.f.write(b'\ntrailer <</Root %i 0 R/Size %i>>\n' % (root_n, len(self.xrefs)+1))
        self.f.write(b"\nstartxref\n%i\n" % startxref)
        self.f.write(b'%%EOF\n')

    def makecollision(self, width, height):
        # it's not worth being too aggressive about size here, we end up
        # having to pad to the next MD5 block regardless of what we do
        TARGET_LEN=129+MAX_PARENS
        text_n = self.beginobj()
        self.f.write(b"<</Type/XObject /Subtype/Form")
        self.f.write(b"/Length %i/BBox[0 0 %i %i]" % (TARGET_LEN, width, height))
        # pad so the / is at end of MD5 block
        padlen = 53 - (self.f.tell() % 64)
        if padlen < 4: padlen += 64
        self.f.write(prepad(self.f.getvalue(), padlen, b">>\nstream\n/"))
        self.f.write(b'>>\nstream\n')
        textstart = self.f.tell()
        self.f.write(b"/")
        assert((self.f.tell() % 64) == 0)
        
        collstart = self.f.tell()
        a, b = cleancoll(self.f.getvalue())
        self.f.write(a)
        collend = self.f.tell()
        namea = nameify(a); nameb = nameify(b)
        self.f.write(closeparens(a))
        
        assert(self.f.tell() == textstart + TARGET_LEN)
        #self.f.write(b" "*(textstart + TARGET_LEN - self.f.tell()))
        self.f.write(b'\nendstream\nendobj\n\n')
        return (text_n, namea, nameb, collstart)

    def makechoice(self, choices, width, height):
        # builds a multicollision that lets us pick which XObject to
        # display without affecting the MD5 of the document
        if len(choices) == 1:
            return (choices[0], ((),) )
        left_n, left_paths = self.makechoice(choices[:len(choices)//2], width, height)
        right_n, right_paths = self.makechoice(choices[len(choices)//2:], width, height)
        coll_n, namea, nameb, collstart = self.makecollision(width, height)
        self.xobjnames.append((namea, left_n))
        self.xobjnames.append((nameb, right_n))
        paths = left_paths + tuple( path+(collstart,) for path in right_paths )
        return coll_n, paths

def makebase(numcolls):
    # create base PDF whose collisions can be reused later
    f = BytesIO()
    maker = PDFMaker(f)
    width, height = 9, 16
    f.write(b"%PDF-1.5\n%\xe2\xe3\xcf\xd3\n\n\n")
    for i in range(0, numcolls):
        maker.makecollision(width, height)
        print("[%i/%i]" % (i, numcolls))
    text_n = maker.beginobj()
    f.write(b'<</Length 0>>\nstream\n\nendstream\nendobj\n\n')
    root_n = maker.beginobj()
    f.write(b'<<\n  /Type /Catalog\n  /Pages %i 0 R\n>>\nendobj\n\n\n' % (len(maker.xrefs)+1))
    pages_n = maker.beginobj()
    f.write(b'<<\n  /Type /Pages\n  /Count 1\n  /Kids [%i 0 R]\n>>\nendobj\n\n' % (len(maker.xrefs)+1))
    maker.beginobj()
    f.write(b'<<\n  /Type /Page\n  /Parent %i 0 R\n' % (pages_n))
    f.write(b"  /MediaBox [0 0 %i %i]\n" % (width*10, height*10))
    f.write(b"  /CropBox [0 0 %i %i]\n" % (width*10, height*10))
    f.write(b'  /Contents %i 0 R\n' % text_n)
    f.write(b'  /Resources\n  << >>\n')
    f.write(b">>\nendobj\n\n")
    maker.writetrailer(root_n)
    return f.getvalue()
    
def makepdf():
    imgns = []; all_subs = []
    f = BytesIO()
    maker = PDFMaker(f)
    width, height = 9, 16
    font_size = 16
    vshift = 3
    f.write(b"%PDF-1.5\n%\xe2\xe3\xcf\xd3\n\n\n")

    forms_n = []
    for letter in b"0123456789ABCDEF":
        forms_n.append(maker.beginobj())
        formtext = b"BT\n/F1 %i Tf\n0 %i Td\n(%s) Tj\nET\n" % (font_size, vshift, bytes(bytearray((letter,))))
        f.write(b"<</Type/XObject /Subtype/Form")
        f.write(b"/Length %i/BBox [0 0 %i %i]>>\n" % (len(formtext), width, height))
        f.write(b"stream\n")
        f.write(formtext)
        f.write(b"\nendstream\nendobj\n\n")

    choices = []; collns = []
    for i in range(0,MD5LEN):
        print("\n%i/%i choices calculated" % (i, MD5LEN))
        coll_n, paths = maker.makechoice(forms_n, width, height)
        choices.append(paths); collns.append(coll_n)
    
    pdftext = b""
    for i in range(0, len(collns)):
        pdftext += b"q\n  1 0 0 1 %i 0 cm\n  /Fm%i Do\nQ\n" % (width*i, i)
    text_n = maker.beginobj()
    f.write(b'<</Length %i>>\nstream\n' % len(pdftext))
    f.write(pdftext)
    f.write(b'\nendstream\nendobj\n\n')
    
    font_n = maker.beginobj()
    f.write(b"<<\n/Type /Font\n/Subtype /Type1\n/Name /F1\n/BaseFont /%s\n>>\nendobj\n" % (b"Courier"));

    # compressing the resources object saves a few KB
    s = b"<<\n"
    s += b"    /Font << /F1 %i 0 R >>\n" % font_n
    s += b"    /XObject <<"
    for i in range(0, len(collns)):
        s += b"/Fm%i %i 0 R" % (i, collns[i])
    for name, n in maker.xobjnames:
        s += b"/%s %i 0 R" % (name, n)
    s += b">>\n"
    s += b">>\n";
    res_n = maker.compressedobj(s)
    
    root_n = maker.beginobj()
    f.write(b'<<\n  /Type /Catalog\n  /Pages %i 0 R\n>>\nendobj\n\n\n' % (len(maker.xrefs)+1))
    pages_n = maker.beginobj()
    f.write(b'<<\n  /Type /Pages\n  /Count 1\n  /Kids [%i 0 R]\n>>\nendobj\n\n' % (len(maker.xrefs)+1))
    maker.beginobj()
    f.write(b'<<\n  /Type /Page\n  /Parent %i 0 R\n' % (pages_n))
    f.write(b"  /MediaBox [0 0 %i %i]\n" % (width*MD5LEN, height))
    f.write(b"  /CropBox [0 0 %i %i]\n" % (width*MD5LEN, height))
    f.write(b'  /Contents %i 0 R\n' % text_n)
    f.write(b'  /Resources %i 0 R\n' % res_n)
    f.write(b">>\nendobj\n\n")
    maker.writetrailer(root_n)

    cs = md5(f.getvalue()).hexdigest()
    for i in range(0,len(choices)):
        for pos in choices[i][int(cs[i], 16) ]:
            dofixup(f, pos)
    assert md5(f.getvalue()).hexdigest() == cs
    return f.getvalue()

if __name__ == '__main__':
    t = makepdf()
    print(md5(t).hexdigest())
    fname = "demo-selfmd5-text.pdf"
    f = open(fname,"wb"); f.write(t); f.close()
    print("Created %s" % fname)
