#!/usr/bin/python
# Buggy hack to rewrite a PDF to include its own MD5 hash.
# No warranty. If it breaks you get to keep both pieces.

from selfmd5pdf import dofixup, othername, PDFMaker

from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure
from pdfminer.pdftypes import PDFStream, PDFObjRef
from pdfminer.psparser import PSLiteral

from struct import pack, unpack
from hashlib import md5
import zlib, random, cStringIO, sys

# replaced with actual MD5 value. Use fixed-width font. Lowercase works too.
placeholder = "-0123456789ABCDEF-ADD-MD5-PLEASE"

def _randname():
    return "".join(random.choice("0123456789ABCDEFGHIJLKMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") for i in range(0,10))

class PDFRemaker(PDFMaker):
    def __init__(self, f, basename, numcolls):
        PDFMaker.__init__(self, f)

        fbase = open(basename,"rb")
        parser = PDFParser(fbase)
        doc = PDFDocument(parser)
        xref = doc.xrefs[0]
        self.fixups = []
        maxid = 1
        for id in xref.get_objids():
            obj = doc.getobj(id)
            if not isinstance(obj, PDFStream) or "Type" not in obj or repr(obj["Type"]) != "/XObject":
                continue
            if obj.data == None:
                obj.decode()
            if obj.data[0] == "/" and obj.data[61:65] == " Do(":
                if id >= maxid:
                    maxid = id+1
                self.fixups.append((id, xref.get_pos(id)[1], obj.data[1:61]))
            if len(self.fixups) == numcolls:
                break
            
        assert len(self.fixups) == numcolls
        baselen = xref.get_pos(maxid)[1]
        self.basewidth = 9.0; self.baseheight = 16.0 # FIXME - don't hardcode
        for id in xrange(1, maxid):
            assert xref.get_pos(id)[1] < baselen
            self.xrefs.append((1, xref.get_pos(id)[1], 0))

        fbase.seek(0)
        self.base = fbase.read(baselen)
        assert len(self.base) == baselen
        fbase.close()
        f.write(self.base)

    def getchoice(self, choices):
        if len(choices) == 1:
            return (choices[0], ((),) )
        left_n, left_paths = self.getchoice(choices[:len(choices)/2])
        right_n, right_paths = self.getchoice(choices[len(choices)/2:])
        coll_n, pos, namea = self.fixups.pop(0)
        collstart = self.base.find(">>\nstream\n/", pos)
        assert collstart > pos and collstart < pos + 128
        collstart += 11
        assert collstart % 64 == 0
        nameb = othername(namea)
        self.xobjnames.append((namea, left_n))
        self.xobjnames.append((nameb, right_n))
        paths = left_paths + tuple( path+(collstart,) for path in right_paths )
        return coll_n, paths
        
def obj2str(obj, idmap):
    if isinstance(obj, dict):
        return "<< " + " ".join("/"+k+" "+obj2str(v,idmap) for k, v in obj.iteritems()) + " >>"
    elif isinstance(obj, list):
        return "[" + " ".join(obj2str(v,idmap) for v in obj) + "]"
    elif isinstance(obj, int) or isinstance(obj, float):
        return str(obj)
    elif isinstance(obj, PDFObjRef):
        return "%i 0 R" % idmap[obj.objid]
    elif isinstance(obj, str):
        return "<"+obj.encode("hex").upper()+">" # FIXME - do we always need to escape?
    elif isinstance(obj, PSLiteral):
        return repr(obj) # FIXME - escaping
    elif isinstance(obj, PDFStream):
        attrs = dict(obj.attrs)
        if obj.data == None:
            return obj2str(attrs,idmap)+"\nstream\n"+obj.rawdata+"\nendstream"
            #obj.decode()
        elif "Filter" in attrs and repr(attrs["Filter"]) == "/FlateDecode":
            attrs.pop("Predictor",0)
            data = zlib.compress(obj.data, 9)
            attrs["Length"] = len(data)
            return obj2str(attrs,idmap)+"\nstream\n"+data+"\nendstream"
        else:
            attrs.pop("Filter",0)
            attrs.pop("Predictor",0)
            attrs["Length"] = len(obj.data)
            return obj2str(attrs,idmap)+"\nstream\n"+obj.data+"\nendstream"
    else:
        raise TypeError(str(type(obj)))


if len(sys.argv) != 3:
    print("Usage: selfmd5ify <input> <output>")
    sys.exit(1)

fin = open(sys.argv[1], 'rb')
parser = PDFParser(fin)
doc = PDFDocument(parser)

rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

width = None; height = None; pageid = None
fontsize = None; vshift = 3 # FIXME
fontid = None
hexchars = "0123456789ABCDEF"
posns = []

def unapply_matrix_pt((a, b, c, d, e, f), (x, y)):
    det = float(a*d-b*c)
    x -= e; y -= f
    return ((d*x-c*y)/det, (-b*x+a*y)/det)

for page in PDFPage.create_pages(doc):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    layout = device.get_result()
    for lt in layout:
        if isinstance(lt, LTTextBox):
            for line in lt:
                assert isinstance(line, LTTextLine)
                #print line.get_text()
                
                idx = line.get_text().upper().find(placeholder)
                if idx < 0:
                    continue
                print "Found @ index %i, font %r" % (idx, line._objs[idx].fontname)
                assert line.get_text()[idx+1:idx+17].upper() == hexchars
                hexchars = line.get_text()[idx+1:idx+17]

                c = line._objs[idx]
                assert c.get_text() == placeholder[0]
                width = c.width
                height = c.height
                pageid = page.pageid

                # FIXME - may not work with rotations, etc
                x0, y0 = unapply_matrix_pt(c.matrix,(c.x0,c.y0))
                x1, y1 = unapply_matrix_pt(c.matrix,(c.x1,c.y1))

                print("Untransformed: (%f,%f) - (%f, %f)" % (x0, y0, x1, y1))

                for id, font in interpreter.fontmap.iteritems():
                    if font.fontname == line._objs[idx].fontname:
                        # FIXME - may be more than one font with this name
                        fontid = id
                        fontsize = abs(y1-y0) / font.get_height()
                        vshift = -font.get_descent() * fontsize # FIXME - rise?
                        print "Found font %s size %f vshift %f" % (fontid, fontsize, vshift)

                # FIXME - we currently ignore text colour, matrix, etc

                for i in range(idx, idx+32):
                    assert line._objs[i].width == width
                    assert line._objs[i].height == height
                    posns.append((line._objs[i].x0, line._objs[i].y0, _randname()))


f =  cStringIO.StringIO() #open('test.pdf', 'wb')
maker = PDFRemaker(f, "basemd5.pdf", 480)

objids = set()
idmap = {}
root_n = None
for xref in doc.xrefs:
    objids.update(xref.get_objids())
    trailer = xref.get_trailer()
    #print trailer
    if trailer and "Root" in trailer and root_n == None:
        root_n = trailer["Root"].objid
    
stripobjs = set()
for id in objids:
    obj = doc.getobj(id)
    if isinstance(obj, (dict,PDFStream)) and "Type" in obj and (repr(obj["Type"]) == "/XRef" or repr(obj["Type"]) == "/ObjStm"):
        stripobjs.add(id)
    
#print stripobjs
objids.difference_update(stripobjs)
objids = list(objids); objids.sort()
#print objids

DIRECT_MAP = 0x80000000 # nasty hack to bypass our object ID mapping

forms_n = []
for letter in hexchars:
    forms_n.append(maker.beginobj())
    # FIXME: deal with char mapping
    # We use a white rectange to white-out the placeholder. FIXME: breaks italics
    formtext = "q 1.0 1.0 1.0 rg 0 0 %f %f re f Q " % (maker.basewidth, maker.baseheight)
    formtext += "%f 0 0 %f 0 0 cm " % (maker.basewidth/width, maker.baseheight/height)
    formtext += "BT\n/%s %f Tf\n0 %f Td\n(%s) Tj\nET\n" % (fontid, fontsize, vshift, letter)
    f.write("<</Type/XObject /Subtype/Form")
    f.write("/Length %i/BBox [0 0 %i %i]>>\n" % (len(formtext), maker.basewidth, maker.baseheight))
    f.write("stream\n")
    f.write(formtext)
    f.write("\nendstream\nendobj\n\n")

t = doc.getobj(pageid)["Contents"]
# FIXME - handle case with more than one stream?
if isinstance(t, list) and len(t) == 1:
    t = t[0]
contentsid = t.objid
res = doc.getobj(pageid)["Resources"]
choices = []
if isinstance(res, PDFObjRef):
    res = doc.getobj(res.objid)
if "XObject" not in res:
    res["XObject"] = dict()
for x, y, name in posns:
    coll_n, paths = maker.getchoice(forms_n)
    res["XObject"][name] = PDFObjRef(doc, coll_n+DIRECT_MAP, 0)
    choices.append(paths)
for name, n in maker.xobjnames:
    res["XObject"][name] = PDFObjRef(doc, n+DIRECT_MAP, 0)
#print res


for i in range(0, len(objids)):
    idmap[objids[i]] = i+len(maker.xrefs)+1
for i in range(1, 1+len(maker.xrefs)):
    idmap[DIRECT_MAP+i] = i
for id in objids:
    n = maker.beginobj()
    assert(n == idmap[id])
    obj = doc.getobj(id)
    if id == contentsid:
        if obj.data == None:
            obj.decode()
        s = ""
        for x, y, name in posns:
            s += "q %f 0 0 %f %f %f cm /%s Do Q\n" % (width/maker.basewidth, height/maker.baseheight, x, y, name)
        #print repr(obj.data)
        obj.data = "q " + obj.data + " Q " + s
        
    f.write(obj2str(obj, idmap))
    f.write("\nendobj\n")
    
maker.writetrailer(idmap[root_n])

cs = md5(f.getvalue()).hexdigest()
for i in range(0,len(choices)):
    for pos in choices[i][int(cs[i], 16) ]:
        dofixup(f, pos)

fout = open(sys.argv[2],"wb"); fout.write(f.getvalue()); fout.close()
