#!/usr/bin/env python #import psyco #psyco.full() import sys sys.setrecursionlimit(10000) import pdfs import pprint import symbols import pdf_contents import decompressors import time import pdf_fonts import pdf_tounicodes def process_font(resource_font): #resource_font.BaseFont #resource_font.Encoding #resource_font.DescendantFonts[0] # FontDescriptor, W #resource_font.ToUnicode pass image_index = 0 def process_page(page): resources = page.Resources fonts = dict(map(lambda source: (source[0], pdf_fonts.Font(source[1])), (resources.get(symbols.Font) or {}).items())) #font = resources.Font XObjects = resources.get(symbols.XObject) or {} # ExtGState = resources.get(symbols.ExtGState) # MEdiaBox = resources.get(symbols.MediaBox) # ColorSpace = resources.get(symbols.ColorSpace) Contents = page.Contents if not isinstance(Contents, list): Contents = [Contents] content_parser = pdf_contents.Parser() for Content in Contents: print("stream", Content.stream, Content.object_ID) contents = content_parser.parse(map(lambda Content: Content.stream, Contents)) #decompressors.copy_stream(XObjects[symbols.intern(b"/Im7")].stream, open("U", "wb")) pprint.pprint(XObjects) font = None font_size = None def preprocess_contents(contents): for operation in contents: operator_, operands = operation if operator_ == symbols.intern(b"'") : #renderer.add_operation(symbols.Tstar, []) yield (symbols.Tj, operands) elif operator_ == symbols.intern(b"\""): yield (symbols.Tw, [operands[0]]) yield (symbols.Tc, [operands[1]]) yield (symbols.Tstar, ) yield (symbols.Tj, [operands[2]]) elif operator_ == symbols.TD: yield (symbols.TL, [-operands[1]]) yield (symbols.Td, operands) elif operator_ == symbols.BT: pass # TODO T_m = T_lm = I else: # Tj, TJ yield operation for operation in preprocess_contents(contents): pprint.pprint(operation) operands = operation[1] if operation[0] == symbols.Tf: # set font font = fonts[operands[0]] font_size = operands[1] elif operation[0] in [symbols.TJ, symbols.Tj]: # FIXME test TJ #if not isinstance(operands[0], list): # operands = [[operands[0]]] for item in operands: values = pdf_tounicodes.get_values_for_characters(font.ToUnicode, item, lambda code: (font.get_char_width(code) * font_size / 1000.0, font.get_char_glyph(code))) print(values) elif operation[0] == symbols.Td: # TODO T_m=T_lm=translate(operands) * T_lm pass elif operation[0] == symbols.Do: global image_index image_name = operands[0] obj = XObjects.get(image_name) #if obj is not None: # print obj.stream # image_index += 1 # # TODO if necessary, also check the ColorSpace palette. # decompressors.copy_stream(obj.stream, open("U%04d" % (image_index), "wb")) if __name__ == "__main__": input_file = open(sys.argv[1], "rb") parser = pdfs.Parser() PDF = parser.parse(input_file) #print(PDF.Info.Title) #sys.exit(0) Pages = PDF.Root.Pages Page_Count = Pages.Count pages = list(pdfs.linearize_pages(Pages)) Kids = Pages.Kids assert(Page_Count == len(pages)) #pprint.pprint(pages[0]) for i in range(Page_Count): process_page(pages[i])