unit PDF_parsers; { PDF parser. Copyright (C) 2008 Danny Milosavljevic This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA } {$MODE OBJFPC} {$M+} { A PDF stream object is composed of a dictionary (<< >>), the keyword stream, a sequence of bytes and the keyword endstream. All streams must be indirect objects. } interface uses sysutils, classes, contnrs, scanners, variants, PDFs; type IPDFParser = interface function Parse() : TPDF; end; TFilterArray = array of String; TParser = class(TScanner, IPDFParser, IInterface) private //fObjectTable : TSymbolTable; fPDF : TPDF; private function SetOwner(aStream : TOwnerStream) : TOwnerStream; inline; protected function ObjectReference(aKey : Variant; aVersion : Cardinal) : IPDFObject; protected function EnsureXREF() : TXREF; function EnsureObjectLoaded(aName : String; aExistingBody : IPDFObject = nil) : IPDFObject; // object name: "%u %u". protected procedure Whitespace(); inline; procedure OptionalWhitespace(); // BUG inline; procedure SkipComment(); inline; //procedure Space(); inline; //procedure NewlineOrSpace(); inline; //procedure OptionalSpaces(); inline; //procedure OptionalNewlinesOrSpaces(); inline; procedure Newline(); inline; function PositiveInteger() : Cardinal; function Integer_() : Integer; function FloatingFraction() : Double; function Value() : Variant; function AttributeName() : UTF8String; function EscapedAttributeNameCharBody() : Char; inline; function SquareBracedValueList() : Variant{array}; function HexadecimalStringBody() : String; function BracedString() : String; function OctalEscapeBody(aFirstChar : Char) : Char; inline; function ValueOrBracedValueList() : Variant; function Attribute(aBAllowObjectReferences : Boolean; out aKey : String) : Variant; function Attributes(aBAllowObjectReferences : Boolean = True{sigh...}) : IAttributes; function AttributesBody(aBAllowObjectReferences : Boolean = True{sigh...}) : IAttributes; inline; function Stream_(aAttributes : IAttributes; const aFilters : TFilterArray) : TStream; function ObjectBody(const aName : String) : IPDFObject; function PDFObject(const aName : String) : IPDFObject; function SymbolDefinitionKey() : String; function Trailer(aBAllowObjectReferences : Boolean) : IAttributes; function XREF() : ISymbolTable; function XREFBody() : ISymbolTable; function XREFGroupBody(aStartingID, aCount : Cardinal; out aActualCount : Cardinal) : ISymbolTable; function XREFV5() : ISymbolTable; function XREFV5FromPDFObject(const aTable : IPDFObject) : ISymbolTable; function StartXREF() : Cardinal; function TrailingStartXREF() : Cardinal; // seekish. published constructor Create(aStream : TStream; aBOwnsStream : Boolean; aPDF : TPDF = nil); function Parse() : TPDF; public destructor Destroy(); override; end; implementation uses {paszlib.}zstream, ASCII85, CCITTFax, DCT, windowed_streams, GIFLZW; { TParser } constructor TParser.Create(aStream : TStream; aBOwnsStream : Boolean; aPDF : TPDF = nil); begin inherited Create(aStream, aBOwnsStream); fPDF := aPDF; EOFInputMark := #26; end; destructor TParser.Destroy(); begin inherited Destroy(); end; {procedure TParser.Space(); inline; begin //while ? Consume(' '); // #9 ? end;} procedure TParser.SkipComment(); inline; begin Consume('%'); while BInput and not (Input in [#10, #13]) do Consume(); end; procedure TParser.OptionalWhitespace(); // BUG inline; begin while Input in [#0, #9, #10, #12, #13, #32, '%'] do begin if Input = '%' then SkipComment() else Consume(); end; end; procedure TParser.Whitespace(); inline; // the official one. begin if Input in ['<', '/', '(', '[', '{'{, '>'}] then // exempt from whitespace. Exit; if Input in [#0, #9, #10, #12, #13, #32, '%'] then begin if Input = '%' then SkipComment() else Consume() end else Consume(#32); OptionalWhitespace(); end; {procedure TParser.NewlineOrSpace(); inline; begin if Input = ' ' then Space() else Newline(); end;} {procedure TParser.OptionalNewlinesOrSpaces(); begin while Input in [#10, #13, ' '] do Consume(); end;} {procedure TParser.OptionalSpaces(); inline; begin while Input = ' ' do Consume(); end;} procedure TParser.Newline(); inline; var vBMatched : Boolean; begin vBMatched := False; if Input = #13 then begin vBMatched := True; Consume(#13); end; if Input = #10 then begin vBMatched := True; Consume(#10); end; if not vBMatched then Consume(#10); end; function TParser.PositiveInteger() : Cardinal; var vDigit : Integer; begin Result := 0; if not (Input in ['0'..'9']) then Error(''); while Input in ['0'..'9'] do begin vDigit := ord(Input) - ord('0'); Result := Result * 10 + vDigit; Consume(); end; end; function TParser.Integer_() : Integer; var vAbsolute : Cardinal; begin Result := 1; if Input = '-' then begin Result := -1; Consume(); end else if Input = '+' then begin Consume(); end; vAbsolute := PositiveInteger(); Result := Result * vAbsolute; end; function TParser.FloatingFraction() : Double; var vString : String; begin vString := ''; { if Input = '-' then vString := Consume() else if Input = '+' then Consume(); if not (Input in ['0'..'9', '.']) then Error('[0-9.]'); } if Input <> '.' then Error('.'); while Input in ['0'..'9', '.'] do vString := vString + Consume(); Result := StrToFloat(vString); // loses precision and since object IDs are in this format too.... StrToFloat(vString); end; function TParser.Value() : Variant; begin // FIXME other values. if Input = 't' then begin Consume('true'); Result := true; end else if Input = 'f' then begin Consume('false'); Result := false; end else if Input = 'n' then begin Consume('null'); Result := variants.Null; end else begin Result := Integer_(); if Input = '.' then // floating-point, not integer. Result := Result + FloatingFraction(); // converts to Double, I hope. end; end; function Upper(aChar : Char) : Char; inline; begin if (aChar >= 'a') and (aChar <= 'z') then Result := chr(ord(aChar) - ord('a') + ord('A')) else Result := aChar; end; function TParser.EscapedAttributeNameCharBody() : Char; inline; var hexDigit1, hexDigit2 : Byte; begin if Input in ['0'..'9', 'A'..'F', 'a'..'f'] then hexDigit1 := Ord(Upper(Consume())) - Ord('0') else Error('[0-9A-Fa-f]'); if Input in ['0'..'9', 'A'..'F', 'a'..'f'] then hexDigit2 := Ord(Upper(Consume())) - Ord('0') else Error('[0-9A-Fa-f]'); if hexDigit1 > 9 then hexDigit1 := hexDigit1 - 7; // dist between '9' and 'A', plus 10. if hexDigit2 > 9 then hexDigit2 := hexDigit2 - 7; // dist between '9' and 'A', plus 10. Result := Chr((hexDigit1 shl 4) or hexDigit2); end; function TParser.AttributeName() : UTF8String; begin // TODO use something simpler than a string for attribute names (like an enum or whatever). Result := ''; //if Input = '/' then begin Result := '/'; Consume('/'); //end; // '!'..'~', then '#AB' // '#' is valid... while (Input >= '!') and (Input <= '~') and not (Input in ['[', ']', '<', '>', '(', ')', '{', '}', '/']) do begin if Input = '#' then begin // escape. Consume(); Result := Result + EscapedAttributeNameCharBody(); end else begin Result := Result + Input; Consume(); end; end; { '/Length' '/Filter' '/Type' '/Contents' '/Resources' } end; // [1 2 3] function TParser.SquareBracedValueList() : Variant{array}; const ChunkSize = 256; var i : Integer; vSize : Cardinal; procedure Grow(aNewSize : Cardinal); inline; begin while vSize < aNewSize do begin vSize := vSize + ChunkSize; end; VarArrayRedim(Result, vSize - 1); end; begin // ??? /Kids [2 0 R 16 0 R 20 0 R 24 0 R 30 0 R 33 0 R] Result := VarArrayCreate([0, ChunkSize - 1], VarVariant); vSize := ChunkSize; i := 0; Consume('['); OptionalWhitespace(); while BInput and (Input <> ']') do begin // /ID [ ] >> if Input = 'R' then begin //Grow(i + 1); Consume('R'); assert(i >= 2); Result[i - 2] := ObjectReference(Result[i - 2], Result[i - 1]); Result[i - 1] := variants.Null; //Result[i] := variants.Null; Dec(i, 1); end else begin Grow(i + 1); Result[i] := ValueOrBracedValueList(); // Value(); Inc(i); end; if Input in [#10, #13] then Newline() else if Input = ']' then else OptionalWhitespace(); // aaargh! '(In)26(tro)-26(duction)-302(to)-302(tensors)]TJ'. you can't be serious. {else if Input <> ']' then Break; some PDFs have: [15/foo] } end; OptionalWhitespace(); Consume(']'); //OptionalWhitespace(); VarArrayRedim(Result, i - 1); end; function TParser.HexadecimalStringBody() : String; var hexDigit1, hexDigit2 : Byte; begin Result := ''; while (Input <> '>') and BInput do begin if Input in ['0'..'9', 'A'..'F', 'a'..'f'] then hexDigit1 := Ord(Upper(Consume())) - Ord('0') else Error('[0-9A-F]'); if Input in ['0'..'9', 'A'..'F', 'a'..'f'] then hexDigit2 := Ord(Upper(Consume())) - Ord('0') else begin while (Input <> '>') and BInput do Consume(); hexDigit2 := 0; // as per standard. end; if hexDigit1 > 9 then hexDigit1 := hexDigit1 - 7; // dist between '9' and 'A', plus 10. if hexDigit2 > 9 then hexDigit2 := hexDigit2 - 7; // dist between '9' and 'A', plus 10. Result := Result + Chr((hexDigit1 shl 4) or hexDigit2); end; end; function TParser.OctalEscapeBody(aFirstChar : Char) : Char; inline; var vCode : Cardinal; begin if not (aFirstChar in ['0'..'7']) then Error('[01234567]') else vCode := ord(aFirstChar) - ord('0'); if Input in ['0'..'7'] then begin vCode := (vCode shl 3) or (ord(Consume()) - ord('0')); if Input in ['0'..'7'] then vCode := (vCode shl 3) or (ord(Consume()) - ord('0')); end; // can be out of bounds! Result := chr(vCode); end; function TParser.BracedString() : String; var vNestingLevel : Cardinal; vEscaped : Char; begin vNestingLevel := 0; Result := ''; Consume('('); while (Input <> ')') or (vNestingLevel > 0) do begin if not BInput then Break; if Input = '(' then Inc(vNestingLevel) else if Input = ')' then Dec(vNestingLevel) else if Input = #13 then begin // only leave #10 in the string. Consume(); if Input <> #10 then // whoops, killed the newline? Result := Result + #10; end else if Input = '\' then begin Consume(); vEscaped := Consume(); case vEscaped of 'n': vEscaped := #10; 'r': vEscaped := #13; 't': vEscaped := #9; 'b': vEscaped := #8; 'f': vEscaped := #12; '(': vEscaped := '('; ')': vEscaped := ')'; '\': vEscaped := '\'; #10, #13: begin OptionalWhitespace(); Continue; end; '0'..'7': vEscaped := OctalEscapeBody(vEscaped); else ; end; Result := Result + vEscaped; Continue; end; Result := Result + Consume(); end; Consume(')'); end; function TParser.ValueOrBracedValueList() : Variant; begin if Input = '[' then Result := SquareBracedValueList() else if Input = '/' then Result := AttributeName() // FIXME SymbolReference(AttributeName()) else if Input = '<' then begin Consume(); if Input = '<' then begin Consume('<'); Result := IAttributes(AttributesBody()); Consume('>>'); end else begin Result := HexadecimalStringBody(); Consume('>'); end; end else if Input = '(' then Result := BracedString() else Result := Value(); end; function TParser.ObjectReference(aKey : Variant; aVersion : Cardinal) : IPDFObject; var vNumber : Cardinal; vID : String; vObject : IPDFObject; begin if VarIsStr(aKey) then vNumber := StrToInt(aKey) else vNumber := Cardinal(aKey); vID := Format('%u %u', [vNumber, aVersion]); vObject := EnsureObjectLoaded(vID); if not Assigned(vObject) then Error('', vID); Result := vObject; end; function TParser.Attribute(aBAllowObjectReferences : Boolean; out aKey : String) : Variant; var vVersion : Cardinal; vID : Cardinal; begin aKey := AttributeName(); //Writeln('SYM', Result.Name); Whitespace(); // /Contents 3 0 R Result := ValueOrBracedValueList(); //OptionalWhitespace(); //Writeln('CURRENT', Input, ';', LineNumber, ';', ColumnNumber); if Input = ' ' then begin // FIXME. Whitespace(); // OptionalSpaces(); if Input in ['0'..'9'] then begin vVersion := PositiveInteger(); Whitespace(); Consume('R'); if aBAllowObjectReferences{avoid reference cycles; FIXME be nicer} then begin vID := StrToInt(Result); //Writeln(Format('(reference for %s is %d)', [aKey, vID])); Result := ObjectReference(Result, vVersion); if not VarIsNull(Result) then IPDFObject(Result).SetID(vID); end else Result := Null; // useful for Trailer parsing... end; end; OptionalWhitespace(); //Writeln('DONE', Input); end; function TParser.AttributesBody(aBAllowObjectReferences : Boolean = True) : IAttributes; inline; var vKey : String; vValue : Variant; begin Result := TAttributes.Create(); OptionalWhitespace(); while Input <> '>' do begin vValue := Attribute(aBAllowObjectReferences, {out} vKey); Result.Add(vKey, vValue); OptionalWhitespace(); {if Input in [#10, #13] then Newline();} end; end; function TParser.Attributes(aBAllowObjectReferences : Boolean = True) : IAttributes; begin Consume('<<'); Result := AttributesBody(aBAllowObjectReferences); Consume('>>'); end; function TParser.SetOwner(aStream : TOwnerStream) : TOwnerStream; inline; begin aStream.SourceOwner := aStream.Source <> Self.Stream; Result := aStream; end; procedure DebugStream(aStream : TStream); //var // vItem : Byte; begin {Writeln('BEGIN CODE'); while True do begin vItem := aStream.ReadByte(); Write(chr(vItem)); end; Writeln('END CODE'); } end; { no DEFINE SLOWLY_BUT_SURELY} function TParser.Stream_(aAttributes : IAttributes; const aFilters : TFilterArray) : TStream; var vFilterIndex : Integer; vFilterName : String; vStream : TStream; vOldPosition : Int64; vDecodingParameters : IAttributes; vLength : Int64; vColumns : Int64; vK : Int64; vEncodedByteAlign : Boolean; function GetIntegerValue(const aValue : Variant) : Int64; var vValue : Variant; begin if VarIsClear(aValue) or VarIsNull(aValue) then Error('', ''); vValue := Dereference(aValue); if VarIsClear(vValue) or VarIsNull(vValue) then Error('', ''); Result := vValue; end; function Fallback(const aValue : Variant; aFallback : Boolean) : Boolean; begin if VarIsClear(aValue) or VarIsNull(aValue) then Result := aFallback else Result := aValue; end; function Fallback(const aValue : Variant; aFallback : Int64) : Int64; begin if VarIsClear(aValue) or VarIsNull(aValue) then Result := aFallback else Result := aValue; end; var vCodeSize : Byte; { in bits. } begin vLength := GetIntegerValue(aAttributes.Get('/Length')); // 'length' doesn't neccessarily mean "stream length". vDecodingParameters := InterfaceFromVariant(aAttributes.Get('/DecodeParms')) as IAttributes; // dict. OptionalWhitespace(); Consume('stream'); {$IFDEF SLOWLY_BUT_SURELY} if Input = #13 then Consume(); Consume(#10); {$ELSE} if Input = #13 then Consume(); // Unload(#10); Consume(#10); {$ENDIF} vStream := Self.Stream; vOldPosition := vStream.Position; vStream.Position := Offset; // Limit stream to aLength. vStream := SetOwner(windowed_streams.TWindowedStream.Create(vStream, vLength)); // also remembers current position as starting position. for vFilterIndex := Low(aFilters) to High(aFilters) do begin vFilterName := aFilters[vFilterIndex]; //Writeln(Format('Filter#%d: %s', [vFilterIndex, vFilterName])); if vFilterName = '/FlateDecode' then // FIXME handle Predictor <> 1. vStream := SetOwner(zstream.Tdecompressionstream.Create(vStream, {skip header} false)) else if vFilterName = '/ASCII85Decode' then begin vStream := SetOwner(ASCII85.TDecoder.Create(vStream)); ASCII85.TDecoder(vStream).BExpectBoundary := True; // some weird streams in PDF files have a ending "~>" although they don't have a starting "<~". end else if vFilterName = '/CCITTFaxDecode' then begin if Assigned(vDecodingParameters) then begin vColumns := Fallback(vDecodingParameters.Get('/Columns'), 1728); vK := Fallback(vDecodingParameters.Get('/K'), 0); vEncodedByteAlign := Fallback(vDecodingParameters.Get('/EncodedByteAlign'), False); end else begin vK := 0; vColumns := 1728; end; {if ((vColumns and $7) <> 0) then Error('', IntToStr(vColumns)); ???? } // FIXME handle vDecodingParameters.Get('/Blackls1') vStream := SetOwner(CCITTFax.TDecoder.Create(vStream, vColumns, vK, vEncodedByteAlign)) end else if vFilterName = '/DCTDecode' then // vStream := SetOwner(DCT.TDecoder.Create(vStream)) else if vFilterName = '/LZWDecode' then begin { FIXME handle Predictor <> 1 (Predictor Function). 1: no prediction. 2: TIFF predictor 2. 10: PNG prediction (on encoding: PNG None). 11: PNG prediction (on encoding: PNG Sub). 12: PNG prediction (on encoding: PNG Up). 13: PNG prediction (on encoding: PNG Average). 14: PNG prediction (on encoding: PNG Paeth). 15: PNG prediction (on encoding: PNG optimum). } vCodeSize := 8; vStream := SetOwner(GIFLZW.TDecoder.Create(vStream, True, vCodeSize)) end else Error('(/FlateDecode|/ASCII85Decode|/CCITTFaxDecode|/LZWDecode)', vFilterName); { * ASCIIHexDecode : '2421ABfd>' * ASCII85Decode * LZWDecode !!!! * FlateDecode * RunLengthDecode * JBIG2Decode * DCTDecode !!! * JPXDecode * Crypt } end; DebugStream(vStream); //Writeln(Format('length %u', [aLength])); Self.Stream.Position := vOldPosition; {$IFDEF SLOWLY_BUT_SURELY} while vLength > 0 { leave one so we preload one} do begin Consume(); Dec(vLength); end; {Unload(Input); Stream.Position := Stream.Position - 1;} {$ELSE} ConsumeBlock(vLength); //Stream.Position := Stream.Position + aLength; //Consume(); {$ENDIF} OptionalWhitespace(); Consume('endstream'); OptionalWhitespace(); Result := vStream; end; function CreatePDFObject(const aType : String) : IPDFObject; inline; begin Result := IPDFObject(TPDFObject.Create()); // TODO other classes? end; function TParser.ObjectBody(const aName : String) : IPDFObject; var vLengthV : Variant; vAttributes : IAttributes; vMetaData : Variant; vFilters : TFilterArray; vFiltersV : Variant; vIndex : Cardinal; vKey : String; vValue : Variant; begin SetLength(vFilters, 0); OptionalWhitespace(); if Input = '<' then begin // TODO support hex literals? Consume('<<'); OptionalWhitespace(); if Input = '>' then begin vMetaData := AttributesBody(); // TAttributes.Create(); // empty AttributesBody(); vAttributes := InterfaceFromVariant(vMetaData) as IAttributes; Result := CreatePDFObject('/Object'); end else begin vValue := Attribute(True, vKey); // try to create the instance as early as possible: if vKey = '/Type' then begin Result := CreatePDFObject(vValue); end else Result := CreatePDFObject('/Object'); fPDF.Symbols.Delete(aName); // delete dummy entry. fPDF.Symbols.Add(aName, Result); // protect against infinite loops by adding it early. OptionalWhitespace(); vMetaData := AttributesBody(); vAttributes := InterfaceFromVariant(vMetaData) as IAttributes; vAttributes.Add(vKey, vValue); end; Consume('>>'); end else begin Result := IPDFObject(TPDFObject.Create()); fPDF.Symbols.Delete(aName); // delete dummy entry. fPDF.Symbols.Add(aName, Result); // protect against infinite loops by adding it early. vMetaData := ValueOrBracedValueList(); vAttributes := InterfaceFromVariant(vMetaData) as IAttributes; end; {if Input = '<' then Result.Attributes := Attributes(); else if Input = '[' then Result.List := SquareBracedValueList(); // or valueor... end;} Result.MetaData := vMetaData; if Assigned(vAttributes) then begin vLengthV := vAttributes.Get('/Length'); vFiltersV := vAttributes.Get('/Filter'); end else begin vLengthV := variants.Unassigned; vFiltersV := variants.Unassigned; end; if VarIsClear(vFiltersV) then SetLength(vFilters, 0) else if ((VarType(vFiltersV) and VarTypeMask) = varString) or ((VarType(vFiltersV) and VarTypeMask) = varOleStr) then begin SetLength(vFilters, 1); vFilters[0] := vFiltersV; end else begin if (VarArrayDimCount(vFiltersV) < 1) then SetLength(vFilters, 0) // ??? FIXME DynArrayFromVariant(vFilters, vFiltersV, nil{typeinfo}); else begin SetLength(vFilters, VarArrayHighBound(vFiltersV, 1) + 1); for vIndex := 0 to VarArrayHighBound(vFiltersV, 1) do vFilters[vIndex] := vFiltersV[vIndex]; // string. end; // = vFilters := vFiltersV; end; //Writeln(vFilters[0]); if not VarIsClear(vLengthV) and not VarIsNull(vLengthV) then begin //vLength := Dereference(vLengthSymbol.Value); // beware! this can also be an object reference (of an object only defined later) => jump there and parse it. // 'length' doesn't neccessarily mean "stream length". OptionalWhitespace(); if Input = 's' then begin // ..."tream"? //if VarIsClear(vLength) or VarIsNull(vLength) then // Error('', ''); Result.Stream := Stream_(vAttributes, vFilters); end; end else begin OptionalWhitespace(); // no stream, I hope. // debugging. I hope this doesn't catch false positives. if Input = 's' then Error('', 'stream'); //Writeln('SHIT'); end; end; function TParser.PDFObject(const aName : String) : IPDFObject; begin Consume('obj'); Whitespace(); Result := ObjectBody(aName); OptionalWhitespace(); Consume('endobj'); end; function TParser.SymbolDefinitionKey() : String; var vID1, vID2 : Cardinal; begin vID1 := PositiveInteger(); Whitespace(); vID2 := PositiveInteger(); Result := Format('%u %u', [vID1, vID2]); //assert(vID2 = 0); // FIXME handle versioning. end; { this especially contains '/Prev', an attribute with a value that represents the offset of the previous XREF table. } function TParser.Trailer(aBAllowObjectReferences : Boolean) : IAttributes; begin Consume('trailer'); Newline(); Result := Attributes(aBAllowObjectReferences); end; // supply a high count if you don't know the count; it will stop at the first 'trailer' is sees (not consuming it). function TParser.XREFGroupBody(aStartingID, aCount : Cardinal; out aActualCount : Cardinal) : ISymbolTable; var vObjectID : Cardinal; vOffset : Int64; vName : String; //vGeneration : Cardinal; begin aActualCount := 0; Result := TSymbolTable.Create(); //Writeln(Format('XREF entry count: %d', [vCount])); vObjectID := aStartingID; while (aCount > 0) and (Input <> 't'{railer}) do begin OptionalWhitespace(); if Input = 't' then Break; vOffset := PositiveInteger(); // 10-digit. //Whitespace(); Consume(#32); PositiveInteger(); // 5-digit. 0 or $FFFFF. Generation number. //Whitespace(); Consume(#32); if Input = 'f' then // first object: free list (linked list with ring semantics). // note vOffset is an Object Number in this case. Consume() // FIXME. else if Input = 'n' then begin Consume('n'); vName := Format('%u %u', [vObjectID, 0 { FIXME generation? } ]); Result.Add(vName, vOffset); end else Error('[fn]'); Inc(vObjectID); Inc(aActualCount); if Input = #32 then begin Consume(); if Input = #13 then Consume(#13) else Consume(#10); end else begin Consume(#13); Consume(#10); end; // the line should always have the same length, 20 bytes. //OptionalWhitespace(); // Spaces(); //Newline(); Dec(aCount); end; end; { before } function TParser.XREFBody() : ISymbolTable; var vStartingID : Cardinal; vBody : ISymbolTable; vCount : Cardinal; begin Result := TSymbolTable.Create(); while Input in ['0'..'9'] do begin // not "end" of xref (trailer). vStartingID := PositiveInteger(); Whitespace(); vCount := PositiveInteger(); Whitespace(); //Newline(); vBody := XREFGroupBody(vStartingID, vCount, vCount); //if Assigned(Result) then Result.MergeFrom(vBody); end; end; function TParser.XREFV5() : ISymbolTable; begin PositiveInteger(); OptionalWhitespace(); PositiveInteger(); OptionalWhitespace(); Result := XREFV5FromPDFObject(PDFObject('XREFV5'{FIXME number})); // chicken-and-egg problem. end; function TParser.XREFV5FromPDFObject(const aTable : IPDFObject) : ISymbolTable; function GetIntegerValue(const aValue : Variant) : Integer; begin if VarIsClear(aValue) or VarIsNull(aValue) then Error('', ''); Result := aValue; end; var vSize : Int64; vAttributes : IAttributes; vPrev : Int64; // 0=none vWs : array[0..9] of Int64; vWCount : Integer; vIndexBeginnings : array of Int64; vIndexSizes : array of Int64; function ReadWs() : Boolean; var i : Integer; vWO : Variant; begin vWO := vAttributes.Get('/W'); if VarIsNull(vWO) or not VarIsArray(vWO) then begin Result := False; Exit; end; // read 'W' // array of integer, required. vWCount := VarArrayHighBound(vWO, 1) + 1; if vWCount > 10 then vWCount := 10; if vWCount < 1 then begin Result := False; Exit; end; for i := 0 to VarArrayHighBound(vWO, 1) do vWs[i] := GetIntegerValue(vWO[i]); Result := True; end; function ReadIndex() : Boolean; var vIndexO : Variant; vIndexSize : Integer; i : Integer; begin vIndexO := vAttributes.Get('/Index'); if not VarIsNull(vIndexO) and VarIsArray(vIndexO) then begin // read 'Index' [(from, count), ...], optional vIndexSize := VarArrayHighBound(vIndexO, 1) + 1; if vIndexSize < 1 then begin Result := False; Exit; end; SetLength(vIndexBeginnings, vIndexSize); SetLength(vIndexSizes, vIndexSize); for i := 0 to vIndexSize - 1 do begin vIndexBeginnings[i] := GetIntegerValue(vIndexO[i shl 1]); vIndexSizes[i] := GetIntegerValue(vIndexO[(i shl 1) + 1]); end; end else begin SetLength(vIndexBeginnings, 1); SetLength(vIndexSizes, 1); vIndexBeginnings[0] := 0; vIndexSizes[0] := vSize; end; Result := True; end; function ReadBinaryInteger(sizeInBytes : Cardinal; out didAnything : Boolean) : Cardinal; var buffer : array[0..3] of Char; i : Integer; begin assert(sizeInBytes <= 4); assert(sizeof(Integer) >= 4); for i := 0 to sizeInBytes - 1 do buffer[i] := Consume(); if sizeInBytes > 0 then didAnything := True; Result := 0; for i := 0 to sizeInBytes - 1 do Result := (Result shl 8) or Ord(buffer[i]); end; procedure ReadIndexEntries(); var vW : Integer; vIndexSize : Integer; vIndexIndex : Integer; vObjectID : Integer; vBDidAnything : Boolean; vValues : array[0..9] of Integer; i : Integer; vName : String; begin vIndexSize := Length(vIndexBeginnings); vIndexIndex := 0; vObjectID := vIndexBeginnings[0]; vBDidAnything := True; vValues[0] := 0; vValues[1] := 0; vValues[2] := 0; vValues[3] := 0; vValues[4] := 0; while vBDidAnything do begin vBDidAnything := False; for i := 0 to vWCount - 1 do begin vW := vWs[i]; if vW > 0 then vValues[i] := ReadBinaryInteger(vW, vBDidAnything) else // TODO actually there's only one default 0, which is the generation number for type=1. Maybe not do too much? vValues[i] := 0; end; if vValues[0] = 0 then begin // fields: [type, object_next_free, next_generation_number] (like 'f). end else if vValues[0] = 1 then begin // fields: [type, offset, generation_number] (like 'n). // TODO: support 'xref' table compression (new in PDF 1.5). vName := Format('%u %u', [vObjectID, vValues[2] ]); Result.Add(vName, vValues[1]); end else begin // fields: [type, object, index_in_stream] (generation implicitly 0). // FIXME myCompressedObjectLocationMap[ID, 0] = vValues[1], vValues[2] end; Inc(vObjectID); Dec(vIndexSizes[vIndexIndex]); if vIndexSizes[vIndexIndex] = 0 then begin Inc(vIndexIndex); if vIndexIndex >= vIndexSize then Break; vObjectID := vIndexBeginnings[vIndexIndex]; end; end; // [0] == 1: myObjectLocationMap[std::pair(ID, V[2]/*generation*/)] = V[1]/*offset*/; // [0] == 2: myCompressedObjectLocationMap[std::pair(ID, 0) = make_pair(V[1] /* referenced object */, V[2] /* index in referenced object */); end; begin Result := TSymbolTable.Create(); vAttributes := aTable.MetaData; vSize := GetIntegerValue(vAttributes.Get('/Size')); vPrev := 0; if not VarIsNull(vAttributes.Get('/Prev')) then vPrev := GetIntegerValue(vAttributes.Get('/Prev')); if not ReadWs() then Exit; if not ReadIndex() then Exit; ReadIndexEntries(); Result.Add('/Prev', vPrev); end; function TParser.XREF() : ISymbolTable; begin if Input <> 'x' then begin Result := XREFV5(); Exit; end; Consume('xref'); Newline(); Result := XREFBody(); // NOT HERE Result.Trailer := Trailer(); end; function TParser.StartXREF() : Cardinal; begin Newline(); Consume('startxref'); Newline(); Result := PositiveInteger(); // this can be 0, which is wrong most of the time. end; { attrs : p-list p-list : name values p-list name : "/" ID value : !"/"... values : value | value value | value value value | "[" value value value value "]" } { %PDF-1.4 3 0 obj << /Length 3380 /Filter /FlateDecode ; or /Filter [/ASCII85Decode /FlateDecode] >> [file...???] stream xyzendstream endobj 2 0 obj << /Type /Page /Contents 3 0 R /Resources 1 0 R /MediaBox [0 0 612 792] /Parent 34 0 R >> endobj xref 0 317 0000000000 65535 f 0000003572 00000 n 0000003467 00000 n 0000000009 00000 n 0000292517 00000 n 0000287440 00000 n 0000292360 00000 n 0000286927 00000 n 0000279236 00000 n 0000286770 00000 n 0000278850 00000 n trailer << /Size 317 /Root 315 0 R /Info 316 0 R /ID [<8F77F1A4ECEB6D5976973DD4F719CD773C38463737> <8F77F1A4ECEB6D5976973DD4F719CD773C38463737>] >> startxref 293709 %%EOF obj << attrs >> [stream ... endstream] endobj } { Side effect: seeks and does not restore. Dangerous! } function TParser.TrailingStartXREF() : Cardinal; var vCount : Integer; vString : ShortString; vStartXREFIndex : Integer; vEndIndex : Integer; begin SetLength(vString, 44); Stream.Position := Stream.Size - 44; vCount := Stream.Read(vString[1], 44); SetLength(vString, vCount); vStartXREFIndex := Pos('startxref', vString); if vStartXREFIndex = -1 then Error('startxref'); vString := Copy(vString, vStartXREFIndex + 9, Length(vString)); vEndIndex := Pos('%', vString); if vEndIndex = -1 then Error('%'); vString := Copy(vString, 1, vEndIndex - 1); Result := StrToInt(Trim(vString)); end; function TParser.EnsureXREF() : TXREF; var vXREFParser : TParser; vOldPosition : Int64; vPreviousXREFPosition : Cardinal; vXREFPosition : Cardinal; vTrailer : IAttributes; vFirstTrailer : IAttributes; vBLinearized : Boolean; vObjectID : Cardinal; vCount : Cardinal; vSymbols : ISymbolTable; vFirstTrailerPosition : Cardinal; // 0 none. begin vFirstTrailerPosition := 0; // none. Result := fPDF.XREF; if Assigned(Result) then Exit; Result := TXREF.Create(); fPDF.XREF := Result; // avoid endless loop. // Oh my #!@* *puke*. // Can't be avoided when some funny PDFs have "/Length 3027 0 R" where object "3027" is not defined yet :( // note that even with multiple XREF tables, startxref at the very end of the first will still refer to the earliest XREF table in the file. Navigate the list by "/Prev" from there. vOldPosition := Stream.Position; // note that /Linearized documents specify the position of the first (freelist) ENTRY as the StartXREF (as opposed to the group header). Also applies to "/T" and "/Prev" pointers. vXREFPosition := TrailingStartXREF(); vBLinearized := False; vObjectID := 1; // move to the beginning of the XREF table. vFirstTrailer := nil; repeat // Note: the one with the root (the first-page xref) will be found first. // Note: the main XREF table (i.e. the second one) has the lower object IDs, starting at 1. // Note: the first-page XREF table has high object IDs, all higher than the main XREF table entries. Stream.Position := vXREFPosition; vXREFParser := TParser.Create(Stream, False{FIXME}, fPDF); if (not vBLinearized) or (Input = 'x') then begin vSymbols := vXREFParser.XREF() end else begin vSymbols := vXREFParser.XREFGroupBody(vObjectID, $FFFFFFF, vCount); Inc(vObjectID, vCount); end; //Writeln('DONE AT LEAST 1 ROUND'); Result.Symbols.MergeFrom(vSymbols); //Writeln('DONE AT LEAST 1 ROUNDx'); fPDF.XREF := Result; // avoid endless loop. if vFirstTrailerPosition = 0 then vFirstTrailerPosition := vXREFParser.Offset; //vXREFParser.Stream.Position - 1 {argh}; vTrailer := vXREFParser.Trailer(False{without refs}); {if not Assigned(vFirstTrailer) then vFirstTrailer := Result.Trailer else Result.Trailer := vFirstTrailer;} vPreviousXREFPosition := NVL(vTrailer.Get('/Prev'), 0); // 0: none. //Writeln('!!PREV', vPreviousXREFPosition); FreeAndNil(vXREFParser); vXREFPosition := vPreviousXREFPosition; until vXREFPosition = 0; //Writeln('DONE'); // read the actual trailer, with all the object references dereferenced: //Result.Trailer := vTrailer; // note that this will already resolve all the object references mentioned in it. assert(vFirstTrailerPosition <> 0); Stream.Position := vFirstTrailerPosition; vXREFParser := TParser.Create(Stream, False{FIXME}, fPDF); Result.Trailer := vXREFParser.Trailer(True{with refs}); FreeAndNil(vXREFParser); Stream.Position := vOldPosition; end; // FIXME clean up. function TParser.EnsureObjectLoaded(aName : String; aExistingBody : IPDFObject = nil) : IPDFObject; // object name: "%u 0". var vOldPosition : Int64; vParser : TParser; vOffset : Int64; vPosition : Variant; vExistingV : Variant; begin assert(Assigned(fPDF)); vExistingV := fPDF.Symbols.Get(aName); if not VarIsClear(vExistingV) then begin // can still be variants.Null, if the object is currently loading. Result := vExistingV; Exit; end; if Assigned(aExistingBody) then begin assert(VarIsClear(fPDF.Symbols.Get(aName))); Result := aExistingBody; fPDF.Symbols.Add(aName, aExistingBody); // protect against infinite loops by adding it early. Exit; end; fPDF.Symbols.Add(aName, variants.Null); // not Clear! // Oh my #!@* *puke*. // Can't be avoided when some funny PDFs have "/Length 3027 0 R" where object "3027" is not defined yet :( vOldPosition := Stream.Position; vPosition := EnsureXREF().Symbols.Get(aName); if VarIsClear(vPosition) then begin // don't know where it is... raise EReadError.Create(Format('unknown PDF object ''%s''', [aName])); //IPDFObject(vSymbol.Value).BFinished := True; // kind of... Result := nil; Exit; end; Stream.Position := vPosition; //Writeln(Format('jumping to object ''%s'' at offset %u', [aName, Stream.Position])); vOffset := Stream.Position; vParser := TParser.Create(Stream, False, fPDF); vParser.Offset := vOffset; vParser.LineNumber := 0; vParser.ColumnNumber := 0; // TODO move BLoading check to here. assert(vParser.SymbolDefinitionKey() = aName); vParser.Whitespace(); //aOutObject.BLoading := True; Result := vParser.PDFObject(aName); //aOutObject.BLoading := False; //(vSymbol.Value); FreeAndNil(vParser); //vSymbol.Value := ObjectBody(); Stream.Position := vOldPosition; end; { Parses one PDF. } function TParser.Parse() : TPDF; var vObject : IPDFObject; vKey : String; begin Result := fPDF; // unnecessary. if not Assigned(Result) then begin Result := TPDF.Create(); end; Consume('%PDF-1.'); Result.MajorVersion := 1; Result.MinorVersion := PositiveInteger(); assert(Result.MinorVersion < 100); // 1..7 ? fPDF := Result; assert(Assigned(fPDF)); if Input = #13 then Consume(); if Input = #10 then Consume(); {$IFDEF LINEAR} //while BInput do begin // TODO %%EOF, %%PDF- ? while True do begin if Input in ['0'..'9'] then begin vKey := SymbolDefinitionKey(); Whitespace(); // ???? vObject := TPDFObject.Create(); PDFObject(vObject); // we need to do this anyway so that the position is advanced. EnsureObjectLoaded(vKey, vObject); OptionalWhitespace(); vObject := nil; // FIXME the only reason the following is done again is so that the position is advanced. //Result.Symbols.Add(SymbolDefinition(TPDFObject.Create())); end else if Input = 'x' then // Xref, maybe. Break else Whitespace(); // or broken. end; { read over stuff we already know } XREF(); Trailer(); NVL(Result.Trailer.Get('/Prev'), 0); // 0: none. // junk here. { end read over stuff we already know } { Trailer especially contains '/Prev', an attribute with a value that represents the offset of the previous XREF table. } StartXREF(); // can be 0, which is wrong most of the time. OptionalWhitespace(); {%%EOF was already consumed as whitespace. if BInput then begin Consume('%%EOF'); end;} {$ENDIF} //end; // already used. Result.XREFOffset := PositiveInteger(); if not Assigned(Result.XREF) then // will already be loaded most of the time. Result.XREF := EnsureXREF(); end; end.