unit PDFs; { PDF parser. Copyright (C) 2008 Danny Milosavljevic This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA } {$MODE OBJFPC} {$M+} interface uses sysutils, classes, contnrs, variants; type IPDFObject = interface ['{f5935a22-cf50-11dd-a909-d73fe42b5db1}'] function GetMetaData() : Variant; procedure SetMetaData(const aMetaData : Variant); function GetBLoading() : Boolean; procedure SetBLoading(aValue : Boolean); function GetType_() : String; //function GetBFinished() : Boolean; //procedure SetBFinished(aValue : Boolean); property MetaData : Variant read GetMetaData write SetMetaData; //function GetSeekedStream() : TStream; function GetStream() : TStream; procedure SetStream(aStream : TStream); property Stream : TStream read GetStream write SetStream; //property BFinished : Boolean read GetBFinished write SetBFinished; property BLoading : Boolean read GetBLoading write SetBLoading; property Type_ : String read GetType_; function MetaDataGet(aName : String) : Variant; procedure SetID(aValue : Cardinal); // for debugging only. function GetID() : Cardinal; // for debugging only. end; // TODO create subclasses for convenience. IPages = IPDFObject; IPage = IPDFObject; IContents = IPDFObject; // IResources = IPDFObject; // '/Font', '/ProcSet' // not neccessarily its own object. //IProcSet = IPDFObject; // variant IFont = IPDFObject { 4 0 obj << /Type /Font /Subtype /Type1 /BaseFont /WMTTAV+CMR17 /FontDescriptor 144 0 R /FirstChar 73 /LastChar 117 /Widths 122 0 R >> endobj }; IFontDescriptor = IPDFObject { /Type /FontDescriptor /FontName /WMTTAV+CMR17 /Flags 4 /FontBBox [-33 -250 945 749] /Ascent 694 /CapHeight 683 /Descent -195 /ItalicAngle 0 /StemV 53 /XHeight 430 /CharSet (/I/c/d/e/i/n/o/r/s/t/u) /FontFile 143 0 R >> endobj }; IFontFile = IPDFObject; IToUnicode = IPDFObject { << /Filter ... >> stream ... endstream endobj }; TSymbolTableKey = String; ISymbolTable = interface ['{7ee52692-cf57-11dd-bc3b-e3fef4e169a5}'] function Get(const aName : TSymbolTableKey) : Variant; procedure Add(aName : TSymbolTableKey; aValue : Variant); function GetItemValue(aIndex : Cardinal) : Variant; function GetItemKey(aIndex : Cardinal) : TSymbolTableKey; function GetItemCount() : Cardinal; //property Item[aIndex : Integer] : TSymbol read GetItem; property ItemCount : Cardinal read GetItemCount; procedure MergeFrom(const aTable : ISymbolTable); procedure Delete(const aName : TSymbolTableKey); end; TSymbolTableKeys = array of TSymbolTableKey; TSymbolTable = class(TInterfacedObject, ISymbolTable, IInterface) private fSymbols : TFPHashObjectList; // . //fSymbols : TFPHashList; //fSymbols : TFPObjectHashTable; //fSymbols : TObjectList; protected function GetKeys() : TSymbolTableKeys; function GetItemCount() : Cardinal; public destructor Destroy(); override; published constructor Create(); function Get(const aName : TSymbolTableKey) : Variant; virtual; procedure Add(aName : TSymbolTableKey; aValue : Variant); virtual; procedure Delete(const aName : TSymbolTableKey); virtual; property ItemCount : Cardinal read GetItemCount; function GetItemValue(aIndex : Cardinal) : Variant; function GetItemKey(aIndex : Cardinal) : TSymbolTableKey; procedure MergeFrom(const aTable : ISymbolTable); public //property Item[aIndex : Cardinal] : TSymbol read GetItem; property Keys : TSymbolTableKeys read GetKeys; end; TAttributes = class(TSymbolTable) private fParent : Pointer{IInterface}; // avoids reference cycles. // TODO make accessible from the outside. published //function Get(aName : String) : TSymbol; override; procedure Add(aName : TSymbolTableKey; aValue : Variant); override; procedure Delete(const aName : TSymbolTableKey); override; end; // list>; // TODO map? IAttributes = ISymbolTable; TAttributeKeys = TSymbolTableKeys; TXREF = class private fSymbols : ISymbolTable; fTrailer : IAttributes; public destructor Destroy(); override; published constructor Create(); property Symbols : ISymbolTable read fSymbols write fSymbols; property Trailer : IAttributes read fTrailer write fTrailer; end; IPDF = interface ['{eb97a884-cf50-11dd-91ce-a7b92417dcd0}'] function GetRoot() : IPDFObject; function GetSymbols() : ISymbolTable; function GetXREF() : TXREF; function GetMajorVersion() : Cardinal; function GetMinorVersion() : Cardinal; function GetTrailer() : IAttributes; procedure Validate(); property Symbols : ISymbolTable read GetSymbols; property XREF : TXREF read GetXREF; property MajorVersion : Cardinal read GetMajorVersion; // deprecated; property MinorVersion : Cardinal read GetMinorVersion; // deprecated; // (version number: deprecated. See version number in catalog instead). property Trailer : IAttributes read GetTrailer; { << /Size 168 /Root 166 0 R <--- start here. /Info 167 0 R /ID [ ] >> } property Root : IPDFObject read GetRoot; end; //TCustomVariantType // IVarInstanceReference TPDFObject = class(TInterfacedObject, IPDFObject, IInterface) private //fAttributes : TAttributes; fMetaData : Variant; // either IAttributes or a variant array. fStream : TStream; //fStreamPosition : Int64; fBLoading : Boolean; fID : Cardinal; public destructor Destroy(); override; protected //function GetBFinished() : Boolean; //procedure SetBFinished(aValue : Boolean); function GetBLoading() : Boolean; procedure SetBLoading(aValue : Boolean); function GetMetaData() : Variant; procedure SetMetaData(const aMetaData : Variant); function GetStream() : TStream; procedure SetStream(aStream : TStream); // also remembers position. function GetType_() : String; public //procedure Clear(var V: TVarData); override; //procedure Copy(var Dest: TVarData; const Source: TVarData; const Indirect: Boolean); override; { if Indirect and VarDataIsByRef(Source) then VarDataCopyNoInd(Dest, Source) else } published function MetaDataGet(aName : String) : Variant; constructor Create(); // use a TWindowedStream ! function GetSeekedStream() : TStream; // this also seeks to the correct position. Beware. property MetaData : Variant read fMetaData write fMetaData; //property Attributes : TAttributes read fAttributes write fAttributes; property Stream : TStream read fStream write SetStream; // use a TWindowedStream so the position is stored... property StreamPosition : Int64 read fStreamPosition; //property BFinished : Boolean read fBFinished write fBFinished; property BLoading : Boolean read fBLoading write fBLoading; property Type_ : String read GetType_; public procedure SetID(aValue : Cardinal); function GetID() : Cardinal; end; TPDF = class(TInterfacedObject, IPDF, IInterface) private fSymbols : ISymbolTable; fXREF : TXREF; fMajorVersion : Cardinal; fMinorVersion : Cardinal; protected function GetRoot() : IPDFObject; function GetSymbols() : ISymbolTable; function GetXREF() : TXREF; function GetMajorVersion() : Cardinal; function GetMinorVersion() : Cardinal; function GetTrailer() : IAttributes; public destructor Destroy(); override; published constructor Create(); property Symbols : ISymbolTable read fSymbols write fSymbols; property XREF : TXREF read fXREF write fXREF; // not set WHILE loading, only at the end. property MajorVersion : Cardinal read fMajorVersion write fMajorVersion; property MinorVersion : Cardinal read fMinorVersion write fMinorVersion; // (version number: deprecated. See version number in catalog instead). property Trailer : IAttributes read GetTrailer; { << /Size 168 /Root 166 0 R <--- start here. /Info 167 0 R /ID [ ] >> } procedure Validate(); property Root : IPDFObject read GetRoot; end; function InterfaceFromVariant(const aValue : Variant) : IInterface; function NVL(const aValue, aFallback : Variant) : Variant; inline; function Dereference(const aValue : Variant) : Variant; implementation { given an IPDFObject, finds the metadata variant, checks whether its just a string (or number), and returns that. } function Dereference(const aValue : Variant) : Variant; var vObject : IPDFObject; vObjectI : IInterface; begin Result := aValue; vObjectI := InterfaceFromVariant(aValue); if Supports(vObjectI, IPDFObject) then vObject := vObjectI as IPDFObject else vObject := nil; if Assigned(vObject) then begin //VarIsEmpty(vObject.MetaData) ??? Result := vObject.MetaData; end; end; function InterfaceFromVariant(const aValue : Variant) : IInterface; begin Result := nil; if VarIsClear(aValue) or VarIsNull(aValue) then Exit; if ((VarType(aValue) and VarTypeMask) = varUnknown) then begin Result := IInterface(aValue); end; end; function NVL(const aValue : Variant; const aFallback : Variant) : Variant; inline; begin if not VarIsClear(aValue) and not VarIsNull(aValue) then Result := aValue else Result := aFallback; end; { TVariantWrapper } type TVariantWrapper = class private fValue : Variant; public property Value : Variant read fValue; constructor Create(aValue : Variant); end; constructor TVariantWrapper.Create(aValue : Variant); begin fValue := aValue; end; { TSymbolTable } function TSymbolTable.GetKeys() : TSymbolTableKeys; var vIndex : Integer; begin SetLength(Result, fSymbols.Count); for vIndex := 0 to fSymbols.Count - 1 do Result[vIndex] := fSymbols.NameOfIndex(vIndex); // TSymbol(fSymbols[vIndex]).Name; end; function TSymbolTable.GetItemValue(aIndex : Cardinal) : Variant; var vWrapper : TVariantWrapper; begin vWrapper := fSymbols.Items[aIndex] as TVariantWrapper; if Assigned(vWrapper) then Result := vWrapper.Value else Result := variants.Unassigned; end; function TSymbolTable.GetItemKey(aIndex : Cardinal) : TSymbolTableKey; begin Result := fSymbols.NameOfIndex(aIndex); end; function TSymbolTable.GetItemCount() : Cardinal; begin Result := fSymbols.Count; end; procedure TSymbolTable.MergeFrom(const aTable : ISymbolTable); var vOtherIndex : Cardinal; vName : TSymbolTableKey; vValue : Variant; begin if aTable.GetItemCount() > 0 then begin for vOtherIndex := 0 to aTable.GetItemCount() - 1 do begin vName := aTable.GetItemKey(vOtherIndex); vValue := aTable.GetItemValue(vOtherIndex); Self.Delete(vName); Self.Add(vName, vValue); end; end; {Writeln('='); if Self.GetItemCount() > 0 then for vOtherIndex := 0 to Self.GetItemCount() - 1 do begin vSymbol := Self.GetItem(vOtherIndex); Writeln(Format('#%u: %s -> ?', [vOtherIndex, vSymbol.Name])); end; Writeln('.');} end; destructor TSymbolTable.Destroy(); begin FreeAndNil(fSymbols); inherited Destroy; end; constructor TSymbolTable.Create(); begin fSymbols := TFPHashObjectList.Create(); fSymbols.OwnsObjects := True; //fSymbols := TFPHashList.Create(); //fSymbols := TFPObjectHashTable.Create(); //fSymbols := TObjectList.Create(); end; function TSymbolTable.Get(const aName : TSymbolTableKey) : Variant; // i : Integer; var vWrapper : TObject; begin vWrapper := fSymbols.Find(aName); if Assigned(vWrapper) then Result := TVariantWrapper(vWrapper).Value else Result := variants.Unassigned; // TODO or VarUnassigned {if fSymbols.Count > 0 then for i := fSymbols.Count - 1 downto 0 do begin Result := TSymbol(fSymbols[i]); if Result.Name = aName then Exit; end; Result := nil;} end; procedure TSymbolTable.Delete(const aName : TSymbolTableKey); var i : Integer; begin i := fSymbols.FindIndexOf(aName); if i <> -1 then fSymbols.Delete(i); end; { steals. } procedure TSymbolTable.Add(aName : TSymbolTableKey; aValue : Variant); begin fSymbols.Add(aName, TVariantWrapper.Create(aValue)); end; { TAttributes } procedure TAttributes.Add(aName : TSymbolTableKey; aValue : Variant); begin if aName = '/Parent' then fParent := InterfaceFromVariant(aValue) else inherited Add(aName, aValue); end; procedure TAttributes.Delete(const aName : TSymbolTableKey); begin inherited Delete(aName); end; { TPDFObject } destructor TPDFObject.Destroy(); begin FreeAndNil(fStream); // + Sources. //FreeAndNil(fAttributes); inherited Destroy(); end; constructor TPDFObject.Create(); begin fBLoading := False; fMetaData := variants.Unassigned; end; {function TPDFObject.GetBFinished() : Boolean; begin Result := fBFinished; end; procedure TPDFObject.SetBFinished(aValue : Boolean); begin fBFinished := aValue; end;} function TPDFObject.MetaDataGet(aName : String) : Variant; var vAttributes : IAttributes; begin vAttributes := InterfaceFromVariant(fMetaData) as IAttributes; // TODO cache. if Assigned(vAttributes) then Result := vAttributes.Get(aName) else Result := variants.Unassigned; end; function TPDFObject.GetBLoading() : Boolean; begin Result := fBLoading; end; procedure TPDFObject.SetBLoading(aValue : Boolean); begin fBLoading := aValue; end; function TPDFObject.GetType_() : String; var vAttributes : IAttributes; begin vAttributes := InterfaceFromVariant(fMetaData) as IAttributes; if Assigned(vAttributes) then Result := NVL(vAttributes.Get('/Type'), '') else Result := ''; end; procedure TPDFObject.SetMetaData(const aMetaData : Variant); begin fMetaData := aMetaData; end; function TPDFObject.GetMetaData() : Variant; begin Result := fMetaData; end; procedure TPDFObject.SetStream(aStream : TStream); // also remembers position. begin //fStreamPosition := aStream.Position; fStream := aStream; end; function TPDFObject.GetStream() : TStream; begin Result := fStream; end; {function TPDFObject.GetSeekedStream() : TStream; // this also seeks to the correct position. Beware. begin if fStream.Position <> fStreamPosition then fStream.Position := fStreamPosition; Result := fStream; end;} { TPDF } constructor TPDF.Create(); begin fSymbols := TSymbolTable.Create(); end; destructor TPDF.Destroy(); begin FreeAndNil(fXREF); inherited Destroy(); end; function TPDF.GetRoot() : IPDFObject; begin Result := InterfaceFromVariant(fXREF.Trailer.Get('/Root')) as IPDFObject; end; procedure TPDF.Validate(); var vPages : PDFs.IPDFObject; vCount : String; vKids : Variant; begin assert(Root.Type_ = '/Catalog'); assert(not VarIsClear(Root.MetaDataGet('/Pages'))); vPages := IPDFObject(InterfaceFromVariant(Root.MetaDataGet('/Pages'))); vCount := vPages.MetaDataGet('/Count'); vKids := vPages.MetaDataGet('/Kids'); { Pages: Pages: Pages: Page: 2 0 obj << /Type /Page /Contents 3 0 R /Resources 1 0 R /MediaBox [0 0 612 792] /Parent 14 0 R >> endobj } end; function TPDF.GetSymbols() : ISymbolTable; begin Result := fSymbols; end; function TPDF.GetXREF() : TXREF; begin Result := fXREF; end; function TPDF.GetMajorVersion() : Cardinal; begin Result := fMajorVersion; end; function TPDF.GetMinorVersion() : Cardinal; begin Result := fMinorVersion; end; function TPDF.GetTrailer() : IAttributes; begin Result := fXREF.Trailer; end; { TXREF } constructor TXREF.Create(); begin fSymbols := TSymbolTable.Create(); end; destructor TXREF.Destroy(); begin inherited Destroy(); end; procedure TPDFObject.SetID(aValue : Cardinal); begin fID := aValue; end; function TPDFObject.GetID() : Cardinal; begin Result := fID; end; end.