unit PDF_ToUnicode_map_parsers;

{PDF parser.
Copyright (C) 2008 Danny Milosavljevic <danny_milo@yahoo.com>

This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
}

{$MODE OBJFPC}
{$M+}

interface
uses PDF_parsers, PDF_ToUnicode_maps;

type
  
  TParser = class(PDF_parsers.TParser)
  private
    fMap : PDF_ToUnicode_maps.TMap;
  protected
    procedure Scan(aNeedle : String);
    procedure RangeBody();
    procedure CharBody();
  public
    destructor Destroy(); override;
  published
    function Parse() : PDF_ToUnicode_maps.TMap;
  end;
  
implementation
uses sysutils;

destructor TParser.Destroy();
begin
  FreeAndNil(fMap);
  inherited Destroy();
end;

// evil. TODO parse this in a nicer way.
procedure TParser.Scan(aNeedle : String);
var
  vMatchCount : Integer;
  vMatchMaximum : Integer;
begin
  vMatchMaximum := Length(aNeedle);
  vMatchCount := 0;
  
  assert(vMatchMaximum > 0);
  
  while (vMatchCount < vMatchMaximum) and BInput do begin
    if Input = aNeedle[1 + vMatchCount] then begin
      Inc(vMatchCount);
    end else
      vMatchCount := 0;
    
    Consume();
  end;
end;

{ UTF-16 -> UCS-4 

    * Braucht das Zeichen mehr als 16 bit wird's kompliziert:
          o Das Zeichen muss jetzt auf zwei 16-bit-Einheiten verteilt werden
          o Damit man erkennen kann, dass die Einheiten ein Paar bilden haben sie an den höchsten Stellen standardisierte bits:
                + 110110.......... (Die erste Einheit des Paares)
                + 110111.......... (Die zweite Einheit des Paares)
          o In jeder Einheit sind also noch 10 bits frei, in die man jeweils eine Hälfte des Unicode-Zeichens stecken kann

Jetzt wird's kompliziert. Anstatt das Unicode-Zeichen einfach zu zerteilen zieht man erst 65536 davon ab. Dies hat den Vorteil, dass man jetzt 65536 Zeichen mehr kodieren kann. Wenn man das Zeichen lesen möchte muss man daran denken, wieder 65536 zu addieren (Also 1101100000000000 1101110000000000 steht nicht für U+0000 sondern für U+10000). Hoffe dass es jetzt klarer ist. RedNifre 02:30, 15. Feb. 2008 (CET)

Der Codebereich von U+D800 bis U+DBFF (High-Surrogates) und der Bereich von U+DC00 bis U+DFFF (Low-Surrogates) ist für diese UTF-16-Ersatzzeichen reserviert und enthält keine eigenständigen Zeichen.
}

{ other direction would be:
    vHighSurrogate := ((aValue - $10000) div $400) + $D800;
    vLowSurrogate := ((aValue - $10000) % $400) + $DC00;
}
function UCS_4_code_from_UTF_16(aValue : String) : Cardinal;
var
  vLength : Integer;
  vHighSurrogate : Cardinal;
  vLowSurrogate : Cardinal;
begin
  vLength := Length(aValue);
  assert(vLength in [2, 4]);
  
  Result := (Ord(aValue[1]) shl 8) or Ord(aValue[2]);
  
  if vLength = 4 then begin
    vHighSurrogate := Result;
    vLowSurrogate := (Ord(aValue[3]) shl 8) or Ord(aValue[4]);
    assert((vHighSurrogate >= $D800) and (vHighSurrogate <= $DBFF));
    assert((vLowSurrogate <= $DC00) and (vLowSurrogate <= $DFFF));
    
    Result := $10000 + (vHighSurrogate - $D800) * $400 + (vLowSurrogate - $DC00);
    // $D800..$DFFF: "forbidden zone".
  end;
  
  // U+D800 bis U+DBFF (High-Surrogates) und der Bereich von U+DC00 bis U+DFFF (Low-Surrogates)
  
  // Lastly, if a CID does not map to a Unicode code point, the value 0xFFFD shall be used as its Unicode code point.
end;

function Hex(aValue : String) : String;
var
  i : Integer;
const
  digits : array[0..15] of Char = '0123456789ABCDEF';
begin
  SetLength(Result, 2 * Length(aValue));
  for i := 1 to Length(aValue) do begin
    Result[i * 2 - 1] := digits[Ord(aValue[i]) shr 4];
    Result[i * 2 - 0] := digits[Ord(aValue[i]) and $F];
  end;
end;

procedure TParser.RangeBody();
var
  vNativeBeginning : String;
  vNativeEnd : String;
  vUnicodeCharacter : TUnicodeCharacter;
begin
  while Input <> 'e' do begin
    vNativeBeginning := ValueOrBracedValueList();
    Whitespace();
    vNativeEnd := ValueOrBracedValueList();
    Whitespace();
    vUnicodeCharacter := UCS_4_code_from_UTF_16(ValueOrBracedValueList());
    Whitespace();
    
    //Writeln(Hex(vNativeBeginning), '..', Hex(vNativeEnd), ' => ', vUnicodeCharacter);
    fMap.AddRange(vNativeBeginning, vNativeEnd, vUnicodeCharacter);
  end;
end;

procedure TParser.CharBody();
var
  vNativeBeginning : String;
  vUnicodeCharacter : TUnicodeCharacter;
begin
  while Input <> 'e' do begin
    vNativeBeginning := ValueOrBracedValueList();
    Whitespace();
    vUnicodeCharacter := UCS_4_code_from_UTF_16(ValueOrBracedValueList());
    Whitespace();

    //Writeln(Hex(vNativeBeginning), ' => ', vUnicodeCharacter);
    fMap.AddRange(vNativeBeginning, vNativeBeginning, vUnicodeCharacter);
  end;
end;

function TParser.Parse() : PDF_ToUnicode_maps.TMap;
begin
  fMap := PDF_ToUnicode_maps.TMap.Create();

  // TODO... this isn't exactly nice.
  
  // "beginbfrange" || "beginbfchar"

  while BInput do begin
    Scan('beginbf');
  
    if not BInput then
      Break;
    
    if Input = 'r' then begin
      Consume('range');
      Whitespace();
      RangeBody();
      OptionalWhitespace();
      Consume('endbfrange');
    end else if Input = 'c' then begin
      Consume('char');
      Whitespace();
      CharBody();
      OptionalWhitespace();
      Consume('endbfchar');
    end else begin
      Error('beginbfrange|beginbfchar', 'beginbf' + Input);
    end;
    OptionalWhitespace();
  end;
  
  // "/CMapType 2 def"

  Result := fMap; // FIXME.
  fMap := nil;
end;

end.

implementation

end.