unit words;

{$M+}

interface

uses lexer_interfaces, interfaces, classes, sysutils;

const
  TokenWord = 10;
  TokenWhitespace = 11;
  
type
  TDefaultWordBreakLexer = class(TInterfacedObject, ILexer, ISourcePosition, IInterface, ICloneable)
  private
    fSourceStream : TStream;
    fSourceFile : TFileName;
    fSourceLine : Cardinal;
    fSourcePosition : Cardinal;
    fInputChar : TLexerChar;
    fInputCharAvailableP : Boolean;
    fState : TLexerState;
    fToken : TToken;
    fMatchBeginning : Cardinal; // inclusive.
    fMatchEnd : Cardinal; // exclusive.
    fMatchClearNextP : Boolean; // whether to clear the match range the next time. 
    fEOFP : Boolean;
    fInputConsumed : TLexerInputConsumed;
  protected
    { use the properties, not the getters! }

    function GetSourceStream : TStream;
    procedure SetSourceStream(stream : TStream);
    function GetEOF : Boolean;
    function GetState : TLexerState;
    procedure SetState(aValue : TLexerState);
    function GetToken : TToken; 
    procedure SetToken(aValue : TToken);
    function GetSourceFile : TFileName;
    function GetSourceLine : Cardinal;
    function GetSourcePosition : Cardinal;
    procedure SetSourcePosition(aValue : Cardinal);
    procedure SetSourceFile(aValue : TFileName);
    procedure SetSourceLine(aValue : Cardinal);
    function GetInputChar : TLexerChar;
    procedure SetInputChar(aValue : TLexerChar);
    function GetMatchedText : string;
    procedure SetMatchedText(aValue : string);
    procedure EnsureInputChar; inline;
    function GetMatchBeginning : Cardinal;
    procedure SetMatchBeginning(aValue : Cardinal);
    function GetMatchEnd : Cardinal;
    procedure SetMatchEnd(aValue : Cardinal);
    procedure ClearMatchedText; inline;
    procedure AddMatchedText; inline;
    
    function GetInputConsumed : TLexerInputConsumed;
    procedure SetInputConsumed(aValue : TLexerInputConsumed);
    
  published
    function Clone : ICloneable;
    function Consume : TToken; // TODO: pass token, ensure that it's there?
    
    function ConsumeOne : TToken; { returns: 0 if no token matched yet; usually you will just use Consume. }
    property State : TLexerState read GetState write SetState;
    
    property Token : TToken read GetToken write SetToken;
    property SourceFile : TFileName read GetSourceFile write SetSourceFile;
    property SourceLine : Cardinal read GetSourceLine write SetSourceLine;
    property SourcePosition : Cardinal read GetSourcePosition write SetSourcePosition;
    //property UTF8Mode : Boolean read fUTF8Mode write SetUTF8Mode;
    
    property SourceStream : TStream read GetSourceStream write SetSourceStream; // stored False;
    property EOF : Boolean read GetEOF stored False;
    property InputChar : TLexerChar read GetInputChar write SetInputChar; // stored False;
    property MatchedText : string read GetMatchedText write SetMatchedText; // stored False; // TODO remove this? Annoying...
    property MatchBeginning : Cardinal read GetMatchBeginning write SetMatchBeginning; // Position, inclusive.
    property MatchEnd : Cardinal read GetMatchEnd write SetMatchEnd; // Position, exclusive.
    // property LexerStates: TLexerStates; for streaming
    
    property InputConsumed : TLexerInputConsumed read GetInputConsumed write SetInputConsumed;
  end;
  
implementation

type
  TDefaultWordBreakLexerState = (sInvalid, sOutsideWord, sInsideWord);

function TDefaultWordBreakLexer.GetSourceStream : TStream;
begin
  Result := fSourceStream;
end;

procedure TDefaultWordBreakLexer.SetSourceStream(stream : TStream);
begin
  fSourceStream := stream;
  fMatchClearNextP := True; // FIXME is this nice?

  fEOFP := False;
  // fEOFPendingP := False;
  Self.Consume;
end;

function TDefaultWordBreakLexer.GetEOF : Boolean;
begin
  Result := fEOFP;
end;

function TDefaultWordBreakLexer.GetState : TLexerState;
begin
  Result := fState;
end;

procedure TDefaultWordBreakLexer.SetState(aValue : TLexerState);
begin
  fState := aValue;
end;

function TDefaultWordBreakLexer.GetToken : TToken; 
begin
  Result := fToken;
end;

procedure TDefaultWordBreakLexer.SetToken(aValue : TToken);
begin
  fToken := aValue;
end;

function TDefaultWordBreakLexer.GetSourceFile : TFileName;
begin
  Result := fSourceFile;
end;

function TDefaultWordBreakLexer.GetSourceLine : Cardinal;
begin
  Result := fSourceLine;
end;

procedure TDefaultWordBreakLexer.SetSourceFile(aValue : TFileName);
begin
  fSourceFile := aValue;
end;

procedure TDefaultWordBreakLexer.SetSourceLine(aValue : Cardinal);
begin
  fSourceLine := aValue;
end;

function TDefaultWordBreakLexer.GetInputChar : TLexerChar;
begin
  Result := fInputChar;
end;

procedure TDefaultWordBreakLexer.SetInputChar(aValue : TLexerChar);
begin
  fInputChar := aValue;
end;

function TDefaultWordBreakLexer.GetMatchedText : string;
begin
  Result := '';
end;

procedure TDefaultWordBreakLexer.SetMatchedText(aValue : string);
begin
end;

    
function TDefaultWordBreakLexer.Consume : TToken; // TODO: pass token, ensure that it's there?
begin
  fToken := TokenNone;
  ClearMatchedText;
  fMatchClearNextP := True;

  while (not Self.EOF) and (fToken = TokenNone) do
    ConsumeOne;

  Result := fToken;
end;

function TDefaultWordBreakLexer.GetMatchBeginning : Cardinal;
begin
  Result := fMatchBeginning;
end;

procedure TDefaultWordBreakLexer.SetMatchBeginning(aValue : Cardinal);
begin
  fMatchBeginning := aValue;
end;

function TDefaultWordBreakLexer.GetMatchEnd : Cardinal;
begin
  Result := fMatchEnd;
end;

procedure TDefaultWordBreakLexer.SetMatchEnd(aValue : Cardinal);
begin
  fMatchEnd := aValue;
end;
    
function TDefaultWordBreakLexer.ConsumeOne : TToken; { returns: 0 if no token matched yet; usually you will just use Consume. }
var
  token : TToken;
begin
{
    if self._eof_pending == True:
          self._eof = True
                return INVALID

    if self._source_stream == None:
          self._eof = True
                # fall

    if self._eof == True:
          raise ELexerEofError("Unexpected end of file")
}
                
                
  // TODO raise EOFError if so needed (not EStreamError?). // TODO do the last wish if there is EOF.
  token := TokenNone;
  Result := token;
  
  Self.EnsureInputChar; // or set eof_pending, emit one more token, or set eof.
  

  case TDefaultWordBreakLexerState(fState) of
  sInvalid:
    Exit;
    
  sOutsideWord: 
    if (fInputChar > #32) then begin
      fState := TLexerState(TDefaultWordBreakLexerState(sInsideWord));
      token := TokenWhitespace;
    end;

  sInsideWord:
    if (fInputChar <= #32) then begin // TODO filter non-space?
      fState := TLexerState(TDefaultWordBreakLexerState(sOutsideWord));
      token := TokenWord;
    end;
  end;
  
  if token <> TokenNone then begin
    fToken := token;
    fMatchClearNextP := True;
    fState := TLexerState(StateInitial);
    Result := token;
  end else begin // next state, next char.
    if fMatchClearNextP then begin
      fMatchClearNextP := False;
      ClearMatchedText;
    end;

    if Assigned(fInputConsumed) then
      fInputConsumed(Self, fInputChar);
      
    fInputCharAvailableP := False;
    Inc(fSourcePosition);

    AddMatchedText;
  end;

  // last state change?
  // previous state?

  Result := fToken;
end;

function TDefaultWordBreakLexer.GetSourcePosition : Cardinal;
begin
  Result := fSourcePosition;
end;

procedure TDefaultWordBreakLexer.SetSourcePosition(aValue : Cardinal);
begin
  fSourcePosition := aValue;
end;

function TDefaultWordBreakLexer.Clone : ICloneable;
var
  ResultLexer : TDefaultWordBreakLexer;
begin
  ResultLexer := TDefaultWordBreakLexer.Create;
  
  ResultLexer.fSourceStream := fSourceStream;
  ResultLexer.fSourceFile := fSourceFile;
  ResultLexer.fSourceLine := fSourceLine;
  ResultLexer.fInputChar := fInputChar;
  ResultLexer.fState := fState;
  ResultLexer.fToken := fToken;
  ResultLexer.fMatchBeginning := fMatchBeginning;
  ResultLexer.fMatchEnd := fMatchEnd;
  
  Result := ResultLexer;
end;

procedure TDefaultWordBreakLexer.EnsureInputChar; inline;
begin
  if not fInputCharAvailableP then begin
    // TODO do more buffering here?
    
    fInputChar := TLexerChar(fSourceStream.ReadByte);
    fInputCharAvailableP := True;
  end;
end;

procedure TDefaultWordBreakLexer.ClearMatchedText; inline;
begin
  // no, this is not wrong.
  fMatchBeginning := fMatchEnd;
  fMatchEnd := fMatchBeginning;
end;

procedure TDefaultWordBreakLexer.AddMatchedText; inline;
begin
  //assert(fInputChar = fSourceStream[fSourcePosition - 1]);
  assert(fMatchEnd = fSourcePosition); // ??
  Inc(fMatchEnd);
end;

function TDefaultWordBreakLexer.GetInputConsumed : TLexerInputConsumed;
begin
  Result := fInputConsumed;
end;

procedure TDefaultWordBreakLexer.SetInputConsumed(aValue : TLexerInputConsumed);
begin
  fInputConsumed := aValue;
end;

end.