unit words; {$M+} interface uses lexer_interfaces, interfaces, classes, sysutils; const TokenWord = 10; TokenWhitespace = 11; type TDefaultWordBreakLexer = class(TInterfacedObject, ILexer, ISourcePosition, IInterface, ICloneable) private fSourceStream : TStream; fSourceFile : TFileName; fSourceLine : Cardinal; fSourcePosition : Cardinal; fInputChar : TLexerChar; fInputCharAvailableP : Boolean; fState : TLexerState; fToken : TToken; fMatchBeginning : Cardinal; // inclusive. fMatchEnd : Cardinal; // exclusive. fMatchClearNextP : Boolean; // whether to clear the match range the next time. fEOFP : Boolean; fInputConsumed : TLexerInputConsumed; protected { use the properties, not the getters! } function GetSourceStream : TStream; procedure SetSourceStream(stream : TStream); function GetEOF : Boolean; function GetState : TLexerState; procedure SetState(aValue : TLexerState); function GetToken : TToken; procedure SetToken(aValue : TToken); function GetSourceFile : TFileName; function GetSourceLine : Cardinal; function GetSourcePosition : Cardinal; procedure SetSourcePosition(aValue : Cardinal); procedure SetSourceFile(aValue : TFileName); procedure SetSourceLine(aValue : Cardinal); function GetInputChar : TLexerChar; procedure SetInputChar(aValue : TLexerChar); function GetMatchedText : string; procedure SetMatchedText(aValue : string); procedure EnsureInputChar; inline; function GetMatchBeginning : Cardinal; procedure SetMatchBeginning(aValue : Cardinal); function GetMatchEnd : Cardinal; procedure SetMatchEnd(aValue : Cardinal); procedure ClearMatchedText; inline; procedure AddMatchedText; inline; function GetInputConsumed : TLexerInputConsumed; procedure SetInputConsumed(aValue : TLexerInputConsumed); published function Clone : ICloneable; function Consume : TToken; // TODO: pass token, ensure that it's there? function ConsumeOne : TToken; { returns: 0 if no token matched yet; usually you will just use Consume. } property State : TLexerState read GetState write SetState; property Token : TToken read GetToken write SetToken; property SourceFile : TFileName read GetSourceFile write SetSourceFile; property SourceLine : Cardinal read GetSourceLine write SetSourceLine; property SourcePosition : Cardinal read GetSourcePosition write SetSourcePosition; //property UTF8Mode : Boolean read fUTF8Mode write SetUTF8Mode; property SourceStream : TStream read GetSourceStream write SetSourceStream; // stored False; property EOF : Boolean read GetEOF stored False; property InputChar : TLexerChar read GetInputChar write SetInputChar; // stored False; property MatchedText : string read GetMatchedText write SetMatchedText; // stored False; // TODO remove this? Annoying... property MatchBeginning : Cardinal read GetMatchBeginning write SetMatchBeginning; // Position, inclusive. property MatchEnd : Cardinal read GetMatchEnd write SetMatchEnd; // Position, exclusive. // property LexerStates: TLexerStates; for streaming property InputConsumed : TLexerInputConsumed read GetInputConsumed write SetInputConsumed; end; implementation type TDefaultWordBreakLexerState = (sInvalid, sOutsideWord, sInsideWord); function TDefaultWordBreakLexer.GetSourceStream : TStream; begin Result := fSourceStream; end; procedure TDefaultWordBreakLexer.SetSourceStream(stream : TStream); begin fSourceStream := stream; fMatchClearNextP := True; // FIXME is this nice? fEOFP := False; // fEOFPendingP := False; Self.Consume; end; function TDefaultWordBreakLexer.GetEOF : Boolean; begin Result := fEOFP; end; function TDefaultWordBreakLexer.GetState : TLexerState; begin Result := fState; end; procedure TDefaultWordBreakLexer.SetState(aValue : TLexerState); begin fState := aValue; end; function TDefaultWordBreakLexer.GetToken : TToken; begin Result := fToken; end; procedure TDefaultWordBreakLexer.SetToken(aValue : TToken); begin fToken := aValue; end; function TDefaultWordBreakLexer.GetSourceFile : TFileName; begin Result := fSourceFile; end; function TDefaultWordBreakLexer.GetSourceLine : Cardinal; begin Result := fSourceLine; end; procedure TDefaultWordBreakLexer.SetSourceFile(aValue : TFileName); begin fSourceFile := aValue; end; procedure TDefaultWordBreakLexer.SetSourceLine(aValue : Cardinal); begin fSourceLine := aValue; end; function TDefaultWordBreakLexer.GetInputChar : TLexerChar; begin Result := fInputChar; end; procedure TDefaultWordBreakLexer.SetInputChar(aValue : TLexerChar); begin fInputChar := aValue; end; function TDefaultWordBreakLexer.GetMatchedText : string; begin Result := ''; end; procedure TDefaultWordBreakLexer.SetMatchedText(aValue : string); begin end; function TDefaultWordBreakLexer.Consume : TToken; // TODO: pass token, ensure that it's there? begin fToken := TokenNone; ClearMatchedText; fMatchClearNextP := True; while (not Self.EOF) and (fToken = TokenNone) do ConsumeOne; Result := fToken; end; function TDefaultWordBreakLexer.GetMatchBeginning : Cardinal; begin Result := fMatchBeginning; end; procedure TDefaultWordBreakLexer.SetMatchBeginning(aValue : Cardinal); begin fMatchBeginning := aValue; end; function TDefaultWordBreakLexer.GetMatchEnd : Cardinal; begin Result := fMatchEnd; end; procedure TDefaultWordBreakLexer.SetMatchEnd(aValue : Cardinal); begin fMatchEnd := aValue; end; function TDefaultWordBreakLexer.ConsumeOne : TToken; { returns: 0 if no token matched yet; usually you will just use Consume. } var token : TToken; begin { if self._eof_pending == True: self._eof = True return INVALID if self._source_stream == None: self._eof = True # fall if self._eof == True: raise ELexerEofError("Unexpected end of file") } // TODO raise EOFError if so needed (not EStreamError?). // TODO do the last wish if there is EOF. token := TokenNone; Result := token; Self.EnsureInputChar; // or set eof_pending, emit one more token, or set eof. case TDefaultWordBreakLexerState(fState) of sInvalid: Exit; sOutsideWord: if (fInputChar > #32) then begin fState := TLexerState(TDefaultWordBreakLexerState(sInsideWord)); token := TokenWhitespace; end; sInsideWord: if (fInputChar <= #32) then begin // TODO filter non-space? fState := TLexerState(TDefaultWordBreakLexerState(sOutsideWord)); token := TokenWord; end; end; if token <> TokenNone then begin fToken := token; fMatchClearNextP := True; fState := TLexerState(StateInitial); Result := token; end else begin // next state, next char. if fMatchClearNextP then begin fMatchClearNextP := False; ClearMatchedText; end; if Assigned(fInputConsumed) then fInputConsumed(Self, fInputChar); fInputCharAvailableP := False; Inc(fSourcePosition); AddMatchedText; end; // last state change? // previous state? Result := fToken; end; function TDefaultWordBreakLexer.GetSourcePosition : Cardinal; begin Result := fSourcePosition; end; procedure TDefaultWordBreakLexer.SetSourcePosition(aValue : Cardinal); begin fSourcePosition := aValue; end; function TDefaultWordBreakLexer.Clone : ICloneable; var ResultLexer : TDefaultWordBreakLexer; begin ResultLexer := TDefaultWordBreakLexer.Create; ResultLexer.fSourceStream := fSourceStream; ResultLexer.fSourceFile := fSourceFile; ResultLexer.fSourceLine := fSourceLine; ResultLexer.fInputChar := fInputChar; ResultLexer.fState := fState; ResultLexer.fToken := fToken; ResultLexer.fMatchBeginning := fMatchBeginning; ResultLexer.fMatchEnd := fMatchEnd; Result := ResultLexer; end; procedure TDefaultWordBreakLexer.EnsureInputChar; inline; begin if not fInputCharAvailableP then begin // TODO do more buffering here? fInputChar := TLexerChar(fSourceStream.ReadByte); fInputCharAvailableP := True; end; end; procedure TDefaultWordBreakLexer.ClearMatchedText; inline; begin // no, this is not wrong. fMatchBeginning := fMatchEnd; fMatchEnd := fMatchBeginning; end; procedure TDefaultWordBreakLexer.AddMatchedText; inline; begin //assert(fInputChar = fSourceStream[fSourcePosition - 1]); assert(fMatchEnd = fSourcePosition); // ?? Inc(fMatchEnd); end; function TDefaultWordBreakLexer.GetInputConsumed : TLexerInputConsumed; begin Result := fInputConsumed; end; procedure TDefaultWordBreakLexer.SetInputConsumed(aValue : TLexerInputConsumed); begin fInputConsumed := aValue; end; end.