unit DFAs; {$M+} interface uses lexer_interfaces, classes, sysutils; type TDFATransition = class; TDFAState = class private fTransitions : array of TDFATransition; { index = TLexerChar } fDefaultTransition : TDFATransition; protected function GetTransition(aCharacter : TLexerChar) : TDFATransition; function GetTransitionCount : Cardinal; published property Transitions[aCharacter : TLexerChar] : TDFATransition read GetTransition; // property TransitionKeys : string; // ARGH. property TransitionCount : Cardinal read GetTransitionCount; // evil JOE syntax optimization. property DefaultTransition : TDFATransition read fDefaultTransition write fDefaultTransition; procedure Add(aCharacter : TLexerChar; aTransition : TDFATransition); // if the list-handling stuff of Pascal wouldn't suck, I'd just return a list of all tokens, but alas... function GetHighestToken : TToken; end; TDFATransition = class private fNewState : TDFAState; // FIXME BAD BAD Type. fFinalToken : TToken; public constructor Create; published property NewState : TDFAState read fNewState write fNewState; // BAD BAD Type property FinalToken : TToken read fFinalToken write fFinalToken; // if any. end; IDFAAutomaton = interface function GetState(aIndex : Cardinal) : TDFAState; // TODO remove implementation detail index. function GetStateCount : Cardinal; // TODO remove implementation detail index. property States[Index : Cardinal] : TDFAState read GetState; // TODO remove implementation detail index. property StateCount : Cardinal read GetStateCount; procedure Add(aState : TDFAState); function GetInitialState : TDFAState; property InitialState : TDFAState read GetInitialState; // convenience. procedure AddLexed(aLexerPattern : String; aToken : TToken); // if the list-handling stuff of Pascal wouldn't suck, I'd just return a list of all tokens, but alas... function GetHighestToken : TToken; end; TDFAAutomaton = class(TInterfacedObject, IDFAAutomaton, IInterface) private fStates : TList; // TODO use a SET with reverse direction lookup table. protected function GetState(aIndex : Cardinal) : TDFAState; function GetStateCount : Cardinal; function GetInitialState : TDFAState; public constructor Create; destructor Destroy; override; published property States[Index : Cardinal] : TDFAState read GetState; property StateCount : Cardinal read GetStateCount; property InitialState : TDFAState read GetInitialState; procedure Add(aState : TDFAState); // convenience. procedure AddLexed(aLexerPattern : String; aToken : TToken); // if the list-handling stuff of Pascal wouldn't suck, I'd just return a list of all tokens, but alas... function GetHighestToken : TToken; end; TDFALexer = class(TInterfacedObject, ILexer, ISourcePosition, IInterface) private fAutomaton : IDFAAutomaton; fSourceStream : TStream; fSourcePosition : Cardinal; fState : TDFAState; fPreviousState : TDFAState; // used by "ResolveInvalid" fInputChar : TLexerChar; fInputCharAvailableP : Boolean; fEOFP : Boolean; fMatchBeginning : Cardinal; fMatchEnd : Cardinal; fToken : TToken; fSourceFile : TFileName; fInputConsumed : TLexerInputConsumed; protected procedure EnsureInputCharAvailable; inline; function GetAutomaton : IDFAAutomaton; procedure SetAutomaton(aItem : IDFAAutomaton); function GetSourceStream : TStream; procedure SetSourceStream(aItem : TStream); //function GetState : TLexerState; //procedure SetState(aItem : TLexerState); function GetInputChar : TLexerChar; procedure SetInputChar(aValue : TLexerChar); function GetEOF : Boolean; function GetToken : TToken; procedure SetToken(aToken : TToken); function GetMatchBeginning : Cardinal; function GetMatchEnd : Cardinal; procedure SetMatchBeginning(aValue : Cardinal); procedure SetMatchEnd(aValue : Cardinal); function GetSourcePosition : Cardinal; procedure SetSourcePosition(aValue : Cardinal); function GetSourceFile : TFileName; procedure SetSourceFile(aValue : TFileName); function GetInputConsumed : TLexerInputConsumed; procedure SetInputConsumed(aValue : TLexerInputConsumed); function ResolveInvalid : TDFATransition; virtual; published constructor Create(aAutomaton : IDFAAutomaton); procedure Reset; function ConsumeOne : TToken; function Consume : TToken; property Automaton : IDFAAutomaton read GetAutomaton write SetAutomaton; property SourceStream : TStream read GetSourceStream write SetSourceStream; property SourcePosition : Cardinal read GetSourcePosition write SetSourcePosition; property SourceFile : TFileName read GetSourceFile write SetSourceFile; // property State : TLexerState read GetState write SetState; property InputChar : TLexerChar read GetInputChar write SetInputChar; property EOF : Boolean read GetEOF; property Token : TToken read GetToken write SetToken; property InputConsumed : TLexerInputConsumed read GetInputConsumed write SetInputConsumed; property MatchBeginning : Cardinal read GetMatchBeginning write SetMatchBeginning; property MatchEnd : Cardinal read GetMatchEnd write SetMatchEnd; end; TAutoCreateDFALexer = class(TDFALexer, ILexer, ISourcePosition, IInterface) private fLexerPattern : String; fLexerPatternDone : Cardinal; // pos. published constructor Create(aAutomaton : IDFAAutomaton; aLexerPattern : String); protected function ResolveInvalid : TDFATransition; override; end; implementation { TDFATransition } constructor TDFATransition.Create; begin fNewState := nil; // StateInvalid; fFinalToken := TokenInvalid; end; { TDFAState } procedure TDFAState.Add(aCharacter : TLexerChar; aTransition : TDFATransition); begin // TODO sanity-check. if Length(fTransitions) <= Longint(aCharacter) then SetLength(fTransitions, Longint(aCharacter) + 1); fTransitions[Longint(aCharacter)] := aTransition; end; function TDFAState.GetHighestToken : TToken; var iTransition : Integer; xToken : TToken; begin Result := TokenInvalid; if Length(fTransitions) > 0 then for iTransition := 0 to High(fTransitions) do begin xToken := fTransitions[iTransition].FinalToken; if xToken > Result then Result := xToken; end; if Assigned(fDefaultTransition) then begin xToken := fDefaultTransition.FinalToken; if xToken > Result then Result := xToken; end; end; function TDFAState.GetTransitionCount : Cardinal; begin Result := Cardinal(Length(fTransitions)); end; function TDFAState.GetTransition(aCharacter : TLexerChar) : TDFATransition; begin if Longint(aCharacter) < Length(fTransitions) then begin Result := fTransitions[Longint(aCharacter)] end else Result := nil; end; { TDFAAutomaton } function TDFAAutomaton.GetInitialState : TDFAState; begin if fStates.Count >= 2 then Result := fStates[1] // 0=invalid. else Result := nil; end; function TDFAAutomaton.GetState(aIndex : Cardinal) : TDFAState; begin Result := fStates[aIndex]; end; function TDFAAutomaton.GetStateCount : Cardinal; begin Result := fStates.Count; end; destructor TDFAAutomaton.Destroy; var i : Cardinal; state : TDFAState; begin if fStates.Count > 0 then for i := fStates.Count - 1 downto 0 do begin state := fStates[i]; FreeAndNil(state); end; fStates.Clear; FreeAndNil(fStates); inherited; end; procedure TDFAAutomaton.Add(aState : TDFAState); begin if fStates.IndexOf(aState) = -1 then fStates.Add(aState); end; function TDFAAutomaton.GetHighestToken : TToken; var i : Integer; xToken : TToken; state : TDFAState; begin Result := TokenInvalid; if fStates.Count > 0 then for i := 0 to fStates.Count - 1 do begin state := fStates[i]; xToken := state.GetHighestToken; if xToken > Result then Result := xToken; end; end; constructor TDFAAutomaton.Create; begin fStates := TList.Create; end; procedure TDFAAutomaton.AddLexed(aLexerPattern : String; aToken : TToken); var lexer : TDFALexer; begin lexer := nil; try lexer := TAutoCreateDFALexer.Create(Self, aLexerPattern); // fPreviousState := nil; // TODO. // fStates :: TList of TDFAState // Transitions[x] :: TDFATransition // transition.NewState // transition.FinalToken finally FreeAndNil(lexer); end; end; { TDFALexer } function TDFALexer.GetAutomaton : IDFAAutomaton; begin Result := fAutomaton; end; procedure TDFALexer.SetAutomaton(aItem : IDFAAutomaton); begin fAutomaton := aItem; end; function TDFALexer.GetSourceStream : TStream; begin Result := fSourceStream; end; procedure TDFALexer.SetSourceStream(aItem : TStream); begin fSourceStream := aItem; end; {function TDFALexer.GetState : TLexerState; begin Result := fState; end; procedure TDFALexer.SetState(aItem : TLexerState); begin fState := aItem; end;} procedure TDFALexer.EnsureInputCharAvailable; begin if not fInputCharAvailableP then begin fInputChar := Chr(fSourceStream.ReadByte); fInputCharAvailableP := True; // TODO fEOFP := True; end; end; function TDFALexer.GetInputChar : TLexerChar; begin Self.EnsureInputCharAvailable; Result := fInputChar; end; procedure TDFALexer.SetInputChar(aValue : TLexerChar); begin fInputChar := aValue; fInputCharAvailableP := True; end; function TDFALexer.GetEOF : Boolean; begin Result := fEOFP; end; function TDFALexer.GetToken : TToken; begin Result := fToken; end; procedure TDFALexer.SetToken(aToken : TToken); begin fToken := aToken; end; var invalidTransition : TDFATransition; function TDFALexer.ResolveInvalid : TDFATransition; begin fPreviousState := fState; Result := invalidTransition; end; function TDFALexer.ConsumeOne : TToken; var newTransition : TDFATransition; i : Integer; begin Result := TokenNone; // process input. Self.EnsureInputCharAvailable; assert(Assigned(fState)); newTransition := fState.Transitions[fInputChar]; if Assigned(Self.fInputConsumed) then // FIXME faster. Self.fInputConsumed(Self, fInputChar); fInputCharAvailableP := False; if not Assigned(newTransition) then begin Writeln(Format('possible transitions on %c would have been: ', [fInputChar])); if fState.TransitionCount > 0 then for i := 0 to fState.TransitionCount - 1 do if Assigned(fState.Transitions[TLexerChar(i)]) then Writeln(Chr(i)); Writeln('--'); newTransition := fState.DefaultTransition; if not Assigned(newTransition) then newTransition := Self.ResolveInvalid; end; assert(Assigned(newTransition)); if Assigned(newTransition.NewState) then begin fState := newTransition.NewState; end else begin fToken := newTransition.FinalToken; Result := fToken; end; end; function TDFALexer.Consume : TToken; begin repeat Result := Self.ConsumeOne; until (Result <> TokenNone) or (not fInputCharAvailableP); end; function TDFALexer.GetMatchBeginning : Cardinal; begin Result := fMatchBeginning; end; function TDFALexer.GetMatchEnd : Cardinal; begin Result := fMatchEnd; end; procedure TDFALexer.SetMatchBeginning(aValue : Cardinal); begin fMatchBeginning := aValue; end; procedure TDFALexer.SetMatchEnd(aValue : Cardinal); begin fMatchEnd := aValue; end; function TDFALexer.GetSourcePosition : Cardinal; begin Result := fSourcePosition; end; procedure TDFALexer.SetSourcePosition(aValue : Cardinal); begin fSourcePosition := aValue; end; function TDFALexer.GetSourceFile : TFileName; begin Result := fSourceFile; end; procedure TDFALexer.SetSourceFile(aValue : TFileName); begin fSourceFile := aValue; end; constructor TDFALexer.Create(aAutomaton : IDFAAutomaton); begin fAutomaton := aAutomaton; // TDFAAutomaton.Create; fSourceStream := nil; fSourcePosition := 0; fState := nil; fInputChar := #0; fInputCharAvailableP := False; fEOFP := False; fMatchBeginning := 0; fMatchEnd := 0; fToken := TokenNone; fSourceFile := ''; fState := aAutomaton.InitialState; end; function TDFALexer.GetInputConsumed : TLexerInputConsumed; begin Result := fInputConsumed; end; procedure TDFALexer.SetInputConsumed(aValue : TLexerInputConsumed); begin fInputConsumed := aValue; end; procedure TDFALexer.Reset; begin fState := fAutomaton.States[0]; // TODO make default state settable. end; { TAutoCreateDFALexer } function TAutoCreateDFALexer.ResolveInvalid : TDFATransition; var c : Integer; {Char} rangeEnd : Integer; {Char} i : Integer; rangeStart : Integer; {Char} previousC : Integer; {Char} hasRangeStart : Boolean; transition : TDFATransition; begin if fLexerPatternDone <= Length(fLexerPattern) then begin c := Ord(fLexerPattern[fLexerPatternDone]); i := fLexerPatternDone; hasRangeStart := False; transition := nil; try transition := TDFATransition.Create; if c = ord('[') then begin Inc(i); c := 0; while i <= Length(fLexerPattern) do begin previousC := c; c := Ord(fLexerPattern[i]); if c = ord(']') then begin // TODO handle escaping. Inc(i); fLexerPatternDone := i; Break; end; if c = ord('-') then begin hasRangeStart := True; rangeStart := previousC; Inc(i); Continue; end; Inc(i); if hasRangeStart then begin rangeEnd := c; for c := ord(rangeStart) + 1 to ord(rangeEnd) do fState.Add(chr(c), transition); hasRangeStart := False; end else fState.Add(chr(c), transition); end; end else begin fState.Add(chr(c), transition); Inc(fLexerPatternDone); end; Result := transition; transition := nil; finally FreeAndNil(transition); end; end else Result := inherited ResolveInvalid; end; constructor TAutoCreateDFALexer.Create(aAutomaton : IDFAAutomaton; aLexerPattern : String); begin inherited Create(aAutomaton); fLexerPattern := aLexerPattern; fLexerPatternDone := 1; end; initialization invalidTransition := TDFATransition.Create; end.