Rewrite of tokenizers

This commit is contained in:
Ryan Nowak 2016-01-15 14:06:07 -08:00
parent 1ce9180a3e
commit e68c55ab41
5 changed files with 321 additions and 197 deletions

View File

@ -1,106 +0,0 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
namespace Microsoft.AspNet.Razor
{
public abstract class StateMachine<TReturn>
{
protected delegate StateResult State();
protected abstract State StartState { get; }
protected State CurrentState { get; set; }
protected virtual TReturn Turn()
{
if (CurrentState != null)
{
StateResult result;
do
{
// Keep running until we get a null result or output
result = CurrentState();
CurrentState = result.Next;
}
while (result != null && !result.HasOutput);
if (result == null)
{
return default(TReturn); // Terminated
}
return result.Output;
}
return default(TReturn);
}
/// <summary>
/// Returns a result indicating that the machine should stop executing and return null output.
/// </summary>
protected StateResult Stop()
{
return null;
}
/// <summary>
/// Returns a result indicating that this state has no output and the machine should immediately invoke the specified state
/// </summary>
/// <remarks>
/// By returning no output, the state machine will invoke the next state immediately, before returning
/// controller to the caller of <see cref="Turn"/>
/// </remarks>
protected StateResult Transition(State newState)
{
return new StateResult(newState);
}
/// <summary>
/// Returns a result containing the specified output and indicating that the next call to
/// <see cref="Turn"/> should invoke the provided state.
/// </summary>
protected StateResult Transition(TReturn output, State newState)
{
return new StateResult(output, newState);
}
/// <summary>
/// Returns a result indicating that this state has no output and the machine should remain in this state
/// </summary>
/// <remarks>
/// By returning no output, the state machine will re-invoke the current state again before returning
/// controller to the caller of <see cref="Turn"/>
/// </remarks>
protected StateResult Stay()
{
return new StateResult(CurrentState);
}
/// <summary>
/// Returns a result containing the specified output and indicating that the next call to
/// <see cref="Turn"/> should re-invoke the current state.
/// </summary>
protected StateResult Stay(TReturn output)
{
return new StateResult(output, CurrentState);
}
protected class StateResult
{
public StateResult(State next)
{
HasOutput = false;
Next = next;
}
public StateResult(TReturn output, State next)
{
HasOutput = true;
Output = output;
Next = next;
}
public bool HasOutput { get; set; }
public TReturn Output { get; set; }
public State Next { get; set; }
}
}
}

View File

@ -17,12 +17,7 @@ namespace Microsoft.AspNet.Razor.Tokenizer
public CSharpTokenizer(ITextDocument source)
: base(source)
{
if (source == null)
{
throw new ArgumentNullException(nameof(source));
}
CurrentState = Data;
base.CurrentState = StartState;
_operatorHandlers = new Dictionary<char, Func<CSharpSymbolType>>()
{
@ -52,10 +47,9 @@ namespace Microsoft.AspNet.Razor.Tokenizer
};
}
protected override State StartState
{
get { return Data; }
}
protected override int StartState => (int)CSharpTokenizerState.Data;
private new CSharpTokenizerState? CurrentState => (CSharpTokenizerState?)base.CurrentState;
public override CSharpSymbolType RazorCommentType
{
@ -72,6 +66,36 @@ namespace Microsoft.AspNet.Razor.Tokenizer
get { return CSharpSymbolType.RazorCommentStar; }
}
protected override StateResult Dispatch()
{
switch (CurrentState)
{
case CSharpTokenizerState.Data:
return Data();
case CSharpTokenizerState.BlockComment:
return BlockComment();
case CSharpTokenizerState.QuotedCharacterLiteral:
return QuotedCharacterLiteral();
case CSharpTokenizerState.QuotedStringLiteral:
return QuotedStringLiteral();
case CSharpTokenizerState.VerbatimStringLiteral:
return VerbatimStringLiteral();
case CSharpTokenizerState.AfterRazorCommentTransition:
return AfterRazorCommentTransition();
case CSharpTokenizerState.EscapedRazorCommentTransition:
return EscapedRazorCommentTransition();
case CSharpTokenizerState.RazorCommentBody:
return RazorCommentBody();
case CSharpTokenizerState.StarAfterRazorCommentBody:
return StarAfterRazorCommentBody();
case CSharpTokenizerState.AtSymbolAfterRazorCommentBody:
return AtSymbolAfterRazorCommentBody();
default:
Debug.Fail("Invalid TokenizerState");
return default(StateResult);
}
}
protected override CSharpSymbol CreateSymbol(SourceLocation start, string content, CSharpSymbolType type, IReadOnlyList<RazorError> errors)
{
return new CSharpSymbol(start, content, type, errors);
@ -100,7 +124,7 @@ namespace Microsoft.AspNet.Razor.Tokenizer
{
return Identifier();
}
else if (Char.IsDigit(CurrentCharacter))
else if (char.IsDigit(CurrentCharacter))
{
return NumericLiteral();
}
@ -110,12 +134,12 @@ namespace Microsoft.AspNet.Razor.Tokenizer
return AtSymbol();
case '\'':
TakeCurrent();
return Transition(() => QuotedLiteral('\'', CSharpSymbolType.CharacterLiteral));
return Transition(CSharpTokenizerState.QuotedCharacterLiteral);
case '"':
TakeCurrent();
return Transition(() => QuotedLiteral('"', CSharpSymbolType.StringLiteral));
return Transition(CSharpTokenizerState.QuotedStringLiteral);
case '.':
if (Char.IsDigit(Peek()))
if (char.IsDigit(Peek()))
{
return RealLiteral();
}
@ -130,7 +154,7 @@ namespace Microsoft.AspNet.Razor.Tokenizer
else if (CurrentCharacter == '*')
{
TakeCurrent();
return Transition(BlockComment);
return Transition(CSharpTokenizerState.BlockComment);
}
else if (CurrentCharacter == '=')
{
@ -152,24 +176,31 @@ namespace Microsoft.AspNet.Razor.Tokenizer
if (CurrentCharacter == '"')
{
TakeCurrent();
return Transition(VerbatimStringLiteral);
return Transition(CSharpTokenizerState.VerbatimStringLiteral);
}
else if (CurrentCharacter == '*')
{
return Transition(EndSymbol(CSharpSymbolType.RazorCommentTransition), AfterRazorCommentTransition);
return Transition(
CSharpTokenizerState.AfterRazorCommentTransition,
EndSymbol(CSharpSymbolType.RazorCommentTransition));
}
else if (CurrentCharacter == '@')
{
// Could be escaped comment transition
return Transition(EndSymbol(CSharpSymbolType.Transition), () =>
{
TakeCurrent();
return Transition(EndSymbol(CSharpSymbolType.Transition), Data);
});
return Transition(
CSharpTokenizerState.EscapedRazorCommentTransition,
EndSymbol(CSharpSymbolType.Transition));
}
return Stay(EndSymbol(CSharpSymbolType.Transition));
}
private StateResult EscapedRazorCommentTransition()
{
TakeCurrent();
return Transition(CSharpTokenizerState.Data, EndSymbol(CSharpSymbolType.Transition));
}
private CSharpSymbolType Operator()
{
var first = CurrentCharacter;
@ -274,9 +305,13 @@ namespace Microsoft.AspNet.Razor.Tokenizer
CurrentStart,
length: 1 /* end of file */));
}
return Transition(EndSymbol(CSharpSymbolType.StringLiteral), Data);
return Transition(CSharpTokenizerState.Data, EndSymbol(CSharpSymbolType.StringLiteral));
}
private StateResult QuotedCharacterLiteral() => QuotedLiteral('\'', CSharpSymbolType.CharacterLiteral);
private StateResult QuotedStringLiteral() => QuotedLiteral('\"', CSharpSymbolType.StringLiteral);
private StateResult QuotedLiteral(char quote, CSharpSymbolType literalType)
{
TakeUntil(c => c == '\\' || c == quote || ParserHelpers.IsNewLine(c));
@ -303,7 +338,7 @@ namespace Microsoft.AspNet.Razor.Tokenizer
{
TakeCurrent(); // No-op if at EOF
}
return Transition(EndSymbol(literalType), Data);
return Transition(CSharpTokenizerState.Data, EndSymbol(literalType));
}
// CSharp Spec §2.3.2
@ -317,7 +352,7 @@ namespace Microsoft.AspNet.Razor.Tokenizer
RazorResources.ParseError_BlockComment_Not_Terminated,
CurrentStart,
length: 1 /* end of file */));
return Transition(EndSymbol(CSharpSymbolType.Comment), Data);
return Transition(CSharpTokenizerState.Data, EndSymbol(CSharpSymbolType.Comment));
}
if (CurrentCharacter == '*')
{
@ -325,7 +360,7 @@ namespace Microsoft.AspNet.Razor.Tokenizer
if (CurrentCharacter == '/')
{
TakeCurrent();
return Transition(EndSymbol(CSharpSymbolType.Comment), Data);
return Transition(CSharpTokenizerState.Data, EndSymbol(CSharpSymbolType.Comment));
}
}
return Stay();
@ -431,19 +466,45 @@ namespace Microsoft.AspNet.Razor.Tokenizer
Debug.Assert(CSharpHelpers.IsIdentifierStart(CurrentCharacter));
TakeCurrent();
TakeUntil(c => !CSharpHelpers.IsIdentifierPart(c));
CSharpSymbol sym = null;
CSharpSymbol symbol = null;
if (HaveContent)
{
var kwd = CSharpKeywordDetector.SymbolTypeForIdentifier(Buffer.ToString());
var keyword = CSharpKeywordDetector.SymbolTypeForIdentifier(Buffer.ToString());
var type = CSharpSymbolType.Identifier;
if (kwd != null)
if (keyword != null)
{
type = CSharpSymbolType.Keyword;
}
sym = new CSharpSymbol(CurrentStart, Buffer.ToString(), type) { Keyword = kwd };
symbol = new CSharpSymbol(CurrentStart, Buffer.ToString(), type) { Keyword = keyword };
}
StartSymbol();
return Stay(sym);
return Stay(symbol);
}
private StateResult Transition(CSharpTokenizerState state)
{
return Transition((int)state, result: null);
}
private StateResult Transition(CSharpTokenizerState state, CSharpSymbol result)
{
return Transition((int)state, result);
}
private enum CSharpTokenizerState
{
Data,
BlockComment,
QuotedCharacterLiteral,
QuotedStringLiteral,
VerbatimStringLiteral,
// Razor Comments - need to be the same for HTML and CSharp
AfterRazorCommentTransition = RazorCommentTokenizerState.AfterRazorCommentTransition,
EscapedRazorCommentTransition = RazorCommentTokenizerState.EscapedRazorCommentTransition,
RazorCommentBody = RazorCommentTokenizerState.RazorCommentBody,
StarAfterRazorCommentBody = RazorCommentTokenizerState.StarAfterRazorCommentBody,
AtSymbolAfterRazorCommentBody = RazorCommentTokenizerState.AtSymbolAfterRazorCommentBody,
}
}
}

View File

@ -18,18 +18,12 @@ namespace Microsoft.AspNet.Razor.Tokenizer
public HtmlTokenizer(ITextDocument source)
: base(source)
{
if (source == null)
{
throw new ArgumentNullException(nameof(source));
}
CurrentState = Data;
base.CurrentState = StartState;
}
protected override State StartState
{
get { return Data; }
}
protected override int StartState => (int)HtmlTokenizerState.Data;
private new HtmlTokenizerState? CurrentState => (HtmlTokenizerState?)base.CurrentState;
public override HtmlSymbolType RazorCommentType
{
@ -50,11 +44,11 @@ namespace Microsoft.AspNet.Razor.Tokenizer
{
using (SeekableTextReader reader = new SeekableTextReader(content))
{
var tok = new HtmlTokenizer(reader);
HtmlSymbol sym;
while ((sym = tok.NextSymbol()) != null)
var tokenizer = new HtmlTokenizer(reader);
HtmlSymbol symbol;
while ((symbol = tokenizer.NextSymbol()) != null)
{
yield return sym;
yield return symbol;
}
}
}
@ -64,6 +58,30 @@ namespace Microsoft.AspNet.Razor.Tokenizer
return new HtmlSymbol(start, content, type, errors);
}
protected override StateResult Dispatch()
{
switch (CurrentState)
{
case HtmlTokenizerState.Data:
return Data();
case HtmlTokenizerState.Text:
return Text();
case HtmlTokenizerState.AfterRazorCommentTransition:
return AfterRazorCommentTransition();
case HtmlTokenizerState.EscapedRazorCommentTransition:
return EscapedRazorCommentTransition();
case HtmlTokenizerState.RazorCommentBody:
return RazorCommentBody();
case HtmlTokenizerState.StarAfterRazorCommentBody:
return StarAfterRazorCommentBody();
case HtmlTokenizerState.AtSymbolAfterRazorCommentBody:
return AtSymbolAfterRazorCommentBody();
default:
Debug.Fail("Invalid TokenizerState");
return default(StateResult);
}
}
// http://dev.w3.org/html5/spec/Overview.html#data-state
private StateResult Data()
{
@ -80,17 +98,18 @@ namespace Microsoft.AspNet.Razor.Tokenizer
TakeCurrent();
if (CurrentCharacter == '*')
{
return Transition(EndSymbol(HtmlSymbolType.RazorCommentTransition), AfterRazorCommentTransition);
return Transition(
HtmlTokenizerState.AfterRazorCommentTransition,
EndSymbol(HtmlSymbolType.RazorCommentTransition));
}
else if (CurrentCharacter == '@')
{
// Could be escaped comment transition
return Transition(EndSymbol(HtmlSymbolType.Transition), () =>
{
TakeCurrent();
return Transition(EndSymbol(HtmlSymbolType.Transition), Data);
});
return Transition(
HtmlTokenizerState.EscapedRazorCommentTransition,
EndSymbol(HtmlSymbolType.Transition));
}
return Stay(EndSymbol(HtmlSymbolType.Transition));
}
else if (AtSymbol())
@ -99,10 +118,16 @@ namespace Microsoft.AspNet.Razor.Tokenizer
}
else
{
return Transition(Text);
return Transition(HtmlTokenizerState.Text);
}
}
private StateResult EscapedRazorCommentTransition()
{
TakeCurrent();
return Transition(HtmlTokenizerState.Data, EndSymbol(HtmlSymbolType.Transition));
}
private StateResult Text()
{
var prev = '\0';
@ -123,7 +148,7 @@ namespace Microsoft.AspNet.Razor.Tokenizer
}
// Output the Text token and return to the Data state to tokenize the next character (if there is one)
return Transition(EndSymbol(HtmlSymbolType.Text), Data);
return Transition(HtmlTokenizerState.Data, EndSymbol(HtmlSymbolType.Text));
}
private HtmlSymbol Symbol()
@ -207,5 +232,28 @@ namespace Microsoft.AspNet.Razor.Tokenizer
CurrentCharacter == '@' ||
(CurrentCharacter == '-' && Peek() == '-');
}
private StateResult Transition(HtmlTokenizerState state)
{
return Transition((int)state, result: null);
}
private StateResult Transition(HtmlTokenizerState state, HtmlSymbol result)
{
return Transition((int)state, result);
}
private enum HtmlTokenizerState
{
Data,
Text,
// Razor Comments - need to be the same for HTML and CSharp
AfterRazorCommentTransition = RazorCommentTokenizerState.AfterRazorCommentTransition,
EscapedRazorCommentTransition = RazorCommentTokenizerState.EscapedRazorCommentTransition,
RazorCommentBody = RazorCommentTokenizerState.RazorCommentBody,
StarAfterRazorCommentBody = RazorCommentTokenizerState.StarAfterRazorCommentBody,
AtSymbolAfterRazorCommentBody = RazorCommentTokenizerState.AtSymbolAfterRazorCommentBody,
}
}
}

View File

@ -7,17 +7,21 @@ using System.Diagnostics;
#if DEBUG
using System.Globalization;
#endif
using System.Linq;
using System.Text;
using Microsoft.AspNet.Razor.Text;
using Microsoft.AspNet.Razor.Tokenizer.Symbols;
namespace Microsoft.AspNet.Razor.Tokenizer
{
public abstract partial class Tokenizer<TSymbol, TSymbolType> : StateMachine<TSymbol>, ITokenizer
[DebuggerDisplay("{DebugDisplay}")]
public abstract partial class Tokenizer<TSymbol, TSymbolType> : ITokenizer
where TSymbolType : struct
where TSymbol : SymbolBase<TSymbolType>
{
#if DEBUG
private StringBuilder _read = new StringBuilder();
#endif
protected Tokenizer(ITextDocument source)
{
if (source == null)
@ -31,6 +35,12 @@ namespace Microsoft.AspNet.Razor.Tokenizer
StartSymbol();
}
protected abstract int StartState { get; }
protected int? CurrentState { get; set; }
protected TSymbol CurrentSymbol { get; private set; }
public TextDocumentReader Source { get; private set; }
protected StringBuilder Buffer { get; private set; }
@ -67,6 +77,10 @@ namespace Microsoft.AspNet.Razor.Tokenizer
protected SourceLocation CurrentStart { get; private set; }
protected abstract TSymbol CreateSymbol(SourceLocation start, string content, TSymbolType type, IReadOnlyList<RazorError> errors);
protected abstract StateResult Dispatch();
public virtual TSymbol NextSymbol()
{
// Post-Condition: Buffer should be empty at the start of Next()
@ -77,12 +91,38 @@ namespace Microsoft.AspNet.Razor.Tokenizer
{
return null;
}
var sym = Turn();
var symbol = Turn();
// Post-Condition: Buffer should be empty at the end of Next()
Debug.Assert(Buffer.Length == 0);
return sym;
return symbol;
}
protected virtual TSymbol Turn()
{
if (CurrentState != null)
{
// Run until we get into the stop state or have a result.
do
{
var next = Dispatch();
CurrentState = next.State;
CurrentSymbol = next.Result;
}
while (CurrentState != null && CurrentSymbol == null);
if (CurrentState == null)
{
return default(TSymbol); // Terminated
}
return CurrentSymbol;
}
return default(TSymbol);
}
public void Reset()
@ -90,7 +130,65 @@ namespace Microsoft.AspNet.Razor.Tokenizer
CurrentState = StartState;
}
protected abstract TSymbol CreateSymbol(SourceLocation start, string content, TSymbolType type, IReadOnlyList<RazorError> errors);
/// <summary>
/// Returns a result indicating that the machine should stop executing and return null output.
/// </summary>
protected StateResult Stop()
{
return default(StateResult);
}
/// <summary>
/// Returns a result indicating that this state has no output and the machine should immediately invoke the specified state
/// </summary>
/// <remarks>
/// By returning no output, the state machine will invoke the next state immediately, before returning
/// controller to the caller of <see cref="Turn"/>
/// </remarks>
protected StateResult Transition(int state)
{
return new StateResult(state, result: null);
}
/// <summary>
/// Returns a result containing the specified output and indicating that the next call to
/// <see cref="Turn"/> should invoke the provided state.
/// </summary>
protected StateResult Transition(int state, TSymbol result)
{
return new StateResult(state, result);
}
protected StateResult Transition(RazorCommentTokenizerState state)
{
return new StateResult((int)state, result: null);
}
protected StateResult Transition(RazorCommentTokenizerState state, TSymbol result)
{
return new StateResult((int)state, result);
}
/// <summary>
/// Returns a result indicating that this state has no output and the machine should remain in this state
/// </summary>
/// <remarks>
/// By returning no output, the state machine will re-invoke the current state again before returning
/// controller to the caller of <see cref="Turn"/>
/// </remarks>
protected StateResult Stay()
{
return new StateResult(CurrentState, result: null);
}
/// <summary>
/// Returns a result containing the specified output and indicating that the next call to
/// <see cref="Turn"/> should re-invoke the current state.
/// </summary>
protected StateResult Stay(TSymbol result)
{
return new StateResult(CurrentState, result);
}
protected TSymbol Single(TSymbolType type)
{
@ -179,9 +277,10 @@ namespace Microsoft.AspNet.Razor.Tokenizer
// We've been moved since last time we were asked for a symbol... reset the state
return Transition(StartState);
}
AssertCurrent('*');
TakeCurrent();
return Transition(EndSymbol(RazorCommentStarType), RazorCommentBody);
return Transition(1002, EndSymbol(RazorCommentStarType));
}
protected StateResult RazorCommentBody()
@ -189,42 +288,43 @@ namespace Microsoft.AspNet.Razor.Tokenizer
TakeUntil(c => c == '*');
if (CurrentCharacter == '*')
{
var star = CurrentCharacter;
var start = CurrentLocation;
MoveNext();
if (!EndOfFile && CurrentCharacter == '@')
if (Peek() == '@')
{
State next = () =>
{
Buffer.Append(star);
return Transition(EndSymbol(start, RazorCommentStarType), () =>
{
if (CurrentCharacter != '@')
{
// We've been moved since last time we were asked for a symbol... reset the state
return Transition(StartState);
}
TakeCurrent();
return Transition(EndSymbol(RazorCommentTransitionType), StartState);
});
};
if (HaveContent)
{
return Transition(EndSymbol(RazorCommentType), next);
return Transition(
RazorCommentTokenizerState.StarAfterRazorCommentBody,
EndSymbol(RazorCommentType));
}
else
{
return Transition(next);
return Transition(RazorCommentTokenizerState.StarAfterRazorCommentBody);
}
}
else
{
Buffer.Append(star);
TakeCurrent();
return Stay();
}
}
return Transition(EndSymbol(RazorCommentType), StartState);
return Transition(StartState, EndSymbol(RazorCommentType));
}
protected StateResult StarAfterRazorCommentBody()
{
AssertCurrent('*');
TakeCurrent();
return Transition(
RazorCommentTokenizerState.AtSymbolAfterRazorCommentBody,
EndSymbol(RazorCommentStarType));
}
protected StateResult AtSymbolAfterRazorCommentBody()
{
AssertCurrent('@');
TakeCurrent();
return Transition(StartState, EndSymbol(RazorCommentTransitionType));
}
/// <summary>
@ -235,7 +335,7 @@ namespace Microsoft.AspNet.Razor.Tokenizer
Func<char, char> filter = c => c;
if (!caseSensitive)
{
filter = Char.ToLowerInvariant;
filter = char.ToLowerInvariant;
}
if (expected.Length == 0 || filter(CurrentCharacter) != filter(expected[0]))
@ -298,14 +398,8 @@ namespace Microsoft.AspNet.Razor.Tokenizer
{
return (ISymbol)NextSymbol();
}
}
#if DEBUG
[DebuggerDisplay("{DebugDisplay}")]
public partial class Tokenizer<TSymbol, TSymbolType>
{
private StringBuilder _read = new StringBuilder();
public string DebugDisplay
{
get { return string.Format(CultureInfo.InvariantCulture, "[{0}] [{1}] [{2}]", _read.ToString(), CurrentCharacter, Remaining); }
@ -320,6 +414,28 @@ namespace Microsoft.AspNet.Razor.Tokenizer
return remaining;
}
}
}
#endif
protected enum RazorCommentTokenizerState
{
AfterRazorCommentTransition = 1000,
EscapedRazorCommentTransition,
RazorCommentBody,
StarAfterRazorCommentBody,
AtSymbolAfterRazorCommentBody,
}
protected struct StateResult
{
public StateResult(int? state, TSymbol result)
{
State = state;
Result = result;
}
public int? State { get; }
public TSymbol Result { get; }
}
}
}

View File

@ -126,7 +126,7 @@ namespace Microsoft.AspNet.Razor.Test.Tokenizer
}
}
protected override State StartState
protected override int StartState
{
get
{
@ -142,6 +142,11 @@ namespace Microsoft.AspNet.Razor.Test.Tokenizer
{
throw new NotImplementedException();
}
protected override StateResult Dispatch()
{
throw new NotImplementedException();
}
}
}
}