aspnetcore/src/Microsoft.AspNetCore.Razor..../Legacy/HtmlTokenizer.cs

296 lines
10 KiB
C#

// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System.Collections.Generic;
using System.Diagnostics;
namespace Microsoft.AspNetCore.Razor.Language.Legacy
{
// Tokenizer _loosely_ based on http://dev.w3.org/html5/spec/Overview.html#tokenization
internal class HtmlTokenizer : Tokenizer<HtmlSymbol, HtmlSymbolType>
{
private const char TransitionChar = '@';
public HtmlTokenizer(ITextDocument source)
: base(source)
{
base.CurrentState = StartState;
}
protected override int StartState => (int)HtmlTokenizerState.Data;
private new HtmlTokenizerState? CurrentState => (HtmlTokenizerState?)base.CurrentState;
public override HtmlSymbolType RazorCommentType
{
get { return HtmlSymbolType.RazorComment; }
}
public override HtmlSymbolType RazorCommentTransitionType
{
get { return HtmlSymbolType.RazorCommentTransition; }
}
public override HtmlSymbolType RazorCommentStarType
{
get { return HtmlSymbolType.RazorCommentStar; }
}
protected override HtmlSymbol CreateSymbol(string content, HtmlSymbolType type, IReadOnlyList<RazorDiagnostic> errors)
{
return new HtmlSymbol(content, type, errors);
}
protected override StateResult Dispatch()
{
switch (CurrentState)
{
case HtmlTokenizerState.Data:
return Data();
case HtmlTokenizerState.Text:
return Text();
case HtmlTokenizerState.AfterRazorCommentTransition:
return AfterRazorCommentTransition();
case HtmlTokenizerState.EscapedRazorCommentTransition:
return EscapedRazorCommentTransition();
case HtmlTokenizerState.RazorCommentBody:
return RazorCommentBody();
case HtmlTokenizerState.StarAfterRazorCommentBody:
return StarAfterRazorCommentBody();
case HtmlTokenizerState.AtSymbolAfterRazorCommentBody:
return AtSymbolAfterRazorCommentBody();
default:
Debug.Fail("Invalid TokenizerState");
return default(StateResult);
}
}
// Optimize memory allocation by returning constants for the most frequent cases
protected override string GetSymbolContent(HtmlSymbolType type)
{
var symbolLength = Buffer.Length;
if (symbolLength == 1)
{
switch (type)
{
case HtmlSymbolType.OpenAngle:
return "<";
case HtmlSymbolType.Bang:
return "!";
case HtmlSymbolType.ForwardSlash:
return "/";
case HtmlSymbolType.QuestionMark:
return "?";
case HtmlSymbolType.LeftBracket:
return "[";
case HtmlSymbolType.CloseAngle:
return ">";
case HtmlSymbolType.RightBracket:
return "]";
case HtmlSymbolType.Equals:
return "=";
case HtmlSymbolType.DoubleQuote:
return "\"";
case HtmlSymbolType.SingleQuote:
return "'";
case HtmlSymbolType.WhiteSpace:
if (Buffer[0] == ' ')
{
return " ";
}
if (Buffer[0] == '\t')
{
return "\t";
}
break;
case HtmlSymbolType.NewLine:
if (Buffer[0] == '\n')
{
return "\n";
}
break;
}
}
if (symbolLength == 2 && type == HtmlSymbolType.NewLine)
{
return "\r\n";
}
return base.GetSymbolContent(type);
}
// http://dev.w3.org/html5/spec/Overview.html#data-state
private StateResult Data()
{
if (ParserHelpers.IsWhitespace(CurrentCharacter))
{
return Stay(Whitespace());
}
else if (ParserHelpers.IsNewLine(CurrentCharacter))
{
return Stay(Newline());
}
else if (CurrentCharacter == '@')
{
TakeCurrent();
if (CurrentCharacter == '*')
{
return Transition(
HtmlTokenizerState.AfterRazorCommentTransition,
EndSymbol(HtmlSymbolType.RazorCommentTransition));
}
else if (CurrentCharacter == '@')
{
// Could be escaped comment transition
return Transition(
HtmlTokenizerState.EscapedRazorCommentTransition,
EndSymbol(HtmlSymbolType.Transition));
}
return Stay(EndSymbol(HtmlSymbolType.Transition));
}
else if (AtSymbol())
{
return Stay(Symbol());
}
else
{
return Transition(HtmlTokenizerState.Text);
}
}
private StateResult EscapedRazorCommentTransition()
{
TakeCurrent();
return Transition(HtmlTokenizerState.Data, EndSymbol(HtmlSymbolType.Transition));
}
private StateResult Text()
{
var prev = '\0';
while (!EndOfFile &&
!(ParserHelpers.IsWhitespace(CurrentCharacter) || ParserHelpers.IsNewLine(CurrentCharacter)) &&
!AtSymbol())
{
prev = CurrentCharacter;
TakeCurrent();
}
if (CurrentCharacter == '@')
{
var next = Peek();
if ((ParserHelpers.IsLetter(prev) || ParserHelpers.IsDecimalDigit(prev)) &&
(ParserHelpers.IsLetter(next) || ParserHelpers.IsDecimalDigit(next)))
{
TakeCurrent(); // Take the "@"
return Stay(); // Stay in the Text state
}
}
// Output the Text token and return to the Data state to tokenize the next character (if there is one)
return Transition(HtmlTokenizerState.Data, EndSymbol(HtmlSymbolType.Text));
}
private HtmlSymbol Symbol()
{
Debug.Assert(AtSymbol());
var sym = CurrentCharacter;
TakeCurrent();
switch (sym)
{
case '<':
return EndSymbol(HtmlSymbolType.OpenAngle);
case '!':
return EndSymbol(HtmlSymbolType.Bang);
case '/':
return EndSymbol(HtmlSymbolType.ForwardSlash);
case '?':
return EndSymbol(HtmlSymbolType.QuestionMark);
case '[':
return EndSymbol(HtmlSymbolType.LeftBracket);
case '>':
return EndSymbol(HtmlSymbolType.CloseAngle);
case ']':
return EndSymbol(HtmlSymbolType.RightBracket);
case '=':
return EndSymbol(HtmlSymbolType.Equals);
case '"':
return EndSymbol(HtmlSymbolType.DoubleQuote);
case '\'':
return EndSymbol(HtmlSymbolType.SingleQuote);
case '-':
Debug.Assert(CurrentCharacter == '-');
TakeCurrent();
return EndSymbol(HtmlSymbolType.DoubleHyphen);
default:
Debug.Fail("Unexpected symbol!");
return EndSymbol(HtmlSymbolType.Unknown);
}
}
private HtmlSymbol Whitespace()
{
while (ParserHelpers.IsWhitespace(CurrentCharacter))
{
TakeCurrent();
}
return EndSymbol(HtmlSymbolType.WhiteSpace);
}
private HtmlSymbol Newline()
{
Debug.Assert(ParserHelpers.IsNewLine(CurrentCharacter));
// CSharp Spec §2.3.1
var checkTwoCharNewline = CurrentCharacter == '\r';
TakeCurrent();
if (checkTwoCharNewline && CurrentCharacter == '\n')
{
TakeCurrent();
}
return EndSymbol(HtmlSymbolType.NewLine);
}
private bool AtSymbol()
{
return CurrentCharacter == '<' ||
CurrentCharacter == '<' ||
CurrentCharacter == '!' ||
CurrentCharacter == '/' ||
CurrentCharacter == '?' ||
CurrentCharacter == '[' ||
CurrentCharacter == '>' ||
CurrentCharacter == ']' ||
CurrentCharacter == '=' ||
CurrentCharacter == '"' ||
CurrentCharacter == '\'' ||
CurrentCharacter == '@' ||
(CurrentCharacter == '-' && Peek() == '-');
}
private StateResult Transition(HtmlTokenizerState state)
{
return Transition((int)state, result: null);
}
private StateResult Transition(HtmlTokenizerState state, HtmlSymbol result)
{
return Transition((int)state, result);
}
private enum HtmlTokenizerState
{
Data,
Text,
// Razor Comments - need to be the same for HTML and CSharp
AfterRazorCommentTransition = RazorCommentTokenizerState.AfterRazorCommentTransition,
EscapedRazorCommentTransition = RazorCommentTokenizerState.EscapedRazorCommentTransition,
RazorCommentBody = RazorCommentTokenizerState.RazorCommentBody,
StarAfterRazorCommentBody = RazorCommentTokenizerState.StarAfterRazorCommentBody,
AtSymbolAfterRazorCommentBody = RazorCommentTokenizerState.AtSymbolAfterRazorCommentBody,
}
}
}