// Copyright (c) .NET Foundation. All rights reserved. // Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. using System; using System.Collections.Generic; using System.Diagnostics; using System.Globalization; namespace Microsoft.AspNetCore.Razor.Language.Legacy { internal class CSharpTokenizer : Tokenizer { private Dictionary> _operatorHandlers; private static readonly Dictionary _keywords = new Dictionary(StringComparer.Ordinal) { { "await", CSharpKeyword.Await }, { "abstract", CSharpKeyword.Abstract }, { "byte", CSharpKeyword.Byte }, { "class", CSharpKeyword.Class }, { "delegate", CSharpKeyword.Delegate }, { "event", CSharpKeyword.Event }, { "fixed", CSharpKeyword.Fixed }, { "if", CSharpKeyword.If }, { "internal", CSharpKeyword.Internal }, { "new", CSharpKeyword.New }, { "override", CSharpKeyword.Override }, { "readonly", CSharpKeyword.Readonly }, { "short", CSharpKeyword.Short }, { "struct", CSharpKeyword.Struct }, { "try", CSharpKeyword.Try }, { "unsafe", CSharpKeyword.Unsafe }, { "volatile", CSharpKeyword.Volatile }, { "as", CSharpKeyword.As }, { "do", CSharpKeyword.Do }, { "is", CSharpKeyword.Is }, { "params", CSharpKeyword.Params }, { "ref", CSharpKeyword.Ref }, { "switch", CSharpKeyword.Switch }, { "ushort", CSharpKeyword.Ushort }, { "while", CSharpKeyword.While }, { "case", CSharpKeyword.Case }, { "const", CSharpKeyword.Const }, { "explicit", CSharpKeyword.Explicit }, { "float", CSharpKeyword.Float }, { "null", CSharpKeyword.Null }, { "sizeof", CSharpKeyword.Sizeof }, { "typeof", CSharpKeyword.Typeof }, { "implicit", CSharpKeyword.Implicit }, { "private", CSharpKeyword.Private }, { "this", CSharpKeyword.This }, { "using", CSharpKeyword.Using }, { "extern", CSharpKeyword.Extern }, { "return", CSharpKeyword.Return }, { "stackalloc", CSharpKeyword.Stackalloc }, { "uint", CSharpKeyword.Uint }, { "base", CSharpKeyword.Base }, { "catch", CSharpKeyword.Catch }, { "continue", CSharpKeyword.Continue }, { "double", CSharpKeyword.Double }, { "for", CSharpKeyword.For }, { "in", CSharpKeyword.In }, { "lock", CSharpKeyword.Lock }, { "object", CSharpKeyword.Object }, { "protected", CSharpKeyword.Protected }, { "static", CSharpKeyword.Static }, { "false", CSharpKeyword.False }, { "public", CSharpKeyword.Public }, { "sbyte", CSharpKeyword.Sbyte }, { "throw", CSharpKeyword.Throw }, { "virtual", CSharpKeyword.Virtual }, { "decimal", CSharpKeyword.Decimal }, { "else", CSharpKeyword.Else }, { "operator", CSharpKeyword.Operator }, { "string", CSharpKeyword.String }, { "ulong", CSharpKeyword.Ulong }, { "bool", CSharpKeyword.Bool }, { "char", CSharpKeyword.Char }, { "default", CSharpKeyword.Default }, { "foreach", CSharpKeyword.Foreach }, { "long", CSharpKeyword.Long }, { "void", CSharpKeyword.Void }, { "enum", CSharpKeyword.Enum }, { "finally", CSharpKeyword.Finally }, { "int", CSharpKeyword.Int }, { "out", CSharpKeyword.Out }, { "sealed", CSharpKeyword.Sealed }, { "true", CSharpKeyword.True }, { "goto", CSharpKeyword.Goto }, { "unchecked", CSharpKeyword.Unchecked }, { "interface", CSharpKeyword.Interface }, { "break", CSharpKeyword.Break }, { "checked", CSharpKeyword.Checked }, { "namespace", CSharpKeyword.Namespace }, { "when", CSharpKeyword.When } }; public CSharpTokenizer(ITextDocument source) : base(source) { base.CurrentState = StartState; _operatorHandlers = new Dictionary>() { { '-', MinusOperator }, { '<', LessThanOperator }, { '>', GreaterThanOperator }, { '&', CreateTwoCharOperatorHandler(CSharpSymbolType.And, '=', CSharpSymbolType.AndAssign, '&', CSharpSymbolType.DoubleAnd) }, { '|', CreateTwoCharOperatorHandler(CSharpSymbolType.Or, '=', CSharpSymbolType.OrAssign, '|', CSharpSymbolType.DoubleOr) }, { '+', CreateTwoCharOperatorHandler(CSharpSymbolType.Plus, '=', CSharpSymbolType.PlusAssign, '+', CSharpSymbolType.Increment) }, { '=', CreateTwoCharOperatorHandler(CSharpSymbolType.Assign, '=', CSharpSymbolType.Equals, '>', CSharpSymbolType.GreaterThanEqual) }, { '!', CreateTwoCharOperatorHandler(CSharpSymbolType.Not, '=', CSharpSymbolType.NotEqual) }, { '%', CreateTwoCharOperatorHandler(CSharpSymbolType.Modulo, '=', CSharpSymbolType.ModuloAssign) }, { '*', CreateTwoCharOperatorHandler(CSharpSymbolType.Star, '=', CSharpSymbolType.MultiplyAssign) }, { ':', CreateTwoCharOperatorHandler(CSharpSymbolType.Colon, ':', CSharpSymbolType.DoubleColon) }, { '?', CreateTwoCharOperatorHandler(CSharpSymbolType.QuestionMark, '?', CSharpSymbolType.NullCoalesce) }, { '^', CreateTwoCharOperatorHandler(CSharpSymbolType.Xor, '=', CSharpSymbolType.XorAssign) }, { '(', () => CSharpSymbolType.LeftParenthesis }, { ')', () => CSharpSymbolType.RightParenthesis }, { '{', () => CSharpSymbolType.LeftBrace }, { '}', () => CSharpSymbolType.RightBrace }, { '[', () => CSharpSymbolType.LeftBracket }, { ']', () => CSharpSymbolType.RightBracket }, { ',', () => CSharpSymbolType.Comma }, { ';', () => CSharpSymbolType.Semicolon }, { '~', () => CSharpSymbolType.Tilde }, { '#', () => CSharpSymbolType.Hash } }; } protected override int StartState => (int)CSharpTokenizerState.Data; private new CSharpTokenizerState? CurrentState => (CSharpTokenizerState?)base.CurrentState; public override CSharpSymbolType RazorCommentType => CSharpSymbolType.RazorComment; public override CSharpSymbolType RazorCommentTransitionType => CSharpSymbolType.RazorCommentTransition; public override CSharpSymbolType RazorCommentStarType => CSharpSymbolType.RazorCommentStar; protected override StateResult Dispatch() { switch (CurrentState) { case CSharpTokenizerState.Data: return Data(); case CSharpTokenizerState.BlockComment: return BlockComment(); case CSharpTokenizerState.QuotedCharacterLiteral: return QuotedCharacterLiteral(); case CSharpTokenizerState.QuotedStringLiteral: return QuotedStringLiteral(); case CSharpTokenizerState.VerbatimStringLiteral: return VerbatimStringLiteral(); case CSharpTokenizerState.AfterRazorCommentTransition: return AfterRazorCommentTransition(); case CSharpTokenizerState.EscapedRazorCommentTransition: return EscapedRazorCommentTransition(); case CSharpTokenizerState.RazorCommentBody: return RazorCommentBody(); case CSharpTokenizerState.StarAfterRazorCommentBody: return StarAfterRazorCommentBody(); case CSharpTokenizerState.AtSymbolAfterRazorCommentBody: return AtSymbolAfterRazorCommentBody(); default: Debug.Fail("Invalid TokenizerState"); return default(StateResult); } } // Optimize memory allocation by returning constants for the most frequent cases protected override string GetSymbolContent(CSharpSymbolType type) { var symbolLength = Buffer.Length; if (symbolLength == 1) { switch (type) { case CSharpSymbolType.IntegerLiteral: switch (Buffer[0]) { case '0': return "0"; case '1': return "1"; case '2': return "2"; case '3': return "3"; case '4': return "4"; case '5': return "5"; case '6': return "6"; case '7': return "7"; case '8': return "8"; case '9': return "9"; } break; case CSharpSymbolType.NewLine: if (Buffer[0] == '\n') { return "\n"; } break; case CSharpSymbolType.WhiteSpace: if (Buffer[0] == ' ') { return " "; } if (Buffer[0] == '\t') { return "\t"; } break; case CSharpSymbolType.Minus: return "-"; case CSharpSymbolType.Not: return "!"; case CSharpSymbolType.Modulo: return "%"; case CSharpSymbolType.And: return "&"; case CSharpSymbolType.LeftParenthesis: return "("; case CSharpSymbolType.RightParenthesis: return ")"; case CSharpSymbolType.Star: return "*"; case CSharpSymbolType.Comma: return ","; case CSharpSymbolType.Dot: return "."; case CSharpSymbolType.Slash: return "/"; case CSharpSymbolType.Colon: return ":"; case CSharpSymbolType.Semicolon: return ";"; case CSharpSymbolType.QuestionMark: return "?"; case CSharpSymbolType.RightBracket: return "]"; case CSharpSymbolType.LeftBracket: return "["; case CSharpSymbolType.Xor: return "^"; case CSharpSymbolType.LeftBrace: return "{"; case CSharpSymbolType.Or: return "|"; case CSharpSymbolType.RightBrace: return "}"; case CSharpSymbolType.Tilde: return "~"; case CSharpSymbolType.Plus: return "+"; case CSharpSymbolType.LessThan: return "<"; case CSharpSymbolType.Assign: return "="; case CSharpSymbolType.GreaterThan: return ">"; case CSharpSymbolType.Hash: return "#"; case CSharpSymbolType.Transition: return "@"; } } else if (symbolLength == 2) { switch (type) { case CSharpSymbolType.NewLine: return "\r\n"; case CSharpSymbolType.Arrow: return "->"; case CSharpSymbolType.Decrement: return "--"; case CSharpSymbolType.MinusAssign: return "-="; case CSharpSymbolType.NotEqual: return "!="; case CSharpSymbolType.ModuloAssign: return "%="; case CSharpSymbolType.AndAssign: return "&="; case CSharpSymbolType.DoubleAnd: return "&&"; case CSharpSymbolType.MultiplyAssign: return "*="; case CSharpSymbolType.DivideAssign: return "/="; case CSharpSymbolType.DoubleColon: return "::"; case CSharpSymbolType.NullCoalesce: return "??"; case CSharpSymbolType.XorAssign: return "^="; case CSharpSymbolType.OrAssign: return "|="; case CSharpSymbolType.DoubleOr: return "||"; case CSharpSymbolType.PlusAssign: return "+="; case CSharpSymbolType.Increment: return "++"; case CSharpSymbolType.LessThanEqual: return "<="; case CSharpSymbolType.LeftShift: return "<<"; case CSharpSymbolType.Equals: return "=="; case CSharpSymbolType.GreaterThanEqual: if (Buffer[0] == '=') { return "=>"; } return ">="; case CSharpSymbolType.RightShift: return ">>"; } } else if (symbolLength == 3) { switch (type) { case CSharpSymbolType.LeftShiftAssign: return "<<="; case CSharpSymbolType.RightShiftAssign: return ">>="; } } return base.GetSymbolContent(type); } protected override CSharpSymbol CreateSymbol(string content, CSharpSymbolType type, IReadOnlyList errors) { return new CSharpSymbol(content, type, errors); } private StateResult Data() { if (ParserHelpers.IsNewLine(CurrentCharacter)) { // CSharp Spec §2.3.1 var checkTwoCharNewline = CurrentCharacter == '\r'; TakeCurrent(); if (checkTwoCharNewline && CurrentCharacter == '\n') { TakeCurrent(); } return Stay(EndSymbol(CSharpSymbolType.NewLine)); } else if (ParserHelpers.IsWhitespace(CurrentCharacter)) { // CSharp Spec §2.3.3 TakeUntil(c => !ParserHelpers.IsWhitespace(c)); return Stay(EndSymbol(CSharpSymbolType.WhiteSpace)); } else if (IsIdentifierStart(CurrentCharacter)) { return Identifier(); } else if (char.IsDigit(CurrentCharacter)) { return NumericLiteral(); } switch (CurrentCharacter) { case '@': return AtSymbol(); case '\'': TakeCurrent(); return Transition(CSharpTokenizerState.QuotedCharacterLiteral); case '"': TakeCurrent(); return Transition(CSharpTokenizerState.QuotedStringLiteral); case '.': if (char.IsDigit(Peek())) { return RealLiteral(); } return Stay(Single(CSharpSymbolType.Dot)); case '/': TakeCurrent(); if (CurrentCharacter == '/') { TakeCurrent(); return SingleLineComment(); } else if (CurrentCharacter == '*') { TakeCurrent(); return Transition(CSharpTokenizerState.BlockComment); } else if (CurrentCharacter == '=') { TakeCurrent(); return Stay(EndSymbol(CSharpSymbolType.DivideAssign)); } else { return Stay(EndSymbol(CSharpSymbolType.Slash)); } default: return Stay(EndSymbol(Operator())); } } private StateResult AtSymbol() { TakeCurrent(); if (CurrentCharacter == '"') { TakeCurrent(); return Transition(CSharpTokenizerState.VerbatimStringLiteral); } else if (CurrentCharacter == '*') { return Transition( CSharpTokenizerState.AfterRazorCommentTransition, EndSymbol(CSharpSymbolType.RazorCommentTransition)); } else if (CurrentCharacter == '@') { // Could be escaped comment transition return Transition( CSharpTokenizerState.EscapedRazorCommentTransition, EndSymbol(CSharpSymbolType.Transition)); } return Stay(EndSymbol(CSharpSymbolType.Transition)); } private StateResult EscapedRazorCommentTransition() { TakeCurrent(); return Transition(CSharpTokenizerState.Data, EndSymbol(CSharpSymbolType.Transition)); } private CSharpSymbolType Operator() { var first = CurrentCharacter; TakeCurrent(); Func handler; if (_operatorHandlers.TryGetValue(first, out handler)) { return handler(); } return CSharpSymbolType.Unknown; } private CSharpSymbolType LessThanOperator() { if (CurrentCharacter == '=') { TakeCurrent(); return CSharpSymbolType.LessThanEqual; } return CSharpSymbolType.LessThan; } private CSharpSymbolType GreaterThanOperator() { if (CurrentCharacter == '=') { TakeCurrent(); return CSharpSymbolType.GreaterThanEqual; } return CSharpSymbolType.GreaterThan; } private CSharpSymbolType MinusOperator() { if (CurrentCharacter == '>') { TakeCurrent(); return CSharpSymbolType.Arrow; } else if (CurrentCharacter == '-') { TakeCurrent(); return CSharpSymbolType.Decrement; } else if (CurrentCharacter == '=') { TakeCurrent(); return CSharpSymbolType.MinusAssign; } return CSharpSymbolType.Minus; } private Func CreateTwoCharOperatorHandler(CSharpSymbolType typeIfOnlyFirst, char second, CSharpSymbolType typeIfBoth) { return () => { if (CurrentCharacter == second) { TakeCurrent(); return typeIfBoth; } return typeIfOnlyFirst; }; } private Func CreateTwoCharOperatorHandler(CSharpSymbolType typeIfOnlyFirst, char option1, CSharpSymbolType typeIfOption1, char option2, CSharpSymbolType typeIfOption2) { return () => { if (CurrentCharacter == option1) { TakeCurrent(); return typeIfOption1; } else if (CurrentCharacter == option2) { TakeCurrent(); return typeIfOption2; } return typeIfOnlyFirst; }; } private StateResult VerbatimStringLiteral() { TakeUntil(c => c == '"'); if (CurrentCharacter == '"') { TakeCurrent(); if (CurrentCharacter == '"') { TakeCurrent(); // Stay in the literal, this is an escaped " return Stay(); } } else if (EndOfFile) { CurrentErrors.Add( new RazorError( LegacyResources.ParseError_Unterminated_String_Literal, CurrentStart, length: 1 /* end of file */)); } return Transition(CSharpTokenizerState.Data, EndSymbol(CSharpSymbolType.StringLiteral)); } private StateResult QuotedCharacterLiteral() => QuotedLiteral('\'', CSharpSymbolType.CharacterLiteral); private StateResult QuotedStringLiteral() => QuotedLiteral('\"', CSharpSymbolType.StringLiteral); private StateResult QuotedLiteral(char quote, CSharpSymbolType literalType) { TakeUntil(c => c == '\\' || c == quote || ParserHelpers.IsNewLine(c)); if (CurrentCharacter == '\\') { TakeCurrent(); // Take the '\' // If the next char is the same quote that started this if (CurrentCharacter == quote || CurrentCharacter == '\\') { TakeCurrent(); // Take it so that we don't prematurely end the literal. } return Stay(); } else if (EndOfFile || ParserHelpers.IsNewLine(CurrentCharacter)) { CurrentErrors.Add( new RazorError( LegacyResources.ParseError_Unterminated_String_Literal, CurrentStart, length: 1 /* " */)); } else { TakeCurrent(); // No-op if at EOF } return Transition(CSharpTokenizerState.Data, EndSymbol(literalType)); } // CSharp Spec §2.3.2 private StateResult BlockComment() { TakeUntil(c => c == '*'); if (EndOfFile) { CurrentErrors.Add( new RazorError( LegacyResources.ParseError_BlockComment_Not_Terminated, CurrentStart, length: 1 /* end of file */)); return Transition(CSharpTokenizerState.Data, EndSymbol(CSharpSymbolType.Comment)); } if (CurrentCharacter == '*') { TakeCurrent(); if (CurrentCharacter == '/') { TakeCurrent(); return Transition(CSharpTokenizerState.Data, EndSymbol(CSharpSymbolType.Comment)); } } return Stay(); } // CSharp Spec §2.3.2 private StateResult SingleLineComment() { TakeUntil(c => ParserHelpers.IsNewLine(c)); return Stay(EndSymbol(CSharpSymbolType.Comment)); } // CSharp Spec §2.4.4 private StateResult NumericLiteral() { if (TakeAll("0x", caseSensitive: true)) { return HexLiteral(); } else { return DecimalLiteral(); } } private StateResult HexLiteral() { TakeUntil(c => !IsHexDigit(c)); TakeIntegerSuffix(); return Stay(EndSymbol(CSharpSymbolType.IntegerLiteral)); } private StateResult DecimalLiteral() { TakeUntil(c => !Char.IsDigit(c)); if (CurrentCharacter == '.' && Char.IsDigit(Peek())) { return RealLiteral(); } else if (IsRealLiteralSuffix(CurrentCharacter) || CurrentCharacter == 'E' || CurrentCharacter == 'e') { return RealLiteralExponentPart(); } else { TakeIntegerSuffix(); return Stay(EndSymbol(CSharpSymbolType.IntegerLiteral)); } } private StateResult RealLiteralExponentPart() { if (CurrentCharacter == 'E' || CurrentCharacter == 'e') { TakeCurrent(); if (CurrentCharacter == '+' || CurrentCharacter == '-') { TakeCurrent(); } TakeUntil(c => !Char.IsDigit(c)); } if (IsRealLiteralSuffix(CurrentCharacter)) { TakeCurrent(); } return Stay(EndSymbol(CSharpSymbolType.RealLiteral)); } // CSharp Spec §2.4.4.3 private StateResult RealLiteral() { AssertCurrent('.'); TakeCurrent(); Debug.Assert(Char.IsDigit(CurrentCharacter)); TakeUntil(c => !Char.IsDigit(c)); return RealLiteralExponentPart(); } private void TakeIntegerSuffix() { if (Char.ToLowerInvariant(CurrentCharacter) == 'u') { TakeCurrent(); if (Char.ToLowerInvariant(CurrentCharacter) == 'l') { TakeCurrent(); } } else if (Char.ToLowerInvariant(CurrentCharacter) == 'l') { TakeCurrent(); if (Char.ToLowerInvariant(CurrentCharacter) == 'u') { TakeCurrent(); } } } // CSharp Spec §2.4.2 private StateResult Identifier() { Debug.Assert(IsIdentifierStart(CurrentCharacter)); TakeCurrent(); TakeUntil(c => !IsIdentifierPart(c)); CSharpSymbol symbol = null; if (HaveContent) { CSharpKeyword keyword; var type = CSharpSymbolType.Identifier; var symbolContent = Buffer.ToString(); if (_keywords.TryGetValue(symbolContent, out keyword)) { type = CSharpSymbolType.Keyword; } symbol = new CSharpSymbol(symbolContent, type) { Keyword = type == CSharpSymbolType.Keyword ? (CSharpKeyword?)keyword : null, }; Buffer.Clear(); CurrentErrors.Clear(); } return Stay(symbol); } private StateResult Transition(CSharpTokenizerState state) { return Transition((int)state, result: null); } private StateResult Transition(CSharpTokenizerState state, CSharpSymbol result) { return Transition((int)state, result); } private static bool IsIdentifierStart(char character) { return char.IsLetter(character) || character == '_' || CharUnicodeInfo.GetUnicodeCategory(character) == UnicodeCategory.LetterNumber; } private static bool IsIdentifierPart(char character) { return char.IsDigit(character) || IsIdentifierStart(character) || IsIdentifierPartByUnicodeCategory(character); } private static bool IsRealLiteralSuffix(char character) { return character == 'F' || character == 'f' || character == 'D' || character == 'd' || character == 'M' || character == 'm'; } private static bool IsIdentifierPartByUnicodeCategory(char character) { var category = CharUnicodeInfo.GetUnicodeCategory(character); return category == UnicodeCategory.NonSpacingMark || // Mn category == UnicodeCategory.SpacingCombiningMark || // Mc category == UnicodeCategory.ConnectorPunctuation || // Pc category == UnicodeCategory.Format; // Cf } private static bool IsHexDigit(char value) { return (value >= '0' && value <= '9') || (value >= 'A' && value <= 'F') || (value >= 'a' && value <= 'f'); } private enum CSharpTokenizerState { Data, BlockComment, QuotedCharacterLiteral, QuotedStringLiteral, VerbatimStringLiteral, // Razor Comments - need to be the same for HTML and CSharp AfterRazorCommentTransition = RazorCommentTokenizerState.AfterRazorCommentTransition, EscapedRazorCommentTransition = RazorCommentTokenizerState.EscapedRazorCommentTransition, RazorCommentBody = RazorCommentTokenizerState.RazorCommentBody, StarAfterRazorCommentBody = RazorCommentTokenizerState.StarAfterRazorCommentBody, AtSymbolAfterRazorCommentBody = RazorCommentTokenizerState.AtSymbolAfterRazorCommentBody, } } }