// Copyright (c) .NET Foundation. All rights reserved. // Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using Microsoft.AspNetCore.Razor.Language.Syntax.InternalSyntax; namespace Microsoft.AspNetCore.Razor.Language.Legacy { internal class HtmlMarkupParser : TokenizerBackedParser { private const string ScriptTagName = "script"; private static readonly SyntaxToken[] nonAllowedHtmlCommentEnding = new[] { SyntaxFactory.Token(SyntaxKind.Text, "-"), SyntaxFactory.Token(SyntaxKind.Bang, "!"), SyntaxFactory.Token(SyntaxKind.OpenAngle, "<"), }; private static readonly SyntaxToken[] singleHyphenArray = new[] { SyntaxFactory.Token(SyntaxKind.Text, "-") }; private static readonly char[] ValidAfterTypeAttributeNameCharacters = { ' ', '\t', '\r', '\n', '\f', '=' }; private SourceLocation _lastTagStart = SourceLocation.Zero; private SyntaxToken _bufferedOpenAngle; //From http://dev.w3.org/html5/spec/Overview.html#elements-0 private readonly ISet _voidElements = new HashSet(StringComparer.OrdinalIgnoreCase) { "area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr" }; public HtmlMarkupParser(ParserContext context) : base(context.ParseLeadingDirectives ? FirstDirectiveHtmlLanguageCharacteristics.Instance : HtmlLanguageCharacteristics.Instance, context) { } public ParserBase CodeParser { get; set; } public ISet VoidElements { get { return _voidElements; } } private bool CaseSensitive { get; set; } private StringComparison Comparison { get { return CaseSensitive ? StringComparison.Ordinal : StringComparison.OrdinalIgnoreCase; } } protected override bool TokenKindEquals(SyntaxKind x, SyntaxKind y) => x == y; public override void BuildSpan(SpanBuilder span, SourceLocation start, string content) { span.Kind = SpanKindInternal.Markup; span.ChunkGenerator = new MarkupChunkGenerator(); base.BuildSpan(span, start, content); } protected override void OutputSpanBeforeRazorComment() { Output(SpanKindInternal.Markup); } protected void SkipToAndParseCode(SyntaxKind type) { SkipToAndParseCode(token => token.Kind == type); } protected void SkipToAndParseCode(Func condition) { SyntaxToken last = null; var startOfLine = false; while (!EndOfFile && !condition(CurrentToken)) { if (Context.NullGenerateWhitespaceAndNewLine) { Context.NullGenerateWhitespaceAndNewLine = false; Span.ChunkGenerator = SpanChunkGenerator.Null; AcceptWhile(token => token.Kind == SyntaxKind.Whitespace); if (At(SyntaxKind.NewLine)) { AcceptAndMoveNext(); } Output(SpanKindInternal.Markup); } else if (At(SyntaxKind.NewLine)) { if (last != null) { Accept(last); } // Mark the start of a new line startOfLine = true; last = null; AcceptAndMoveNext(); } else if (At(SyntaxKind.Transition)) { var transition = CurrentToken; NextToken(); if (At(SyntaxKind.Transition)) { if (last != null) { Accept(last); last = null; } Output(SpanKindInternal.Markup); Accept(transition); Span.ChunkGenerator = SpanChunkGenerator.Null; Output(SpanKindInternal.Markup); AcceptAndMoveNext(); continue; // while } else { if (!EndOfFile) { PutCurrentBack(); } PutBack(transition); } // Handle whitespace rewriting if (last != null) { if (!Context.DesignTimeMode && last.Kind == SyntaxKind.Whitespace && startOfLine) { // Put the whitespace back too startOfLine = false; PutBack(last); last = null; } else { // Accept last Accept(last); last = null; } } OtherParserBlock(); } else if (At(SyntaxKind.RazorCommentTransition)) { if (last != null) { // Don't render the whitespace between the start of the line and the razor comment. if (startOfLine && last.Kind == SyntaxKind.Whitespace) { AddMarkerTokenIfNecessary(); // Output the tokens that may have been accepted prior to the whitespace. Output(SpanKindInternal.Markup); Span.ChunkGenerator = SpanChunkGenerator.Null; } Accept(last); last = null; } AddMarkerTokenIfNecessary(); Output(SpanKindInternal.Markup); RazorComment(); // Handle the whitespace and newline at the end of a razor comment. if (startOfLine && (At(SyntaxKind.NewLine) || (At(SyntaxKind.Whitespace) && NextIs(SyntaxKind.NewLine)))) { AcceptWhile(IsSpacingToken(includeNewLines: false)); AcceptAndMoveNext(); Span.ChunkGenerator = SpanChunkGenerator.Null; Output(SpanKindInternal.Markup); } } else { // As long as we see whitespace, we're still at the "start" of the line startOfLine &= At(SyntaxKind.Whitespace); // If there's a last token, accept it if (last != null) { Accept(last); last = null; } // Advance last = CurrentToken; NextToken(); } } if (last != null) { Accept(last); } } protected static Func IsSpacingToken(bool includeNewLines) { return token => token.Kind == SyntaxKind.Whitespace || (includeNewLines && token.Kind == SyntaxKind.NewLine); } private void OtherParserBlock() { AddMarkerTokenIfNecessary(); Output(SpanKindInternal.Markup); using (PushSpanConfig()) { CodeParser.ParseBlock(); } Span.Start = CurrentLocation; Initialize(Span); NextToken(); } private bool IsBangEscape(int lookahead) { var potentialBang = Lookahead(lookahead); if (potentialBang != null && potentialBang.Kind == SyntaxKind.Bang) { var afterBang = Lookahead(lookahead + 1); return afterBang != null && afterBang.Kind == SyntaxKind.Text && !string.Equals(afterBang.Content, "DOCTYPE", StringComparison.OrdinalIgnoreCase); } return false; } private void OptionalBangEscape() { if (IsBangEscape(lookahead: 0)) { Output(SpanKindInternal.Markup); // Accept the parser escape character '!'. Assert(SyntaxKind.Bang); AcceptAndMoveNext(); // Setup the metacode span that we will be outputing. Span.ChunkGenerator = SpanChunkGenerator.Null; Output(SpanKindInternal.MetaCode, AcceptedCharactersInternal.None); } } public override void ParseBlock() { if (Context == null) { throw new InvalidOperationException(Resources.Parser_Context_Not_Set); } using (PushSpanConfig(DefaultMarkupSpan)) { using (Context.Builder.StartBlock(BlockKindInternal.Markup)) { Span.Start = CurrentLocation; if (!NextToken()) { return; } AcceptWhile(IsSpacingToken(includeNewLines: true)); if (CurrentToken.Kind == SyntaxKind.OpenAngle) { // "<" => Implicit Tag Block TagBlock(new Stack>()); } else if (CurrentToken.Kind == SyntaxKind.Transition) { // "@" => Explicit Tag/Single Line Block OR Template Output(SpanKindInternal.Markup); // Definitely have a transition span Assert(SyntaxKind.Transition); AcceptAndMoveNext(); Span.EditHandler.AcceptedCharacters = AcceptedCharactersInternal.None; Span.ChunkGenerator = SpanChunkGenerator.Null; Output(SpanKindInternal.Transition); if (At(SyntaxKind.Transition)) { Span.ChunkGenerator = SpanChunkGenerator.Null; AcceptAndMoveNext(); Output(SpanKindInternal.MetaCode); } AfterTransition(); } else { Context.ErrorSink.OnError( RazorDiagnosticFactory.CreateParsing_MarkupBlockMustStartWithTag( new SourceSpan(CurrentStart, CurrentToken.Content.Length))); } Output(SpanKindInternal.Markup); } } } private void DefaultMarkupSpan(SpanBuilder span) { span.ChunkGenerator = new MarkupChunkGenerator(); span.EditHandler = new SpanEditHandler(Language.TokenizeString, AcceptedCharactersInternal.Any); } private void AfterTransition() { // "@:" => Explicit Single Line Block if (CurrentToken.Kind == SyntaxKind.Text && CurrentToken.Content.Length > 0 && CurrentToken.Content[0] == ':') { // Split the token var split = Language.SplitToken(CurrentToken, 1, SyntaxKind.Colon); // The first part (left) is added to this span and we return a MetaCode span Accept(split.Item1); Span.ChunkGenerator = SpanChunkGenerator.Null; Output(SpanKindInternal.MetaCode); if (split.Item2 != null) { Accept(split.Item2); } NextToken(); SingleLineMarkup(); } else if (CurrentToken.Kind == SyntaxKind.OpenAngle) { TagBlock(new Stack>()); } } private void SingleLineMarkup() { // Parse until a newline, it's that simple! // First, signal to code parser that whitespace is significant to us. var old = Context.WhiteSpaceIsSignificantToAncestorBlock; Context.WhiteSpaceIsSignificantToAncestorBlock = true; Span.EditHandler = new SpanEditHandler(Language.TokenizeString); SkipToAndParseCode(SyntaxKind.NewLine); if (!EndOfFile && CurrentToken.Kind == SyntaxKind.NewLine) { AcceptAndMoveNext(); Span.EditHandler.AcceptedCharacters = AcceptedCharactersInternal.None; } PutCurrentBack(); Context.WhiteSpaceIsSignificantToAncestorBlock = old; Output(SpanKindInternal.Markup); } private void TagBlock(Stack> tags) { // Skip Whitespace and Text var complete = false; do { SkipToAndParseCode(SyntaxKind.OpenAngle); // Output everything prior to the OpenAngle into a markup span Output(SpanKindInternal.Markup); // Do not want to start a new tag block if we're at the end of the file. IDisposable tagBlockWrapper = null; try { var atSpecialTag = AtSpecialTag; if (!EndOfFile && !atSpecialTag) { // Start a Block tag. This is used to wrap things like

or etc. tagBlockWrapper = Context.Builder.StartBlock(BlockKindInternal.Tag); } if (EndOfFile) { EndTagBlock(tags, complete: true); } else { _bufferedOpenAngle = null; _lastTagStart = CurrentStart; Assert(SyntaxKind.OpenAngle); _bufferedOpenAngle = CurrentToken; var tagStart = CurrentStart; if (!NextToken()) { Accept(_bufferedOpenAngle); EndTagBlock(tags, complete: false); } else { complete = AfterTagStart(tagStart, tags, atSpecialTag, tagBlockWrapper); } } if (complete) { // Completed tags have no accepted characters inside of blocks. Span.EditHandler.AcceptedCharacters = AcceptedCharactersInternal.None; } // Output the contents of the tag into its own markup span. Output(SpanKindInternal.Markup); } finally { // Will be null if we were at end of file or special tag when initially created. if (tagBlockWrapper != null) { // End tag block tagBlockWrapper.Dispose(); } } } while (tags.Count > 0); EndTagBlock(tags, complete); } private bool AfterTagStart(SourceLocation tagStart, Stack> tags, bool atSpecialTag, IDisposable tagBlockWrapper) { if (!EndOfFile) { switch (CurrentToken.Kind) { case SyntaxKind.ForwardSlash: // End Tag return EndTag(tagStart, tags, tagBlockWrapper); case SyntaxKind.Bang: // Comment, CDATA, DOCTYPE, or a parser-escaped HTML tag. if (atSpecialTag) { Accept(_bufferedOpenAngle); return BangTag(); } else { goto default; } case SyntaxKind.QuestionMark: // XML PI Accept(_bufferedOpenAngle); return XmlPI(); default: // Start Tag return StartTag(tags, tagBlockWrapper); } } if (tags.Count == 0) { Context.ErrorSink.OnError( RazorDiagnosticFactory.CreateParsing_OuterTagMissingName( new SourceSpan(CurrentStart, contentLength: 1 /* end of file */))); } return false; } private bool XmlPI() { // Accept "?" Assert(SyntaxKind.QuestionMark); AcceptAndMoveNext(); return AcceptUntilAll(SyntaxKind.QuestionMark, SyntaxKind.CloseAngle); } private bool BangTag() { // Accept "!" Assert(SyntaxKind.Bang); if (AcceptAndMoveNext()) { if (IsHtmlCommentAhead()) { using (Context.Builder.StartBlock(BlockKindInternal.HtmlComment)) { // Accept the double-hyphen token at the beginning of the comment block. AcceptAndMoveNext(); Output(SpanKindInternal.Markup, AcceptedCharactersInternal.None); Span.EditHandler.AcceptedCharacters = AcceptedCharactersInternal.Whitespace; while (!EndOfFile) { SkipToAndParseCode(SyntaxKind.DoubleHyphen); var lastDoubleHyphen = AcceptAllButLastDoubleHyphens(); if (At(SyntaxKind.CloseAngle)) { // Output the content in the comment block as a separate markup Output(SpanKindInternal.Markup, AcceptedCharactersInternal.Whitespace); // This is the end of a comment block Accept(lastDoubleHyphen); AcceptAndMoveNext(); Output(SpanKindInternal.Markup, AcceptedCharactersInternal.None); return true; } else if (lastDoubleHyphen != null) { Accept(lastDoubleHyphen); } } } } else if (CurrentToken.Kind == SyntaxKind.LeftBracket) { if (AcceptAndMoveNext()) { return CData(); } } else { AcceptAndMoveNext(); return AcceptUntilAll(SyntaxKind.CloseAngle); } } return false; } protected SyntaxToken AcceptAllButLastDoubleHyphens() { var lastDoubleHyphen = CurrentToken; AcceptWhile(s => { if (NextIs(SyntaxKind.DoubleHyphen)) { lastDoubleHyphen = s; return true; } return false; }); NextToken(); if (At(SyntaxKind.Text) && IsHyphen(CurrentToken)) { // Doing this here to maintain the order of tokens if (!NextIs(SyntaxKind.CloseAngle)) { Accept(lastDoubleHyphen); lastDoubleHyphen = null; } AcceptAndMoveNext(); } return lastDoubleHyphen; } internal static bool IsHyphen(SyntaxToken token) { return token.Kind == SyntaxKind.Text && token.Content == "-"; } protected bool IsHtmlCommentAhead() { // From HTML5 Specification, available at http://www.w3.org/TR/html52/syntax.html#comments // Comments must have the following format: // 1. The string "" As we will be treating this as a comment ending, there is no need to handle this case at all. // 2.2.3 "--!>" // 2.3 nor end with the string "" if (CurrentToken.Kind != SyntaxKind.DoubleHyphen) { return false; } // Check condition 2.1 if (NextIs(SyntaxKind.CloseAngle) || NextIs(next => IsHyphen(next) && NextIs(SyntaxKind.CloseAngle))) { return false; } // Check condition 2.2 var isValidComment = false; LookaheadUntil((token, prevTokens) => { if (token.Kind == SyntaxKind.DoubleHyphen) { if (NextIs(SyntaxKind.CloseAngle)) { // Check condition 2.3: We're at the end of a comment. Check to make sure the text ending is allowed. isValidComment = !IsCommentContentEndingInvalid(prevTokens); return true; } else if (NextIs(ns => IsHyphen(ns) && NextIs(SyntaxKind.CloseAngle))) { // Check condition 2.3: we're at the end of a comment, which has an extra dash. // Need to treat the dash as part of the content and check the ending. // However, that case would have already been checked as part of check from 2.2.1 which // would already fail this iteration and we wouldn't get here isValidComment = true; return true; } else if (NextIs(ns => ns.Kind == SyntaxKind.Bang && NextIs(SyntaxKind.CloseAngle))) { // This is condition 2.2.3 isValidComment = false; return true; } } else if (token.Kind == SyntaxKind.OpenAngle) { // Checking condition 2.2.1 if (NextIs(ns => ns.Kind == SyntaxKind.Bang && NextIs(SyntaxKind.DoubleHyphen))) { isValidComment = false; return true; } } return false; }); return isValidComment; } ///

/// Verifies, that the sequence doesn't end with the "<!-" HtmlTokens. Note, the first token is an opening bracket token /// internal static bool IsCommentContentEndingInvalid(IEnumerable sequence) { var reversedSequence = sequence.Reverse(); var index = 0; foreach (var item in reversedSequence) { if (!item.IsEquivalentTo(nonAllowedHtmlCommentEnding[index++])) { return false; } if (index == nonAllowedHtmlCommentEnding.Length) { return true; } } return false; } private bool CData() { if (CurrentToken.Kind == SyntaxKind.Text && string.Equals(CurrentToken.Content, "cdata", StringComparison.OrdinalIgnoreCase)) { if (AcceptAndMoveNext()) { if (CurrentToken.Kind == SyntaxKind.LeftBracket) { return AcceptUntilAll(SyntaxKind.RightBracket, SyntaxKind.RightBracket, SyntaxKind.CloseAngle); } } } return false; } private bool EndTag(SourceLocation tagStart, Stack> tags, IDisposable tagBlockWrapper) { // Accept "/" and move next Assert(SyntaxKind.ForwardSlash); var forwardSlash = CurrentToken; if (!NextToken()) { Accept(_bufferedOpenAngle); Accept(forwardSlash); return false; } else { var tagName = string.Empty; SyntaxToken bangToken = null; if (At(SyntaxKind.Bang)) { bangToken = CurrentToken; var nextToken = Lookahead(count: 1); if (nextToken != null && nextToken.Kind == SyntaxKind.Text) { tagName = "!" + nextToken.Content; } } else if (At(SyntaxKind.Text)) { tagName = CurrentToken.Content; } var matched = RemoveTag(tags, tagName, tagStart); if (tags.Count == 0 && // Note tagName may contain a '!' escape character. This ensures doesn't match here. // tags are treated like any other escaped HTML end tag. string.Equals(tagName, SyntaxConstants.TextTagName, StringComparison.OrdinalIgnoreCase) && matched) { return EndTextTag(forwardSlash, tagBlockWrapper); } Accept(_bufferedOpenAngle); Accept(forwardSlash); OptionalBangEscape(); AcceptUntil(SyntaxKind.CloseAngle); // Accept the ">" return Optional(SyntaxKind.CloseAngle); } } private void RecoverTextTag() { // We don't want to skip-to and parse because there shouldn't be anything in the body of text tags. AcceptUntil(SyntaxKind.CloseAngle, SyntaxKind.NewLine); // Include the close angle in the text tag block if it's there, otherwise just move on Optional(SyntaxKind.CloseAngle); } private bool EndTextTag(SyntaxToken solidus, IDisposable tagBlockWrapper) { Accept(_bufferedOpenAngle); Accept(solidus); var textLocation = CurrentStart; Assert(SyntaxKind.Text); AcceptAndMoveNext(); var seenCloseAngle = Optional(SyntaxKind.CloseAngle); if (!seenCloseAngle) { Context.ErrorSink.OnError( RazorDiagnosticFactory.CreateParsing_TextTagCannotContainAttributes( new SourceSpan(textLocation, contentLength: 4 /* text */))); Span.EditHandler.AcceptedCharacters = AcceptedCharactersInternal.Any; RecoverTextTag(); } else { Span.EditHandler.AcceptedCharacters = AcceptedCharactersInternal.None; } Span.ChunkGenerator = SpanChunkGenerator.Null; CompleteTagBlockWithSpan(tagBlockWrapper, Span.EditHandler.AcceptedCharacters, SpanKindInternal.Transition); return seenCloseAngle; } // Special tags include