// Copyright (c) .NET Foundation. All rights reserved. // Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; namespace Microsoft.AspNetCore.Razor.Language.Legacy { internal class HtmlMarkupParser : TokenizerBackedParser { private const string ScriptTagName = "script"; private static readonly HtmlSymbol[] nonAllowedHtmlCommentEnding = new[] { HtmlSymbol.Hyphen, new HtmlSymbol("!", HtmlSymbolType.Bang), new HtmlSymbol("<", HtmlSymbolType.OpenAngle) }; private static readonly HtmlSymbol[] singleHyphenArray = new[] { HtmlSymbol.Hyphen }; private static readonly char[] ValidAfterTypeAttributeNameCharacters = { ' ', '\t', '\r', '\n', '\f', '=' }; private SourceLocation _lastTagStart = SourceLocation.Zero; private HtmlSymbol _bufferedOpenAngle; //From http://dev.w3.org/html5/spec/Overview.html#elements-0 private ISet _voidElements = new HashSet(StringComparer.OrdinalIgnoreCase) { "area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr" }; public HtmlMarkupParser(ParserContext context) : base(context.ParseLeadingDirectives ? FirstDirectiveHtmlLanguageCharacteristics.Instance : HtmlLanguageCharacteristics.Instance, context) { } public ParserBase CodeParser { get; set; } public ISet VoidElements { get { return _voidElements; } } private bool CaseSensitive { get; set; } private StringComparison Comparison { get { return CaseSensitive ? StringComparison.Ordinal : StringComparison.OrdinalIgnoreCase; } } protected override bool SymbolTypeEquals(HtmlSymbolType x, HtmlSymbolType y) => x == y; public override void BuildSpan(SpanBuilder span, SourceLocation start, string content) { span.Kind = SpanKindInternal.Markup; span.ChunkGenerator = new MarkupChunkGenerator(); base.BuildSpan(span, start, content); } protected override void OutputSpanBeforeRazorComment() { Output(SpanKindInternal.Markup); } protected void SkipToAndParseCode(HtmlSymbolType type) { SkipToAndParseCode(sym => sym.Type == type); } protected void SkipToAndParseCode(Func condition) { HtmlSymbol last = null; var startOfLine = false; while (!EndOfFile && !condition(CurrentSymbol)) { if (Context.NullGenerateWhitespaceAndNewLine) { Context.NullGenerateWhitespaceAndNewLine = false; Span.ChunkGenerator = SpanChunkGenerator.Null; AcceptWhile(symbol => symbol.Type == HtmlSymbolType.WhiteSpace); if (At(HtmlSymbolType.NewLine)) { AcceptAndMoveNext(); } Output(SpanKindInternal.Markup); } else if (At(HtmlSymbolType.NewLine)) { if (last != null) { Accept(last); } // Mark the start of a new line startOfLine = true; last = null; AcceptAndMoveNext(); } else if (At(HtmlSymbolType.Transition)) { var transition = CurrentSymbol; NextToken(); if (At(HtmlSymbolType.Transition)) { if (last != null) { Accept(last); last = null; } Output(SpanKindInternal.Markup); Accept(transition); Span.ChunkGenerator = SpanChunkGenerator.Null; Output(SpanKindInternal.Markup); AcceptAndMoveNext(); continue; // while } else { if (!EndOfFile) { PutCurrentBack(); } PutBack(transition); } // Handle whitespace rewriting if (last != null) { if (!Context.DesignTimeMode && last.Type == HtmlSymbolType.WhiteSpace && startOfLine) { // Put the whitespace back too startOfLine = false; PutBack(last); last = null; } else { // Accept last Accept(last); last = null; } } OtherParserBlock(); } else if (At(HtmlSymbolType.RazorCommentTransition)) { if (last != null) { // Don't render the whitespace between the start of the line and the razor comment. if (startOfLine && last.Type == HtmlSymbolType.WhiteSpace) { AddMarkerSymbolIfNecessary(); // Output the symbols that may have been accepted prior to the whitespace. Output(SpanKindInternal.Markup); Span.ChunkGenerator = SpanChunkGenerator.Null; } Accept(last); last = null; } AddMarkerSymbolIfNecessary(); Output(SpanKindInternal.Markup); RazorComment(); // Handle the whitespace and newline at the end of a razor comment. if (startOfLine && (At(HtmlSymbolType.NewLine) || (At(HtmlSymbolType.WhiteSpace) && NextIs(HtmlSymbolType.NewLine)))) { AcceptWhile(IsSpacingToken(includeNewLines: false)); AcceptAndMoveNext(); Span.ChunkGenerator = SpanChunkGenerator.Null; Output(SpanKindInternal.Markup); } } else { // As long as we see whitespace, we're still at the "start" of the line startOfLine &= At(HtmlSymbolType.WhiteSpace); // If there's a last token, accept it if (last != null) { Accept(last); last = null; } // Advance last = CurrentSymbol; NextToken(); } } if (last != null) { Accept(last); } } protected static Func IsSpacingToken(bool includeNewLines) { return sym => sym.Type == HtmlSymbolType.WhiteSpace || (includeNewLines && sym.Type == HtmlSymbolType.NewLine); } private void OtherParserBlock() { AddMarkerSymbolIfNecessary(); Output(SpanKindInternal.Markup); using (PushSpanConfig()) { CodeParser.ParseBlock(); } Span.Start = CurrentLocation; Initialize(Span); NextToken(); } private bool IsBangEscape(int lookahead) { var potentialBang = Lookahead(lookahead); if (potentialBang != null && potentialBang.Type == HtmlSymbolType.Bang) { var afterBang = Lookahead(lookahead + 1); return afterBang != null && afterBang.Type == HtmlSymbolType.Text && !string.Equals(afterBang.Content, "DOCTYPE", StringComparison.OrdinalIgnoreCase); } return false; } private void OptionalBangEscape() { if (IsBangEscape(lookahead: 0)) { Output(SpanKindInternal.Markup); // Accept the parser escape character '!'. Assert(HtmlSymbolType.Bang); AcceptAndMoveNext(); // Setup the metacode span that we will be outputing. Span.ChunkGenerator = SpanChunkGenerator.Null; Output(SpanKindInternal.MetaCode, AcceptedCharactersInternal.None); } } public override void ParseBlock() { if (Context == null) { throw new InvalidOperationException(Resources.Parser_Context_Not_Set); } using (PushSpanConfig(DefaultMarkupSpan)) { using (Context.Builder.StartBlock(BlockKindInternal.Markup)) { Span.Start = CurrentLocation; if (!NextToken()) { return; } AcceptWhile(IsSpacingToken(includeNewLines: true)); if (CurrentSymbol.Type == HtmlSymbolType.OpenAngle) { // "<" => Implicit Tag Block TagBlock(new Stack>()); } else if (CurrentSymbol.Type == HtmlSymbolType.Transition) { // "@" => Explicit Tag/Single Line Block OR Template Output(SpanKindInternal.Markup); // Definitely have a transition span Assert(HtmlSymbolType.Transition); AcceptAndMoveNext(); Span.EditHandler.AcceptedCharacters = AcceptedCharactersInternal.None; Span.ChunkGenerator = SpanChunkGenerator.Null; Output(SpanKindInternal.Transition); if (At(HtmlSymbolType.Transition)) { Span.ChunkGenerator = SpanChunkGenerator.Null; AcceptAndMoveNext(); Output(SpanKindInternal.MetaCode); } AfterTransition(); } else { Context.ErrorSink.OnError( RazorDiagnosticFactory.CreateParsing_MarkupBlockMustStartWithTag( new SourceSpan(CurrentStart, CurrentSymbol.Content.Length))); } Output(SpanKindInternal.Markup); } } } private void DefaultMarkupSpan(SpanBuilder span) { span.ChunkGenerator = new MarkupChunkGenerator(); span.EditHandler = new SpanEditHandler(Language.TokenizeString, AcceptedCharactersInternal.Any); } private void AfterTransition() { // "@:" => Explicit Single Line Block if (CurrentSymbol.Type == HtmlSymbolType.Text && CurrentSymbol.Content.Length > 0 && CurrentSymbol.Content[0] == ':') { // Split the token Tuple split = Language.SplitSymbol(CurrentSymbol, 1, HtmlSymbolType.Colon); // The first part (left) is added to this span and we return a MetaCode span Accept(split.Item1); Span.ChunkGenerator = SpanChunkGenerator.Null; Output(SpanKindInternal.MetaCode); if (split.Item2 != null) { Accept(split.Item2); } NextToken(); SingleLineMarkup(); } else if (CurrentSymbol.Type == HtmlSymbolType.OpenAngle) { TagBlock(new Stack>()); } } private void SingleLineMarkup() { // Parse until a newline, it's that simple! // First, signal to code parser that whitespace is significant to us. var old = Context.WhiteSpaceIsSignificantToAncestorBlock; Context.WhiteSpaceIsSignificantToAncestorBlock = true; Span.EditHandler = new SpanEditHandler(Language.TokenizeString); SkipToAndParseCode(HtmlSymbolType.NewLine); if (!EndOfFile && CurrentSymbol.Type == HtmlSymbolType.NewLine) { AcceptAndMoveNext(); Span.EditHandler.AcceptedCharacters = AcceptedCharactersInternal.None; } PutCurrentBack(); Context.WhiteSpaceIsSignificantToAncestorBlock = old; Output(SpanKindInternal.Markup); } private void TagBlock(Stack> tags) { // Skip Whitespace and Text var complete = false; do { SkipToAndParseCode(HtmlSymbolType.OpenAngle); // Output everything prior to the OpenAngle into a markup span Output(SpanKindInternal.Markup); // Do not want to start a new tag block if we're at the end of the file. IDisposable tagBlockWrapper = null; try { var atSpecialTag = AtSpecialTag; if (!EndOfFile && !atSpecialTag) { // Start a Block tag. This is used to wrap things like

or etc. tagBlockWrapper = Context.Builder.StartBlock(BlockKindInternal.Tag); } if (EndOfFile) { EndTagBlock(tags, complete: true); } else { _bufferedOpenAngle = null; _lastTagStart = CurrentStart; Assert(HtmlSymbolType.OpenAngle); _bufferedOpenAngle = CurrentSymbol; var tagStart = CurrentStart; if (!NextToken()) { Accept(_bufferedOpenAngle); EndTagBlock(tags, complete: false); } else { complete = AfterTagStart(tagStart, tags, atSpecialTag, tagBlockWrapper); } } if (complete) { // Completed tags have no accepted characters inside of blocks. Span.EditHandler.AcceptedCharacters = AcceptedCharactersInternal.None; } // Output the contents of the tag into its own markup span. Output(SpanKindInternal.Markup); } finally { // Will be null if we were at end of file or special tag when initially created. if (tagBlockWrapper != null) { // End tag block tagBlockWrapper.Dispose(); } } } while (tags.Count > 0); EndTagBlock(tags, complete); } private bool AfterTagStart(SourceLocation tagStart, Stack> tags, bool atSpecialTag, IDisposable tagBlockWrapper) { if (!EndOfFile) { switch (CurrentSymbol.Type) { case HtmlSymbolType.ForwardSlash: // End Tag return EndTag(tagStart, tags, tagBlockWrapper); case HtmlSymbolType.Bang: // Comment, CDATA, DOCTYPE, or a parser-escaped HTML tag. if (atSpecialTag) { Accept(_bufferedOpenAngle); return BangTag(); } else { goto default; } case HtmlSymbolType.QuestionMark: // XML PI Accept(_bufferedOpenAngle); return XmlPI(); default: // Start Tag return StartTag(tags, tagBlockWrapper); } } if (tags.Count == 0) { Context.ErrorSink.OnError( RazorDiagnosticFactory.CreateParsing_OuterTagMissingName( new SourceSpan(CurrentStart, contentLength: 1 /* end of file */))); } return false; } private bool XmlPI() { // Accept "?" Assert(HtmlSymbolType.QuestionMark); AcceptAndMoveNext(); return AcceptUntilAll(HtmlSymbolType.QuestionMark, HtmlSymbolType.CloseAngle); } private bool BangTag() { // Accept "!" Assert(HtmlSymbolType.Bang); if (AcceptAndMoveNext()) { if (IsHtmlCommentAhead()) { using (Context.Builder.StartBlock(BlockKindInternal.HtmlComment)) { // Accept the double-hyphen symbol at the beginning of the comment block. AcceptAndMoveNext(); Output(SpanKindInternal.Markup, AcceptedCharactersInternal.None); Span.EditHandler.AcceptedCharacters = AcceptedCharactersInternal.WhiteSpace; while (!EndOfFile) { SkipToAndParseCode(HtmlSymbolType.DoubleHyphen); var lastDoubleHyphen = AcceptAllButLastDoubleHyphens(); if (At(HtmlSymbolType.CloseAngle)) { // Output the content in the comment block as a separate markup Output(SpanKindInternal.Markup, AcceptedCharactersInternal.WhiteSpace); // This is the end of a comment block Accept(lastDoubleHyphen); AcceptAndMoveNext(); Output(SpanKindInternal.Markup, AcceptedCharactersInternal.None); return true; } else if (lastDoubleHyphen != null) { Accept(lastDoubleHyphen); } } } } else if (CurrentSymbol.Type == HtmlSymbolType.LeftBracket) { if (AcceptAndMoveNext()) { return CData(); } } else { AcceptAndMoveNext(); return AcceptUntilAll(HtmlSymbolType.CloseAngle); } } return false; } protected HtmlSymbol AcceptAllButLastDoubleHyphens() { var lastDoubleHyphen = CurrentSymbol; AcceptWhile(s => { if (NextIs(HtmlSymbolType.DoubleHyphen)) { lastDoubleHyphen = s; return true; } return false; }); NextToken(); if (At(HtmlSymbolType.Text) && IsHyphen(CurrentSymbol)) { // Doing this here to maintain the order of symbols if (!NextIs(HtmlSymbolType.CloseAngle)) { Accept(lastDoubleHyphen); lastDoubleHyphen = null; } AcceptAndMoveNext(); } return lastDoubleHyphen; } internal static bool IsHyphen(HtmlSymbol symbol) { return symbol.Equals(HtmlSymbol.Hyphen); } protected bool IsHtmlCommentAhead() { /* * From HTML5 Specification, available at http://www.w3.org/TR/html52/syntax.html#comments * * Comments must have the following format: * 1. The string "" // As we will be treating this as a comment ending, there is no need to handle this case at all. * 2.2.3 "--!>" * 2.3 nor end with the string "" * * */ if (CurrentSymbol.Type != HtmlSymbolType.DoubleHyphen) { return false; } // Check condition 2.1 if (NextIs(HtmlSymbolType.CloseAngle) || NextIs(next => IsHyphen(next) && NextIs(HtmlSymbolType.CloseAngle))) { return false; } // Check condition 2.2 var isValidComment = false; LookaheadUntil((symbol, prevSymbols) => { if (symbol.Type == HtmlSymbolType.DoubleHyphen) { if (NextIs(HtmlSymbolType.CloseAngle)) { // Check condition 2.3: We're at the end of a comment. Check to make sure the text ending is allowed. isValidComment = !IsCommentContentEndingInvalid(prevSymbols); return true; } else if (NextIs(ns => IsHyphen(ns) && NextIs(HtmlSymbolType.CloseAngle))) { // Check condition 2.3: we're at the end of a comment, which has an extra dash. // Need to treat the dash as part of the content and check the ending. // However, that case would have already been checked as part of check from 2.2.1 which // would already fail this iteration and we wouldn't get here isValidComment = true; return true; } else if (NextIs(ns => ns.Type == HtmlSymbolType.Bang && NextIs(HtmlSymbolType.CloseAngle))) { // This is condition 2.2.3 isValidComment = false; return true; } } else if (symbol.Type == HtmlSymbolType.OpenAngle) { // Checking condition 2.2.1 if (NextIs(ns => ns.Type == HtmlSymbolType.Bang && NextIs(HtmlSymbolType.DoubleHyphen))) { isValidComment = false; return true; } } return false; }); return isValidComment; } ///

/// Verifies, that the sequence doesn't end with the "<!-" HtmlSymbols. Note, the first symbol is an opening bracket symbol /// internal static bool IsCommentContentEndingInvalid(IEnumerable sequence) { var reversedSequence = sequence.Reverse(); var index = 0; foreach (var item in reversedSequence) { if (!item.Equals(nonAllowedHtmlCommentEnding[index++])) { return false; } if (index == nonAllowedHtmlCommentEnding.Length) { return true; } } return false; } private bool CData() { if (CurrentSymbol.Type == HtmlSymbolType.Text && string.Equals(CurrentSymbol.Content, "cdata", StringComparison.OrdinalIgnoreCase)) { if (AcceptAndMoveNext()) { if (CurrentSymbol.Type == HtmlSymbolType.LeftBracket) { return AcceptUntilAll(HtmlSymbolType.RightBracket, HtmlSymbolType.RightBracket, HtmlSymbolType.CloseAngle); } } } return false; } private bool EndTag(SourceLocation tagStart, Stack> tags, IDisposable tagBlockWrapper) { // Accept "/" and move next Assert(HtmlSymbolType.ForwardSlash); var forwardSlash = CurrentSymbol; if (!NextToken()) { Accept(_bufferedOpenAngle); Accept(forwardSlash); return false; } else { var tagName = string.Empty; HtmlSymbol bangSymbol = null; if (At(HtmlSymbolType.Bang)) { bangSymbol = CurrentSymbol; var nextSymbol = Lookahead(count: 1); if (nextSymbol != null && nextSymbol.Type == HtmlSymbolType.Text) { tagName = "!" + nextSymbol.Content; } } else if (At(HtmlSymbolType.Text)) { tagName = CurrentSymbol.Content; } var matched = RemoveTag(tags, tagName, tagStart); if (tags.Count == 0 && // Note tagName may contain a '!' escape character. This ensures doesn't match here. // tags are treated like any other escaped HTML end tag. string.Equals(tagName, SyntaxConstants.TextTagName, StringComparison.OrdinalIgnoreCase) && matched) { return EndTextTag(forwardSlash, tagBlockWrapper); } Accept(_bufferedOpenAngle); Accept(forwardSlash); OptionalBangEscape(); AcceptUntil(HtmlSymbolType.CloseAngle); // Accept the ">" return Optional(HtmlSymbolType.CloseAngle); } } private void RecoverTextTag() { // We don't want to skip-to and parse because there shouldn't be anything in the body of text tags. AcceptUntil(HtmlSymbolType.CloseAngle, HtmlSymbolType.NewLine); // Include the close angle in the text tag block if it's there, otherwise just move on Optional(HtmlSymbolType.CloseAngle); } private bool EndTextTag(HtmlSymbol solidus, IDisposable tagBlockWrapper) { Accept(_bufferedOpenAngle); Accept(solidus); var textLocation = CurrentStart; Assert(HtmlSymbolType.Text); AcceptAndMoveNext(); var seenCloseAngle = Optional(HtmlSymbolType.CloseAngle); if (!seenCloseAngle) { Context.ErrorSink.OnError( RazorDiagnosticFactory.CreateParsing_TextTagCannotContainAttributes( new SourceSpan(textLocation, contentLength: 4 /* text */))); Span.EditHandler.AcceptedCharacters = AcceptedCharactersInternal.Any; RecoverTextTag(); } else { Span.EditHandler.AcceptedCharacters = AcceptedCharactersInternal.None; } Span.ChunkGenerator = SpanChunkGenerator.Null; CompleteTagBlockWithSpan(tagBlockWrapper, Span.EditHandler.AcceptedCharacters, SpanKindInternal.Transition); return seenCloseAngle; } // Special tags include