// Copyright (c) .NET Foundation. All rights reserved. // Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. using AngleSharp; using AngleSharp.Extensions; using AngleSharp.Html; using AngleSharp.Parser.Html; using Microsoft.AspNetCore.Razor.Language; using Microsoft.AspNetCore.Razor.Language.Intermediate; using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; namespace Microsoft.AspNetCore.Blazor.Razor { // Rewrites the standard IR to a format more suitable for Blazor // // HTML nodes are rewritten to contain more structure, instead of treating HTML as opaque content // it is structured into element/component nodes, and attribute nodes. internal class ComponentDocumentRewritePass : IntermediateNodePassBase, IRazorDocumentClassifierPass { // Per the HTML spec, the following elements are inherently self-closing // For example, is the same as (and therefore it cannot contain descendants) private readonly static HashSet VoidElements = new HashSet(StringComparer.OrdinalIgnoreCase) { "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr", }; // Run as soon as possible after the Component document classifier public override int Order => ComponentDocumentClassifierPass.DefaultFeatureOrder + 1; protected override void ExecuteCore(RazorCodeDocument codeDocument, DocumentIntermediateNode documentNode) { if (documentNode.DocumentKind != ComponentDocumentClassifierPass.ComponentDocumentKind) { return; } var visitor = new RewriteWalker(codeDocument.Source); visitor.Visit(documentNode); } // Visits nodes then rewrites them using a post-order traversal. The result is that the tree // is rewritten bottom up. // // This relies on a few invariants Razor already provides for correctness. // - Tag Helpers are the only real nesting construct // - Tag Helpers require properly nested HTML inside their body // // This means that when we find a 'container' for HTML content, we have the guarantee // that the content is properly nested, except at the top level of scope. And since the top // level isn't nested inside anything, we can't introduce any errors due to misunderstanding // the structure. private class RewriteWalker : IntermediateNodeWalker { private readonly RazorSourceDocument _source; public RewriteWalker(RazorSourceDocument source) { _source = source; } public override void VisitDefault(IntermediateNode node) { var foundHtml = false; for (var i = 0; i < node.Children.Count; i++) { var child = node.Children[i]; Visit(child); if (child is HtmlContentIntermediateNode) { foundHtml = true; } } if (foundHtml) { RewriteChildren(_source, node); } } public override void VisitHtmlAttribute(HtmlAttributeIntermediateNode node) { // Don't rewrite inside of attributes } public override void VisitTagHelperHtmlAttribute(TagHelperHtmlAttributeIntermediateNode node) { // Don't rewrite inside of attributes } public override void VisitTagHelperProperty(TagHelperPropertyIntermediateNode node) { // Don't rewrite inside of attributes } private void RewriteChildren(RazorSourceDocument source, IntermediateNode node) { // We expect all of the immediate children of a node (together) to comprise // a well-formed tree of elements and components. var stack = new Stack(); stack.Push(node); // Make a copy, we will clear and rebuild the child collection of this node. var children = node.Children.ToArray(); node.Children.Clear(); // Due to the way Anglesharp parses HTML (tags at a time) we need to keep track of some state. // This handles cases like: // // // // This will lower like: // // HtmlContent // // We need to consume HTML until we see the 'end tag' for and then we can // the attributes from the parsed HTML and the CSharpAttribute value. var parser = new HtmlParser(source); var attributes = new List(); for (var i = 0; i < children.Length; i++) { if (children[i] is HtmlContentIntermediateNode htmlNode) { parser.Push(htmlNode); var tokens = parser.Get(); foreach (var token in tokens) { // We have to call this before get. Anglesharp doesn't return the start position // of tokens. var start = parser.GetCurrentLocation(); // We have to set the Location explicitly otherwise we would need to include // the token in every call to the parser. parser.SetLocation(token); var end = parser.GetCurrentLocation(); if (token.Type == HtmlTokenType.EndOfFile) { break; } switch (token.Type) { case HtmlTokenType.Character: { // Text content var span = new SourceSpan(start, end.AbsoluteIndex - start.AbsoluteIndex); stack.Peek().Children.Add(new HtmlContentIntermediateNode() { Children = { new IntermediateToken() { Content = token.Data, Kind = TokenKind.Html, Source = span, } }, Source = span, }); break; } case HtmlTokenType.StartTag: case HtmlTokenType.EndTag: { var tag = token.AsTag(); if (token.Type == HtmlTokenType.StartTag) { var elementNode = new HtmlElementIntermediateNode() { TagName = parser.GetTagNameOriginalCasing(tag), Source = new SourceSpan(start, end.AbsoluteIndex - start.AbsoluteIndex), }; stack.Peek().Children.Add(elementNode); stack.Push(elementNode); for (var j = 0; j < tag.Attributes.Count; j++) { // Unfortunately Anglesharp doesn't provide positions for attributes // so we can't record the spans here. var attribute = tag.Attributes[j]; stack.Peek().Children.Add(CreateAttributeNode(attribute)); } for (var j = 0; j < attributes.Count; j++) { stack.Peek().Children.Add(attributes[j]); } attributes.Clear(); } if (tag.IsSelfClosing || VoidElements.Contains(tag.Data)) { // We can't possibly hit an error here since we just added an element node. stack.Pop(); } if (token.Type == HtmlTokenType.EndTag) { var popped = stack.Pop(); if (stack.Count == 0) { // If we managed to 'bottom out' the stack then we have an unbalanced end tag. // Put back the current node so we don't crash. stack.Push(popped); var tagName = parser.GetTagNameOriginalCasing(token.AsTag()); var span = new SourceSpan(start, end.AbsoluteIndex - start.AbsoluteIndex); var diagnostic = BlazorDiagnosticFactory.Create_UnexpectedClosingTag(span, tagName); popped.Children.Add(new HtmlElementIntermediateNode() { Diagnostics = { diagnostic, }, TagName = tagName, Source = span, }); } else if (!string.Equals(tag.Name, ((HtmlElementIntermediateNode)popped).TagName, StringComparison.OrdinalIgnoreCase)) { var span = new SourceSpan(start, end.AbsoluteIndex - start.AbsoluteIndex); var diagnostic = BlazorDiagnosticFactory.Create_MismatchedClosingTag(span, ((HtmlElementIntermediateNode)popped).TagName, token.Data); popped.Diagnostics.Add(diagnostic); } else { // Happy path. // // We need to compute a new source span because when we found the start tag before we knew // the end poosition of the tag. var length = end.AbsoluteIndex - popped.Source.Value.AbsoluteIndex; popped.Source = new SourceSpan( popped.Source.Value.FilePath, popped.Source.Value.AbsoluteIndex, popped.Source.Value.LineIndex, popped.Source.Value.CharacterIndex, length); } } break; } case HtmlTokenType.Comment: break; default: throw new InvalidCastException($"Unsupported token type: {token.Type.ToString()}"); } } } else if (children[i] is HtmlAttributeIntermediateNode htmlAttribute) { // Buffer the attribute for now, it will get written out as part of a tag. attributes.Add(htmlAttribute); } else { // not HTML, or already rewritten. stack.Peek().Children.Add(children[i]); } } var extraContent = parser.GetUnparsedContent(); if (!string.IsNullOrEmpty(extraContent)) { // extra HTML - almost certainly invalid because it couldn't be parsed. var start = parser.GetCurrentLocation(); var end = parser.GetCurrentLocation(extraContent.Length); var span = new SourceSpan(start, end.AbsoluteIndex - start.AbsoluteIndex); stack.Peek().Children.Add(new HtmlContentIntermediateNode() { Children = { new IntermediateToken() { Content = extraContent, Kind = TokenKind.Html, Source = span, } }, Diagnostics = { BlazorDiagnosticFactory.Create_InvalidHtmlContent(span, extraContent), }, Source = span, }); } while (stack.Count > 1) { // not balanced var popped = (HtmlElementIntermediateNode)stack.Pop(); var diagnostic = BlazorDiagnosticFactory.Create_UnclosedTag(popped.Source, popped.TagName); popped.Diagnostics.Add(diagnostic); } } } private static HtmlAttributeIntermediateNode CreateAttributeNode(KeyValuePair attribute) { return new HtmlAttributeIntermediateNode() { AttributeName = attribute.Key, Children = { new HtmlAttributeValueIntermediateNode() { Children = { new IntermediateToken() { Kind = TokenKind.Html, Content = attribute.Value, }, } }, } }; } private static string GetHtmlContent(HtmlContentIntermediateNode node) { var builder = new StringBuilder(); for (var i = 0; i < node.Children.Count; i++) { var token = node.Children[i] as IntermediateToken; if (token != null && token.IsHtml) { builder.Append(token.Content); } } return builder.ToString(); } [DebuggerDisplay("{DebuggerDisplay,nq}")] private class HtmlParser { private readonly RazorSourceDocument _source; // Tracks the offsets between the start of _content and then original source document. private List<(int offset, int sourceOffset)> _offsets; private TextSource _textSource; private int _position; private string _content; public HtmlParser(RazorSourceDocument source) { _source = source; } public void Push(HtmlContentIntermediateNode node) { var builder = new StringBuilder(); var offsets = new List<(int offset, int sourceOffset)>(); if (_content != null && _position < _content.Length) { offsets.Add((0, _offsets[0].sourceOffset + _position)); builder.Append(_content, _position, _content.Length - _position); } for (var i = 0; i < node.Children.Count; i++) { var token = node.Children[i] as IntermediateToken; if (token != null && token.IsHtml) { offsets.Add((builder.Length, token.Source.Value.AbsoluteIndex)); builder.Append(token.Content); } } _content = builder.ToString(); _offsets = offsets; _textSource = new TextSource(_content); _position = 0; } public string GetUnparsedContent() { return _position >= _content.Length ? string.Empty : _content.Substring(_position); } public IEnumerable Get() { if (_textSource == null) { throw new InvalidOperationException("You need to call Push first."); } var tokens = _textSource.Tokenize(HtmlEntityService.Resolver); return tokens; } public void SetLocation(HtmlToken token) { // The tokenizer will advance to the end when you have an unclosed tag. // We don't want this, we want to resume before the unclosed tag. if (token.Type != HtmlTokenType.EndOfFile) { _position = _textSource.Index; } } public SourceLocation GetCurrentLocation(int offset = 0) { var absoluteIndex = GetAbsoluteIndex(_position + offset); int lineIndex = -1; int columnIndex = -1; var remaining = absoluteIndex; for (var i = 0; i < _source.Lines.Count; i++) { var lineLength = _source.Lines.GetLineLength(i); if (lineLength > remaining) { lineIndex = i; columnIndex = remaining; break; } remaining -= lineLength; } return new SourceLocation(_source.FilePath, absoluteIndex, lineIndex, columnIndex); } public SourceSpan GetSpan(HtmlToken token) { var absoluteIndex = GetAbsoluteIndex(token.Position.Position); int lineIndex = -1; int columnIndex = -1; var remaining = absoluteIndex; for (var i = 0; i < _source.Lines.Count; i++) { var lineLength = _source.Lines.GetLineLength(i); if (lineLength > remaining) { lineIndex = i; columnIndex = remaining; break; } remaining -= lineLength; } var length = GetAbsoluteIndex(_position) - absoluteIndex; return new SourceSpan(_source.FilePath, absoluteIndex, lineIndex, columnIndex, length); } private int GetAbsoluteIndex(int contentIndex) { for (var i = _offsets.Count - 1; i >= 0; i--) { if (_offsets[i].offset <= contentIndex) { return _offsets[i].sourceOffset + (contentIndex - _offsets[i].offset); } } throw new InvalidOperationException("Unexpected index value."); } // Anglesharp canonicalizes the case of tags, we want what the user typed. public string GetTagNameOriginalCasing(HtmlTagToken tag) { var offset = tag.Type == HtmlTokenType.EndTag ? 1 : 0; // For end tags, skip the '/' return tag.Name; } private string DebuggerDisplay { get { if (_content == null) { return "Content={}"; } var builder = new StringBuilder(); builder.Append("Content="); builder.Append("{"); builder.Append(_content.Substring(0, Math.Min(_position, _content.Length))); builder.Append("|"); builder.Append(_content.Substring(Math.Min(_position, _content.Length))); builder.Append("}"); return builder.ToString(); } } } } }