Decode HTML entities (dotnet/aspnetcore-tooling#379)

* Decode HTML entities \n\nCommit migrated from 6b7b9a3bc3
2019-04-03 10:03:42 -07:00 · 2019-04-03 10:03:42 -07:00 · 01e9d70207
parent 249752087e
commit 01e9d70207
2 changed files with 3839 additions and 4 deletions
--- a/src/Razor/Microsoft.AspNetCore.Razor.Language/src/Components/ComponentMarkupEncodingPass.cs
+++ b/src/Razor/Microsoft.AspNetCore.Razor.Language/src/Components/ComponentMarkupEncodingPass.cs
@ -1,8 +1,13 @@
 // Copyright (c) .NET Foundation. All rights reserved.
 // Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

+using System;
+using System.Collections.Generic;
+using System.Globalization;
 using System.Linq;
+using System.Text;
 using Microsoft.AspNetCore.Razor.Language.Intermediate;
+using Microsoft.AspNetCore.Razor.Language.Legacy;

 namespace Microsoft.AspNetCore.Razor.Language.Components
 {
@ -38,17 +43,18 @@ namespace Microsoft.AspNetCore.Razor.Language.Components
            // We want to use AddMarkupContent to avoid aggresive encoding during prerendering.
            // Specifically, when one of the following characters are in the content,
            // 1. New lines (\r, \n), tabs(\t) - so they get rendered as actual new lines, tabs instead of &#xA;
-            // 2. Ampersands (&) - so that HTML entities are rendered correctly without getting encoded
-            // 3. Any character outside the ASCII range
+            // 2. Any character outside the ASCII range

-            private static readonly char[] EncodedCharacters = new[] { '\r', '\n', '\t', '&' };
+            private static readonly char[] EncodedCharacters = new[] { '\r', '\n', '\t' };
+
+            private readonly Dictionary<string, string> _seenEntities = new Dictionary<string, string>(StringComparer.Ordinal);

            public override void VisitHtml(HtmlContentIntermediateNode node)
            {
                for (var i = 0; i < node.Children.Count; i++)
                {
                    var child = node.Children[i];
-                    if (!(child is IntermediateToken token) || !token.IsHtml)
+                    if (!(child is IntermediateToken token) || !token.IsHtml || string.IsNullOrEmpty(token.Content))
                    {
                        // We only care about Html tokens.
                        continue;
@ -65,6 +71,145 @@ namespace Microsoft.AspNetCore.Razor.Language.Components
                        }
                    }
                }
+
+                // If we reach here, we don't have newlines, tabs or non-ascii characters in this node.
+                // If we can successfully decode all HTML entities(if any) in this node, we can safely let it call AddContent.
+                var decodedContent = new string[node.Children.Count];
+                for (var i = 0; i < node.Children.Count; i++)
+                {
+                    var child = node.Children[i];
+                    if (!(child is IntermediateToken token) || !token.IsHtml || string.IsNullOrEmpty(token.Content))
+                    {
+                        // We only care about Html tokens.
+                        continue;
+                    }
+
+                    if (TryDecodeHtmlEntities(token.Content, out var decoded))
+                    {
+                        decodedContent[i] = decoded;
+                    }
+                    else
+                    {
+                        node.SetEncoded();
+                        return;
+                    }
+                }
+
+                // If we reach here, it means we have successfully decoded all content.
+                // Replace all token content with the decoded value.
+                for (var i = 0; i < node.Children.Count; i++)
+                {
+                    var child = node.Children[i];
+                    if (!(child is IntermediateToken token) || !token.IsHtml || string.IsNullOrEmpty(token.Content))
+                    {
+                        // We only care about Html tokens.
+                        continue;
+                    }
+
+                    token.Content = decodedContent[i];
+                }
+            }
+
+            private bool TryDecodeHtmlEntities(string content, out string decoded)
+            {
+                _seenEntities.Clear();
+                decoded = content;
+                var i = 0;
+                while (i < content.Length)
+                {
+                    var ch = content[i];
+                    if (ch == '&')
+                    {
+                        if (TryGetHtmlEntity(content, i, out var entity, out var replacement))
+                        {
+                            if (!_seenEntities.ContainsKey(entity))
+                            {
+                                _seenEntities.Add(entity, replacement);
+                            }
+
+                            i += entity.Length;
+                        }
+                        else
+                        {
+                            // We found a '&' that we don't know what to do with. Don't try to decode further.
+                            return false;
+                        }
+                    }
+                    else
+                    {
+                        i++;
+                    }
+                }
+
+                foreach (var entity in _seenEntities)
+                {
+                    decoded = decoded.Replace(entity.Key, entity.Value);
+                }
+
+                return true;
+            }
+
+            private bool TryGetHtmlEntity(string content, int position, out string entity, out string replacement)
+            {
+                // We're at '&'. Check if it is the start of an HTML entity.
+                entity = null;
+                replacement = null;
+                var endPosition = -1;
+                for (var i = position + 1; i < content.Length; i++)
+                {
+                    var ch = content[i];
+                    if (char.IsLetterOrDigit(ch) || ch == '#')
+                    {
+                        continue;
+                    }
+                    else if (ch == ';')
+                    {
+                        endPosition = i;
+                    }
+
+                    break;
+                }
+
+                if (endPosition != -1)
+                {
+                    entity = content.Substring(position, endPosition - position + 1);
+                    if (entity.StartsWith("&#"))
+                    {
+                        // Extract the codepoint and map it to an entity.
+
+                        // `entity` is guaranteed to be of the format &#****;
+                        var entityValue = entity.Substring(2, entity.Length - 3);
+                        var codePoint = -1;
+                        if (!int.TryParse(entityValue, out codePoint))
+                        {
+                            // If it is not an integer, check if it is hexadecimal like 0x00CD
+                            try
+                            {
+                                codePoint = Convert.ToInt32(entityValue, 16);
+                            }
+                            catch (FormatException)
+                            {
+                                // Do nothing.
+                            }
+                        }
+
+                        if (ParserHelpers.HtmlEntityCodePoints.TryGetValue(codePoint, out replacement))
+                        {
+                            // This is a known html entity unicode codepoint.
+                            return true;
+                        }
+
+                        // Unknown entity.
+                        return false;
+                    }
+                    else if (ParserHelpers.NamedHtmlEntities.TryGetValue(entity, out replacement))
+                    {
+                        return true;
+                    }
+                }
+
+                // The '&' is not part of an HTML entity.
+                return false;
            }
        }
    }
--- a/src/Razor/Microsoft.AspNetCore.Razor.Language/src/Legacy/ParserHelpers.cs
+++ b/src/Razor/Microsoft.AspNetCore.Razor.Language/src/Legacy/ParserHelpers.cs