* Decode HTML entities
\n\nCommit migrated from 6b7b9a3bc3
This commit is contained in:
Ajay Bhargav Baaskaran 2019-04-03 10:03:42 -07:00 committed by GitHub
parent 249752087e
commit 01e9d70207
2 changed files with 3839 additions and 4 deletions

View File

@ -1,8 +1,13 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text;
using Microsoft.AspNetCore.Razor.Language.Intermediate;
using Microsoft.AspNetCore.Razor.Language.Legacy;
namespace Microsoft.AspNetCore.Razor.Language.Components
{
@ -38,17 +43,18 @@ namespace Microsoft.AspNetCore.Razor.Language.Components
// We want to use AddMarkupContent to avoid aggresive encoding during prerendering.
// Specifically, when one of the following characters are in the content,
// 1. New lines (\r, \n), tabs(\t) - so they get rendered as actual new lines, tabs instead of 

// 2. Ampersands (&) - so that HTML entities are rendered correctly without getting encoded
// 3. Any character outside the ASCII range
// 2. Any character outside the ASCII range
private static readonly char[] EncodedCharacters = new[] { '\r', '\n', '\t', '&' };
private static readonly char[] EncodedCharacters = new[] { '\r', '\n', '\t' };
private readonly Dictionary<string, string> _seenEntities = new Dictionary<string, string>(StringComparer.Ordinal);
public override void VisitHtml(HtmlContentIntermediateNode node)
{
for (var i = 0; i < node.Children.Count; i++)
{
var child = node.Children[i];
if (!(child is IntermediateToken token) || !token.IsHtml)
if (!(child is IntermediateToken token) || !token.IsHtml || string.IsNullOrEmpty(token.Content))
{
// We only care about Html tokens.
continue;
@ -65,6 +71,145 @@ namespace Microsoft.AspNetCore.Razor.Language.Components
}
}
}
// If we reach here, we don't have newlines, tabs or non-ascii characters in this node.
// If we can successfully decode all HTML entities(if any) in this node, we can safely let it call AddContent.
var decodedContent = new string[node.Children.Count];
for (var i = 0; i < node.Children.Count; i++)
{
var child = node.Children[i];
if (!(child is IntermediateToken token) || !token.IsHtml || string.IsNullOrEmpty(token.Content))
{
// We only care about Html tokens.
continue;
}
if (TryDecodeHtmlEntities(token.Content, out var decoded))
{
decodedContent[i] = decoded;
}
else
{
node.SetEncoded();
return;
}
}
// If we reach here, it means we have successfully decoded all content.
// Replace all token content with the decoded value.
for (var i = 0; i < node.Children.Count; i++)
{
var child = node.Children[i];
if (!(child is IntermediateToken token) || !token.IsHtml || string.IsNullOrEmpty(token.Content))
{
// We only care about Html tokens.
continue;
}
token.Content = decodedContent[i];
}
}
private bool TryDecodeHtmlEntities(string content, out string decoded)
{
_seenEntities.Clear();
decoded = content;
var i = 0;
while (i < content.Length)
{
var ch = content[i];
if (ch == '&')
{
if (TryGetHtmlEntity(content, i, out var entity, out var replacement))
{
if (!_seenEntities.ContainsKey(entity))
{
_seenEntities.Add(entity, replacement);
}
i += entity.Length;
}
else
{
// We found a '&' that we don't know what to do with. Don't try to decode further.
return false;
}
}
else
{
i++;
}
}
foreach (var entity in _seenEntities)
{
decoded = decoded.Replace(entity.Key, entity.Value);
}
return true;
}
private bool TryGetHtmlEntity(string content, int position, out string entity, out string replacement)
{
// We're at '&'. Check if it is the start of an HTML entity.
entity = null;
replacement = null;
var endPosition = -1;
for (var i = position + 1; i < content.Length; i++)
{
var ch = content[i];
if (char.IsLetterOrDigit(ch) || ch == '#')
{
continue;
}
else if (ch == ';')
{
endPosition = i;
}
break;
}
if (endPosition != -1)
{
entity = content.Substring(position, endPosition - position + 1);
if (entity.StartsWith("&#"))
{
// Extract the codepoint and map it to an entity.
// `entity` is guaranteed to be of the format &#****;
var entityValue = entity.Substring(2, entity.Length - 3);
var codePoint = -1;
if (!int.TryParse(entityValue, out codePoint))
{
// If it is not an integer, check if it is hexadecimal like 0x00CD
try
{
codePoint = Convert.ToInt32(entityValue, 16);
}
catch (FormatException)
{
// Do nothing.
}
}
if (ParserHelpers.HtmlEntityCodePoints.TryGetValue(codePoint, out replacement))
{
// This is a known html entity unicode codepoint.
return true;
}
// Unknown entity.
return false;
}
else if (ParserHelpers.NamedHtmlEntities.TryGetValue(entity, out replacement))
{
return true;
}
}
// The '&' is not part of an HTML entity.
return false;
}
}
}