Decode HTML entities (dotnet/aspnetcore-tooling#379)
* Decode HTML entities
\n\nCommit migrated from 6b7b9a3bc3
This commit is contained in:
parent
249752087e
commit
01e9d70207
|
|
@ -1,8 +1,13 @@
|
|||
// Copyright (c) .NET Foundation. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Globalization;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using Microsoft.AspNetCore.Razor.Language.Intermediate;
|
||||
using Microsoft.AspNetCore.Razor.Language.Legacy;
|
||||
|
||||
namespace Microsoft.AspNetCore.Razor.Language.Components
|
||||
{
|
||||
|
|
@ -38,17 +43,18 @@ namespace Microsoft.AspNetCore.Razor.Language.Components
|
|||
// We want to use AddMarkupContent to avoid aggresive encoding during prerendering.
|
||||
// Specifically, when one of the following characters are in the content,
|
||||
// 1. New lines (\r, \n), tabs(\t) - so they get rendered as actual new lines, tabs instead of 

|
||||
// 2. Ampersands (&) - so that HTML entities are rendered correctly without getting encoded
|
||||
// 3. Any character outside the ASCII range
|
||||
// 2. Any character outside the ASCII range
|
||||
|
||||
private static readonly char[] EncodedCharacters = new[] { '\r', '\n', '\t', '&' };
|
||||
private static readonly char[] EncodedCharacters = new[] { '\r', '\n', '\t' };
|
||||
|
||||
private readonly Dictionary<string, string> _seenEntities = new Dictionary<string, string>(StringComparer.Ordinal);
|
||||
|
||||
public override void VisitHtml(HtmlContentIntermediateNode node)
|
||||
{
|
||||
for (var i = 0; i < node.Children.Count; i++)
|
||||
{
|
||||
var child = node.Children[i];
|
||||
if (!(child is IntermediateToken token) || !token.IsHtml)
|
||||
if (!(child is IntermediateToken token) || !token.IsHtml || string.IsNullOrEmpty(token.Content))
|
||||
{
|
||||
// We only care about Html tokens.
|
||||
continue;
|
||||
|
|
@ -65,6 +71,145 @@ namespace Microsoft.AspNetCore.Razor.Language.Components
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we reach here, we don't have newlines, tabs or non-ascii characters in this node.
|
||||
// If we can successfully decode all HTML entities(if any) in this node, we can safely let it call AddContent.
|
||||
var decodedContent = new string[node.Children.Count];
|
||||
for (var i = 0; i < node.Children.Count; i++)
|
||||
{
|
||||
var child = node.Children[i];
|
||||
if (!(child is IntermediateToken token) || !token.IsHtml || string.IsNullOrEmpty(token.Content))
|
||||
{
|
||||
// We only care about Html tokens.
|
||||
continue;
|
||||
}
|
||||
|
||||
if (TryDecodeHtmlEntities(token.Content, out var decoded))
|
||||
{
|
||||
decodedContent[i] = decoded;
|
||||
}
|
||||
else
|
||||
{
|
||||
node.SetEncoded();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// If we reach here, it means we have successfully decoded all content.
|
||||
// Replace all token content with the decoded value.
|
||||
for (var i = 0; i < node.Children.Count; i++)
|
||||
{
|
||||
var child = node.Children[i];
|
||||
if (!(child is IntermediateToken token) || !token.IsHtml || string.IsNullOrEmpty(token.Content))
|
||||
{
|
||||
// We only care about Html tokens.
|
||||
continue;
|
||||
}
|
||||
|
||||
token.Content = decodedContent[i];
|
||||
}
|
||||
}
|
||||
|
||||
private bool TryDecodeHtmlEntities(string content, out string decoded)
|
||||
{
|
||||
_seenEntities.Clear();
|
||||
decoded = content;
|
||||
var i = 0;
|
||||
while (i < content.Length)
|
||||
{
|
||||
var ch = content[i];
|
||||
if (ch == '&')
|
||||
{
|
||||
if (TryGetHtmlEntity(content, i, out var entity, out var replacement))
|
||||
{
|
||||
if (!_seenEntities.ContainsKey(entity))
|
||||
{
|
||||
_seenEntities.Add(entity, replacement);
|
||||
}
|
||||
|
||||
i += entity.Length;
|
||||
}
|
||||
else
|
||||
{
|
||||
// We found a '&' that we don't know what to do with. Don't try to decode further.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var entity in _seenEntities)
|
||||
{
|
||||
decoded = decoded.Replace(entity.Key, entity.Value);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private bool TryGetHtmlEntity(string content, int position, out string entity, out string replacement)
|
||||
{
|
||||
// We're at '&'. Check if it is the start of an HTML entity.
|
||||
entity = null;
|
||||
replacement = null;
|
||||
var endPosition = -1;
|
||||
for (var i = position + 1; i < content.Length; i++)
|
||||
{
|
||||
var ch = content[i];
|
||||
if (char.IsLetterOrDigit(ch) || ch == '#')
|
||||
{
|
||||
continue;
|
||||
}
|
||||
else if (ch == ';')
|
||||
{
|
||||
endPosition = i;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (endPosition != -1)
|
||||
{
|
||||
entity = content.Substring(position, endPosition - position + 1);
|
||||
if (entity.StartsWith("&#"))
|
||||
{
|
||||
// Extract the codepoint and map it to an entity.
|
||||
|
||||
// `entity` is guaranteed to be of the format &#****;
|
||||
var entityValue = entity.Substring(2, entity.Length - 3);
|
||||
var codePoint = -1;
|
||||
if (!int.TryParse(entityValue, out codePoint))
|
||||
{
|
||||
// If it is not an integer, check if it is hexadecimal like 0x00CD
|
||||
try
|
||||
{
|
||||
codePoint = Convert.ToInt32(entityValue, 16);
|
||||
}
|
||||
catch (FormatException)
|
||||
{
|
||||
// Do nothing.
|
||||
}
|
||||
}
|
||||
|
||||
if (ParserHelpers.HtmlEntityCodePoints.TryGetValue(codePoint, out replacement))
|
||||
{
|
||||
// This is a known html entity unicode codepoint.
|
||||
return true;
|
||||
}
|
||||
|
||||
// Unknown entity.
|
||||
return false;
|
||||
}
|
||||
else if (ParserHelpers.NamedHtmlEntities.TryGetValue(entity, out replacement))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// The '&' is not part of an HTML entity.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue