diff --git a/src/Microsoft.AspNet.WebUtilities/Encoders/HtmlEncoder.cs b/src/Microsoft.AspNet.WebUtilities/Encoders/HtmlEncoder.cs
index b0559d7219..0f205b3e43 100644
--- a/src/Microsoft.AspNet.WebUtilities/Encoders/HtmlEncoder.cs
+++ b/src/Microsoft.AspNet.WebUtilities/Encoders/HtmlEncoder.cs
@@ -3,7 +3,6 @@
using System;
using System.Diagnostics;
-using System.Runtime.CompilerServices;
using System.Text;
using System.Threading;
@@ -21,14 +20,14 @@ namespace Microsoft.AspNet.WebUtilities.Encoders
// The default HtmlEncoder (Basic Latin), instantiated on demand
private static HtmlEncoder _defaultEncoder;
- // A bitmap of characters which are allowed to be returned unescaped.
- private readonly uint[] _allowedCharsBitmap = new uint[0x10000 / 32];
+ // The inner encoder, responsible for the actual encoding routines
+ private readonly HtmlUnicodeEncoder _innerUnicodeEncoder;
///
/// Instantiates an encoder using the 'Basic Latin' code table as the allow list.
///
public HtmlEncoder()
- : this(CodePointFilters.BasicLatin)
+ : this(HtmlUnicodeEncoder.BasicLatin)
{
}
@@ -36,41 +35,14 @@ namespace Microsoft.AspNet.WebUtilities.Encoders
/// Instantiates an encoder using a custom allow list of characters.
///
public HtmlEncoder(params ICodePointFilter[] filters)
+ : this(new HtmlUnicodeEncoder(filters))
{
- if (filters == null)
- {
- return; // no characters are allowed, just no-op immediately
- }
+ }
- // Punch a hole for each allowed code point across all filters (this is an OR).
- // We don't allow supplementary (astral) characters for now.
- foreach (var filter in filters)
- {
- foreach (var codePoint in filter.GetAllowedCodePoints())
- {
- if (!UnicodeHelpers.IsSupplementaryCodePoint(codePoint))
- {
- AllowCharacter((char)codePoint);
- }
- }
- }
-
- // Forbid characters that are special in HTML
- ForbidCharacter('<');
- ForbidCharacter('>');
- ForbidCharacter('&');
- ForbidCharacter('\''); // can be used to escape attributes
- ForbidCharacter('\"'); // can be used to escape attributes
- ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks
-
- // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed
- // (includes categories Cc, Cs, Co, Cn, Zl, Zp)
- uint[] definedCharactersBitmap = UnicodeHelpers.GetDefinedCharacterBitmap();
- Debug.Assert(definedCharactersBitmap.Length == _allowedCharsBitmap.Length);
- for (int i = 0; i < _allowedCharsBitmap.Length; i++)
- {
- _allowedCharsBitmap[i] &= definedCharactersBitmap[i];
- }
+ private HtmlEncoder(HtmlUnicodeEncoder innerEncoder)
+ {
+ Debug.Assert(innerEncoder != null);
+ _innerUnicodeEncoder = innerEncoder;
}
///
@@ -91,139 +63,78 @@ namespace Microsoft.AspNet.WebUtilities.Encoders
}
}
- // Marks a character as allowed (can be returned unencoded)
- private void AllowCharacter(char c)
- {
- uint codePoint = (uint)c;
- int index = (int)(codePoint >> 5);
- int offset = (int)(codePoint & 0x1FU);
- _allowedCharsBitmap[index] |= 0x1U << offset;
- }
-
- // Marks a character as forbidden (must be returned encoded)
- private void ForbidCharacter(char c)
- {
- uint codePoint = (uint)c;
- int index = (int)(codePoint >> 5);
- int offset = (int)(codePoint & 0x1FU);
- _allowedCharsBitmap[index] &= ~(0x1U << offset);
- }
-
///
/// Everybody's favorite HtmlEncode routine.
///
public string HtmlEncode(string value)
{
- if (String.IsNullOrEmpty(value))
- {
- return value;
- }
-
- // Quick check: does the string need to be encoded at all?
- // If not, just return the input string as-is.
- for (int i = 0; i < value.Length; i++)
- {
- if (!IsCharacterAllowed(value[i]))
- {
- return HtmlEncodeImpl(value, i);
- }
- }
- return value;
+ return _innerUnicodeEncoder.Encode(value);
}
- private string HtmlEncodeImpl(string input, int idxOfFirstCharWhichRequiresEncoding)
+ private sealed class HtmlUnicodeEncoder : UnicodeEncoderBase
{
- Debug.Assert(idxOfFirstCharWhichRequiresEncoding >= 0);
- Debug.Assert(idxOfFirstCharWhichRequiresEncoding < input.Length);
+ // A singleton instance of the basic latin encoder.
+ private static HtmlUnicodeEncoder _basicLatinSingleton;
// The worst case encoding is 8 output chars per input char: [input] U+FFFF -> [output] ""
// We don't need to worry about astral code points since they consume *two* input chars to
- // generate at most 10 output chars (""), which equates to 5 output per input.
- int numCharsWhichMayRequireEncoding = input.Length - idxOfFirstCharWhichRequiresEncoding;
- int sbCapacity = checked(idxOfFirstCharWhichRequiresEncoding + EncoderCommon.GetCapacityOfOutputStringBuilder(numCharsWhichMayRequireEncoding, worstCaseOutputCharsPerInputChar: 8));
- Debug.Assert(sbCapacity >= input.Length);
+ // generate at most 10 output chars (""), which equates to 5 output chars per input char.
+ private const int MaxOutputCharsPerInputChar = 8;
- // Allocate the StringBuilder with the first (known to not require encoding) part of the input string,
- // then begin encoding from the last (potentially requiring encoding) part of the input string.
- StringBuilder builder = new StringBuilder(input, 0, idxOfFirstCharWhichRequiresEncoding, sbCapacity);
- fixed (char* pInput = input)
+ internal HtmlUnicodeEncoder(ICodePointFilter[] filters)
+ : base(filters, MaxOutputCharsPerInputChar)
{
- return HtmlEncodeImpl2(builder, &pInput[idxOfFirstCharWhichRequiresEncoding], (uint)numCharsWhichMayRequireEncoding);
}
- }
- private string HtmlEncodeImpl2(StringBuilder builder, char* input, uint charsRemaining)
- {
- while (charsRemaining != 0)
+ internal static HtmlUnicodeEncoder BasicLatin
{
- int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(input, endOfString: (charsRemaining == 1));
- if (UnicodeHelpers.IsSupplementaryCodePoint(nextScalar))
+ get
{
- // Supplementary characters should always be encoded numerically.
- WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar);
-
- // We consume two UTF-16 characters for a single supplementary character.
- input += 2;
- charsRemaining -= 2;
- }
- else
- {
- // Otherwise, this was a BMP character.
- input++;
- charsRemaining--;
- char c = (char)nextScalar;
- if (IsCharacterAllowed(c))
+ HtmlUnicodeEncoder encoder = Volatile.Read(ref _basicLatinSingleton);
+ if (encoder == null)
{
- builder.Append(c);
- }
- else
- {
- if (c == '<') { builder.Append("<"); }
- else if (c == '>') { builder.Append(">"); }
- else if (c == '&') { builder.Append("&"); }
- else if (c == '\"') { builder.Append("""); }
- else { WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar); }
+ encoder = new HtmlUnicodeEncoder(new[] { CodePointFilters.BasicLatin });
+ Volatile.Write(ref _basicLatinSingleton, encoder);
}
+ return encoder;
}
}
- return builder.ToString();
- }
-
- // Determines whether the given character can be returned unencoded.
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private bool IsCharacterAllowed(char c)
- {
- uint codePoint = (uint)c;
- int index = (int)(codePoint >> 5);
- int offset = (int)(codePoint & 0x1FU);
- return ((_allowedCharsBitmap[index] >> offset) & 0x1U) != 0;
- }
-
- // Writes a scalar value as ""
- private static void WriteScalarAsHtmlEncodedEntity(StringBuilder builder, uint value)
- {
- // We're building the characters up in reverse
- char* chars = stackalloc char[8 /* "FFFFFFFF" */];
- int numCharsWritten = 0;
- do
+ // Writes a scalar value as an HTML-encoded entity.
+ protected override void WriteEncodedScalar(StringBuilder builder, uint value)
{
- Debug.Assert(numCharsWritten < 8, "Couldn't have written 8 characters out by this point.");
- // Pop off the last nibble
- chars[numCharsWritten++] = HexUtil.IntToChar(value & 0xFU);
- value >>= 4;
- } while (value != 0);
+ if (value == (uint)'\"') { builder.Append("""); }
+ else if (value == (uint)'&') { builder.Append("&"); }
+ else if (value == (uint)'<') { builder.Append("<"); }
+ else if (value == (uint)'>') { builder.Append(">"); }
+ else { WriteEncodedScalarAsNumericEntity(builder, value); }
+ }
- // Finally, write out the HTML-encoded scalar value.
- builder.Append('&');
- builder.Append('#');
- builder.Append('x');
- Debug.Assert(numCharsWritten > 0, "At least one character should've been written.");
- do
+ // Writes a scalar value as an HTML-encoded numeric entity.
+ private static void WriteEncodedScalarAsNumericEntity(StringBuilder builder, uint value)
{
- builder.Append(chars[--numCharsWritten]);
- } while (numCharsWritten != 0);
- builder.Append(';');
+ // We're building the characters up in reverse
+ char* chars = stackalloc char[8 /* "FFFFFFFF" */];
+ int numCharsWritten = 0;
+ do
+ {
+ Debug.Assert(numCharsWritten < 8, "Couldn't have written 8 characters out by this point.");
+ // Pop off the last nibble
+ chars[numCharsWritten++] = HexUtil.IntToChar(value & 0xFU);
+ value >>= 4;
+ } while (value != 0);
+
+ // Finally, write out the HTML-encoded scalar value.
+ builder.Append('&');
+ builder.Append('#');
+ builder.Append('x');
+ Debug.Assert(numCharsWritten > 0, "At least one character should've been written.");
+ do
+ {
+ builder.Append(chars[--numCharsWritten]);
+ } while (numCharsWritten != 0);
+ builder.Append(';');
+ }
}
}
}