diff --git a/src/Microsoft.AspNet.WebUtilities/Encoders/HtmlEncoder.cs b/src/Microsoft.AspNet.WebUtilities/Encoders/HtmlEncoder.cs index b0559d7219..0f205b3e43 100644 --- a/src/Microsoft.AspNet.WebUtilities/Encoders/HtmlEncoder.cs +++ b/src/Microsoft.AspNet.WebUtilities/Encoders/HtmlEncoder.cs @@ -3,7 +3,6 @@ using System; using System.Diagnostics; -using System.Runtime.CompilerServices; using System.Text; using System.Threading; @@ -21,14 +20,14 @@ namespace Microsoft.AspNet.WebUtilities.Encoders // The default HtmlEncoder (Basic Latin), instantiated on demand private static HtmlEncoder _defaultEncoder; - // A bitmap of characters which are allowed to be returned unescaped. - private readonly uint[] _allowedCharsBitmap = new uint[0x10000 / 32]; + // The inner encoder, responsible for the actual encoding routines + private readonly HtmlUnicodeEncoder _innerUnicodeEncoder; /// /// Instantiates an encoder using the 'Basic Latin' code table as the allow list. /// public HtmlEncoder() - : this(CodePointFilters.BasicLatin) + : this(HtmlUnicodeEncoder.BasicLatin) { } @@ -36,41 +35,14 @@ namespace Microsoft.AspNet.WebUtilities.Encoders /// Instantiates an encoder using a custom allow list of characters. /// public HtmlEncoder(params ICodePointFilter[] filters) + : this(new HtmlUnicodeEncoder(filters)) { - if (filters == null) - { - return; // no characters are allowed, just no-op immediately - } + } - // Punch a hole for each allowed code point across all filters (this is an OR). - // We don't allow supplementary (astral) characters for now. - foreach (var filter in filters) - { - foreach (var codePoint in filter.GetAllowedCodePoints()) - { - if (!UnicodeHelpers.IsSupplementaryCodePoint(codePoint)) - { - AllowCharacter((char)codePoint); - } - } - } - - // Forbid characters that are special in HTML - ForbidCharacter('<'); - ForbidCharacter('>'); - ForbidCharacter('&'); - ForbidCharacter('\''); // can be used to escape attributes - ForbidCharacter('\"'); // can be used to escape attributes - ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks - - // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed - // (includes categories Cc, Cs, Co, Cn, Zl, Zp) - uint[] definedCharactersBitmap = UnicodeHelpers.GetDefinedCharacterBitmap(); - Debug.Assert(definedCharactersBitmap.Length == _allowedCharsBitmap.Length); - for (int i = 0; i < _allowedCharsBitmap.Length; i++) - { - _allowedCharsBitmap[i] &= definedCharactersBitmap[i]; - } + private HtmlEncoder(HtmlUnicodeEncoder innerEncoder) + { + Debug.Assert(innerEncoder != null); + _innerUnicodeEncoder = innerEncoder; } /// @@ -91,139 +63,78 @@ namespace Microsoft.AspNet.WebUtilities.Encoders } } - // Marks a character as allowed (can be returned unencoded) - private void AllowCharacter(char c) - { - uint codePoint = (uint)c; - int index = (int)(codePoint >> 5); - int offset = (int)(codePoint & 0x1FU); - _allowedCharsBitmap[index] |= 0x1U << offset; - } - - // Marks a character as forbidden (must be returned encoded) - private void ForbidCharacter(char c) - { - uint codePoint = (uint)c; - int index = (int)(codePoint >> 5); - int offset = (int)(codePoint & 0x1FU); - _allowedCharsBitmap[index] &= ~(0x1U << offset); - } - /// /// Everybody's favorite HtmlEncode routine. /// public string HtmlEncode(string value) { - if (String.IsNullOrEmpty(value)) - { - return value; - } - - // Quick check: does the string need to be encoded at all? - // If not, just return the input string as-is. - for (int i = 0; i < value.Length; i++) - { - if (!IsCharacterAllowed(value[i])) - { - return HtmlEncodeImpl(value, i); - } - } - return value; + return _innerUnicodeEncoder.Encode(value); } - private string HtmlEncodeImpl(string input, int idxOfFirstCharWhichRequiresEncoding) + private sealed class HtmlUnicodeEncoder : UnicodeEncoderBase { - Debug.Assert(idxOfFirstCharWhichRequiresEncoding >= 0); - Debug.Assert(idxOfFirstCharWhichRequiresEncoding < input.Length); + // A singleton instance of the basic latin encoder. + private static HtmlUnicodeEncoder _basicLatinSingleton; // The worst case encoding is 8 output chars per input char: [input] U+FFFF -> [output] "￿" // We don't need to worry about astral code points since they consume *two* input chars to - // generate at most 10 output chars ("􏿿"), which equates to 5 output per input. - int numCharsWhichMayRequireEncoding = input.Length - idxOfFirstCharWhichRequiresEncoding; - int sbCapacity = checked(idxOfFirstCharWhichRequiresEncoding + EncoderCommon.GetCapacityOfOutputStringBuilder(numCharsWhichMayRequireEncoding, worstCaseOutputCharsPerInputChar: 8)); - Debug.Assert(sbCapacity >= input.Length); + // generate at most 10 output chars ("􏿿"), which equates to 5 output chars per input char. + private const int MaxOutputCharsPerInputChar = 8; - // Allocate the StringBuilder with the first (known to not require encoding) part of the input string, - // then begin encoding from the last (potentially requiring encoding) part of the input string. - StringBuilder builder = new StringBuilder(input, 0, idxOfFirstCharWhichRequiresEncoding, sbCapacity); - fixed (char* pInput = input) + internal HtmlUnicodeEncoder(ICodePointFilter[] filters) + : base(filters, MaxOutputCharsPerInputChar) { - return HtmlEncodeImpl2(builder, &pInput[idxOfFirstCharWhichRequiresEncoding], (uint)numCharsWhichMayRequireEncoding); } - } - private string HtmlEncodeImpl2(StringBuilder builder, char* input, uint charsRemaining) - { - while (charsRemaining != 0) + internal static HtmlUnicodeEncoder BasicLatin { - int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(input, endOfString: (charsRemaining == 1)); - if (UnicodeHelpers.IsSupplementaryCodePoint(nextScalar)) + get { - // Supplementary characters should always be encoded numerically. - WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar); - - // We consume two UTF-16 characters for a single supplementary character. - input += 2; - charsRemaining -= 2; - } - else - { - // Otherwise, this was a BMP character. - input++; - charsRemaining--; - char c = (char)nextScalar; - if (IsCharacterAllowed(c)) + HtmlUnicodeEncoder encoder = Volatile.Read(ref _basicLatinSingleton); + if (encoder == null) { - builder.Append(c); - } - else - { - if (c == '<') { builder.Append("<"); } - else if (c == '>') { builder.Append(">"); } - else if (c == '&') { builder.Append("&"); } - else if (c == '\"') { builder.Append("""); } - else { WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar); } + encoder = new HtmlUnicodeEncoder(new[] { CodePointFilters.BasicLatin }); + Volatile.Write(ref _basicLatinSingleton, encoder); } + return encoder; } } - return builder.ToString(); - } - - // Determines whether the given character can be returned unencoded. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private bool IsCharacterAllowed(char c) - { - uint codePoint = (uint)c; - int index = (int)(codePoint >> 5); - int offset = (int)(codePoint & 0x1FU); - return ((_allowedCharsBitmap[index] >> offset) & 0x1U) != 0; - } - - // Writes a scalar value as "�" - private static void WriteScalarAsHtmlEncodedEntity(StringBuilder builder, uint value) - { - // We're building the characters up in reverse - char* chars = stackalloc char[8 /* "FFFFFFFF" */]; - int numCharsWritten = 0; - do + // Writes a scalar value as an HTML-encoded entity. + protected override void WriteEncodedScalar(StringBuilder builder, uint value) { - Debug.Assert(numCharsWritten < 8, "Couldn't have written 8 characters out by this point."); - // Pop off the last nibble - chars[numCharsWritten++] = HexUtil.IntToChar(value & 0xFU); - value >>= 4; - } while (value != 0); + if (value == (uint)'\"') { builder.Append("""); } + else if (value == (uint)'&') { builder.Append("&"); } + else if (value == (uint)'<') { builder.Append("<"); } + else if (value == (uint)'>') { builder.Append(">"); } + else { WriteEncodedScalarAsNumericEntity(builder, value); } + } - // Finally, write out the HTML-encoded scalar value. - builder.Append('&'); - builder.Append('#'); - builder.Append('x'); - Debug.Assert(numCharsWritten > 0, "At least one character should've been written."); - do + // Writes a scalar value as an HTML-encoded numeric entity. + private static void WriteEncodedScalarAsNumericEntity(StringBuilder builder, uint value) { - builder.Append(chars[--numCharsWritten]); - } while (numCharsWritten != 0); - builder.Append(';'); + // We're building the characters up in reverse + char* chars = stackalloc char[8 /* "FFFFFFFF" */]; + int numCharsWritten = 0; + do + { + Debug.Assert(numCharsWritten < 8, "Couldn't have written 8 characters out by this point."); + // Pop off the last nibble + chars[numCharsWritten++] = HexUtil.IntToChar(value & 0xFU); + value >>= 4; + } while (value != 0); + + // Finally, write out the HTML-encoded scalar value. + builder.Append('&'); + builder.Append('#'); + builder.Append('x'); + Debug.Assert(numCharsWritten > 0, "At least one character should've been written."); + do + { + builder.Append(chars[--numCharsWritten]); + } while (numCharsWritten != 0); + builder.Append(';'); + } } } }