Update HtmlEncoder to wrap UnicodeEncoderBase

2015-02-11 12:03:47 -08:00 · 2015-02-11 12:03:47 -08:00 · 14c872d981
parent fc7ed3a9cd
commit 14c872d981
1 changed files with 56 additions and 145 deletions
--- a/src/Microsoft.AspNet.WebUtilities/Encoders/HtmlEncoder.cs
+++ b/src/Microsoft.AspNet.WebUtilities/Encoders/HtmlEncoder.cs
@ -3,7 +3,6 @@

 using System;
 using System.Diagnostics;
-using System.Runtime.CompilerServices;
 using System.Text;
 using System.Threading;

@ -21,14 +20,14 @@ namespace Microsoft.AspNet.WebUtilities.Encoders
        // The default HtmlEncoder (Basic Latin), instantiated on demand
        private static HtmlEncoder _defaultEncoder;

-        // A bitmap of characters which are allowed to be returned unescaped.
-        private readonly uint[] _allowedCharsBitmap = new uint[0x10000 / 32];
+        // The inner encoder, responsible for the actual encoding routines
+        private readonly HtmlUnicodeEncoder _innerUnicodeEncoder;

        /// <summary>
        /// Instantiates an encoder using the 'Basic Latin' code table as the allow list.
        /// </summary>
        public HtmlEncoder()
-            : this(CodePointFilters.BasicLatin)
+            : this(HtmlUnicodeEncoder.BasicLatin)
        {
        }

@ -36,41 +35,14 @@ namespace Microsoft.AspNet.WebUtilities.Encoders
        /// Instantiates an encoder using a custom allow list of characters.
        /// </summary>
        public HtmlEncoder(params ICodePointFilter[] filters)
+            : this(new HtmlUnicodeEncoder(filters))
        {
-            if (filters == null)
-            {
-                return; // no characters are allowed, just no-op immediately
-            }
+        }

-            // Punch a hole for each allowed code point across all filters (this is an OR).
-            // We don't allow supplementary (astral) characters for now.
-            foreach (var filter in filters)
-            {
-                foreach (var codePoint in filter.GetAllowedCodePoints())
-                {
-                    if (!UnicodeHelpers.IsSupplementaryCodePoint(codePoint))
-                    {
-                        AllowCharacter((char)codePoint);
-                    }
-                }
-            }
-
-            // Forbid characters that are special in HTML
-            ForbidCharacter('<');
-            ForbidCharacter('>');
-            ForbidCharacter('&');
-            ForbidCharacter('\''); // can be used to escape attributes
-            ForbidCharacter('\"'); // can be used to escape attributes
-            ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks
-
-            // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed
-            // (includes categories Cc, Cs, Co, Cn, Zl, Zp)
-            uint[] definedCharactersBitmap = UnicodeHelpers.GetDefinedCharacterBitmap();
-            Debug.Assert(definedCharactersBitmap.Length == _allowedCharsBitmap.Length);
-            for (int i = 0; i < _allowedCharsBitmap.Length; i++)
-            {
-                _allowedCharsBitmap[i] &= definedCharactersBitmap[i];
-            }
+        private HtmlEncoder(HtmlUnicodeEncoder innerEncoder)
+        {
+            Debug.Assert(innerEncoder != null);
+            _innerUnicodeEncoder = innerEncoder;
        }

        /// <summary>
@ -91,139 +63,78 @@ namespace Microsoft.AspNet.WebUtilities.Encoders
            }
        }

-        // Marks a character as allowed (can be returned unencoded)
-        private void AllowCharacter(char c)
-        {
-            uint codePoint = (uint)c;
-            int index = (int)(codePoint >> 5);
-            int offset = (int)(codePoint & 0x1FU);
-            _allowedCharsBitmap[index] |= 0x1U << offset;
-        }
-
-        // Marks a character as forbidden (must be returned encoded)
-        private void ForbidCharacter(char c)
-        {
-            uint codePoint = (uint)c;
-            int index = (int)(codePoint >> 5);
-            int offset = (int)(codePoint & 0x1FU);
-            _allowedCharsBitmap[index] &= ~(0x1U << offset);
-        }
-
        /// <summary>
        /// Everybody's favorite HtmlEncode routine.
        /// </summary>
        public string HtmlEncode(string value)
        {
-            if (String.IsNullOrEmpty(value))
-            {
-                return value;
-            }
-
-            // Quick check: does the string need to be encoded at all?
-            // If not, just return the input string as-is.
-            for (int i = 0; i < value.Length; i++)
-            {
-                if (!IsCharacterAllowed(value[i]))
-                {
-                    return HtmlEncodeImpl(value, i);
-                }
-            }
-            return value;
+            return _innerUnicodeEncoder.Encode(value);
        }

-        private string HtmlEncodeImpl(string input, int idxOfFirstCharWhichRequiresEncoding)
+        private sealed class HtmlUnicodeEncoder : UnicodeEncoderBase
        {
-            Debug.Assert(idxOfFirstCharWhichRequiresEncoding >= 0);
-            Debug.Assert(idxOfFirstCharWhichRequiresEncoding < input.Length);
+            // A singleton instance of the basic latin encoder.
+            private static HtmlUnicodeEncoder _basicLatinSingleton;

            // The worst case encoding is 8 output chars per input char: [input] U+FFFF -> [output] "&#xFFFF;"
            // We don't need to worry about astral code points since they consume *two* input chars to
-            // generate at most 10 output chars ("&#x10FFFF;"), which equates to 5 output per input.
-            int numCharsWhichMayRequireEncoding = input.Length - idxOfFirstCharWhichRequiresEncoding;
-            int sbCapacity = checked(idxOfFirstCharWhichRequiresEncoding + EncoderCommon.GetCapacityOfOutputStringBuilder(numCharsWhichMayRequireEncoding, worstCaseOutputCharsPerInputChar: 8));
-            Debug.Assert(sbCapacity >= input.Length);
+            // generate at most 10 output chars ("&#x10FFFF;"), which equates to 5 output chars per input char.
+            private const int MaxOutputCharsPerInputChar = 8;

-            // Allocate the StringBuilder with the first (known to not require encoding) part of the input string,
-            // then begin encoding from the last (potentially requiring encoding) part of the input string.
-            StringBuilder builder = new StringBuilder(input, 0, idxOfFirstCharWhichRequiresEncoding, sbCapacity);
-            fixed (char* pInput = input)
+            internal HtmlUnicodeEncoder(ICodePointFilter[] filters)
+                : base(filters, MaxOutputCharsPerInputChar)
            {
-                return HtmlEncodeImpl2(builder, &pInput[idxOfFirstCharWhichRequiresEncoding], (uint)numCharsWhichMayRequireEncoding);
            }
-        }

-        private string HtmlEncodeImpl2(StringBuilder builder, char* input, uint charsRemaining)
-        {
-            while (charsRemaining != 0)
+            internal static HtmlUnicodeEncoder BasicLatin
            {
-                int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(input, endOfString: (charsRemaining == 1));
-                if (UnicodeHelpers.IsSupplementaryCodePoint(nextScalar))
+                get
                {
-                    // Supplementary characters should always be encoded numerically.
-                    WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar);
-
-                    // We consume two UTF-16 characters for a single supplementary character.
-                    input += 2;
-                    charsRemaining -= 2;
-                }
-                else
-                {
-                    // Otherwise, this was a BMP character.
-                    input++;
-                    charsRemaining--;
-                    char c = (char)nextScalar;
-                    if (IsCharacterAllowed(c))
+                    HtmlUnicodeEncoder encoder = Volatile.Read(ref _basicLatinSingleton);
+                    if (encoder == null)
                    {
-                        builder.Append(c);
-                    }
-                    else
-                    {
-                        if (c == '<') { builder.Append("&lt;"); }
-                        else if (c == '>') { builder.Append("&gt;"); }
-                        else if (c == '&') { builder.Append("&amp;"); }
-                        else if (c == '\"') { builder.Append("&quot;"); }
-                        else { WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar); }
+                        encoder = new HtmlUnicodeEncoder(new[] { CodePointFilters.BasicLatin });
+                        Volatile.Write(ref _basicLatinSingleton, encoder);
                    }
+                    return encoder;
                }
            }

-            return builder.ToString();
-        }
-
-        // Determines whether the given character can be returned unencoded.
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private bool IsCharacterAllowed(char c)
-        {
-            uint codePoint = (uint)c;
-            int index = (int)(codePoint >> 5);
-            int offset = (int)(codePoint & 0x1FU);
-            return ((_allowedCharsBitmap[index] >> offset) & 0x1U) != 0;
-        }
-
-        // Writes a scalar value as "&#xFFFFFFFF;"
-        private static void WriteScalarAsHtmlEncodedEntity(StringBuilder builder, uint value)
-        {
-            // We're building the characters up in reverse
-            char* chars = stackalloc char[8 /* "FFFFFFFF" */];
-            int numCharsWritten = 0;
-            do
+            // Writes a scalar value as an HTML-encoded entity.
+            protected override void WriteEncodedScalar(StringBuilder builder, uint value)
            {
-                Debug.Assert(numCharsWritten < 8, "Couldn't have written 8 characters out by this point.");
-                // Pop off the last nibble
-                chars[numCharsWritten++] = HexUtil.IntToChar(value & 0xFU);
-                value >>= 4;
-            } while (value != 0);
+                if (value == (uint)'\"') { builder.Append("&quot;"); }
+                else if (value == (uint)'&') { builder.Append("&amp;"); }
+                else if (value == (uint)'<') { builder.Append("&lt;"); }
+                else if (value == (uint)'>') { builder.Append("&gt;"); }
+                else { WriteEncodedScalarAsNumericEntity(builder, value); }
+            }

-            // Finally, write out the HTML-encoded scalar value.
-            builder.Append('&');
-            builder.Append('#');
-            builder.Append('x');
-            Debug.Assert(numCharsWritten > 0, "At least one character should've been written.");
-            do
+            // Writes a scalar value as an HTML-encoded numeric entity.
+            private static void WriteEncodedScalarAsNumericEntity(StringBuilder builder, uint value)
            {
-                builder.Append(chars[--numCharsWritten]);
-            } while (numCharsWritten != 0);
-            builder.Append(';');
+                // We're building the characters up in reverse
+                char* chars = stackalloc char[8 /* "FFFFFFFF" */];
+                int numCharsWritten = 0;
+                do
+                {
+                    Debug.Assert(numCharsWritten < 8, "Couldn't have written 8 characters out by this point.");
+                    // Pop off the last nibble
+                    chars[numCharsWritten++] = HexUtil.IntToChar(value & 0xFU);
+                    value >>= 4;
+                } while (value != 0);
+
+                // Finally, write out the HTML-encoded scalar value.
+                builder.Append('&');
+                builder.Append('#');
+                builder.Append('x');
+                Debug.Assert(numCharsWritten > 0, "At least one character should've been written.");
+                do
+                {
+                    builder.Append(chars[--numCharsWritten]);
+                } while (numCharsWritten != 0);
+                builder.Append(';');
+            }
        }
    }
 }