// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved. // Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. using System; using System.Diagnostics; using System.IO; using System.Runtime.CompilerServices; using System.Text; namespace Microsoft.AspNet.WebUtilities.Encoders { internal unsafe abstract class UnicodeEncoderBase { // A bitmap of characters which are allowed to be returned unescaped. private readonly uint[] _allowedCharsBitmap = new uint[0x10000 / 32]; // The worst-case number of output chars generated for any input char. private readonly int _maxOutputCharsPerInputChar; /// /// Instantiates an encoder using a custom allow list of characters. /// protected UnicodeEncoderBase(ICodePointFilter[] filters, int maxOutputCharsPerInputChar) { _maxOutputCharsPerInputChar = maxOutputCharsPerInputChar; if (filters != null) { // Punch a hole for each allowed code point across all filters (this is an OR). // We don't allow supplementary (astral) characters for now. foreach (var filter in filters) { foreach (var codePoint in filter.GetAllowedCodePoints()) { if (!UnicodeHelpers.IsSupplementaryCodePoint(codePoint)) { AllowCharacter((char)codePoint); } } } } // Forbid characters that are special in HTML. // Even though this is a common encoder used by everybody (including URL // and JavaScript strings), it's unfortunately common for developers to // forget to HTML-encode a string once it has been URL-encoded or // JavaScript string-escaped, so this offers extra protection. ForbidCharacter('<'); ForbidCharacter('>'); ForbidCharacter('&'); ForbidCharacter('\''); // can be used to escape attributes ForbidCharacter('\"'); // can be used to escape attributes ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed // (includes categories Cc, Cs, Co, Cn, Zl, Zp) uint[] definedCharactersBitmap = UnicodeHelpers.GetDefinedCharacterBitmap(); Debug.Assert(definedCharactersBitmap.Length == _allowedCharsBitmap.Length); for (int i = 0; i < _allowedCharsBitmap.Length; i++) { _allowedCharsBitmap[i] &= definedCharactersBitmap[i]; } } // Marks a character as allowed (can be returned unencoded) private void AllowCharacter(char c) { uint codePoint = (uint)c; int index = (int)(codePoint >> 5); int offset = (int)(codePoint & 0x1FU); _allowedCharsBitmap[index] |= 0x1U << offset; } // Marks a character as forbidden (must be returned encoded) protected void ForbidCharacter(char c) { uint codePoint = (uint)c; int index = (int)(codePoint >> 5); int offset = (int)(codePoint & 0x1FU); _allowedCharsBitmap[index] &= ~(0x1U << offset); } /// /// Entry point to the encoder. /// public void Encode([NotNull] char[] value, int startIndex, int charCount, [NotNull] TextWriter output) { // Input checking ValidateInputs(startIndex, charCount, actualInputLength: value.Length); if (charCount != 0) { fixed (char* pChars = value) { int indexOfFirstCharWhichRequiresEncoding = GetIndexOfFirstCharWhichRequiresEncoding(&pChars[startIndex], charCount); if (indexOfFirstCharWhichRequiresEncoding < 0) { // All chars are valid - just copy the buffer as-is. output.Write(value, startIndex, charCount); } else { // Flush all chars which are known to be valid, then encode the remainder individually if (indexOfFirstCharWhichRequiresEncoding > 0) { output.Write(value, startIndex, indexOfFirstCharWhichRequiresEncoding); } EncodeCore(&pChars[startIndex + indexOfFirstCharWhichRequiresEncoding], (uint)(charCount - indexOfFirstCharWhichRequiresEncoding), output); } } } } /// /// Entry point to the encoder. /// public string Encode(string value) { if (String.IsNullOrEmpty(value)) { return value; } // Quick check: does the string need to be encoded at all? // If not, just return the input string as-is. for (int i = 0; i < value.Length; i++) { if (!IsCharacterAllowed(value[i])) { return EncodeCore(value, idxOfFirstCharWhichRequiresEncoding: i); } } return value; } /// /// Entry point to the encoder. /// public void Encode([NotNull] string value, int startIndex, int charCount, [NotNull] TextWriter output) { // Input checking ValidateInputs(startIndex, charCount, actualInputLength: value.Length); if (charCount != 0) { fixed (char* pChars = value) { if (charCount == value.Length) { // Optimize for the common case: we're being asked to encode the entire input string // (not just a subset). If all characters are safe, we can just spit it out as-is. int indexOfFirstCharWhichRequiresEncoding = GetIndexOfFirstCharWhichRequiresEncoding(pChars, charCount); if (indexOfFirstCharWhichRequiresEncoding < 0) { output.Write(value); } else { // Flush all chars which are known to be valid, then encode the remainder individually for (int i = 0; i < indexOfFirstCharWhichRequiresEncoding; i++) { output.Write(pChars[i]); } EncodeCore(&pChars[indexOfFirstCharWhichRequiresEncoding], (uint)(charCount - indexOfFirstCharWhichRequiresEncoding), output); } } else { // We're being asked to encode a subset, so we need to go through the slow path of appending // each character individually. EncodeCore(&pChars[startIndex], (uint)charCount, output); } } } } private string EncodeCore(string input, int idxOfFirstCharWhichRequiresEncoding) { Debug.Assert(idxOfFirstCharWhichRequiresEncoding >= 0); Debug.Assert(idxOfFirstCharWhichRequiresEncoding < input.Length); int numCharsWhichMayRequireEncoding = input.Length - idxOfFirstCharWhichRequiresEncoding; int sbCapacity = checked(idxOfFirstCharWhichRequiresEncoding + EncoderCommon.GetCapacityOfOutputStringBuilder(numCharsWhichMayRequireEncoding, _maxOutputCharsPerInputChar)); Debug.Assert(sbCapacity >= input.Length); // Allocate the StringBuilder with the first (known to not require encoding) part of the input string, // then begin encoding from the last (potentially requiring encoding) part of the input string. StringBuilder builder = new StringBuilder(input, 0, idxOfFirstCharWhichRequiresEncoding, sbCapacity); Writer writer = new Writer(builder); fixed (char* pInput = input) { EncodeCore(ref writer, &pInput[idxOfFirstCharWhichRequiresEncoding], (uint)numCharsWhichMayRequireEncoding); } return builder.ToString(); } private void EncodeCore(char* input, uint charsRemaining, TextWriter output) { Writer writer = new Writer(output); EncodeCore(ref writer, input, charsRemaining); } private void EncodeCore(ref Writer writer, char* input, uint charsRemaining) { while (charsRemaining != 0) { int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(input, endOfString: (charsRemaining == 1)); if (UnicodeHelpers.IsSupplementaryCodePoint(nextScalar)) { // Supplementary characters should always be encoded numerically. WriteEncodedScalar(ref writer, (uint)nextScalar); // We consume two UTF-16 characters for a single supplementary character. input += 2; charsRemaining -= 2; } else { // Otherwise, this was a BMP character. input++; charsRemaining--; char c = (char)nextScalar; if (IsCharacterAllowed(c)) { writer.Write(c); } else { WriteEncodedScalar(ref writer, (uint)nextScalar); } } } } private int GetIndexOfFirstCharWhichRequiresEncoding(char* input, int inputLength) { for (int i = 0; i < inputLength; i++) { if (!IsCharacterAllowed(input[i])) { return i; } } return -1; // no characters require encoding } // Determines whether the given character can be returned unencoded. [MethodImpl(MethodImplOptions.AggressiveInlining)] private bool IsCharacterAllowed(char c) { uint codePoint = (uint)c; int index = (int)(codePoint >> 5); int offset = (int)(codePoint & 0x1FU); return ((_allowedCharsBitmap[index] >> offset) & 0x1U) != 0; } private static void ValidateInputs(int startIndex, int charCount, int actualInputLength) { if (startIndex < 0 || startIndex > actualInputLength) { throw new ArgumentOutOfRangeException(nameof(startIndex)); } if (charCount < 0 || charCount > (actualInputLength - startIndex)) { throw new ArgumentOutOfRangeException(nameof(charCount)); } } protected abstract void WriteEncodedScalar(ref Writer writer, uint value); /// /// Provides an abstraction over both StringBuilder and TextWriter. /// Declared as a struct so we can allocate on the stack and pass by /// reference. Eliminates chatty virtual dispatches on hot paths. /// protected struct Writer { private readonly StringBuilder _innerBuilder; private readonly TextWriter _innerWriter; public Writer(StringBuilder innerBuilder) { _innerBuilder = innerBuilder; _innerWriter = null; } public Writer(TextWriter innerWriter) { _innerBuilder = null; _innerWriter = innerWriter; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Write(char value) { if (_innerBuilder != null) { _innerBuilder.Append(value); } else { _innerWriter.Write(value); } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Write(string value) { if (_innerBuilder != null) { _innerBuilder.Append(value); } else { _innerWriter.Write(value); } } } } }