From 6c9055cadcd1a715482751c75660e05cadb693ee Mon Sep 17 00:00:00 2001 From: Levi B Date: Wed, 1 Apr 2015 11:45:52 -0700 Subject: [PATCH] Fix CJK Ideographs and Hangul Syllables representation Characters in these blocks weren't correctly identified as assigned characters, which caused the encoders to always encode them, even if the ranges were in the allow list. --- .../unicode-7.0.0-defined-characters.bin | Bin 8192 -> 8192 bytes .../UnicodeHelpersTests.cs | 28 ++++++- .../DefinedCharListGenerator/Program.cs | 77 +++++++++++++++++- 3 files changed, 101 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.Framework.WebEncoders.Core/compiler/resources/unicode-7.0.0-defined-characters.bin b/src/Microsoft.Framework.WebEncoders.Core/compiler/resources/unicode-7.0.0-defined-characters.bin index c9b36c871d6146d5c25044cd405807c15641d3f3..06530d73cf6848c91b0f41b326dccf833192412a 100644 GIT binary patch delta 148 zcmZp0XmHrjz&3dS%Q+-`hRL2`@= retVal.Length) { continue; // don't care about supplementary chars } + if (name.EndsWith(", First>", StringComparison.Ordinal) || name.EndsWith(", Last>", StringComparison.Ordinal)) + { + // ignore spans - we'll handle them separately + continue; + } + if (codePoint == (uint)' ') { retVal[codePoint] = true; // we allow U+0020 SPACE as our only valid Zs (whitespace) char @@ -184,6 +190,24 @@ namespace Microsoft.Framework.WebEncoders } } + // Handle known spans from Unicode 7.0.0's UnicodeData.txt + + // CJK Ideograph Extension A + for (int i = '\u3400'; i <= '\u4DB5'; i++) + { + retVal[i] = true; + } + // CJK Ideograph + for (int i = '\u4E00'; i <= '\u9FCC'; i++) + { + retVal[i] = true; + } + // Hangul Syllable + for (int i = '\uAC00'; i <= '\uD7A3'; i++) + { + retVal[i] = true; + } + // Finally, we need to make sure we've seen every category which contains // allowed characters. This provides extra defense against having a typo // in the list of categories. diff --git a/unicode/Generators/DefinedCharListGenerator/Program.cs b/unicode/Generators/DefinedCharListGenerator/Program.cs index f98dd32f5b..edcd0f3602 100644 --- a/unicode/Generators/DefinedCharListGenerator/Program.cs +++ b/unicode/Generators/DefinedCharListGenerator/Program.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.Diagnostics; using System.Globalization; using System.IO; @@ -19,6 +20,7 @@ namespace DefinedCharListGenerator const uint MAX_UNICODE_CHAR = 0x10FFFF; // Unicode range is U+0000 .. U+10FFFF bool[] definedChars = new bool[MAX_UNICODE_CHAR + 1]; + Dictionary spans = new Dictionary(); // Read all defined characters from the input file. string[] allLines = File.ReadAllLines("UnicodeData.txt"); @@ -28,11 +30,33 @@ namespace DefinedCharListGenerator foreach (string line in allLines) { string[] splitLine = line.Split(new char[] { ';' }, 4); + uint codepoint = uint.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture); + string rawName = splitLine[1]; + string category = splitLine[2]; + + // spans go into their own dictionary for later processing + string spanName; + bool isStartOfSpan; + if (IsSpanDefinition(rawName, out spanName, out isStartOfSpan)) + { + if (isStartOfSpan) + { + spans.Add(spanName, new Span() { FirstCodePoint = codepoint, Category = category }); + } + else + { + var existingSpan = spans[spanName]; + Debug.Assert(existingSpan.FirstCodePoint != 0, "We should've seen the start of this span already."); + Debug.Assert(existingSpan.LastCodePoint == 0, "We shouldn't have seen the end of this span already."); + Debug.Assert(existingSpan.Category == category, "Span start Unicode category doesn't match span end Unicode category."); + existingSpan.LastCodePoint = codepoint; + } + continue; + } // We only allow certain categories of code points. // Zs (space separators) aren't included, but we allow U+0020 SPACE as a special case - uint codepoint = uint.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture); - string category = splitLine[2]; + if (!(codepoint == (uint)' ' || IsAllowedUnicodeCategory(category))) { continue; @@ -42,6 +66,21 @@ namespace DefinedCharListGenerator definedChars[codepoint] = true; } + // Next, populate characters that weren't defined on their own lines + // but which are instead defined as members of a named span. + foreach (var span in spans.Values) + { + if (IsAllowedUnicodeCategory(span.Category)) + { + Debug.Assert(span.FirstCodePoint <= MAX_UNICODE_CHAR); + Debug.Assert(span.LastCodePoint <= MAX_UNICODE_CHAR); + for (uint i = span.FirstCodePoint; i <= span.LastCodePoint; i++) + { + definedChars[i] = true; + } + } + } + // Finally, write the list of defined characters out as a bitmap. // Each consecutive block of 8 chars is written as a single byte. // For instance, the first byte of the output file contains the @@ -103,5 +142,39 @@ namespace DefinedCharListGenerator || category == "So" || category == "Cf"; /* other */ } + + private static bool IsSpanDefinition(string rawName, out string spanName, out bool isStartOfSpan) + { + // Spans are represented within angle brackets, such as the following: + // DC00;;Cs;0;L;;;;;N;;;;; + // DFFF;;Cs;0;L;;;;;N;;;;; + if (rawName.StartsWith("<", StringComparison.Ordinal)) + { + if (rawName.EndsWith(", First>", StringComparison.Ordinal)) + { + spanName = rawName.Substring(1, rawName.Length - 1 - ", First>".Length); + isStartOfSpan = true; + return true; + } + else if (rawName.EndsWith(", Last>", StringComparison.Ordinal)) + { + spanName = rawName.Substring(1, rawName.Length - 1 - ", Last>".Length); + isStartOfSpan = false; + return true; + } + } + + // not surrounded by <>, or or some other non-span + spanName = null; + isStartOfSpan = false; + return false; + } + + private class Span + { + public uint FirstCodePoint; + public uint LastCodePoint; + public string Category; + } } }