Fix CJK Ideographs and Hangul Syllables representation

Characters in these blocks weren't correctly identified as assigned characters, which caused the encoders to always encode them, even if the ranges were in the allow list.
2015-04-01 11:45:52 -07:00 · 2015-04-01 11:45:52 -07:00 · 6c9055cadc
parent 8da763a14a
commit 6c9055cadc
3 changed files with 101 additions and 4 deletions
--- a/src/Microsoft.Framework.WebEncoders.Core/compiler/resources/unicode-7.0.0-defined-characters.bin
+++ b/src/Microsoft.Framework.WebEncoders.Core/compiler/resources/unicode-7.0.0-defined-characters.bin
--- a/test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs
+++ b/test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs
@ -101,8 +101,7 @@ namespace Microsoft.Framework.WebEncoders
                bool actual = UnicodeHelpers.IsCharacterDefined((char)i);
                if (expected != actual)
                {
-                    string message = String.Format(CultureInfo.InvariantCulture, "Character U+{0:X4}: expected = {1}, actual = {2}", i, expected, actual);
-                    errors.Add(message);
+                    errors.Add($"Character U+{i:X4}: expected = {expected}, actual = {actual}");
                }
            }

@ -164,11 +163,18 @@ namespace Microsoft.Framework.WebEncoders
            {
                string[] splitLine = line.Split(';');
                uint codePoint = UInt32.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture);
+                string name = splitLine[1];
                if (codePoint >= retVal.Length)
                {
                    continue; // don't care about supplementary chars
                }

+                if (name.EndsWith(", First>", StringComparison.Ordinal) || name.EndsWith(", Last>", StringComparison.Ordinal))
+                {
+                    // ignore spans - we'll handle them separately
+                    continue;
+                }
+
                if (codePoint == (uint)' ')
                {
                    retVal[codePoint] = true; // we allow U+0020 SPACE as our only valid Zs (whitespace) char
@ -184,6 +190,24 @@ namespace Microsoft.Framework.WebEncoders
                }
            }

+            // Handle known spans from Unicode 7.0.0's UnicodeData.txt
+
+            // CJK Ideograph Extension A
+            for (int i = '\u3400'; i <= '\u4DB5'; i++)
+            {
+                retVal[i] = true;
+            }
+            // CJK Ideograph
+            for (int i = '\u4E00'; i <= '\u9FCC'; i++)
+            {
+                retVal[i] = true;
+            }
+            // Hangul Syllable
+            for (int i = '\uAC00'; i <= '\uD7A3'; i++)
+            {
+                retVal[i] = true;
+            }
+
            // Finally, we need to make sure we've seen every category which contains
            // allowed characters. This provides extra defense against having a typo
            // in the list of categories.
--- a/unicode/Generators/DefinedCharListGenerator/Program.cs
+++ b/unicode/Generators/DefinedCharListGenerator/Program.cs
@ -1,4 +1,5 @@
 using System;
+using System.Collections.Generic;
 using System.Diagnostics;
 using System.Globalization;
 using System.IO;
@ -19,6 +20,7 @@ namespace DefinedCharListGenerator

            const uint MAX_UNICODE_CHAR = 0x10FFFF; // Unicode range is U+0000 .. U+10FFFF
            bool[] definedChars = new bool[MAX_UNICODE_CHAR + 1];
+            Dictionary<string, Span> spans = new Dictionary<string, Span>();

            // Read all defined characters from the input file.
            string[] allLines = File.ReadAllLines("UnicodeData.txt");
@ -28,11 +30,33 @@ namespace DefinedCharListGenerator
            foreach (string line in allLines)
            {
                string[] splitLine = line.Split(new char[] { ';' }, 4);
+                uint codepoint = uint.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture);
+                string rawName = splitLine[1];
+                string category = splitLine[2];
+
+                // spans go into their own dictionary for later processing
+                string spanName;
+                bool isStartOfSpan;
+                if (IsSpanDefinition(rawName, out spanName, out isStartOfSpan))
+                {
+                    if (isStartOfSpan)
+                    {
+                        spans.Add(spanName, new Span() { FirstCodePoint = codepoint, Category = category });
+                    }
+                    else
+                    {
+                        var existingSpan = spans[spanName];
+                        Debug.Assert(existingSpan.FirstCodePoint != 0, "We should've seen the start of this span already.");
+                        Debug.Assert(existingSpan.LastCodePoint == 0, "We shouldn't have seen the end of this span already.");
+                        Debug.Assert(existingSpan.Category == category, "Span start Unicode category doesn't match span end Unicode category.");
+                        existingSpan.LastCodePoint = codepoint;
+                    }
+                    continue;
+                }

                // We only allow certain categories of code points.
                // Zs (space separators) aren't included, but we allow U+0020 SPACE as a special case
-                uint codepoint = uint.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture);
-                string category = splitLine[2];
+
                if (!(codepoint == (uint)' ' || IsAllowedUnicodeCategory(category)))
                {
                    continue;
@ -42,6 +66,21 @@ namespace DefinedCharListGenerator
                definedChars[codepoint] = true;
            }

+            // Next, populate characters that weren't defined on their own lines
+            // but which are instead defined as members of a named span.
+            foreach (var span in spans.Values)
+            {
+                if (IsAllowedUnicodeCategory(span.Category))
+                {
+                    Debug.Assert(span.FirstCodePoint <= MAX_UNICODE_CHAR);
+                    Debug.Assert(span.LastCodePoint <= MAX_UNICODE_CHAR);
+                    for (uint i = span.FirstCodePoint; i <= span.LastCodePoint; i++)
+                    {
+                        definedChars[i] = true;
+                    }
+                }
+            }
+
            // Finally, write the list of defined characters out as a bitmap.
            // Each consecutive block of 8 chars is written as a single byte.
            // For instance, the first byte of the output file contains the
@ -103,5 +142,39 @@ namespace DefinedCharListGenerator
                || category == "So"
                || category == "Cf"; /* other */
        }
+
+        private static bool IsSpanDefinition(string rawName, out string spanName, out bool isStartOfSpan)
+        {
+            // Spans are represented within angle brackets, such as the following:
+            // DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;
+            // DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;
+            if (rawName.StartsWith("<", StringComparison.Ordinal))
+            {
+                if (rawName.EndsWith(", First>", StringComparison.Ordinal))
+                {
+                    spanName = rawName.Substring(1, rawName.Length - 1 - ", First>".Length);
+                    isStartOfSpan = true;
+                    return true;
+                }
+                else if (rawName.EndsWith(", Last>", StringComparison.Ordinal))
+                {
+                    spanName = rawName.Substring(1, rawName.Length - 1 - ", Last>".Length);
+                    isStartOfSpan = false;
+                    return true;
+                }
+            }
+
+            // not surrounded by <>, or <control> or some other non-span
+            spanName = null;
+            isStartOfSpan = false;
+            return false;
+        }
+
+        private class Span
+        {
+            public uint FirstCodePoint;
+            public uint LastCodePoint;
+            public string Category;
+        }
    }
 }