From 6c9055cadcd1a715482751c75660e05cadb693ee Mon Sep 17 00:00:00 2001
From: Levi B <levib@yahoo.com>
Date: Wed, 1 Apr 2015 11:45:52 -0700
Subject: [PATCH] Fix CJK Ideographs and Hangul Syllables representation
 Characters in these blocks weren't correctly identified as assigned
 characters, which caused the encoders to always encode them, even if the
 ranges were in the allow list.

---
 .../unicode-7.0.0-defined-characters.bin      | Bin 8192 -> 8192 bytes
 .../UnicodeHelpersTests.cs                    |  28 ++++++-
 .../DefinedCharListGenerator/Program.cs       |  77 +++++++++++++++++-
 3 files changed, 101 insertions(+), 4 deletions(-)

diff --git a/src/Microsoft.Framework.WebEncoders.Core/compiler/resources/unicode-7.0.0-defined-characters.bin b/src/Microsoft.Framework.WebEncoders.Core/compiler/resources/unicode-7.0.0-defined-characters.bin
index c9b36c871d6146d5c25044cd405807c15641d3f3..06530d73cf6848c91b0f41b326dccf833192412a 100644
GIT binary patch
delta 148
zcmZp0XmHrjz&3dS%Q+-`hRL2`@<f)qNJ7ZcxY%9n@(f@A)H`wVM3MKCjaYo~m=Vmw
X&+z|${g3}3z`yw*rvl4l1^EX6VjM%b

delta 91
zcmZp0XmHrjz{be9nNix5nOlM3KNNrj<XJWgNDF`nP9}B%1~8aBk!9lKi6Za8I=~7-
V7<o8=+UkFRb@GGtO_rB`2mp$yAO-*c

diff --git a/test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs b/test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs
index 1984a61a53..6be7c93bbc 100644
--- a/test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs
+++ b/test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs
@@ -101,8 +101,7 @@ namespace Microsoft.Framework.WebEncoders
                 bool actual = UnicodeHelpers.IsCharacterDefined((char)i);
                 if (expected != actual)
                 {
-                    string message = String.Format(CultureInfo.InvariantCulture, "Character U+{0:X4}: expected = {1}, actual = {2}", i, expected, actual);
-                    errors.Add(message);
+                    errors.Add($"Character U+{i:X4}: expected = {expected}, actual = {actual}");
                 }
             }
 
@@ -164,11 +163,18 @@ namespace Microsoft.Framework.WebEncoders
             {
                 string[] splitLine = line.Split(';');
                 uint codePoint = UInt32.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture);
+                string name = splitLine[1];
                 if (codePoint >= retVal.Length)
                 {
                     continue; // don't care about supplementary chars
                 }
 
+                if (name.EndsWith(", First>", StringComparison.Ordinal) || name.EndsWith(", Last>", StringComparison.Ordinal))
+                {
+                    // ignore spans - we'll handle them separately
+                    continue;
+                }
+
                 if (codePoint == (uint)' ')
                 {
                     retVal[codePoint] = true; // we allow U+0020 SPACE as our only valid Zs (whitespace) char
@@ -184,6 +190,24 @@ namespace Microsoft.Framework.WebEncoders
                 }
             }
 
+            // Handle known spans from Unicode 7.0.0's UnicodeData.txt
+
+            // CJK Ideograph Extension A
+            for (int i = '\u3400'; i <= '\u4DB5'; i++)
+            {
+                retVal[i] = true;
+            }
+            // CJK Ideograph
+            for (int i = '\u4E00'; i <= '\u9FCC'; i++)
+            {
+                retVal[i] = true;
+            }
+            // Hangul Syllable
+            for (int i = '\uAC00'; i <= '\uD7A3'; i++)
+            {
+                retVal[i] = true;
+            }
+
             // Finally, we need to make sure we've seen every category which contains
             // allowed characters. This provides extra defense against having a typo
             // in the list of categories.
diff --git a/unicode/Generators/DefinedCharListGenerator/Program.cs b/unicode/Generators/DefinedCharListGenerator/Program.cs
index f98dd32f5b..edcd0f3602 100644
--- a/unicode/Generators/DefinedCharListGenerator/Program.cs
+++ b/unicode/Generators/DefinedCharListGenerator/Program.cs
@@ -1,4 +1,5 @@
 ﻿using System;
+using System.Collections.Generic;
 using System.Diagnostics;
 using System.Globalization;
 using System.IO;
@@ -19,6 +20,7 @@ namespace DefinedCharListGenerator
 
             const uint MAX_UNICODE_CHAR = 0x10FFFF; // Unicode range is U+0000 .. U+10FFFF
             bool[] definedChars = new bool[MAX_UNICODE_CHAR + 1];
+            Dictionary<string, Span> spans = new Dictionary<string, Span>();
 
             // Read all defined characters from the input file.
             string[] allLines = File.ReadAllLines("UnicodeData.txt");
@@ -28,11 +30,33 @@ namespace DefinedCharListGenerator
             foreach (string line in allLines)
             {
                 string[] splitLine = line.Split(new char[] { ';' }, 4);
+                uint codepoint = uint.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture);
+                string rawName = splitLine[1];
+                string category = splitLine[2];
+
+                // spans go into their own dictionary for later processing
+                string spanName;
+                bool isStartOfSpan;
+                if (IsSpanDefinition(rawName, out spanName, out isStartOfSpan))
+                {
+                    if (isStartOfSpan)
+                    {
+                        spans.Add(spanName, new Span() { FirstCodePoint = codepoint, Category = category });
+                    }
+                    else
+                    {
+                        var existingSpan = spans[spanName];
+                        Debug.Assert(existingSpan.FirstCodePoint != 0, "We should've seen the start of this span already.");
+                        Debug.Assert(existingSpan.LastCodePoint == 0, "We shouldn't have seen the end of this span already.");
+                        Debug.Assert(existingSpan.Category == category, "Span start Unicode category doesn't match span end Unicode category.");
+                        existingSpan.LastCodePoint = codepoint;
+                    }
+                    continue;
+                }
 
                 // We only allow certain categories of code points.
                 // Zs (space separators) aren't included, but we allow U+0020 SPACE as a special case
-                uint codepoint = uint.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture);
-                string category = splitLine[2];
+
                 if (!(codepoint == (uint)' ' || IsAllowedUnicodeCategory(category)))
                 {
                     continue;
@@ -42,6 +66,21 @@ namespace DefinedCharListGenerator
                 definedChars[codepoint] = true;
             }
 
+            // Next, populate characters that weren't defined on their own lines
+            // but which are instead defined as members of a named span.
+            foreach (var span in spans.Values)
+            {
+                if (IsAllowedUnicodeCategory(span.Category))
+                {
+                    Debug.Assert(span.FirstCodePoint <= MAX_UNICODE_CHAR);
+                    Debug.Assert(span.LastCodePoint <= MAX_UNICODE_CHAR);
+                    for (uint i = span.FirstCodePoint; i <= span.LastCodePoint; i++)
+                    {
+                        definedChars[i] = true;
+                    }
+                }
+            }
+
             // Finally, write the list of defined characters out as a bitmap.
             // Each consecutive block of 8 chars is written as a single byte.
             // For instance, the first byte of the output file contains the
@@ -103,5 +142,39 @@ namespace DefinedCharListGenerator
                 || category == "So"
                 || category == "Cf"; /* other */
         }
+
+        private static bool IsSpanDefinition(string rawName, out string spanName, out bool isStartOfSpan)
+        {
+            // Spans are represented within angle brackets, such as the following:
+            // DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;
+            // DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;
+            if (rawName.StartsWith("<", StringComparison.Ordinal))
+            {
+                if (rawName.EndsWith(", First>", StringComparison.Ordinal))
+                {
+                    spanName = rawName.Substring(1, rawName.Length - 1 - ", First>".Length);
+                    isStartOfSpan = true;
+                    return true;
+                }
+                else if (rawName.EndsWith(", Last>", StringComparison.Ordinal))
+                {
+                    spanName = rawName.Substring(1, rawName.Length - 1 - ", Last>".Length);
+                    isStartOfSpan = false;
+                    return true;
+                }
+            }
+
+            // not surrounded by <>, or <control> or some other non-span
+            spanName = null;
+            isStartOfSpan = false;
+            return false;
+        }
+
+        private class Span
+        {
+            public uint FirstCodePoint;
+            public uint LastCodePoint;
+            public string Category;
+        }
     }
 }