Update WebEncoders from Unicode 7.0 to Unicode 8.0
Add "how to update" file detailing update steps
This commit is contained in:
parent
f3e828892d
commit
43d0b0f65b
|
|
@ -28,14 +28,14 @@ namespace Microsoft.Framework.WebEncoders
|
|||
|
||||
/// <summary>
|
||||
/// Helper method which creates a bitmap of all characters which are
|
||||
/// defined per version 7.0.0 of the Unicode specification.
|
||||
/// defined per version 8.0 of the Unicode specification.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.NoInlining)]
|
||||
private static uint[] CreateDefinedCharacterBitmap()
|
||||
{
|
||||
// The stream should be exactly 8KB in size.
|
||||
var assembly = typeof(UnicodeHelpers).GetTypeInfo().Assembly;
|
||||
var resourceName = assembly.GetName().Name + ".compiler.resources.unicode-7.0.0-defined-characters.bin";
|
||||
var resourceName = assembly.GetName().Name + ".compiler.resources.unicode-defined-chars.bin";
|
||||
|
||||
var stream = assembly.GetManifestResourceStream(resourceName);
|
||||
if (stream.Length != 8 * 1024)
|
||||
|
|
@ -72,7 +72,7 @@ namespace Microsoft.Framework.WebEncoders
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a bitmap of all characters which are defined per version 7.0.0
|
||||
/// Returns a bitmap of all characters which are defined per version 8.0
|
||||
/// of the Unicode specification.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
|
|
@ -204,7 +204,7 @@ namespace Microsoft.Framework.WebEncoders
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a value stating whether a character is defined per version 7.0.0
|
||||
/// Returns a value stating whether a character is defined per version 8.0
|
||||
/// of the Unicode specification. Certain classes of characters (control chars,
|
||||
/// private use, surrogates, some whitespace) are considered "undefined" for
|
||||
/// our purposes.
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ namespace Microsoft.Framework.WebEncoders
|
|||
{
|
||||
/// <summary>
|
||||
/// Contains predefined <see cref="UnicodeRange"/> instances which correspond to blocks
|
||||
/// from the Unicode 7.0 specification.
|
||||
/// from the Unicode 8.0 specification.
|
||||
/// </summary>
|
||||
public static partial class UnicodeRanges
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1277,6 +1277,15 @@ namespace Microsoft.Framework.WebEncoders
|
|||
public static UnicodeRange LatinExtendedE => Volatile.Read(ref _latinExtendedE) ?? CreateRange(ref _latinExtendedE, first: '\uAB30', last: '\uAB6F');
|
||||
private static UnicodeRange _latinExtendedE;
|
||||
|
||||
/// <summary>
|
||||
/// A <see cref="UnicodeRange"/> corresponding to the 'Cherokee Supplement' Unicode block (U+AB70..U+ABBF).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// See http://www.unicode.org/charts/PDF/UAB70.pdf for the full set of characters in this block.
|
||||
/// </remarks>
|
||||
public static UnicodeRange CherokeeSupplement => Volatile.Read(ref _cherokeeSupplement) ?? CreateRange(ref _cherokeeSupplement, first: '\uAB70', last: '\uABBF');
|
||||
private static UnicodeRange _cherokeeSupplement;
|
||||
|
||||
/// <summary>
|
||||
/// A <see cref="UnicodeRange"/> corresponding to the 'Meetei Mayek' Unicode block (U+ABC0..U+ABFF).
|
||||
/// </summary>
|
||||
|
|
@ -1303,7 +1312,7 @@ namespace Microsoft.Framework.WebEncoders
|
|||
/// </remarks>
|
||||
public static UnicodeRange HangulJamoExtendedB => Volatile.Read(ref _hangulJamoExtendedB) ?? CreateRange(ref _hangulJamoExtendedB, first: '\uD7B0', last: '\uD7FF');
|
||||
private static UnicodeRange _hangulJamoExtendedB;
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// A <see cref="UnicodeRange"/> corresponding to the 'CJK Compatibility Ideographs' Unicode block (U+F900..U+FAFF).
|
||||
/// </summary>
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -193,7 +193,7 @@ namespace Microsoft.Framework.WebEncoders
|
|||
}
|
||||
}
|
||||
|
||||
// Handle known spans from Unicode 7.0.0's UnicodeData.txt
|
||||
// Handle known spans from Unicode 8.0's UnicodeData.txt
|
||||
|
||||
// CJK Ideograph Extension A
|
||||
for (int i = '\u3400'; i <= '\u4DB5'; i++)
|
||||
|
|
@ -201,7 +201,7 @@ namespace Microsoft.Framework.WebEncoders
|
|||
retVal[i] = true;
|
||||
}
|
||||
// CJK Ideograph
|
||||
for (int i = '\u4E00'; i <= '\u9FCC'; i++)
|
||||
for (int i = '\u4E00'; i <= '\u9FD5'; i++)
|
||||
{
|
||||
retVal[i] = true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -172,6 +172,7 @@ namespace Microsoft.Framework.WebEncoders
|
|||
[InlineData('\uAAE0', '\uAAFF', nameof(UnicodeRanges.MeeteiMayekExtensions))]
|
||||
[InlineData('\uAB00', '\uAB2F', nameof(UnicodeRanges.EthiopicExtendedA))]
|
||||
[InlineData('\uAB30', '\uAB6F', nameof(UnicodeRanges.LatinExtendedE))]
|
||||
[InlineData('\uAB70', '\uABBF', nameof(UnicodeRanges.CherokeeSupplement))]
|
||||
[InlineData('\uABC0', '\uABFF', nameof(UnicodeRanges.MeeteiMayek))]
|
||||
[InlineData('\uAC00', '\uD7AF', nameof(UnicodeRanges.HangulSyllables))]
|
||||
[InlineData('\uD7B0', '\uD7FF', nameof(UnicodeRanges.HangulJamoExtendedB))]
|
||||
|
|
|
|||
|
|
@ -1,14 +1,11 @@
|
|||
# Blocks-7.0.0.txt
|
||||
# Date: 2014-04-03, 23:23:00 GMT [RP, KW]
|
||||
# Blocks-8.0.0.txt
|
||||
# Date: 2014-11-10, 23:04:00 GMT [KW]
|
||||
#
|
||||
# Unicode Character Database
|
||||
# Copyright (c) 1991-2014 Unicode, Inc.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# Note: The casing of block names is not normative.
|
||||
# For example, "Basic Latin" and "BASIC LATIN" are equivalent.
|
||||
#
|
||||
# Format:
|
||||
# Start Code..End Code; Block Name
|
||||
|
||||
|
|
@ -20,6 +17,14 @@
|
|||
# For more information on the comparison of property values,
|
||||
# see UAX #44: http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# All block ranges start with a value where (cp MOD 16) = 0,
|
||||
# and end with a value where (cp MOD 16) = 15. In other words,
|
||||
# the last hexadecimal digit of the start of range is ...0
|
||||
# and the last hexadecimal digit of the end of range is ...F.
|
||||
# This constraint on block ranges guarantees that allocations
|
||||
# are done in terms of whole columns, and that code chart display
|
||||
# never involves splitting columns in the charts.
|
||||
#
|
||||
# All code points not explicitly listed for Block
|
||||
# have the value No_Block.
|
||||
|
||||
|
|
@ -168,6 +173,7 @@ AA80..AADF; Tai Viet
|
|||
AAE0..AAFF; Meetei Mayek Extensions
|
||||
AB00..AB2F; Ethiopic Extended-A
|
||||
AB30..AB6F; Latin Extended-E
|
||||
AB70..ABBF; Cherokee Supplement
|
||||
ABC0..ABFF; Meetei Mayek
|
||||
AC00..D7AF; Hangul Syllables
|
||||
D7B0..D7FF; Hangul Jamo Extended-B
|
||||
|
|
@ -210,6 +216,7 @@ FFF0..FFFF; Specials
|
|||
10840..1085F; Imperial Aramaic
|
||||
10860..1087F; Palmyrene
|
||||
10880..108AF; Nabataean
|
||||
108E0..108FF; Hatran
|
||||
10900..1091F; Phoenician
|
||||
10920..1093F; Lydian
|
||||
10980..1099F; Meroitic Hieroglyphs
|
||||
|
|
@ -223,6 +230,7 @@ FFF0..FFFF; Specials
|
|||
10B60..10B7F; Inscriptional Pahlavi
|
||||
10B80..10BAF; Psalter Pahlavi
|
||||
10C00..10C4F; Old Turkic
|
||||
10C80..10CFF; Old Hungarian
|
||||
10E60..10E7F; Rumi Numeral Symbols
|
||||
11000..1107F; Brahmi
|
||||
11080..110CF; Kaithi
|
||||
|
|
@ -232,17 +240,21 @@ FFF0..FFFF; Specials
|
|||
11180..111DF; Sharada
|
||||
111E0..111FF; Sinhala Archaic Numbers
|
||||
11200..1124F; Khojki
|
||||
11280..112AF; Multani
|
||||
112B0..112FF; Khudawadi
|
||||
11300..1137F; Grantha
|
||||
11480..114DF; Tirhuta
|
||||
11580..115FF; Siddham
|
||||
11600..1165F; Modi
|
||||
11680..116CF; Takri
|
||||
11700..1173F; Ahom
|
||||
118A0..118FF; Warang Citi
|
||||
11AC0..11AFF; Pau Cin Hau
|
||||
12000..123FF; Cuneiform
|
||||
12400..1247F; Cuneiform Numbers and Punctuation
|
||||
12480..1254F; Early Dynastic Cuneiform
|
||||
13000..1342F; Egyptian Hieroglyphs
|
||||
14400..1467F; Anatolian Hieroglyphs
|
||||
16800..16A3F; Bamum Supplement
|
||||
16A40..16A6F; Mro
|
||||
16AD0..16AFF; Bassa Vah
|
||||
|
|
@ -257,6 +269,7 @@ FFF0..FFFF; Specials
|
|||
1D300..1D35F; Tai Xuan Jing Symbols
|
||||
1D360..1D37F; Counting Rod Numerals
|
||||
1D400..1D7FF; Mathematical Alphanumeric Symbols
|
||||
1D800..1DAAF; Sutton SignWriting
|
||||
1E800..1E8DF; Mende Kikakui
|
||||
1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols
|
||||
1F000..1F02F; Mahjong Tiles
|
||||
|
|
@ -271,9 +284,11 @@ FFF0..FFFF; Specials
|
|||
1F700..1F77F; Alchemical Symbols
|
||||
1F780..1F7FF; Geometric Shapes Extended
|
||||
1F800..1F8FF; Supplemental Arrows-C
|
||||
1F900..1F9FF; Supplemental Symbols and Pictographs
|
||||
20000..2A6DF; CJK Unified Ideographs Extension B
|
||||
2A700..2B73F; CJK Unified Ideographs Extension C
|
||||
2B740..2B81F; CJK Unified Ideographs Extension D
|
||||
2B820..2CEAF; CJK Unified Ideographs Extension E
|
||||
2F800..2FA1F; CJK Compatibility Ideographs Supplement
|
||||
E0000..E007F; Tags
|
||||
E0100..E01EF; Variation Selectors Supplement
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,94 @@
|
|||
This document contains instructions for updating the Unicode data set used by
|
||||
the WebEncoders project.
|
||||
|
||||
1) Download the latest UnicodeData.txt and Blocks.txt from the Unicode
|
||||
Consortium web site. These files are normally found under
|
||||
http://www.unicode.org/Public/X.Y.Z/ucd/, where X.Y.Z is the version of the
|
||||
Unicode specification of interest. Replace the UnicodeData.txt and
|
||||
Blocks.txt files in this folder with the files you downloaded.
|
||||
|
||||
2) Update unicode-copyright.txt in this folder with the following information:
|
||||
- The exact URLs where you downloaded UnicodeData.txt and Blocks.txt.
|
||||
- The date on which you downloaded these two files.
|
||||
- The Unicode copyright and permission notice, if it has changed. The latest
|
||||
copyright and permission notice can be found at the bottom of
|
||||
http://www.unicode.org/copyright.html.
|
||||
|
||||
3) Open the Generators solution and run the DefinedCharListGenerator project.
|
||||
Running this will drop a file unicode-defined-chars.bin into the output
|
||||
folder. Move this file into the following directory, overwriting the
|
||||
existing file in that directory:
|
||||
src\Microsoft.Framework.WebEncoders.Core\compiler\resources
|
||||
|
||||
4) Open the Generators solution and run the UnicodeTablesGenerator project.
|
||||
Running this will drop two files UnicodeRanges.generated.txt and
|
||||
UnicodeRangesTests.generated.txt into the output folder.
|
||||
|
||||
5) Open UnicodeRanges.generated.txt in your favorite text editor. You'll see
|
||||
that the file contains all of the parsed Unicode block information in
|
||||
ascending code point order. Manually REMOVE the following blocks from this
|
||||
text file and re-save it.
|
||||
- High Surrogates (U+D800..U+DB7F)
|
||||
- High Private Use Surrogates (U+DB80..U+DBFF)
|
||||
- Low Surrogates (U+DC00..U+DFFF)
|
||||
- Private Use Area (U+E000..U+F8FF)
|
||||
|
||||
6) Open src\Microsoft.Framework.WebEncoders.Core\UnicodeRanges.generated.cs in
|
||||
your IDE. Delete everything within the partial class definition and replace
|
||||
it with the contents of UnicodeRanges.generated.txt. (Remember to remove
|
||||
the blocks mentioned in the previous step, otherwise unit tests will fail.)
|
||||
|
||||
Open src\Microsoft.Framework.WebEncoders.Core\UnicodeRanges.cs in your IDE.
|
||||
Update the doc comment at the top of the class to reflect the appropriate
|
||||
version of the Unicode specification.
|
||||
|
||||
7) Open UnicodeRangesTests.generated.txt in your favorite text editor. Just
|
||||
like in the previous .txt file, you'll need to remove the [InlineData]
|
||||
lines which map to the Unicode blocks which were manually removed.
|
||||
See step (5) for the list of which blocks must be removed. Then re-save
|
||||
this file.
|
||||
|
||||
8) Open test\Microsoft.Framework.WebEncoders.Tests\UnicodeRangesTests.cs in
|
||||
your IDE. Delete all of the [InlineData] attributes on the Range_Unicode
|
||||
test, then paste the contents of UnicodeRangesTests.generated.txt in
|
||||
to restore the new [InlineData] list.
|
||||
|
||||
IMPORTANT: Don't delete the [Theory] attribute on this method!
|
||||
|
||||
9) Open test\Microsoft.Framework.WebEncoders.Tests\UnicodeHelpersTests.cs in
|
||||
your IDE. Scroll to the bottom of the ReadListOfDefinedCharacters method,
|
||||
and you'll see a section where the test special-cases CJK Ideographs and
|
||||
Hangul Syllables. As more characters are added to the Unicode specification
|
||||
the list of valid CJK Ideographs and Hangul Syllables can grow, so make sure
|
||||
these match up with the relevant lines in UnicodeData.txt. For instance, at
|
||||
the time of this writing UnicodeData.txt lists the valid Hangul Syllable
|
||||
character range as follows:
|
||||
|
||||
AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
|
||||
D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
|
||||
|
||||
If necessary, update the logic in the ReadListOfDefinedCharacters method to
|
||||
account for any changes to these lines in UnicodeData.txt.
|
||||
|
||||
That's it! Run the unit tests and everything should be good to go. If you find
|
||||
any stray comments throughout the code base that reference a specific version
|
||||
of the Unicode specification, go ahead and update them so that they correctly
|
||||
reflect the version you just submitted.
|
||||
|
||||
To recap, the files you should check in are:
|
||||
|
||||
src\Microsoft.Framework.WebEncoders.Core\compiler\resources\
|
||||
unicode-defined-chars.bin
|
||||
|
||||
src\Microsoft.Framework.WebEncoders.Core\
|
||||
UnicodeRanges.cs
|
||||
UnicodeRanges.generated.cs
|
||||
|
||||
test\Microsoft.Framework.WebEncoders.Tests\
|
||||
UnicodeHelpersTests.cs (if necessary, see step 9)
|
||||
UnicodeRangesTests.cs
|
||||
|
||||
unicode\
|
||||
Blocks.txt
|
||||
unicode-copyright.txt
|
||||
UnicodeData.txt
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
The files Blocks.txt and UnicodeData.txt in this directory were
|
||||
retrieved from the following URLs on Saturday, February 7, 2015.
|
||||
retrieved from the following URLs on Saturday, September 5, 2015.
|
||||
|
||||
http://www.unicode.org/Public/7.0.0/ucd/Blocks.txt
|
||||
http://www.unicode.org/Public/7.0.0/ucd/UnicodeData.txt
|
||||
http://www.unicode.org/Public/8.0.0/ucd/Blocks.txt
|
||||
http://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt
|
||||
|
||||
The below copyright notice applies to these files.
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue