summaryrefslogtreecommitdiff
path: root/data/unicode/CompositionExclusions.txt
diff options
context:
space:
mode:
authorGravatar Jose Colon Rodriguez2024-02-19 09:11:56 -0400
committerGravatar Jose Colon Rodriguez2024-02-19 09:11:56 -0400
commit6013b2ded106521ee9cae6bd77dacbd5254ff763 (patch)
tree990f13cfbe4bfc20a08d2f097c4646984bffb565 /data/unicode/CompositionExclusions.txt
parentTried SIMD lower/upper string. Slower than linear. (diff)
downloadzg-6013b2ded106521ee9cae6bd77dacbd5254ff763.tar.gz
zg-6013b2ded106521ee9cae6bd77dacbd5254ff763.tar.xz
zg-6013b2ded106521ee9cae6bd77dacbd5254ff763.zip
Cleaned up directory structure
Diffstat (limited to 'data/unicode/CompositionExclusions.txt')
-rw-r--r--data/unicode/CompositionExclusions.txt221
1 files changed, 221 insertions, 0 deletions
diff --git a/data/unicode/CompositionExclusions.txt b/data/unicode/CompositionExclusions.txt
new file mode 100644
index 0000000..db708a7
--- /dev/null
+++ b/data/unicode/CompositionExclusions.txt
@@ -0,0 +1,221 @@
1# CompositionExclusions-15.1.0.txt
2# Date: 2023-01-05
3# © 2023 Unicode®, Inc.
4# For terms of use, see https://www.unicode.org/terms_of_use.html
5#
6# Unicode Character Database
7# For documentation, see https://www.unicode.org/reports/tr44/
8#
9# This file lists the characters for the Composition Exclusion Table
10# defined in UAX #15, Unicode Normalization Forms.
11#
12# This file is a normative contributory data file in the
13# Unicode Character Database.
14#
15# For more information, see
16# https://www.unicode.org/reports/tr15/#Primary_Exclusion_List_Table
17#
18# For a full derivation of composition exclusions, see the derived property
19# Full_Composition_Exclusion in DerivedNormalizationProps.txt
20#
21
22# ================================================
23# (1) Script Specifics
24#
25# This list of characters cannot be derived from the UnicodeData.txt file.
26#
27# Included are the following subcategories:
28#
29# - Many precomposed characters using a nukta diacritic in the Devanagari,
30# Bangla/Bengali, Gurmukhi, or Odia/Oriya scripts.
31# - Tibetan letters and subjoined letters with decompositions including
32# U+0FB7 TIBETAN SUBJOINED LETTER HA or U+0FB5 TIBETAN SUBJOINED LETTER SSA.
33# - Two two-part Tibetan vowel signs involving top and bottom pieces.
34# - A large collection of compatibility precomposed characters for Hebrew
35# involving dagesh and/or other combining marks.
36#
37# This list is unlikely to grow.
38#
39# ================================================
40
410958 # DEVANAGARI LETTER QA
420959 # DEVANAGARI LETTER KHHA
43095A # DEVANAGARI LETTER GHHA
44095B # DEVANAGARI LETTER ZA
45095C # DEVANAGARI LETTER DDDHA
46095D # DEVANAGARI LETTER RHA
47095E # DEVANAGARI LETTER FA
48095F # DEVANAGARI LETTER YYA
4909DC # BENGALI LETTER RRA
5009DD # BENGALI LETTER RHA
5109DF # BENGALI LETTER YYA
520A33 # GURMUKHI LETTER LLA
530A36 # GURMUKHI LETTER SHA
540A59 # GURMUKHI LETTER KHHA
550A5A # GURMUKHI LETTER GHHA
560A5B # GURMUKHI LETTER ZA
570A5E # GURMUKHI LETTER FA
580B5C # ORIYA LETTER RRA
590B5D # ORIYA LETTER RHA
600F43 # TIBETAN LETTER GHA
610F4D # TIBETAN LETTER DDHA
620F52 # TIBETAN LETTER DHA
630F57 # TIBETAN LETTER BHA
640F5C # TIBETAN LETTER DZHA
650F69 # TIBETAN LETTER KSSA
660F76 # TIBETAN VOWEL SIGN VOCALIC R
670F78 # TIBETAN VOWEL SIGN VOCALIC L
680F93 # TIBETAN SUBJOINED LETTER GHA
690F9D # TIBETAN SUBJOINED LETTER DDHA
700FA2 # TIBETAN SUBJOINED LETTER DHA
710FA7 # TIBETAN SUBJOINED LETTER BHA
720FAC # TIBETAN SUBJOINED LETTER DZHA
730FB9 # TIBETAN SUBJOINED LETTER KSSA
74FB1D # HEBREW LETTER YOD WITH HIRIQ
75FB1F # HEBREW LIGATURE YIDDISH YOD YOD PATAH
76FB2A # HEBREW LETTER SHIN WITH SHIN DOT
77FB2B # HEBREW LETTER SHIN WITH SIN DOT
78FB2C # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
79FB2D # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
80FB2E # HEBREW LETTER ALEF WITH PATAH
81FB2F # HEBREW LETTER ALEF WITH QAMATS
82FB30 # HEBREW LETTER ALEF WITH MAPIQ
83FB31 # HEBREW LETTER BET WITH DAGESH
84FB32 # HEBREW LETTER GIMEL WITH DAGESH
85FB33 # HEBREW LETTER DALET WITH DAGESH
86FB34 # HEBREW LETTER HE WITH MAPIQ
87FB35 # HEBREW LETTER VAV WITH DAGESH
88FB36 # HEBREW LETTER ZAYIN WITH DAGESH
89FB38 # HEBREW LETTER TET WITH DAGESH
90FB39 # HEBREW LETTER YOD WITH DAGESH
91FB3A # HEBREW LETTER FINAL KAF WITH DAGESH
92FB3B # HEBREW LETTER KAF WITH DAGESH
93FB3C # HEBREW LETTER LAMED WITH DAGESH
94FB3E # HEBREW LETTER MEM WITH DAGESH
95FB40 # HEBREW LETTER NUN WITH DAGESH
96FB41 # HEBREW LETTER SAMEKH WITH DAGESH
97FB43 # HEBREW LETTER FINAL PE WITH DAGESH
98FB44 # HEBREW LETTER PE WITH DAGESH
99FB46 # HEBREW LETTER TSADI WITH DAGESH
100FB47 # HEBREW LETTER QOF WITH DAGESH
101FB48 # HEBREW LETTER RESH WITH DAGESH
102FB49 # HEBREW LETTER SHIN WITH DAGESH
103FB4A # HEBREW LETTER TAV WITH DAGESH
104FB4B # HEBREW LETTER VAV WITH HOLAM
105FB4C # HEBREW LETTER BET WITH RAFE
106FB4D # HEBREW LETTER KAF WITH RAFE
107FB4E # HEBREW LETTER PE WITH RAFE
108
109# Total code points: 67
110
111# ================================================
112# (2) Post Composition Version precomposed characters
113#
114# These characters cannot be derived solely from the UnicodeData.txt file
115# in this version of Unicode.
116#
117# Note that characters added to the standard after the
118# Composition Version and which have canonical decomposition mappings
119# are not automatically added to this list of Post Composition
120# Version precomposed characters.
121# ================================================
122
1232ADC # FORKING
1241D15E # MUSICAL SYMBOL HALF NOTE
1251D15F # MUSICAL SYMBOL QUARTER NOTE
1261D160 # MUSICAL SYMBOL EIGHTH NOTE
1271D161 # MUSICAL SYMBOL SIXTEENTH NOTE
1281D162 # MUSICAL SYMBOL THIRTY-SECOND NOTE
1291D163 # MUSICAL SYMBOL SIXTY-FOURTH NOTE
1301D164 # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
1311D1BB # MUSICAL SYMBOL MINIMA
1321D1BC # MUSICAL SYMBOL MINIMA BLACK
1331D1BD # MUSICAL SYMBOL SEMIMINIMA WHITE
1341D1BE # MUSICAL SYMBOL SEMIMINIMA BLACK
1351D1BF # MUSICAL SYMBOL FUSA WHITE
1361D1C0 # MUSICAL SYMBOL FUSA BLACK
137
138# Total code points: 14
139
140# ================================================
141# (3) Singleton Decompositions
142#
143# These characters can be derived from the UnicodeData.txt file
144# by including all canonically decomposable characters whose
145# canonical decomposition consists of a single character.
146#
147# These characters are simply quoted here for reference.
148# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
149# ================================================
150
151# 0340..0341 [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK
152# 0343 COMBINING GREEK KORONIS
153# 0374 GREEK NUMERAL SIGN
154# 037E GREEK QUESTION MARK
155# 0387 GREEK ANO TELEIA
156# 1F71 GREEK SMALL LETTER ALPHA WITH OXIA
157# 1F73 GREEK SMALL LETTER EPSILON WITH OXIA
158# 1F75 GREEK SMALL LETTER ETA WITH OXIA
159# 1F77 GREEK SMALL LETTER IOTA WITH OXIA
160# 1F79 GREEK SMALL LETTER OMICRON WITH OXIA
161# 1F7B GREEK SMALL LETTER UPSILON WITH OXIA
162# 1F7D GREEK SMALL LETTER OMEGA WITH OXIA
163# 1FBB GREEK CAPITAL LETTER ALPHA WITH OXIA
164# 1FBE GREEK PROSGEGRAMMENI
165# 1FC9 GREEK CAPITAL LETTER EPSILON WITH OXIA
166# 1FCB GREEK CAPITAL LETTER ETA WITH OXIA
167# 1FD3 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
168# 1FDB GREEK CAPITAL LETTER IOTA WITH OXIA
169# 1FE3 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
170# 1FEB GREEK CAPITAL LETTER UPSILON WITH OXIA
171# 1FEE..1FEF [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA
172# 1FF9 GREEK CAPITAL LETTER OMICRON WITH OXIA
173# 1FFB GREEK CAPITAL LETTER OMEGA WITH OXIA
174# 1FFD GREEK OXIA
175# 2000..2001 [2] EN QUAD..EM QUAD
176# 2126 OHM SIGN
177# 212A..212B [2] KELVIN SIGN..ANGSTROM SIGN
178# 2329 LEFT-POINTING ANGLE BRACKET
179# 232A RIGHT-POINTING ANGLE BRACKET
180# F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
181# FA10 CJK COMPATIBILITY IDEOGRAPH-FA10
182# FA12 CJK COMPATIBILITY IDEOGRAPH-FA12
183# FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
184# FA20 CJK COMPATIBILITY IDEOGRAPH-FA20
185# FA22 CJK COMPATIBILITY IDEOGRAPH-FA22
186# FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
187# FA2A..FA6D [68] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA6D
188# FA70..FAD9 [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
189# 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
190
191# Total code points: 1035
192
193# ================================================
194# (4) Non-Starter Decompositions
195#
196# These characters can be derived from the UnicodeData.txt file
197# by including each expanding canonical decomposition
198# (i.e., those which canonically decompose to a sequence
199# of characters instead of a single character), such that:
200#
201# A. The character is not a Starter.
202#
203# OR (inclusive)
204#
205# B. The character's canonical decomposition begins
206# with a character that is not a Starter.
207#
208# Note that a "Starter" is any character with a zero combining class.
209#
210# These characters are simply quoted here for reference.
211# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
212# ================================================
213
214# 0344 COMBINING GREEK DIALYTIKA TONOS
215# 0F73 TIBETAN VOWEL SIGN II
216# 0F75 TIBETAN VOWEL SIGN UU
217# 0F81 TIBETAN VOWEL SIGN REVERSED II
218
219# Total code points: 4
220
221# EOF