Cleaned up directory structure

author: Jose Colon Rodriguez 2024-02-19 09:11:56 -0400
committer: Jose Colon Rodriguez 2024-02-19 09:11:56 -0400
commit: 6013b2ded106521ee9cae6bd77dacbd5254ff763 (patch)
tree: 990f13cfbe4bfc20a08d2f097c4646984bffb565 /data/unicode/PropertyAliases.txt
parent: Tried SIMD lower/upper string. Slower than linear. (diff)
download: zg-6013b2ded106521ee9cae6bd77dacbd5254ff763.tar.gz
zg-6013b2ded106521ee9cae6bd77dacbd5254ff763.tar.xz
zg-6013b2ded106521ee9cae6bd77dacbd5254ff763.zip
1 files changed, 217 insertions, 0 deletions
diff --git a/data/unicode/PropertyAliases.txt b/data/unicode/PropertyAliases.txt
new file mode 100644
index 0000000..686b25a
--- /dev/null
+++ b/data/unicode/PropertyAliases.txt
@@ -0,0 +1,217 @@
+# PropertyAliases-15.1.0.txt
+# Date: 2023-08-07, 15:21:34 GMT
+# © 2023 Unicode®, Inc.
+# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# For terms of use, see https://www.unicode.org/terms_of_use.html
+#
+# Unicode Character Database
+#   For documentation, see https://www.unicode.org/reports/tr44/
+#
+# This file contains aliases for properties used in the UCD.
+# These names can be used for XML formats of UCD data, for regular-expression
+# property tests, and other programmatic textual descriptions of Unicode data.
+#
+# The names may be translated in appropriate environments, and additional
+# aliases may be useful.
+#
+# FORMAT
+#
+# Each line has two or more fields, separated by semicolons.
+#
+# First Field: The first field is the short name for the property.
+# It is typically an abbreviation, but in a number of cases it is simply
+# a duplicate of the "long name" in the second field.
+# For Unihan database tags, the short name is actually a longer string than
+# the tag specified in the second field.
+#
+# Second Field: The second field is the long name for the property,
+# typically the formal name used in documentation about the property.
+#
+# The above are the preferred aliases. Other aliases may be listed in additional fields.
+#
+# Loose matching should be applied to all property names and property values, with
+# the exception of String Property values. With loose matching of property names and
+# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
+# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
+#
+# NOTE: Property value names are NOT unique across properties. For example:
+#
+#   AL means Arabic Letter for the Bidi_Class property, and
+#   AL means Above_Left for the Combining_Class property, and
+#   AL means Alphabetic for the Line_Break property.
+#
+# In addition, some property names may be the same as some property value names.
+# For example:
+#
+#   sc means the Script property, and
+#   Sc means the General_Category property value Currency_Symbol (Sc)
+#
+# The combination of property value and property name is, however, unique.
+#
+# For more information, see UAX #44, Unicode Character Database, and
+# UTS #18, Unicode Regular Expressions.
+# ================================================
+# ================================================
+# Numeric Properties
+# ================================================
+cjkAccountingNumeric     ; kAccountingNumeric
+cjkOtherNumeric          ; kOtherNumeric
+cjkPrimaryNumeric        ; kPrimaryNumeric
+nv                       ; Numeric_Value
+# ================================================
+# String Properties
+# ================================================
+bmg                      ; Bidi_Mirroring_Glyph
+bpb                      ; Bidi_Paired_Bracket
+cf                       ; Case_Folding
+cjkCompatibilityVariant  ; kCompatibilityVariant
+dm                       ; Decomposition_Mapping
+EqUIdeo                  ; Equivalent_Unified_Ideograph
+FC_NFKC                  ; FC_NFKC_Closure
+lc                       ; Lowercase_Mapping
+NFKC_CF                  ; NFKC_Casefold
+NFKC_SCF                 ; NFKC_Simple_Casefold
+scf                      ; Simple_Case_Folding         ; sfc
+slc                      ; Simple_Lowercase_Mapping
+stc                      ; Simple_Titlecase_Mapping
+suc                      ; Simple_Uppercase_Mapping
+tc                       ; Titlecase_Mapping
+uc                       ; Uppercase_Mapping
+# ================================================
+# Miscellaneous Properties
+# ================================================
+cjkIICore                ; kIICore
+cjkIRG_GSource           ; kIRG_GSource
+cjkIRG_HSource           ; kIRG_HSource
+cjkIRG_JSource           ; kIRG_JSource
+cjkIRG_KPSource          ; kIRG_KPSource
+cjkIRG_KSource           ; kIRG_KSource
+cjkIRG_MSource           ; kIRG_MSource
+cjkIRG_SSource           ; kIRG_SSource
+cjkIRG_TSource           ; kIRG_TSource
+cjkIRG_UKSource          ; kIRG_UKSource
+cjkIRG_USource           ; kIRG_USource
+cjkIRG_VSource           ; kIRG_VSource
+cjkRSUnicode             ; kRSUnicode                  ; Unicode_Radical_Stroke; URS
+isc                      ; ISO_Comment
+JSN                      ; Jamo_Short_Name
+na                       ; Name
+na1                      ; Unicode_1_Name
+Name_Alias               ; Name_Alias
+scx                      ; Script_Extensions
+# ================================================
+# Catalog Properties
+# ================================================
+age                      ; Age
+blk                      ; Block
+sc                       ; Script
+# ================================================
+# Enumerated Properties
+# ================================================
+bc                       ; Bidi_Class
+bpt                      ; Bidi_Paired_Bracket_Type
+ccc                      ; Canonical_Combining_Class
+dt                       ; Decomposition_Type
+ea                       ; East_Asian_Width
+gc                       ; General_Category
+GCB                      ; Grapheme_Cluster_Break
+hst                      ; Hangul_Syllable_Type
+InCB                     ; Indic_Conjunct_Break
+InPC                     ; Indic_Positional_Category
+InSC                     ; Indic_Syllabic_Category
+jg                       ; Joining_Group
+jt                       ; Joining_Type
+lb                       ; Line_Break
+NFC_QC                   ; NFC_Quick_Check
+NFD_QC                   ; NFD_Quick_Check
+NFKC_QC                  ; NFKC_Quick_Check
+NFKD_QC                  ; NFKD_Quick_Check
+nt                       ; Numeric_Type
+SB                       ; Sentence_Break
+vo                       ; Vertical_Orientation
+WB                       ; Word_Break
+# ================================================
+# Binary Properties
+# ================================================
+AHex                     ; ASCII_Hex_Digit
+Alpha                    ; Alphabetic
+Bidi_C                   ; Bidi_Control
+Bidi_M                   ; Bidi_Mirrored
+Cased                    ; Cased
+CE                       ; Composition_Exclusion
+CI                       ; Case_Ignorable
+Comp_Ex                  ; Full_Composition_Exclusion
+CWCF                     ; Changes_When_Casefolded
+CWCM                     ; Changes_When_Casemapped
+CWKCF                    ; Changes_When_NFKC_Casefolded
+CWL                      ; Changes_When_Lowercased
+CWT                      ; Changes_When_Titlecased
+CWU                      ; Changes_When_Uppercased
+Dash                     ; Dash
+Dep                      ; Deprecated
+DI                       ; Default_Ignorable_Code_Point
+Dia                      ; Diacritic
+EBase                    ; Emoji_Modifier_Base
+EComp                    ; Emoji_Component
+EMod                     ; Emoji_Modifier
+Emoji                    ; Emoji
+EPres                    ; Emoji_Presentation
+Ext                      ; Extender
+ExtPict                  ; Extended_Pictographic
+Gr_Base                  ; Grapheme_Base
+Gr_Ext                   ; Grapheme_Extend
+Gr_Link                  ; Grapheme_Link
+Hex                      ; Hex_Digit
+Hyphen                   ; Hyphen
+ID_Compat_Math_Continue  ; ID_Compat_Math_Continue
+ID_Compat_Math_Start     ; ID_Compat_Math_Start
+IDC                      ; ID_Continue
+Ideo                     ; Ideographic
+IDS                      ; ID_Start
+IDSB                     ; IDS_Binary_Operator
+IDST                     ; IDS_Trinary_Operator
+IDSU                     ; IDS_Unary_Operator
+Join_C                   ; Join_Control
+LOE                      ; Logical_Order_Exception
+Lower                    ; Lowercase
+Math                     ; Math
+NChar                    ; Noncharacter_Code_Point
+OAlpha                   ; Other_Alphabetic
+ODI                      ; Other_Default_Ignorable_Code_Point
+OGr_Ext                  ; Other_Grapheme_Extend
+OIDC                     ; Other_ID_Continue
+OIDS                     ; Other_ID_Start
+OLower                   ; Other_Lowercase
+OMath                    ; Other_Math
+OUpper                   ; Other_Uppercase
+Pat_Syn                  ; Pattern_Syntax
+Pat_WS                   ; Pattern_White_Space
+PCM                      ; Prepended_Concatenation_Mark
+QMark                    ; Quotation_Mark
+Radical                  ; Radical
+RI                       ; Regional_Indicator
+SD                       ; Soft_Dotted
+STerm                    ; Sentence_Terminal
+Term                     ; Terminal_Punctuation
+UIdeo                    ; Unified_Ideograph
+Upper                    ; Uppercase
+VS                       ; Variation_Selector
+WSpace                   ; White_Space                 ; space
+XIDC                     ; XID_Continue
+XIDS                     ; XID_Start
+XO_NFC                   ; Expands_On_NFC
+XO_NFD                   ; Expands_On_NFD
+XO_NFKC                  ; Expands_On_NFKC
+XO_NFKD                  ; Expands_On_NFKD
+# ================================================
+# Total:    134
+# EOF
author	Jose Colon Rodriguez	2024-02-19 09:11:56 -0400
committer	Jose Colon Rodriguez	2024-02-19 09:11:56 -0400
commit	6013b2ded106521ee9cae6bd77dacbd5254ff763 (patch)
tree	990f13cfbe4bfc20a08d2f097c4646984bffb565 /data/unicode/PropertyAliases.txt
parent	Tried SIMD lower/upper string. Slower than linear. (diff)
download	zg-6013b2ded106521ee9cae6bd77dacbd5254ff763.tar.gz zg-6013b2ded106521ee9cae6bd77dacbd5254ff763.tar.xz zg-6013b2ded106521ee9cae6bd77dacbd5254ff763.zip

diff --git a/data/unicode/PropertyAliases.txt b/data/unicode/PropertyAliases.txt new file mode 100644 index 0000000..686b25a --- /dev/null +++ b/data/unicode/PropertyAliases.txt
@@ -0,0 +1,217 @@
	1	# PropertyAliases-15.1.0.txt
	2	# Date: 2023-08-07, 15:21:34 GMT
	3	# © 2023 Unicode®, Inc.
	4	# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
	5	# For terms of use, see https://www.unicode.org/terms_of_use.html
	6	#
	7	# Unicode Character Database
	8	# For documentation, see https://www.unicode.org/reports/tr44/
	9	#
	10	# This file contains aliases for properties used in the UCD.
	11	# These names can be used for XML formats of UCD data, for regular-expression
	12	# property tests, and other programmatic textual descriptions of Unicode data.
	13	#
	14	# The names may be translated in appropriate environments, and additional
	15	# aliases may be useful.
	16	#
	17	# FORMAT
	18	#
	19	# Each line has two or more fields, separated by semicolons.
	20	#
	21	# First Field: The first field is the short name for the property.
	22	# It is typically an abbreviation, but in a number of cases it is simply
	23	# a duplicate of the "long name" in the second field.
	24	# For Unihan database tags, the short name is actually a longer string than
	25	# the tag specified in the second field.
	26	#
	27	# Second Field: The second field is the long name for the property,
	28	# typically the formal name used in documentation about the property.
	29	#
	30	# The above are the preferred aliases. Other aliases may be listed in additional fields.
	31	#
	32	# Loose matching should be applied to all property names and property values, with
	33	# the exception of String Property values. With loose matching of property names and
	34	# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
	35	# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
	36	#
	37	# NOTE: Property value names are NOT unique across properties. For example:
	38	#
	39	# AL means Arabic Letter for the Bidi_Class property, and
	40	# AL means Above_Left for the Combining_Class property, and
	41	# AL means Alphabetic for the Line_Break property.
	42	#
	43	# In addition, some property names may be the same as some property value names.
	44	# For example:
	45	#
	46	# sc means the Script property, and
	47	# Sc means the General_Category property value Currency_Symbol (Sc)
	48	#
	49	# The combination of property value and property name is, however, unique.
	50	#
	51	# For more information, see UAX #44, Unicode Character Database, and
	52	# UTS #18, Unicode Regular Expressions.
	53	# ================================================
	54
	55
	56	# ================================================
	57	# Numeric Properties
	58	# ================================================
	59	cjkAccountingNumeric ; kAccountingNumeric
	60	cjkOtherNumeric ; kOtherNumeric
	61	cjkPrimaryNumeric ; kPrimaryNumeric
	62	nv ; Numeric_Value
	63
	64	# ================================================
	65	# String Properties
	66	# ================================================
	67	bmg ; Bidi_Mirroring_Glyph
	68	bpb ; Bidi_Paired_Bracket
	69	cf ; Case_Folding
	70	cjkCompatibilityVariant ; kCompatibilityVariant
	71	dm ; Decomposition_Mapping
	72	EqUIdeo ; Equivalent_Unified_Ideograph
	73	FC_NFKC ; FC_NFKC_Closure
	74	lc ; Lowercase_Mapping
	75	NFKC_CF ; NFKC_Casefold
	76	NFKC_SCF ; NFKC_Simple_Casefold
	77	scf ; Simple_Case_Folding ; sfc
	78	slc ; Simple_Lowercase_Mapping
	79	stc ; Simple_Titlecase_Mapping
	80	suc ; Simple_Uppercase_Mapping
	81	tc ; Titlecase_Mapping
	82	uc ; Uppercase_Mapping
	83
	84	# ================================================
	85	# Miscellaneous Properties
	86	# ================================================
	87	cjkIICore ; kIICore
	88	cjkIRG_GSource ; kIRG_GSource
	89	cjkIRG_HSource ; kIRG_HSource
	90	cjkIRG_JSource ; kIRG_JSource
	91	cjkIRG_KPSource ; kIRG_KPSource
	92	cjkIRG_KSource ; kIRG_KSource
	93	cjkIRG_MSource ; kIRG_MSource
	94	cjkIRG_SSource ; kIRG_SSource
	95	cjkIRG_TSource ; kIRG_TSource
	96	cjkIRG_UKSource ; kIRG_UKSource
	97	cjkIRG_USource ; kIRG_USource
	98	cjkIRG_VSource ; kIRG_VSource
	99	cjkRSUnicode ; kRSUnicode ; Unicode_Radical_Stroke; URS
	100	isc ; ISO_Comment
	101	JSN ; Jamo_Short_Name
	102	na ; Name
	103	na1 ; Unicode_1_Name
	104	Name_Alias ; Name_Alias
	105	scx ; Script_Extensions
	106
	107	# ================================================
	108	# Catalog Properties
	109	# ================================================
	110	age ; Age
	111	blk ; Block
	112	sc ; Script
	113
	114	# ================================================
	115	# Enumerated Properties
	116	# ================================================
	117	bc ; Bidi_Class
	118	bpt ; Bidi_Paired_Bracket_Type
	119	ccc ; Canonical_Combining_Class
	120	dt ; Decomposition_Type
	121	ea ; East_Asian_Width
	122	gc ; General_Category
	123	GCB ; Grapheme_Cluster_Break
	124	hst ; Hangul_Syllable_Type
	125	InCB ; Indic_Conjunct_Break
	126	InPC ; Indic_Positional_Category
	127	InSC ; Indic_Syllabic_Category
	128	jg ; Joining_Group
	129	jt ; Joining_Type
	130	lb ; Line_Break
	131	NFC_QC ; NFC_Quick_Check
	132	NFD_QC ; NFD_Quick_Check
	133	NFKC_QC ; NFKC_Quick_Check
	134	NFKD_QC ; NFKD_Quick_Check
	135	nt ; Numeric_Type
	136	SB ; Sentence_Break
	137	vo ; Vertical_Orientation
	138	WB ; Word_Break
	139
	140	# ================================================
	141	# Binary Properties
	142	# ================================================
	143	AHex ; ASCII_Hex_Digit
	144	Alpha ; Alphabetic
	145	Bidi_C ; Bidi_Control
	146	Bidi_M ; Bidi_Mirrored
	147	Cased ; Cased
	148	CE ; Composition_Exclusion
	149	CI ; Case_Ignorable
	150	Comp_Ex ; Full_Composition_Exclusion
	151	CWCF ; Changes_When_Casefolded
	152	CWCM ; Changes_When_Casemapped
	153	CWKCF ; Changes_When_NFKC_Casefolded
	154	CWL ; Changes_When_Lowercased
	155	CWT ; Changes_When_Titlecased
	156	CWU ; Changes_When_Uppercased
	157	Dash ; Dash
	158	Dep ; Deprecated
	159	DI ; Default_Ignorable_Code_Point
	160	Dia ; Diacritic
	161	EBase ; Emoji_Modifier_Base
	162	EComp ; Emoji_Component
	163	EMod ; Emoji_Modifier
	164	Emoji ; Emoji
	165	EPres ; Emoji_Presentation
	166	Ext ; Extender
	167	ExtPict ; Extended_Pictographic
	168	Gr_Base ; Grapheme_Base
	169	Gr_Ext ; Grapheme_Extend
	170	Gr_Link ; Grapheme_Link
	171	Hex ; Hex_Digit
	172	Hyphen ; Hyphen
	173	ID_Compat_Math_Continue ; ID_Compat_Math_Continue
	174	ID_Compat_Math_Start ; ID_Compat_Math_Start
	175	IDC ; ID_Continue
	176	Ideo ; Ideographic
	177	IDS ; ID_Start
	178	IDSB ; IDS_Binary_Operator
	179	IDST ; IDS_Trinary_Operator
	180	IDSU ; IDS_Unary_Operator
	181	Join_C ; Join_Control
	182	LOE ; Logical_Order_Exception
	183	Lower ; Lowercase
	184	Math ; Math
	185	NChar ; Noncharacter_Code_Point
	186	OAlpha ; Other_Alphabetic
	187	ODI ; Other_Default_Ignorable_Code_Point
	188	OGr_Ext ; Other_Grapheme_Extend
	189	OIDC ; Other_ID_Continue
	190	OIDS ; Other_ID_Start
	191	OLower ; Other_Lowercase
	192	OMath ; Other_Math
	193	OUpper ; Other_Uppercase
	194	Pat_Syn ; Pattern_Syntax
	195	Pat_WS ; Pattern_White_Space
	196	PCM ; Prepended_Concatenation_Mark
	197	QMark ; Quotation_Mark
	198	Radical ; Radical
	199	RI ; Regional_Indicator
	200	SD ; Soft_Dotted
	201	STerm ; Sentence_Terminal
	202	Term ; Terminal_Punctuation
	203	UIdeo ; Unified_Ideograph
	204	Upper ; Uppercase
	205	VS ; Variation_Selector
	206	WSpace ; White_Space ; space
	207	XIDC ; XID_Continue
	208	XIDS ; XID_Start
	209	XO_NFC ; Expands_On_NFC
	210	XO_NFD ; Expands_On_NFD
	211	XO_NFKC ; Expands_On_NFKC
	212	XO_NFKD ; Expands_On_NFKD
	213
	214	# ================================================
	215	# Total: 134
	216
	217	# EOF