blob: 454e4355e436d20c5b74022e1f21df59caadc6d0 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
28import sys
29
30SCRIPT = sys.argv[0]
Martin v. Löwis93cbca32008-09-10 14:08:48 +000031VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Florent Xiclunaf1789de2010-03-19 01:17:46 +000034UNIDATA_VERSION = "5.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000035UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000038UNIHAN = "Unihan%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000039DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000040DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000041
42old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000043
44CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
45 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
46 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
47 "So" ]
48
49BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
50 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
51 "ON" ]
52
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000053EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
54
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000055# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000056ALPHA_MASK = 0x01
57DECIMAL_MASK = 0x02
58DIGIT_MASK = 0x04
59LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000060LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000061SPACE_MASK = 0x20
62TITLE_MASK = 0x40
63UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000064XID_START_MASK = 0x100
65XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000066PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000067NODELTA_MASK = 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000068NUMERIC_MASK = 0x1000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000069
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000070def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000071
Collin Winter6afaeb72007-08-03 17:06:41 +000072 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000073
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000074 version = ""
75 unicode = UnicodeData(UNICODE_DATA % version,
76 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000077 EASTASIAN_WIDTH % version,
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000078 UNIHAN % version,
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000079 DERIVED_CORE_PROPERTIES % version,
80 DERIVEDNORMALIZATION_PROPS % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000081
Georg Brandl559e5d72008-06-11 18:37:52 +000082 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000083
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000084 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000085 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
87 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000088 EASTASIAN_WIDTH % ("-"+version),
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000089 UNIHAN % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000090 DERIVED_CORE_PROPERTIES % ("-"+version))
Georg Brandl559e5d72008-06-11 18:37:52 +000091 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000092 merge_old_version(version, unicode, old_unicode)
93
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000094 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000095 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000096 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000097
98# --------------------------------------------------------------------
99# unicode character properties
100
101def makeunicodedata(unicode, trace):
102
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000103 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000104 table = [dummy]
105 cache = {0: dummy}
106 index = [0] * len(unicode.chars)
107
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000108 FILE = "Modules/unicodedata_db.h"
109
Collin Winter6afaeb72007-08-03 17:06:41 +0000110 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000111
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000112 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000113
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000114 for char in unicode.chars:
115 record = unicode.table[char]
116 if record:
117 # extract database properties
118 category = CATEGORY_NAMES.index(record[2])
119 combining = int(record[3])
120 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
121 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000122 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000123 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000124 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000125 category, combining, bidirectional, mirrored, eastasianwidth,
126 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000127 )
128 # add entry to index and item tables
129 i = cache.get(item)
130 if i is None:
131 cache[item] = i = len(table)
132 table.append(item)
133 index[char] = i
134
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000135 # 2) decomposition data
136
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000137 decomp_data = [0]
138 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000139 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000140 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000141
Martin v. Löwis677bde22002-11-23 22:08:15 +0000142 comp_pairs = []
143 comp_first = [None] * len(unicode.chars)
144 comp_last = [None] * len(unicode.chars)
145
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000146 for char in unicode.chars:
147 record = unicode.table[char]
148 if record:
149 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000150 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000151 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000152 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000153 # prefix
154 if decomp[0][0] == "<":
155 prefix = decomp.pop(0)
156 else:
157 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000158 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000159 i = decomp_prefix.index(prefix)
160 except ValueError:
161 i = len(decomp_prefix)
162 decomp_prefix.append(prefix)
163 prefix = i
164 assert prefix < 256
165 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000166 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000167 # Collect NFC pairs
168 if not prefix and len(decomp) == 3 and \
169 char not in unicode.exclusions and \
170 unicode.table[decomp[1]][3] == "0":
171 p, l, r = decomp
172 comp_first[l] = 1
173 comp_last[r] = 1
174 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000175 try:
176 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000177 except ValueError:
178 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 decomp_data.extend(decomp)
180 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000181 else:
182 i = 0
183 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000184
Martin v. Löwis677bde22002-11-23 22:08:15 +0000185 f = l = 0
186 comp_first_ranges = []
187 comp_last_ranges = []
188 prev_f = prev_l = None
189 for i in unicode.chars:
190 if comp_first[i] is not None:
191 comp_first[i] = f
192 f += 1
193 if prev_f is None:
194 prev_f = (i,i)
195 elif prev_f[1]+1 == i:
196 prev_f = prev_f[0],i
197 else:
198 comp_first_ranges.append(prev_f)
199 prev_f = (i,i)
200 if comp_last[i] is not None:
201 comp_last[i] = l
202 l += 1
203 if prev_l is None:
204 prev_l = (i,i)
205 elif prev_l[1]+1 == i:
206 prev_l = prev_l[0],i
207 else:
208 comp_last_ranges.append(prev_l)
209 prev_l = (i,i)
210 comp_first_ranges.append(prev_f)
211 comp_last_ranges.append(prev_l)
212 total_first = f
213 total_last = l
214
215 comp_data = [0]*(total_first*total_last)
216 for f,l,char in comp_pairs:
217 f = comp_first[f]
218 l = comp_last[l]
219 comp_data[f*total_last+l] = char
220
Collin Winter6afaeb72007-08-03 17:06:41 +0000221 print(len(table), "unique properties")
222 print(len(decomp_prefix), "unique decomposition prefixes")
223 print(len(decomp_data), "unique decomposition entries:", end=' ')
224 print(decomp_size, "bytes")
225 print(total_first, "first characters in NFC")
226 print(total_last, "last characters in NFC")
227 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000228
Collin Winter6afaeb72007-08-03 17:06:41 +0000229 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000230
Fred Drake9c685052000-10-26 03:56:46 +0000231 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000232 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
233 print(file=fp)
234 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
235 print("/* a list of unique database records */", file=fp)
236 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000237 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000238 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000239 print("};", file=fp)
240 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000241
Collin Winter6afaeb72007-08-03 17:06:41 +0000242 print("/* Reindexing of NFC first characters. */", file=fp)
243 print("#define TOTAL_FIRST",total_first, file=fp)
244 print("#define TOTAL_LAST",total_last, file=fp)
245 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000246 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000247 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000248 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
249 print(" {0,0,0}", file=fp)
250 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000251 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000252 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000253 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
254 print(" {0,0,0}", file=fp)
255 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000256
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000257 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000258 # the support code moved into unicodedatabase.c
259
Collin Winter6afaeb72007-08-03 17:06:41 +0000260 print("/* string literals */", file=fp)
261 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000262 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000263 print(" \"%s\"," % name, file=fp)
264 print(" NULL", file=fp)
265 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000268 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000269 print(" \"%s\"," % name, file=fp)
270 print(" NULL", file=fp)
271 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000272
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000274 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000275 print(" \"%s\"," % name, file=fp)
276 print(" NULL", file=fp)
277 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000278
Collin Winter6afaeb72007-08-03 17:06:41 +0000279 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000280 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000281 print(" \"%s\"," % name, file=fp)
282 print(" NULL", file=fp)
283 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000284
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000285 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000286 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000287
Collin Winter6afaeb72007-08-03 17:06:41 +0000288 print("/* index tables for the database records */", file=fp)
289 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000290 Array("index1", index1).dump(fp, trace)
291 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000292
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000293 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000294 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000295
Collin Winter6afaeb72007-08-03 17:06:41 +0000296 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000297 Array("decomp_data", decomp_data).dump(fp, trace)
298
Collin Winter6afaeb72007-08-03 17:06:41 +0000299 print("/* index tables for the decomposition data */", file=fp)
300 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000301 Array("decomp_index1", index1).dump(fp, trace)
302 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000303
Martin v. Löwis677bde22002-11-23 22:08:15 +0000304 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000305 print("/* NFC pairs */", file=fp)
306 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000307 Array("comp_index", index).dump(fp, trace)
308 Array("comp_data", index2).dump(fp, trace)
309
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000310 # Generate delta tables for old versions
311 for version, table, normalization in unicode.changed:
312 cversion = version.replace(".","_")
313 records = [table[0]]
314 cache = {table[0]:0}
315 index = [0] * len(table)
316 for i, record in enumerate(table):
317 try:
318 index[i] = cache[record]
319 except KeyError:
320 index[i] = cache[record] = len(records)
321 records.append(record)
322 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000323 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000324 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000325 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
326 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
328 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000329 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
330 print("{", file=fp)
331 print("\tint index;", file=fp)
332 print("\tif (n >= 0x110000) index = 0;", file=fp)
333 print("\telse {", file=fp)
334 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
335 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
336 (cversion, shift, ((1<<shift)-1)), file=fp)
337 print("\t}", file=fp)
338 print("\treturn change_records_%s+index;" % cversion, file=fp)
339 print("}\n", file=fp)
340 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
341 print("{", file=fp)
342 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000343 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000344 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
345 print("\tdefault: return 0;", file=fp)
346 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000347
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000348 fp.close()
349
350# --------------------------------------------------------------------
351# unicode character type tables
352
353def makeunicodetype(unicode, trace):
354
355 FILE = "Objects/unicodetype_db.h"
356
Collin Winter6afaeb72007-08-03 17:06:41 +0000357 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000358
359 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000360 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000361 table = [dummy]
362 cache = {0: dummy}
363 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000364 numeric = {}
365 spaces = []
366 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000367
368 for char in unicode.chars:
369 record = unicode.table[char]
370 if record:
371 # extract database properties
372 category = record[2]
373 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000374 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000375 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000376 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000377 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
378 flags |= ALPHA_MASK
379 if category == "Ll":
380 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000381 if category == "Zl" or bidirectional == "B":
382 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000383 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000384 if category == "Zs" or bidirectional in ("WS", "B", "S"):
385 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000386 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000387 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000388 flags |= TITLE_MASK
389 if category == "Lu":
390 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000391 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000392 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000393 if "XID_Start" in properties:
394 flags |= XID_START_MASK
395 if "XID_Continue" in properties:
396 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000397 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000398 if record[12]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000399 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000400 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000401 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402 if record[13]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000403 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000404 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000405 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 if record[14]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000407 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000408 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000409 # UCD.html says that a missing title char means that
410 # it defaults to the uppercase character, not to the
411 # character itself. Apparently, in the current UCD (5.x)
412 # this feature is never used
413 title = upper
414 upper_d = upper - char
415 lower_d = lower - char
416 title_d = title - char
417 if -32768 <= upper_d <= 32767 and \
418 -32768 <= lower_d <= 32767 and \
419 -32768 <= title_d <= 32767:
420 # use deltas
421 upper = upper_d & 0xffff
422 lower = lower_d & 0xffff
423 title = title_d & 0xffff
424 else:
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000425 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000426 # decimal digit, integer digit
427 decimal = 0
428 if record[6]:
429 flags |= DECIMAL_MASK
430 decimal = int(record[6])
431 digit = 0
432 if record[7]:
433 flags |= DIGIT_MASK
434 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000435 if record[8]:
436 flags |= NUMERIC_MASK
437 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000438 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000439 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000440 )
441 # add entry to index and item tables
442 i = cache.get(item)
443 if i is None:
444 cache[item] = i = len(table)
445 table.append(item)
446 index[char] = i
447
Collin Winter6afaeb72007-08-03 17:06:41 +0000448 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000449 print(sum(map(len, numeric.values())), "numeric code points")
450 print(len(spaces), "whitespace code points")
451 print(len(linebreaks), "linebreak code points")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000452
Collin Winter6afaeb72007-08-03 17:06:41 +0000453 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000454
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000455 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000456 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
457 print(file=fp)
458 print("/* a list of unique character type descriptors */", file=fp)
459 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000460 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000461 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
462 print("};", file=fp)
463 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000464
465 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000466 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000467
Collin Winter6afaeb72007-08-03 17:06:41 +0000468 print("/* type indexes */", file=fp)
469 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000470 Array("index1", index1).dump(fp, trace)
471 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000472
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000473 # Generate code for _PyUnicode_ToNumeric()
474 numeric_items = sorted(numeric.items())
475 print('/* Returns the numeric value as double for Unicode characters', file=fp)
476 print(' * having this property, -1.0 otherwise.', file=fp)
477 print(' */', file=fp)
478 print('double _PyUnicode_ToNumeric(Py_UNICODE ch)', file=fp)
479 print('{', file=fp)
480 print(' switch (ch) {', file=fp)
481 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000482 # Turn text into float literals
483 parts = value.split('/')
484 parts = [repr(float(part)) for part in parts]
485 value = '/'.join(parts)
486
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000487 haswide = False
488 hasnonewide = False
489 codepoints.sort()
490 for codepoint in codepoints:
491 if codepoint < 0x10000:
492 hasnonewide = True
493 if codepoint >= 0x10000 and not haswide:
494 print('#ifdef Py_UNICODE_WIDE', file=fp)
495 haswide = True
496 print(' case 0x%04X:' % (codepoint,), file=fp)
497 if haswide and hasnonewide:
498 print('#endif', file=fp)
499 print(' return (double) %s;' % (value,), file=fp)
500 if haswide and not hasnonewide:
501 print('#endif', file=fp)
502 print(' }', file=fp)
503 print(' return -1.0;', file=fp)
504 print('}', file=fp)
505 print(file=fp)
506
507 # Generate code for _PyUnicode_IsWhitespace()
508 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
509 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
510 print(" */", file=fp)
511 print('int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)', file=fp)
512 print('{', file=fp)
513 print('#ifdef WANT_WCTYPE_FUNCTIONS', file=fp)
514 print(' return iswspace(ch);', file=fp)
515 print('#else', file=fp)
516 print(' switch (ch) {', file=fp)
517
518 haswide = False
519 hasnonewide = False
520 spaces.sort()
521 for codepoint in spaces:
522 if codepoint < 0x10000:
523 hasnonewide = True
524 if codepoint >= 0x10000 and not haswide:
525 print('#ifdef Py_UNICODE_WIDE', file=fp)
526 haswide = True
527 print(' case 0x%04X:' % (codepoint,), file=fp)
528 if haswide and hasnonewide:
529 print('#endif', file=fp)
530 print(' return 1;', file=fp)
531 if haswide and not hasnonewide:
532 print('#endif', file=fp)
533
534 print(' }', file=fp)
535 print(' return 0;', file=fp)
536 print('#endif', file=fp)
537 print('}', file=fp)
538 print(file=fp)
539
540 # Generate code for _PyUnicode_IsLinebreak()
541 print("/* Returns 1 for Unicode characters having the category 'Zl',", file=fp)
542 print(" * 'Zp' or type 'B', 0 otherwise.", file=fp)
543 print(" */", file=fp)
544 print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
545 print('{', file=fp)
546 print(' switch (ch) {', file=fp)
547 haswide = False
548 hasnonewide = False
549 linebreaks.sort()
550 for codepoint in linebreaks:
551 if codepoint < 0x10000:
552 hasnonewide = True
553 if codepoint >= 0x10000 and not haswide:
554 print('#ifdef Py_UNICODE_WIDE', file=fp)
555 haswide = True
556 print(' case 0x%04X:' % (codepoint,), file=fp)
557 if haswide and hasnonewide:
558 print('#endif', file=fp)
559 print(' return 1;', file=fp)
560 if haswide and not hasnonewide:
561 print('#endif', file=fp)
562
563 print(' }', file=fp)
564 print(' return 0;', file=fp)
565 print('}', file=fp)
566 print(file=fp)
567
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000568 fp.close()
569
570# --------------------------------------------------------------------
571# unicode name database
572
573def makeunicodename(unicode, trace):
574
575 FILE = "Modules/unicodename_db.h"
576
Collin Winter6afaeb72007-08-03 17:06:41 +0000577 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000578
579 # collect names
580 names = [None] * len(unicode.chars)
581
582 for char in unicode.chars:
583 record = unicode.table[char]
584 if record:
585 name = record[1].strip()
586 if name and name[0] != "<":
587 names[char] = name + chr(0)
588
Georg Brandl559e5d72008-06-11 18:37:52 +0000589 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000590
591 # collect unique words from names (note that we differ between
592 # words inside a sentence, and words ending a sentence. the
593 # latter includes the trailing null byte.
594
595 words = {}
596 n = b = 0
597 for char in unicode.chars:
598 name = names[char]
599 if name:
600 w = name.split()
601 b = b + len(name)
602 n = n + len(w)
603 for w in w:
604 l = words.get(w)
605 if l:
606 l.append(None)
607 else:
608 words[w] = [len(words)]
609
Collin Winter6afaeb72007-08-03 17:06:41 +0000610 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000611
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000612 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000613
Martin v. Löwis97225da2002-11-24 23:05:09 +0000614 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000615 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000616 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000617 return -len(alist), aword
618 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000619
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000620 # figure out how many phrasebook escapes we need
621 escapes = 0
622 while escapes * 256 < len(wordlist):
623 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000624 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000625
626 short = 256 - escapes
627
628 assert short > 0
629
Collin Winter6afaeb72007-08-03 17:06:41 +0000630 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000631
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000632 # statistics
633 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000634 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000635 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000636 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000637
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000638 # pick the most commonly used words, and sort the rest on falling
639 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000640
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000641 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000642 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000643 wordlist.extend(wordtail)
644
645 # generate lexicon from words
646
647 lexicon_offset = [0]
648 lexicon = ""
649 words = {}
650
651 # build a lexicon string
652 offset = 0
653 for w, x in wordlist:
654 # encoding: bit 7 indicates last character in word (chr(128)
655 # indicates the last character in an entire string)
656 ww = w[:-1] + chr(ord(w[-1])+128)
657 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000658 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000659 if o < 0:
660 o = offset
661 lexicon = lexicon + ww
662 offset = offset + len(w)
663 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000664 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000665
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000666 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000667
668 # generate phrasebook from names and lexicon
669 phrasebook = [0]
670 phrasebook_offset = [0] * len(unicode.chars)
671 for char in unicode.chars:
672 name = names[char]
673 if name:
674 w = name.split()
675 phrasebook_offset[char] = len(phrasebook)
676 for w in w:
677 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000678 if i < short:
679 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000680 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000681 # store as two bytes
682 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000683 phrasebook.append(i&255)
684
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000685 assert getsize(phrasebook) == 1
686
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000687 #
688 # unicode name hash table
689
690 # extract names
691 data = []
692 for char in unicode.chars:
693 record = unicode.table[char]
694 if record:
695 name = record[1].strip()
696 if name and name[0] != "<":
697 data.append((name, char))
698
699 # the magic number 47 was chosen to minimize the number of
700 # collisions on the current data set. if you like, change it
701 # and see what happens...
702
703 codehash = Hash("code", data, 47)
704
Collin Winter6afaeb72007-08-03 17:06:41 +0000705 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000706
707 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000708 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
709 print(file=fp)
710 print("#define NAME_MAXLEN", 256, file=fp)
711 print(file=fp)
712 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000713 Array("lexicon", lexicon).dump(fp, trace)
714 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000715
716 # split decomposition index table
717 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
718
Collin Winter6afaeb72007-08-03 17:06:41 +0000719 print("/* code->name phrasebook */", file=fp)
720 print("#define phrasebook_shift", shift, file=fp)
721 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000722
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000723 Array("phrasebook", phrasebook).dump(fp, trace)
724 Array("phrasebook_offset1", offset1).dump(fp, trace)
725 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000726
Collin Winter6afaeb72007-08-03 17:06:41 +0000727 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000728 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000729
730 fp.close()
731
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000732
733def merge_old_version(version, new, old):
734 # Changes to exclusion file not implemented yet
735 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000736 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000737
738 # In these change records, 0xFF means "no change"
739 bidir_changes = [0xFF]*0x110000
740 category_changes = [0xFF]*0x110000
741 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000742 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000743 # In numeric data, 0 means "no change",
744 # -1 means "did not have a numeric value
745 numeric_changes = [0] * 0x110000
746 # normalization_changes is a list of key-value pairs
747 normalization_changes = []
748 for i in range(0x110000):
749 if new.table[i] is None:
750 # Characters unassigned in the new version ought to
751 # be unassigned in the old one
752 assert old.table[i] is None
753 continue
754 # check characters unassigned in the old version
755 if old.table[i] is None:
756 # category 0 is "unassigned"
757 category_changes[i] = 0
758 continue
759 # check characters that differ
760 if old.table[i] != new.table[i]:
761 for k in range(len(old.table[i])):
762 if old.table[i][k] != new.table[i][k]:
763 value = old.table[i][k]
764 if k == 2:
765 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
766 category_changes[i] = CATEGORY_NAMES.index(value)
767 elif k == 4:
768 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
769 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
770 elif k == 5:
771 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
772 # We assume that all normalization changes are in 1:1 mappings
773 assert " " not in value
774 normalization_changes.append((i, value))
775 elif k == 6:
776 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
777 # we only support changes where the old value is a single digit
778 assert value in "0123456789"
779 decimal_changes[i] = int(value)
780 elif k == 8:
781 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
782 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000783 if not value:
784 numeric_changes[i] = -1
785 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000786 numeric_changes[i] = float(value)
787 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000788 elif k == 9:
789 if value == 'Y':
790 mirrored_changes[i] = '1'
791 else:
792 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000793 elif k == 11:
794 # change to ISO comment, ignore
795 pass
796 elif k == 12:
797 # change to simple uppercase mapping; ignore
798 pass
799 elif k == 13:
800 # change to simple lowercase mapping; ignore
801 pass
802 elif k == 14:
803 # change to simple titlecase mapping; ignore
804 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000805 elif k == 16:
806 # derived property changes; not yet
807 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000808 else:
809 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000810 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000811 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000812 decimal_changes, mirrored_changes,
813 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000814 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000815
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000816
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000817# --------------------------------------------------------------------
818# the following support code is taken from the unidb utilities
819# Copyright (c) 1999-2000 by Secret Labs AB
820
821# load a unicode-data file from disk
822
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000823class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000824 # Record structure:
825 # [ID, name, category, combining, bidi, decomp, (6)
826 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
827 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
828 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000829
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000830 def __init__(self, filename, exclusions, eastasianwidth, unihan,
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000831 derivedprops, derivednormalizationprops=None, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000832 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000833 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000834 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000835 while 1:
836 s = file.readline()
837 if not s:
838 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000839 s = s.strip().split(";")
840 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000841 table[char] = s
842
Martin v. Löwis97225da2002-11-24 23:05:09 +0000843 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000844 if expand:
845 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000846 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000847 s = table[i]
848 if s:
849 if s[1][-6:] == "First>":
850 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000851 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000852 elif s[1][-5:] == "Last>":
853 s[1] = ""
854 field = None
855 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000856 f2 = field[:]
857 f2[0] = "%X" % i
858 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000859
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000860 # public attributes
861 self.filename = filename
862 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000863 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000864
Martin v. Löwis677bde22002-11-23 22:08:15 +0000865 file = open(exclusions)
866 self.exclusions = {}
867 for s in file:
868 s = s.strip()
869 if not s:
870 continue
871 if s[0] == '#':
872 continue
873 char = int(s.split()[0],16)
874 self.exclusions[char] = 1
875
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000876 widths = [None] * 0x110000
877 for s in open(eastasianwidth):
878 s = s.strip()
879 if not s:
880 continue
881 if s[0] == '#':
882 continue
883 s = s.split()[0].split(';')
884 if '..' in s[0]:
885 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000886 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000887 else:
888 chars = [int(s[0], 16)]
889 for char in chars:
890 widths[char] = s[1]
891 for i in range(0, 0x110000):
892 if table[i] is not None:
893 table[i].append(widths[i])
894
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000895 for i in range(0, 0x110000):
896 if table[i] is not None:
897 table[i].append(set())
898 for s in open(derivedprops):
899 s = s.split('#', 1)[0].strip()
900 if not s:
901 continue
902
903 r, p = s.split(";")
904 r = r.strip()
905 p = p.strip()
906 if ".." in r:
907 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000908 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000909 else:
910 chars = [int(r, 16)]
911 for char in chars:
912 if table[char]:
913 # Some properties (e.g. Default_Ignorable_Code_Point)
914 # apply to unassigned code points; ignore them
915 table[char][-1].add(p)
916
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000917 if derivednormalizationprops:
918 quickchecks = [0] * 0x110000 # default is Yes
919 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
920 for s in open(derivednormalizationprops):
921 if '#' in s:
922 s = s[:s.index('#')]
923 s = [i.strip() for i in s.split(';')]
924 if len(s) < 2 or s[1] not in qc_order:
925 continue
926 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
927 quickcheck_shift = qc_order.index(s[1])*2
928 quickcheck <<= quickcheck_shift
929 if '..' not in s[0]:
930 first = last = int(s[0], 16)
931 else:
932 first, last = [int(c, 16) for c in s[0].split('..')]
933 for char in range(first, last+1):
934 assert not (quickchecks[char]>>quickcheck_shift)&3
935 quickchecks[char] |= quickcheck
936 for i in range(0, 0x110000):
937 if table[i] is not None:
938 table[i].append(quickchecks[i])
939
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000940 for line in open(unihan, encoding='utf-8'):
941 if not line.startswith('U+'):
942 continue
943 code, tag, value = line.split(None, 3)[:3]
944 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
945 'kOtherNumeric'):
946 continue
947 value = value.strip().replace(',', '')
948 i = int(code[2:], 16)
949 # Patch the numeric field
950 if table[i] is not None:
951 table[i][8] = value
952
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000953 def uselatin1(self):
954 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000955 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000956
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000957# hash table tools
958
959# this is a straight-forward reimplementation of Python's built-in
960# dictionary type, using a static data structure, and a custom string
961# hash algorithm.
962
963def myhash(s, magic):
964 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000965 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000966 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000967 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000968 if ix:
969 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
970 return h
971
972SIZES = [
973 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
974 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
975 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
976 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
977]
978
979class Hash:
980 def __init__(self, name, data, magic):
981 # turn a (key, value) list into a static hash table structure
982
983 # determine table size
984 for size, poly in SIZES:
985 if size > len(data):
986 poly = size + poly
987 break
988 else:
Collin Wintera817e582007-08-22 23:05:06 +0000989 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000990
Collin Winter6afaeb72007-08-03 17:06:41 +0000991 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000992
993 table = [None] * size
994
995 mask = size-1
996
997 n = 0
998
999 hash = myhash
1000
1001 # initialize hash table
1002 for key, value in data:
1003 h = hash(key, magic)
1004 i = (~h) & mask
1005 v = table[i]
1006 if v is None:
1007 table[i] = value
1008 continue
1009 incr = (h ^ (h >> 3)) & mask;
1010 if not incr:
1011 incr = mask
1012 while 1:
1013 n = n + 1
1014 i = (i + incr) & mask
1015 v = table[i]
1016 if v is None:
1017 table[i] = value
1018 break
1019 incr = incr << 1
1020 if incr > mask:
1021 incr = incr ^ poly
1022
Collin Winter6afaeb72007-08-03 17:06:41 +00001023 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001024 self.collisions = n
1025
1026 for i in range(len(table)):
1027 if table[i] is None:
1028 table[i] = 0
1029
1030 self.data = Array(name + "_hash", table)
1031 self.magic = magic
1032 self.name = name
1033 self.size = size
1034 self.poly = poly
1035
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001036 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001037 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001038 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001039 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1040 file.write("#define %s_size %d\n" % (self.name, self.size))
1041 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1042
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001043# stuff to deal with arrays of unsigned integers
1044
1045class Array:
1046
1047 def __init__(self, name, data):
1048 self.name = name
1049 self.data = data
1050
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001051 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001052 # write data to file, as a C array
1053 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001054 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001055 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001056 file.write("static ")
1057 if size == 1:
1058 file.write("unsigned char")
1059 elif size == 2:
1060 file.write("unsigned short")
1061 else:
1062 file.write("unsigned int")
1063 file.write(" " + self.name + "[] = {\n")
1064 if self.data:
1065 s = " "
1066 for item in self.data:
1067 i = str(item) + ", "
1068 if len(s) + len(i) > 78:
1069 file.write(s + "\n")
1070 s = " " + i
1071 else:
1072 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001073 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001074 file.write(s + "\n")
1075 file.write("};\n\n")
1076
1077def getsize(data):
1078 # return smallest possible integer size for the given array
1079 maxdata = max(data)
1080 if maxdata < 256:
1081 return 1
1082 elif maxdata < 65536:
1083 return 2
1084 else:
1085 return 4
1086
Tim Peters21013482000-09-25 07:13:41 +00001087def splitbins(t, trace=0):
1088 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1089
1090 t is a sequence of ints. This function can be useful to save space if
1091 many of the ints are the same. t1 and t2 are lists of ints, and shift
1092 is an int, chosen to minimize the combined size of t1 and t2 (in C
1093 code), and where for each i in range(len(t)),
1094 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1095 where mask is a bitmask isolating the last "shift" bits.
1096
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001097 If optional arg trace is non-zero (default zero), progress info
1098 is printed to sys.stderr. The higher the value, the more info
1099 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001100 """
1101
Tim Peters21013482000-09-25 07:13:41 +00001102 if trace:
1103 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001104 print("%d+%d bins at shift %d; %d bytes" % (
1105 len(t1), len(t2), shift, bytes), file=sys.stderr)
1106 print("Size of original table:", len(t)*getsize(t), \
1107 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001108 n = len(t)-1 # last valid index
1109 maxshift = 0 # the most we can shift n and still have something left
1110 if n > 0:
1111 while n >> 1:
1112 n >>= 1
1113 maxshift += 1
1114 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001115 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001116 t = tuple(t) # so slices can be dict keys
1117 for shift in range(maxshift + 1):
1118 t1 = []
1119 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001120 size = 2**shift
1121 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001122 for i in range(0, len(t), size):
1123 bin = t[i:i+size]
1124 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001125 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001126 index = len(t2)
1127 bincache[bin] = index
1128 t2.extend(bin)
1129 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001130 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001131 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001132 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001133 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001134 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001135 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001136 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001137 t1, t2, shift = best
1138 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001139 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001140 dump(t1, t2, shift, bytes)
1141 if __debug__:
1142 # exhaustively verify that the decomposition is correct
1143 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001144 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001145 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1146 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001147
1148if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001149 maketables(1)