blob: 464a4ebf772234b395e1883aaaae1c25420e6fc1 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -07004# This script converts Unicode database files to Modules/unicodedata_db.h,
5# Modules/unicodename_db.h, and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00006#
7# history:
8# 2000-09-24 fl created (based on bits and pieces from unidb)
9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000010# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000011# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000012# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000013# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000014# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000015# 2002-09-11 wd use string methods
16# 2002-10-18 mvl update to Unicode 3.2
17# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000018# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000019# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000020# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000021# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000022# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030023# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050024# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000025#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000026# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000027#
28
Ezio Melotti931b8aa2011-10-21 21:57:36 +030029import os
30import sys
31import zipfile
32
Stefan Behnelfaa29482019-06-01 21:49:03 +020033from functools import partial
Greg Priceef2af1a2019-08-12 22:20:56 -070034from textwrap import dedent
Greg Pricec03e6982019-08-13 19:28:38 -070035from typing import Iterator, List, Tuple
Fredrik Lundhf367cac2000-09-24 23:18:31 +000036
37SCRIPT = sys.argv[0]
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -070038VERSION = "3.3"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000039
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000040# The Unicode Database
R David Murray7445a382014-10-09 17:30:33 -040041# --------------------
42# When changing UCD version please update
43# * Doc/library/stdtypes.rst, and
44# * Doc/library/unicodedata.rst
R David Murray5f16f902014-10-09 20:45:59 -040045# * Doc/reference/lexical_analysis.rst (two occurrences)
Benjamin Peterson3aca40d2019-05-08 20:59:35 -070046UNIDATA_VERSION = "12.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000047UNICODE_DATA = "UnicodeData%s.txt"
48COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
49EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000050UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000051DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000052DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000053LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030054NAME_ALIASES = "NameAliases%s.txt"
55NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050056SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050057CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030058
59# Private Use Areas -- in planes 1, 15, 16
60PUA_1 = range(0xE000, 0xF900)
61PUA_15 = range(0xF0000, 0xFFFFE)
62PUA_16 = range(0x100000, 0x10FFFE)
63
64# we use this ranges of PUA_15 to store name aliases and named sequences
65NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050066NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000067
68old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000069
70CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
71 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
72 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
73 "So" ]
74
75BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
76 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
Benjamin Peterson94d08d92013-10-10 17:24:45 -040077 "ON", "LRI", "RLI", "FSI", "PDI" ]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000078
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000079EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
80
Florent Xicluna806d8cf2010-03-30 19:34:18 +000081MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
82
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000083# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000084ALPHA_MASK = 0x01
85DECIMAL_MASK = 0x02
86DIGIT_MASK = 0x04
87LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000088LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000089SPACE_MASK = 0x20
90TITLE_MASK = 0x40
91UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000092XID_START_MASK = 0x100
93XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000094PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050095NUMERIC_MASK = 0x800
96CASE_IGNORABLE_MASK = 0x1000
97CASED_MASK = 0x2000
98EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000099
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000100# these ranges need to match unicodedata.c:is_unified_ideograph
101cjk_ranges = [
102 ('3400', '4DB5'),
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -0700103 ('4E00', '9FEF'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000104 ('20000', '2A6D6'),
105 ('2A700', '2B734'),
Benjamin Peterson48013832015-06-27 15:45:56 -0500106 ('2B740', '2B81D'),
107 ('2B820', '2CEA1'),
Benjamin Peterson279a9622017-06-22 22:31:08 -0700108 ('2CEB0', '2EBE0'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000109]
110
Stefan Behnelfaa29482019-06-01 21:49:03 +0200111
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000112def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000113
Collin Winter6afaeb72007-08-03 17:06:41 +0000114 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000115
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000116 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000117
Georg Brandl559e5d72008-06-11 18:37:52 +0000118 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000119
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000120 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000121 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000122 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000123 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000124 merge_old_version(version, unicode, old_unicode)
125
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000126 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000127 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000128 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000129
Stefan Behnelfaa29482019-06-01 21:49:03 +0200130
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000131# --------------------------------------------------------------------
132# unicode character properties
133
134def makeunicodedata(unicode, trace):
135
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000136 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000137 table = [dummy]
138 cache = {0: dummy}
139 index = [0] * len(unicode.chars)
140
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000141 FILE = "Modules/unicodedata_db.h"
142
Collin Winter6afaeb72007-08-03 17:06:41 +0000143 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000144
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000145 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000146
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000147 for char in unicode.chars:
148 record = unicode.table[char]
149 if record:
150 # extract database properties
151 category = CATEGORY_NAMES.index(record[2])
152 combining = int(record[3])
153 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
154 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000155 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000156 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000157 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000158 category, combining, bidirectional, mirrored, eastasianwidth,
159 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000160 )
161 # add entry to index and item tables
162 i = cache.get(item)
163 if i is None:
164 cache[item] = i = len(table)
165 table.append(item)
166 index[char] = i
167
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000168 # 2) decomposition data
169
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000170 decomp_data = [0]
171 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000172 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000173 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000174
Martin v. Löwis677bde22002-11-23 22:08:15 +0000175 comp_pairs = []
176 comp_first = [None] * len(unicode.chars)
177 comp_last = [None] * len(unicode.chars)
178
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000179 for char in unicode.chars:
180 record = unicode.table[char]
181 if record:
182 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000183 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000184 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000185 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000186 # prefix
187 if decomp[0][0] == "<":
188 prefix = decomp.pop(0)
189 else:
190 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000191 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192 i = decomp_prefix.index(prefix)
193 except ValueError:
194 i = len(decomp_prefix)
195 decomp_prefix.append(prefix)
196 prefix = i
197 assert prefix < 256
198 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000199 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000200 # Collect NFC pairs
201 if not prefix and len(decomp) == 3 and \
202 char not in unicode.exclusions and \
203 unicode.table[decomp[1]][3] == "0":
204 p, l, r = decomp
205 comp_first[l] = 1
206 comp_last[r] = 1
207 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000208 try:
209 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000210 except ValueError:
211 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000212 decomp_data.extend(decomp)
213 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000214 else:
215 i = 0
216 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000217
Martin v. Löwis677bde22002-11-23 22:08:15 +0000218 f = l = 0
219 comp_first_ranges = []
220 comp_last_ranges = []
221 prev_f = prev_l = None
222 for i in unicode.chars:
223 if comp_first[i] is not None:
224 comp_first[i] = f
225 f += 1
226 if prev_f is None:
227 prev_f = (i,i)
228 elif prev_f[1]+1 == i:
229 prev_f = prev_f[0],i
230 else:
231 comp_first_ranges.append(prev_f)
232 prev_f = (i,i)
233 if comp_last[i] is not None:
234 comp_last[i] = l
235 l += 1
236 if prev_l is None:
237 prev_l = (i,i)
238 elif prev_l[1]+1 == i:
239 prev_l = prev_l[0],i
240 else:
241 comp_last_ranges.append(prev_l)
242 prev_l = (i,i)
243 comp_first_ranges.append(prev_f)
244 comp_last_ranges.append(prev_l)
245 total_first = f
246 total_last = l
247
248 comp_data = [0]*(total_first*total_last)
249 for f,l,char in comp_pairs:
250 f = comp_first[f]
251 l = comp_last[l]
252 comp_data[f*total_last+l] = char
253
Collin Winter6afaeb72007-08-03 17:06:41 +0000254 print(len(table), "unique properties")
255 print(len(decomp_prefix), "unique decomposition prefixes")
256 print(len(decomp_data), "unique decomposition entries:", end=' ')
257 print(decomp_size, "bytes")
258 print(total_first, "first characters in NFC")
259 print(total_last, "last characters in NFC")
260 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000261
Collin Winter6afaeb72007-08-03 17:06:41 +0000262 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000263
Stefan Behnelfaa29482019-06-01 21:49:03 +0200264 with open(FILE, "w") as fp:
265 fprint = partial(print, file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Stefan Behnelfaa29482019-06-01 21:49:03 +0200267 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
268 fprint()
269 fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
270 fprint("/* a list of unique database records */")
271 fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
272 for item in table:
273 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
274 fprint("};")
275 fprint()
Martin v. Löwis677bde22002-11-23 22:08:15 +0000276
Stefan Behnelfaa29482019-06-01 21:49:03 +0200277 fprint("/* Reindexing of NFC first characters. */")
278 fprint("#define TOTAL_FIRST",total_first)
279 fprint("#define TOTAL_LAST",total_last)
280 fprint("struct reindex{int start;short count,index;};")
281 fprint("static struct reindex nfc_first[] = {")
282 for start,end in comp_first_ranges:
283 fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
284 fprint(" {0,0,0}")
285 fprint("};\n")
286 fprint("static struct reindex nfc_last[] = {")
287 for start,end in comp_last_ranges:
288 fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
289 fprint(" {0,0,0}")
290 fprint("};\n")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000291
Stefan Behnelfaa29482019-06-01 21:49:03 +0200292 # FIXME: <fl> the following tables could be made static, and
293 # the support code moved into unicodedatabase.c
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000294
Stefan Behnelfaa29482019-06-01 21:49:03 +0200295 fprint("/* string literals */")
296 fprint("const char *_PyUnicode_CategoryNames[] = {")
297 for name in CATEGORY_NAMES:
298 fprint(" \"%s\"," % name)
299 fprint(" NULL")
300 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000301
Stefan Behnelfaa29482019-06-01 21:49:03 +0200302 fprint("const char *_PyUnicode_BidirectionalNames[] = {")
303 for name in BIDIRECTIONAL_NAMES:
304 fprint(" \"%s\"," % name)
305 fprint(" NULL")
306 fprint("};")
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000307
Stefan Behnelfaa29482019-06-01 21:49:03 +0200308 fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
309 for name in EASTASIANWIDTH_NAMES:
310 fprint(" \"%s\"," % name)
311 fprint(" NULL")
312 fprint("};")
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000313
Stefan Behnelfaa29482019-06-01 21:49:03 +0200314 fprint("static const char *decomp_prefix[] = {")
315 for name in decomp_prefix:
316 fprint(" \"%s\"," % name)
317 fprint(" NULL")
318 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000319
Stefan Behnelfaa29482019-06-01 21:49:03 +0200320 # split record index table
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000321 index1, index2, shift = splitbins(index, trace)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000322
Stefan Behnelfaa29482019-06-01 21:49:03 +0200323 fprint("/* index tables for the database records */")
324 fprint("#define SHIFT", shift)
325 Array("index1", index1).dump(fp, trace)
326 Array("index2", index2).dump(fp, trace)
327
328 # split decomposition index table
329 index1, index2, shift = splitbins(decomp_index, trace)
330
331 fprint("/* decomposition data */")
332 Array("decomp_data", decomp_data).dump(fp, trace)
333
334 fprint("/* index tables for the decomposition data */")
335 fprint("#define DECOMP_SHIFT", shift)
336 Array("decomp_index1", index1).dump(fp, trace)
337 Array("decomp_index2", index2).dump(fp, trace)
338
339 index, index2, shift = splitbins(comp_data, trace)
340 fprint("/* NFC pairs */")
341 fprint("#define COMP_SHIFT", shift)
342 Array("comp_index", index).dump(fp, trace)
343 Array("comp_data", index2).dump(fp, trace)
344
345 # Generate delta tables for old versions
346 for version, table, normalization in unicode.changed:
347 cversion = version.replace(".","_")
348 records = [table[0]]
349 cache = {table[0]:0}
350 index = [0] * len(table)
351 for i, record in enumerate(table):
352 try:
353 index[i] = cache[record]
354 except KeyError:
355 index[i] = cache[record] = len(records)
356 records.append(record)
357 index1, index2, shift = splitbins(index, trace)
358 fprint("static const change_record change_records_%s[] = {" % cversion)
359 for record in records:
360 fprint(" { %s }," % ", ".join(map(str,record)))
361 fprint("};")
362 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
363 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
364 fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
365 fprint("{")
366 fprint(" int index;")
367 fprint(" if (n >= 0x110000) index = 0;")
368 fprint(" else {")
369 fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))
370 fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
371 (cversion, shift, ((1<<shift)-1)))
372 fprint(" }")
373 fprint(" return change_records_%s+index;" % cversion)
374 fprint("}\n")
375 fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
376 fprint("{")
377 fprint(" switch(n) {")
378 for k, v in normalization:
379 fprint(" case %s: return 0x%s;" % (hex(k), v))
380 fprint(" default: return 0;")
381 fprint(" }\n}\n")
382
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000383
384# --------------------------------------------------------------------
385# unicode character type tables
386
387def makeunicodetype(unicode, trace):
388
389 FILE = "Objects/unicodetype_db.h"
390
Collin Winter6afaeb72007-08-03 17:06:41 +0000391 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392
393 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000394 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000395 table = [dummy]
396 cache = {0: dummy}
397 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000398 numeric = {}
399 spaces = []
400 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500401 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402
403 for char in unicode.chars:
404 record = unicode.table[char]
405 if record:
406 # extract database properties
407 category = record[2]
408 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000409 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 flags = 0
411 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
412 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500413 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000414 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000415 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000416 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000417 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000418 if category == "Zs" or bidirectional in ("WS", "B", "S"):
419 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000420 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000421 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000422 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500423 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000424 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000425 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000426 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000427 if "XID_Start" in properties:
428 flags |= XID_START_MASK
429 if "XID_Continue" in properties:
430 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500431 if "Cased" in properties:
432 flags |= CASED_MASK
433 if "Case_Ignorable" in properties:
434 flags |= CASE_IGNORABLE_MASK
435 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500436 cf = unicode.case_folding.get(char, [char])
437 if record[12]:
438 upper = int(record[12], 16)
439 else:
440 upper = char
441 if record[13]:
442 lower = int(record[13], 16)
443 else:
444 lower = char
445 if record[14]:
446 title = int(record[14], 16)
447 else:
448 title = upper
449 if sc is None and cf != [lower]:
450 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500451 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500452 if upper == lower == title:
453 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500454 else:
455 upper = upper - char
456 lower = lower - char
457 title = title - char
458 assert (abs(upper) <= 2147483647 and
459 abs(lower) <= 2147483647 and
460 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000461 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500462 # This happens either when some character maps to more than one
463 # character in uppercase, lowercase, or titlecase or the
464 # casefolded version of the character is different from the
465 # lowercase. The extra characters are stored in a different
466 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500467 flags |= EXTENDED_CASE_MASK
468 lower = len(extra_casing) | (len(sc[0]) << 24)
469 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500470 if cf != sc[0]:
471 lower |= len(cf) << 20
472 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500473 upper = len(extra_casing) | (len(sc[2]) << 24)
474 extra_casing.extend(sc[2])
475 # Title is probably equal to upper.
476 if sc[1] == sc[2]:
477 title = upper
478 else:
479 title = len(extra_casing) | (len(sc[1]) << 24)
480 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000481 # decimal digit, integer digit
482 decimal = 0
483 if record[6]:
484 flags |= DECIMAL_MASK
485 decimal = int(record[6])
486 digit = 0
487 if record[7]:
488 flags |= DIGIT_MASK
489 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000490 if record[8]:
491 flags |= NUMERIC_MASK
492 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000493 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000494 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000495 )
496 # add entry to index and item tables
497 i = cache.get(item)
498 if i is None:
499 cache[item] = i = len(table)
500 table.append(item)
501 index[char] = i
502
Collin Winter6afaeb72007-08-03 17:06:41 +0000503 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000504 print(sum(map(len, numeric.values())), "numeric code points")
505 print(len(spaces), "whitespace code points")
506 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500507 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000508
Collin Winter6afaeb72007-08-03 17:06:41 +0000509 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000510
Stefan Behnelfaa29482019-06-01 21:49:03 +0200511 with open(FILE, "w") as fp:
512 fprint = partial(print, file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000513
Stefan Behnelfaa29482019-06-01 21:49:03 +0200514 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
515 fprint()
516 fprint("/* a list of unique character type descriptors */")
517 fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
518 for item in table:
519 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
520 fprint("};")
521 fprint()
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500522
Stefan Behnelfaa29482019-06-01 21:49:03 +0200523 fprint("/* extended case mappings */")
524 fprint()
525 fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
526 for c in extra_casing:
527 fprint(" %d," % c)
528 fprint("};")
529 fprint()
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000530
Stefan Behnelfaa29482019-06-01 21:49:03 +0200531 # split decomposition index table
532 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000533
Stefan Behnelfaa29482019-06-01 21:49:03 +0200534 fprint("/* type indexes */")
535 fprint("#define SHIFT", shift)
536 Array("index1", index1).dump(fp, trace)
537 Array("index2", index2).dump(fp, trace)
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000538
Stefan Behnelfaa29482019-06-01 21:49:03 +0200539 # Generate code for _PyUnicode_ToNumeric()
540 numeric_items = sorted(numeric.items())
541 fprint('/* Returns the numeric value as double for Unicode characters')
542 fprint(' * having this property, -1.0 otherwise.')
543 fprint(' */')
544 fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
545 fprint('{')
546 fprint(' switch (ch) {')
547 for value, codepoints in numeric_items:
548 # Turn text into float literals
549 parts = value.split('/')
550 parts = [repr(float(part)) for part in parts]
551 value = '/'.join(parts)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000552
Stefan Behnelfaa29482019-06-01 21:49:03 +0200553 codepoints.sort()
554 for codepoint in codepoints:
555 fprint(' case 0x%04X:' % (codepoint,))
556 fprint(' return (double) %s;' % (value,))
557 fprint(' }')
558 fprint(' return -1.0;')
559 fprint('}')
560 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000561
Stefan Behnelfaa29482019-06-01 21:49:03 +0200562 # Generate code for _PyUnicode_IsWhitespace()
563 fprint("/* Returns 1 for Unicode characters having the bidirectional")
564 fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
565 fprint(" */")
566 fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
567 fprint('{')
568 fprint(' switch (ch) {')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000569
Stefan Behnelfaa29482019-06-01 21:49:03 +0200570 for codepoint in sorted(spaces):
571 fprint(' case 0x%04X:' % (codepoint,))
572 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000573
Stefan Behnelfaa29482019-06-01 21:49:03 +0200574 fprint(' }')
575 fprint(' return 0;')
576 fprint('}')
577 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000578
Stefan Behnelfaa29482019-06-01 21:49:03 +0200579 # Generate code for _PyUnicode_IsLinebreak()
580 fprint("/* Returns 1 for Unicode characters having the line break")
581 fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
582 fprint(" * type 'B', 0 otherwise.")
583 fprint(" */")
584 fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
585 fprint('{')
586 fprint(' switch (ch) {')
587 for codepoint in sorted(linebreaks):
588 fprint(' case 0x%04X:' % (codepoint,))
589 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000590
Stefan Behnelfaa29482019-06-01 21:49:03 +0200591 fprint(' }')
592 fprint(' return 0;')
593 fprint('}')
594 fprint()
595
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000596
597# --------------------------------------------------------------------
598# unicode name database
599
600def makeunicodename(unicode, trace):
601
602 FILE = "Modules/unicodename_db.h"
603
Collin Winter6afaeb72007-08-03 17:06:41 +0000604 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000605
606 # collect names
607 names = [None] * len(unicode.chars)
608
609 for char in unicode.chars:
610 record = unicode.table[char]
611 if record:
612 name = record[1].strip()
613 if name and name[0] != "<":
614 names[char] = name + chr(0)
615
Jon Dufresne39726282017-05-18 07:35:54 -0700616 print(len([n for n in names if n is not None]), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000617
618 # collect unique words from names (note that we differ between
619 # words inside a sentence, and words ending a sentence. the
620 # latter includes the trailing null byte.
621
622 words = {}
623 n = b = 0
624 for char in unicode.chars:
625 name = names[char]
626 if name:
627 w = name.split()
628 b = b + len(name)
629 n = n + len(w)
630 for w in w:
631 l = words.get(w)
632 if l:
633 l.append(None)
634 else:
635 words[w] = [len(words)]
636
Collin Winter6afaeb72007-08-03 17:06:41 +0000637 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000638
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000639 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000640
Martin v. Löwis97225da2002-11-24 23:05:09 +0000641 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000642 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000643 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000644 return -len(alist), aword
645 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000646
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000647 # figure out how many phrasebook escapes we need
648 escapes = 0
649 while escapes * 256 < len(wordlist):
650 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000651 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000652
653 short = 256 - escapes
654
655 assert short > 0
656
Collin Winter6afaeb72007-08-03 17:06:41 +0000657 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000658
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000659 # statistics
660 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000661 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000662 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000663 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000664
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000665 # pick the most commonly used words, and sort the rest on falling
666 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000667
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000668 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000669 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000670 wordlist.extend(wordtail)
671
672 # generate lexicon from words
673
674 lexicon_offset = [0]
675 lexicon = ""
676 words = {}
677
678 # build a lexicon string
679 offset = 0
680 for w, x in wordlist:
681 # encoding: bit 7 indicates last character in word (chr(128)
682 # indicates the last character in an entire string)
683 ww = w[:-1] + chr(ord(w[-1])+128)
684 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000685 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000686 if o < 0:
687 o = offset
688 lexicon = lexicon + ww
689 offset = offset + len(w)
690 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000691 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000692
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000693 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000694
695 # generate phrasebook from names and lexicon
696 phrasebook = [0]
697 phrasebook_offset = [0] * len(unicode.chars)
698 for char in unicode.chars:
699 name = names[char]
700 if name:
701 w = name.split()
702 phrasebook_offset[char] = len(phrasebook)
703 for w in w:
704 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000705 if i < short:
706 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000707 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000708 # store as two bytes
709 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000710 phrasebook.append(i&255)
711
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000712 assert getsize(phrasebook) == 1
713
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000714 #
715 # unicode name hash table
716
717 # extract names
718 data = []
719 for char in unicode.chars:
720 record = unicode.table[char]
721 if record:
722 name = record[1].strip()
723 if name and name[0] != "<":
724 data.append((name, char))
725
726 # the magic number 47 was chosen to minimize the number of
727 # collisions on the current data set. if you like, change it
728 # and see what happens...
729
730 codehash = Hash("code", data, 47)
731
Collin Winter6afaeb72007-08-03 17:06:41 +0000732 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000733
Stefan Behnelfaa29482019-06-01 21:49:03 +0200734 with open(FILE, "w") as fp:
735 fprint = partial(print, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000736
Stefan Behnelfaa29482019-06-01 21:49:03 +0200737 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
738 fprint()
739 fprint("#define NAME_MAXLEN", 256)
740 fprint()
741 fprint("/* lexicon */")
742 Array("lexicon", lexicon).dump(fp, trace)
743 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000744
Stefan Behnelfaa29482019-06-01 21:49:03 +0200745 # split decomposition index table
746 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000747
Stefan Behnelfaa29482019-06-01 21:49:03 +0200748 fprint("/* code->name phrasebook */")
749 fprint("#define phrasebook_shift", shift)
750 fprint("#define phrasebook_short", short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000751
Stefan Behnelfaa29482019-06-01 21:49:03 +0200752 Array("phrasebook", phrasebook).dump(fp, trace)
753 Array("phrasebook_offset1", offset1).dump(fp, trace)
754 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000755
Stefan Behnelfaa29482019-06-01 21:49:03 +0200756 fprint("/* name->code dictionary */")
757 codehash.dump(fp, trace)
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300758
Stefan Behnelfaa29482019-06-01 21:49:03 +0200759 fprint()
760 fprint('static const unsigned int aliases_start = %#x;' %
761 NAME_ALIASES_START)
762 fprint('static const unsigned int aliases_end = %#x;' %
763 (NAME_ALIASES_START + len(unicode.aliases)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300764
Stefan Behnelfaa29482019-06-01 21:49:03 +0200765 fprint('static const unsigned int name_aliases[] = {')
766 for name, codepoint in unicode.aliases:
767 fprint(' 0x%04X,' % codepoint)
768 fprint('};')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300769
Stefan Behnelfaa29482019-06-01 21:49:03 +0200770 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
771 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
772 # sequences or sequences with non-BMP chars are added.
773 # unicodedata_lookup should be adapted too.
774 fprint(dedent("""
775 typedef struct NamedSequence {
776 int seqlen;
777 Py_UCS2 seq[4];
778 } named_sequence;
779 """))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300780
Stefan Behnelfaa29482019-06-01 21:49:03 +0200781 fprint('static const unsigned int named_sequences_start = %#x;' %
782 NAMED_SEQUENCES_START)
783 fprint('static const unsigned int named_sequences_end = %#x;' %
784 (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300785
Stefan Behnelfaa29482019-06-01 21:49:03 +0200786 fprint('static const named_sequence named_sequences[] = {')
787 for name, sequence in unicode.named_sequences:
788 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
789 fprint(' {%d, {%s}},' % (len(sequence), seq_str))
790 fprint('};')
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000791
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000792
793def merge_old_version(version, new, old):
794 # Changes to exclusion file not implemented yet
795 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000796 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000797
798 # In these change records, 0xFF means "no change"
799 bidir_changes = [0xFF]*0x110000
800 category_changes = [0xFF]*0x110000
801 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000802 mirrored_changes = [0xFF]*0x110000
Benjamin Peterson67752312016-09-14 23:53:47 -0700803 east_asian_width_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000804 # In numeric data, 0 means "no change",
805 # -1 means "did not have a numeric value
806 numeric_changes = [0] * 0x110000
807 # normalization_changes is a list of key-value pairs
808 normalization_changes = []
809 for i in range(0x110000):
810 if new.table[i] is None:
811 # Characters unassigned in the new version ought to
812 # be unassigned in the old one
813 assert old.table[i] is None
814 continue
815 # check characters unassigned in the old version
816 if old.table[i] is None:
817 # category 0 is "unassigned"
818 category_changes[i] = 0
819 continue
820 # check characters that differ
821 if old.table[i] != new.table[i]:
822 for k in range(len(old.table[i])):
823 if old.table[i][k] != new.table[i][k]:
824 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300825 if k == 1 and i in PUA_15:
826 # the name is not set in the old.table, but in the
827 # new.table we are using it for aliases and named seq
828 assert value == ''
829 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000830 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
831 category_changes[i] = CATEGORY_NAMES.index(value)
832 elif k == 4:
833 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
834 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
835 elif k == 5:
836 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
837 # We assume that all normalization changes are in 1:1 mappings
838 assert " " not in value
839 normalization_changes.append((i, value))
840 elif k == 6:
841 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
842 # we only support changes where the old value is a single digit
843 assert value in "0123456789"
844 decimal_changes[i] = int(value)
845 elif k == 8:
846 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
847 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000848 if not value:
849 numeric_changes[i] = -1
850 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000851 numeric_changes[i] = float(value)
852 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000853 elif k == 9:
854 if value == 'Y':
855 mirrored_changes[i] = '1'
856 else:
857 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000858 elif k == 11:
859 # change to ISO comment, ignore
860 pass
861 elif k == 12:
862 # change to simple uppercase mapping; ignore
863 pass
864 elif k == 13:
865 # change to simple lowercase mapping; ignore
866 pass
867 elif k == 14:
868 # change to simple titlecase mapping; ignore
869 pass
Benjamin Peterson67752312016-09-14 23:53:47 -0700870 elif k == 15:
871 # change to east asian width
872 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000873 elif k == 16:
874 # derived property changes; not yet
875 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000876 elif k == 17:
877 # normalization quickchecks are not performed
878 # for older versions
879 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000880 else:
881 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000882 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000883 new.changed.append((version, list(zip(bidir_changes, category_changes,
Benjamin Peterson67752312016-09-14 23:53:47 -0700884 decimal_changes, mirrored_changes,
885 east_asian_width_changes,
886 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000887 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000888
Stefan Behnelfaa29482019-06-01 21:49:03 +0200889
Greg Price3e4498d2019-08-14 18:18:53 -0700890DATA_DIR = os.path.join('Tools', 'unicode', 'data')
891
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000892def open_data(template, version):
Greg Price3e4498d2019-08-14 18:18:53 -0700893 local = os.path.join(DATA_DIR, template % ('-'+version,))
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000894 if not os.path.exists(local):
895 import urllib.request
896 if version == '3.2.0':
897 # irregular url structure
Greg Price3e4498d2019-08-14 18:18:53 -0700898 url = ('http://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000899 else:
900 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
Greg Price3e4498d2019-08-14 18:18:53 -0700901 os.makedirs(DATA_DIR, exist_ok=True)
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000902 urllib.request.urlretrieve(url, filename=local)
903 if local.endswith('.txt'):
904 return open(local, encoding='utf-8')
905 else:
906 # Unihan.zip
907 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000908
Stefan Behnelfaa29482019-06-01 21:49:03 +0200909
Greg Pricec03e6982019-08-13 19:28:38 -0700910def expand_range(char_range: str) -> Iterator[int]:
911 '''
912 Parses ranges of code points, as described in UAX #44:
913 https://www.unicode.org/reports/tr44/#Code_Point_Ranges
914 '''
915 if '..' in char_range:
916 first, last = [int(c, 16) for c in char_range.split('..')]
917 else:
918 first = last = int(char_range, 16)
919 for char in range(first, last+1):
920 yield char
921
922
Greg Priceef2af1a2019-08-12 22:20:56 -0700923class UcdFile:
924 '''
925 A file in the standard format of the UCD.
926
927 See: https://www.unicode.org/reports/tr44/#Format_Conventions
928
929 Note that, as described there, the Unihan data files have their
930 own separate format.
931 '''
932
933 def __init__(self, template: str, version: str) -> None:
934 self.template = template
935 self.version = version
936
937 def records(self) -> Iterator[List[str]]:
938 with open_data(self.template, self.version) as file:
939 for line in file:
940 line = line.split('#', 1)[0].strip()
941 if not line:
942 continue
943 yield [field.strip() for field in line.split(';')]
944
945 def __iter__(self) -> Iterator[List[str]]:
946 return self.records()
947
Greg Pricec03e6982019-08-13 19:28:38 -0700948 def expanded(self) -> Iterator[Tuple[int, List[str]]]:
949 for record in self.records():
950 char_range, rest = record[0], record[1:]
951 for char in expand_range(char_range):
952 yield char, rest
953
Greg Priceef2af1a2019-08-12 22:20:56 -0700954
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000955# --------------------------------------------------------------------
956# the following support code is taken from the unidb utilities
957# Copyright (c) 1999-2000 by Secret Labs AB
958
959# load a unicode-data file from disk
960
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000961class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000962 # Record structure:
963 # [ID, name, category, combining, bidi, decomp, (6)
964 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
965 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
966 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000967
Greg Price99d208e2019-08-12 22:59:30 -0700968 def __init__(self, version, cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000969 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000970 table = [None] * 0x110000
Greg Priceef2af1a2019-08-12 22:20:56 -0700971 for s in UcdFile(UNICODE_DATA, version):
972 char = int(s[0], 16)
973 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000974
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000975 cjk_ranges_found = []
976
Martin v. Löwis97225da2002-11-24 23:05:09 +0000977 # expand first-last ranges
Greg Price99d208e2019-08-12 22:59:30 -0700978 field = None
979 for i in range(0, 0x110000):
Greg Pricec03e6982019-08-13 19:28:38 -0700980 # The file UnicodeData.txt has its own distinct way of
981 # expressing ranges. See:
982 # https://www.unicode.org/reports/tr44/#Code_Point_Ranges
Greg Price99d208e2019-08-12 22:59:30 -0700983 s = table[i]
984 if s:
985 if s[1][-6:] == "First>":
986 s[1] = ""
987 field = s
988 elif s[1][-5:] == "Last>":
989 if s[1].startswith("<CJK Ideograph"):
990 cjk_ranges_found.append((field[0],
991 s[0]))
992 s[1] = ""
993 field = None
994 elif field:
995 f2 = field[:]
996 f2[0] = "%X" % i
997 table[i] = f2
998 if cjk_check and cjk_ranges != cjk_ranges_found:
999 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001000
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001001 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001002 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001003 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +00001004 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001005
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001006 # check for name aliases and named sequences, see #12753
1007 # aliases and named sequences are not in 3.2.0
1008 if version != '3.2.0':
1009 self.aliases = []
1010 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
1011 # in order to take advantage of the compression and lookup
1012 # algorithms used for the other characters
1013 pua_index = NAME_ALIASES_START
Greg Priceef2af1a2019-08-12 22:20:56 -07001014 for char, name, abbrev in UcdFile(NAME_ALIASES, version):
1015 char = int(char, 16)
1016 self.aliases.append((name, char))
1017 # also store the name in the PUA 1
1018 self.table[pua_index][1] = name
1019 pua_index += 1
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001020 assert pua_index - NAME_ALIASES_START == len(self.aliases)
1021
1022 self.named_sequences = []
Ezio Melotti7c4a7e62013-08-26 01:32:56 +03001023 # store named sequences in the PUA 1, in range U+F0100..,
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001024 # in order to take advantage of the compression and lookup
1025 # algorithms used for the other characters.
1026
Benjamin Peterson71f660e2012-02-20 22:24:29 -05001027 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001028 pua_index = NAMED_SEQUENCES_START
Greg Priceef2af1a2019-08-12 22:20:56 -07001029 for name, chars in UcdFile(NAMED_SEQUENCES, version):
1030 chars = tuple(int(char, 16) for char in chars.split())
1031 # check that the structure defined in makeunicodename is OK
1032 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1033 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1034 "the NamedSequence struct and in unicodedata_lookup")
1035 self.named_sequences.append((name, chars))
1036 # also store these in the PUA 1
1037 self.table[pua_index][1] = name
1038 pua_index += 1
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001039 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1040
Martin v. Löwis677bde22002-11-23 22:08:15 +00001041 self.exclusions = {}
Greg Priceef2af1a2019-08-12 22:20:56 -07001042 for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
1043 char = int(char, 16)
1044 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001045
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001046 widths = [None] * 0x110000
Greg Pricec03e6982019-08-13 19:28:38 -07001047 for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
1048 widths[char] = width
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001049
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001050 for i in range(0, 0x110000):
1051 if table[i] is not None:
1052 table[i].append(widths[i])
1053
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001054 for i in range(0, 0x110000):
1055 if table[i] is not None:
1056 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001057
Greg Pricec03e6982019-08-13 19:28:38 -07001058 for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
1059 if table[char]:
1060 # Some properties (e.g. Default_Ignorable_Code_Point)
1061 # apply to unassigned code points; ignore them
1062 table[char][-1].add(p)
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001063
Greg Pricec03e6982019-08-13 19:28:38 -07001064 for char_range, value in UcdFile(LINE_BREAK, version):
1065 if value not in MANDATORY_LINE_BREAKS:
Greg Priceef2af1a2019-08-12 22:20:56 -07001066 continue
Greg Pricec03e6982019-08-13 19:28:38 -07001067 for char in expand_range(char_range):
Greg Priceef2af1a2019-08-12 22:20:56 -07001068 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001069
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001070 # We only want the quickcheck properties
1071 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1072 # Yes is the default, hence only N and M occur
1073 # In 3.2.0, the format was different (NF?_NO)
1074 # The parsing will incorrectly determine these as
1075 # "yes", however, unicodedata.c will not perform quickchecks
1076 # for older versions, and no delta records will be created.
1077 quickchecks = [0] * 0x110000
1078 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Greg Priceef2af1a2019-08-12 22:20:56 -07001079 for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1080 if len(s) < 2 or s[1] not in qc_order:
1081 continue
1082 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1083 quickcheck_shift = qc_order.index(s[1])*2
1084 quickcheck <<= quickcheck_shift
Greg Pricec03e6982019-08-13 19:28:38 -07001085 for char in expand_range(s[0]):
Greg Priceef2af1a2019-08-12 22:20:56 -07001086 assert not (quickchecks[char]>>quickcheck_shift)&3
1087 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001088 for i in range(0, 0x110000):
1089 if table[i] is not None:
1090 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001091
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001092 with open_data(UNIHAN, version) as file:
1093 zip = zipfile.ZipFile(file)
1094 if version == '3.2.0':
1095 data = zip.open('Unihan-3.2.0.txt').read()
1096 else:
1097 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001098 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001099 if not line.startswith('U+'):
1100 continue
1101 code, tag, value = line.split(None, 3)[:3]
1102 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1103 'kOtherNumeric'):
1104 continue
1105 value = value.strip().replace(',', '')
1106 i = int(code[2:], 16)
1107 # Patch the numeric field
1108 if table[i] is not None:
1109 table[i][8] = value
Greg Priceef2af1a2019-08-12 22:20:56 -07001110
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001111 sc = self.special_casing = {}
Greg Priceef2af1a2019-08-12 22:20:56 -07001112 for data in UcdFile(SPECIAL_CASING, version):
1113 if data[4]:
1114 # We ignore all conditionals (since they depend on
1115 # languages) except for one, which is hardcoded. See
1116 # handle_capital_sigma in unicodeobject.c.
1117 continue
1118 c = int(data[0], 16)
1119 lower = [int(char, 16) for char in data[1].split()]
1120 title = [int(char, 16) for char in data[2].split()]
1121 upper = [int(char, 16) for char in data[3].split()]
1122 sc[c] = (lower, title, upper)
1123
Benjamin Petersond5890c82012-01-14 13:23:30 -05001124 cf = self.case_folding = {}
1125 if version != '3.2.0':
Greg Priceef2af1a2019-08-12 22:20:56 -07001126 for data in UcdFile(CASE_FOLDING, version):
1127 if data[1] in "CF":
1128 c = int(data[0], 16)
1129 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001130
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001131 def uselatin1(self):
1132 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001133 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001134
Stefan Behnelfaa29482019-06-01 21:49:03 +02001135
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001136# hash table tools
1137
1138# this is a straight-forward reimplementation of Python's built-in
1139# dictionary type, using a static data structure, and a custom string
1140# hash algorithm.
1141
1142def myhash(s, magic):
1143 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001144 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001145 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001146 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001147 if ix:
1148 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1149 return h
1150
Stefan Behnelfaa29482019-06-01 21:49:03 +02001151
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001152SIZES = [
1153 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1154 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1155 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1156 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1157]
1158
Stefan Behnelfaa29482019-06-01 21:49:03 +02001159
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001160class Hash:
1161 def __init__(self, name, data, magic):
1162 # turn a (key, value) list into a static hash table structure
1163
1164 # determine table size
1165 for size, poly in SIZES:
1166 if size > len(data):
1167 poly = size + poly
1168 break
1169 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001170 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001171
Collin Winter6afaeb72007-08-03 17:06:41 +00001172 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001173
1174 table = [None] * size
1175
1176 mask = size-1
1177
1178 n = 0
1179
1180 hash = myhash
1181
1182 # initialize hash table
1183 for key, value in data:
1184 h = hash(key, magic)
1185 i = (~h) & mask
1186 v = table[i]
1187 if v is None:
1188 table[i] = value
1189 continue
Stefan Behnelfaa29482019-06-01 21:49:03 +02001190 incr = (h ^ (h >> 3)) & mask
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001191 if not incr:
1192 incr = mask
1193 while 1:
1194 n = n + 1
1195 i = (i + incr) & mask
1196 v = table[i]
1197 if v is None:
1198 table[i] = value
1199 break
1200 incr = incr << 1
1201 if incr > mask:
1202 incr = incr ^ poly
1203
Collin Winter6afaeb72007-08-03 17:06:41 +00001204 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001205 self.collisions = n
1206
1207 for i in range(len(table)):
1208 if table[i] is None:
1209 table[i] = 0
1210
1211 self.data = Array(name + "_hash", table)
1212 self.magic = magic
1213 self.name = name
1214 self.size = size
1215 self.poly = poly
1216
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001217 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001218 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001219 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001220 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1221 file.write("#define %s_size %d\n" % (self.name, self.size))
1222 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1223
Stefan Behnelfaa29482019-06-01 21:49:03 +02001224
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001225# stuff to deal with arrays of unsigned integers
1226
1227class Array:
1228
1229 def __init__(self, name, data):
1230 self.name = name
1231 self.data = data
1232
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001233 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001234 # write data to file, as a C array
1235 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001236 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001237 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Inada Naoki6fec9052019-04-17 08:40:34 +09001238 file.write("static const ")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001239 if size == 1:
1240 file.write("unsigned char")
1241 elif size == 2:
1242 file.write("unsigned short")
1243 else:
1244 file.write("unsigned int")
1245 file.write(" " + self.name + "[] = {\n")
1246 if self.data:
1247 s = " "
1248 for item in self.data:
1249 i = str(item) + ", "
1250 if len(s) + len(i) > 78:
Benjamin Peterson279a9622017-06-22 22:31:08 -07001251 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001252 s = " " + i
1253 else:
1254 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001255 if s.strip():
Benjamin Peterson279a9622017-06-22 22:31:08 -07001256 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001257 file.write("};\n\n")
1258
Stefan Behnelfaa29482019-06-01 21:49:03 +02001259
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001260def getsize(data):
1261 # return smallest possible integer size for the given array
1262 maxdata = max(data)
1263 if maxdata < 256:
1264 return 1
1265 elif maxdata < 65536:
1266 return 2
1267 else:
1268 return 4
1269
Stefan Behnelfaa29482019-06-01 21:49:03 +02001270
Tim Peters21013482000-09-25 07:13:41 +00001271def splitbins(t, trace=0):
1272 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1273
1274 t is a sequence of ints. This function can be useful to save space if
1275 many of the ints are the same. t1 and t2 are lists of ints, and shift
1276 is an int, chosen to minimize the combined size of t1 and t2 (in C
1277 code), and where for each i in range(len(t)),
1278 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1279 where mask is a bitmask isolating the last "shift" bits.
1280
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001281 If optional arg trace is non-zero (default zero), progress info
1282 is printed to sys.stderr. The higher the value, the more info
1283 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001284 """
1285
Tim Peters21013482000-09-25 07:13:41 +00001286 if trace:
1287 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001288 print("%d+%d bins at shift %d; %d bytes" % (
1289 len(t1), len(t2), shift, bytes), file=sys.stderr)
Stefan Behnelfaa29482019-06-01 21:49:03 +02001290 print("Size of original table:", len(t)*getsize(t), "bytes",
1291 file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001292 n = len(t)-1 # last valid index
1293 maxshift = 0 # the most we can shift n and still have something left
1294 if n > 0:
1295 while n >> 1:
1296 n >>= 1
1297 maxshift += 1
1298 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001299 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001300 t = tuple(t) # so slices can be dict keys
1301 for shift in range(maxshift + 1):
1302 t1 = []
1303 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001304 size = 2**shift
1305 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001306 for i in range(0, len(t), size):
1307 bin = t[i:i+size]
1308 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001309 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001310 index = len(t2)
1311 bincache[bin] = index
1312 t2.extend(bin)
1313 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001314 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001315 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001316 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001317 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001318 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001319 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001320 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001321 t1, t2, shift = best
1322 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001323 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001324 dump(t1, t2, shift, bytes)
1325 if __debug__:
1326 # exhaustively verify that the decomposition is correct
1327 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001328 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001329 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1330 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001331
Stefan Behnelfaa29482019-06-01 21:49:03 +02001332
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001333if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001334 maketables(1)