blob: cc2b2981ef5b76d0c01870b471fd399a54147f60 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -07004# This script converts Unicode database files to Modules/unicodedata_db.h,
5# Modules/unicodename_db.h, and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00006#
7# history:
8# 2000-09-24 fl created (based on bits and pieces from unidb)
9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000010# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000011# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000012# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000013# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000014# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000015# 2002-09-11 wd use string methods
16# 2002-10-18 mvl update to Unicode 3.2
17# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000018# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000019# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000020# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000021# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000022# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030023# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050024# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000025#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000026# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000027#
28
Ezio Melotti931b8aa2011-10-21 21:57:36 +030029import os
30import sys
31import zipfile
32
Stefan Behnelfaa29482019-06-01 21:49:03 +020033from functools import partial
Greg Priceef2af1a2019-08-12 22:20:56 -070034from textwrap import dedent
Greg Pricec03e6982019-08-13 19:28:38 -070035from typing import Iterator, List, Tuple
Fredrik Lundhf367cac2000-09-24 23:18:31 +000036
37SCRIPT = sys.argv[0]
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -070038VERSION = "3.3"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000039
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000040# The Unicode Database
R David Murray7445a382014-10-09 17:30:33 -040041# --------------------
42# When changing UCD version please update
43# * Doc/library/stdtypes.rst, and
44# * Doc/library/unicodedata.rst
R David Murray5f16f902014-10-09 20:45:59 -040045# * Doc/reference/lexical_analysis.rst (two occurrences)
Benjamin Peterson3aca40d2019-05-08 20:59:35 -070046UNIDATA_VERSION = "12.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000047UNICODE_DATA = "UnicodeData%s.txt"
48COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
49EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000050UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000051DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000052DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000053LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030054NAME_ALIASES = "NameAliases%s.txt"
55NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050056SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050057CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030058
59# Private Use Areas -- in planes 1, 15, 16
60PUA_1 = range(0xE000, 0xF900)
61PUA_15 = range(0xF0000, 0xFFFFE)
62PUA_16 = range(0x100000, 0x10FFFE)
63
64# we use this ranges of PUA_15 to store name aliases and named sequences
65NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050066NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000067
68old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000069
70CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
71 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
72 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
73 "So" ]
74
75BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
76 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
Benjamin Peterson94d08d92013-10-10 17:24:45 -040077 "ON", "LRI", "RLI", "FSI", "PDI" ]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000078
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000079EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
80
Florent Xicluna806d8cf2010-03-30 19:34:18 +000081MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
82
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000083# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000084ALPHA_MASK = 0x01
85DECIMAL_MASK = 0x02
86DIGIT_MASK = 0x04
87LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000088LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000089SPACE_MASK = 0x20
90TITLE_MASK = 0x40
91UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000092XID_START_MASK = 0x100
93XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000094PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050095NUMERIC_MASK = 0x800
96CASE_IGNORABLE_MASK = 0x1000
97CASED_MASK = 0x2000
98EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000099
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000100# these ranges need to match unicodedata.c:is_unified_ideograph
101cjk_ranges = [
102 ('3400', '4DB5'),
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -0700103 ('4E00', '9FEF'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000104 ('20000', '2A6D6'),
105 ('2A700', '2B734'),
Benjamin Peterson48013832015-06-27 15:45:56 -0500106 ('2B740', '2B81D'),
107 ('2B820', '2CEA1'),
Benjamin Peterson279a9622017-06-22 22:31:08 -0700108 ('2CEB0', '2EBE0'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000109]
110
Stefan Behnelfaa29482019-06-01 21:49:03 +0200111
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000112def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000113
Collin Winter6afaeb72007-08-03 17:06:41 +0000114 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000115
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000116 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000117
Georg Brandl559e5d72008-06-11 18:37:52 +0000118 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000119
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000120 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000121 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000122 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000123 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000124 merge_old_version(version, unicode, old_unicode)
125
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000126 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000127 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000128 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000129
Stefan Behnelfaa29482019-06-01 21:49:03 +0200130
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000131# --------------------------------------------------------------------
132# unicode character properties
133
134def makeunicodedata(unicode, trace):
135
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000136 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000137 table = [dummy]
138 cache = {0: dummy}
139 index = [0] * len(unicode.chars)
140
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000141 FILE = "Modules/unicodedata_db.h"
142
Collin Winter6afaeb72007-08-03 17:06:41 +0000143 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000144
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000145 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000146
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000147 for char in unicode.chars:
148 record = unicode.table[char]
149 if record:
150 # extract database properties
151 category = CATEGORY_NAMES.index(record[2])
152 combining = int(record[3])
153 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
154 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000155 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000156 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000157 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000158 category, combining, bidirectional, mirrored, eastasianwidth,
159 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000160 )
161 # add entry to index and item tables
162 i = cache.get(item)
163 if i is None:
164 cache[item] = i = len(table)
165 table.append(item)
166 index[char] = i
167
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000168 # 2) decomposition data
169
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000170 decomp_data = [0]
171 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000172 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000173 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000174
Martin v. Löwis677bde22002-11-23 22:08:15 +0000175 comp_pairs = []
176 comp_first = [None] * len(unicode.chars)
177 comp_last = [None] * len(unicode.chars)
178
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000179 for char in unicode.chars:
180 record = unicode.table[char]
181 if record:
182 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000183 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000184 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000185 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000186 # prefix
187 if decomp[0][0] == "<":
188 prefix = decomp.pop(0)
189 else:
190 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000191 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192 i = decomp_prefix.index(prefix)
193 except ValueError:
194 i = len(decomp_prefix)
195 decomp_prefix.append(prefix)
196 prefix = i
197 assert prefix < 256
198 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000199 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000200 # Collect NFC pairs
201 if not prefix and len(decomp) == 3 and \
202 char not in unicode.exclusions and \
203 unicode.table[decomp[1]][3] == "0":
204 p, l, r = decomp
205 comp_first[l] = 1
206 comp_last[r] = 1
207 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000208 try:
209 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000210 except ValueError:
211 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000212 decomp_data.extend(decomp)
213 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000214 else:
215 i = 0
216 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000217
Martin v. Löwis677bde22002-11-23 22:08:15 +0000218 f = l = 0
219 comp_first_ranges = []
220 comp_last_ranges = []
221 prev_f = prev_l = None
222 for i in unicode.chars:
223 if comp_first[i] is not None:
224 comp_first[i] = f
225 f += 1
226 if prev_f is None:
227 prev_f = (i,i)
228 elif prev_f[1]+1 == i:
229 prev_f = prev_f[0],i
230 else:
231 comp_first_ranges.append(prev_f)
232 prev_f = (i,i)
233 if comp_last[i] is not None:
234 comp_last[i] = l
235 l += 1
236 if prev_l is None:
237 prev_l = (i,i)
238 elif prev_l[1]+1 == i:
239 prev_l = prev_l[0],i
240 else:
241 comp_last_ranges.append(prev_l)
242 prev_l = (i,i)
243 comp_first_ranges.append(prev_f)
244 comp_last_ranges.append(prev_l)
245 total_first = f
246 total_last = l
247
248 comp_data = [0]*(total_first*total_last)
249 for f,l,char in comp_pairs:
250 f = comp_first[f]
251 l = comp_last[l]
252 comp_data[f*total_last+l] = char
253
Collin Winter6afaeb72007-08-03 17:06:41 +0000254 print(len(table), "unique properties")
255 print(len(decomp_prefix), "unique decomposition prefixes")
256 print(len(decomp_data), "unique decomposition entries:", end=' ')
257 print(decomp_size, "bytes")
258 print(total_first, "first characters in NFC")
259 print(total_last, "last characters in NFC")
260 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000261
Collin Winter6afaeb72007-08-03 17:06:41 +0000262 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000263
Stefan Behnelfaa29482019-06-01 21:49:03 +0200264 with open(FILE, "w") as fp:
265 fprint = partial(print, file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Stefan Behnelfaa29482019-06-01 21:49:03 +0200267 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
268 fprint()
269 fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
270 fprint("/* a list of unique database records */")
271 fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
272 for item in table:
273 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
274 fprint("};")
275 fprint()
Martin v. Löwis677bde22002-11-23 22:08:15 +0000276
Stefan Behnelfaa29482019-06-01 21:49:03 +0200277 fprint("/* Reindexing of NFC first characters. */")
278 fprint("#define TOTAL_FIRST",total_first)
279 fprint("#define TOTAL_LAST",total_last)
280 fprint("struct reindex{int start;short count,index;};")
281 fprint("static struct reindex nfc_first[] = {")
282 for start,end in comp_first_ranges:
283 fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
284 fprint(" {0,0,0}")
285 fprint("};\n")
286 fprint("static struct reindex nfc_last[] = {")
287 for start,end in comp_last_ranges:
288 fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
289 fprint(" {0,0,0}")
290 fprint("};\n")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000291
Stefan Behnelfaa29482019-06-01 21:49:03 +0200292 # FIXME: <fl> the following tables could be made static, and
293 # the support code moved into unicodedatabase.c
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000294
Stefan Behnelfaa29482019-06-01 21:49:03 +0200295 fprint("/* string literals */")
296 fprint("const char *_PyUnicode_CategoryNames[] = {")
297 for name in CATEGORY_NAMES:
298 fprint(" \"%s\"," % name)
299 fprint(" NULL")
300 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000301
Stefan Behnelfaa29482019-06-01 21:49:03 +0200302 fprint("const char *_PyUnicode_BidirectionalNames[] = {")
303 for name in BIDIRECTIONAL_NAMES:
304 fprint(" \"%s\"," % name)
305 fprint(" NULL")
306 fprint("};")
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000307
Stefan Behnelfaa29482019-06-01 21:49:03 +0200308 fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
309 for name in EASTASIANWIDTH_NAMES:
310 fprint(" \"%s\"," % name)
311 fprint(" NULL")
312 fprint("};")
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000313
Stefan Behnelfaa29482019-06-01 21:49:03 +0200314 fprint("static const char *decomp_prefix[] = {")
315 for name in decomp_prefix:
316 fprint(" \"%s\"," % name)
317 fprint(" NULL")
318 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000319
Stefan Behnelfaa29482019-06-01 21:49:03 +0200320 # split record index table
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000321 index1, index2, shift = splitbins(index, trace)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000322
Stefan Behnelfaa29482019-06-01 21:49:03 +0200323 fprint("/* index tables for the database records */")
324 fprint("#define SHIFT", shift)
325 Array("index1", index1).dump(fp, trace)
326 Array("index2", index2).dump(fp, trace)
327
328 # split decomposition index table
329 index1, index2, shift = splitbins(decomp_index, trace)
330
331 fprint("/* decomposition data */")
332 Array("decomp_data", decomp_data).dump(fp, trace)
333
334 fprint("/* index tables for the decomposition data */")
335 fprint("#define DECOMP_SHIFT", shift)
336 Array("decomp_index1", index1).dump(fp, trace)
337 Array("decomp_index2", index2).dump(fp, trace)
338
339 index, index2, shift = splitbins(comp_data, trace)
340 fprint("/* NFC pairs */")
341 fprint("#define COMP_SHIFT", shift)
342 Array("comp_index", index).dump(fp, trace)
343 Array("comp_data", index2).dump(fp, trace)
344
345 # Generate delta tables for old versions
346 for version, table, normalization in unicode.changed:
347 cversion = version.replace(".","_")
348 records = [table[0]]
349 cache = {table[0]:0}
350 index = [0] * len(table)
351 for i, record in enumerate(table):
352 try:
353 index[i] = cache[record]
354 except KeyError:
355 index[i] = cache[record] = len(records)
356 records.append(record)
357 index1, index2, shift = splitbins(index, trace)
358 fprint("static const change_record change_records_%s[] = {" % cversion)
359 for record in records:
360 fprint(" { %s }," % ", ".join(map(str,record)))
361 fprint("};")
362 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
363 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
364 fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
365 fprint("{")
366 fprint(" int index;")
367 fprint(" if (n >= 0x110000) index = 0;")
368 fprint(" else {")
369 fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))
370 fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
371 (cversion, shift, ((1<<shift)-1)))
372 fprint(" }")
373 fprint(" return change_records_%s+index;" % cversion)
374 fprint("}\n")
375 fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
376 fprint("{")
377 fprint(" switch(n) {")
378 for k, v in normalization:
379 fprint(" case %s: return 0x%s;" % (hex(k), v))
380 fprint(" default: return 0;")
381 fprint(" }\n}\n")
382
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000383
384# --------------------------------------------------------------------
385# unicode character type tables
386
387def makeunicodetype(unicode, trace):
388
389 FILE = "Objects/unicodetype_db.h"
390
Collin Winter6afaeb72007-08-03 17:06:41 +0000391 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392
393 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000394 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000395 table = [dummy]
396 cache = {0: dummy}
397 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000398 numeric = {}
399 spaces = []
400 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500401 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402
403 for char in unicode.chars:
404 record = unicode.table[char]
405 if record:
406 # extract database properties
407 category = record[2]
408 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000409 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 flags = 0
411 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
412 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500413 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000414 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000415 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000416 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000417 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000418 if category == "Zs" or bidirectional in ("WS", "B", "S"):
419 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000420 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000421 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000422 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500423 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000424 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000425 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000426 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000427 if "XID_Start" in properties:
428 flags |= XID_START_MASK
429 if "XID_Continue" in properties:
430 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500431 if "Cased" in properties:
432 flags |= CASED_MASK
433 if "Case_Ignorable" in properties:
434 flags |= CASE_IGNORABLE_MASK
435 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500436 cf = unicode.case_folding.get(char, [char])
437 if record[12]:
438 upper = int(record[12], 16)
439 else:
440 upper = char
441 if record[13]:
442 lower = int(record[13], 16)
443 else:
444 lower = char
445 if record[14]:
446 title = int(record[14], 16)
447 else:
448 title = upper
449 if sc is None and cf != [lower]:
450 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500451 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500452 if upper == lower == title:
453 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500454 else:
455 upper = upper - char
456 lower = lower - char
457 title = title - char
458 assert (abs(upper) <= 2147483647 and
459 abs(lower) <= 2147483647 and
460 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000461 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500462 # This happens either when some character maps to more than one
463 # character in uppercase, lowercase, or titlecase or the
464 # casefolded version of the character is different from the
465 # lowercase. The extra characters are stored in a different
466 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500467 flags |= EXTENDED_CASE_MASK
468 lower = len(extra_casing) | (len(sc[0]) << 24)
469 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500470 if cf != sc[0]:
471 lower |= len(cf) << 20
472 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500473 upper = len(extra_casing) | (len(sc[2]) << 24)
474 extra_casing.extend(sc[2])
475 # Title is probably equal to upper.
476 if sc[1] == sc[2]:
477 title = upper
478 else:
479 title = len(extra_casing) | (len(sc[1]) << 24)
480 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000481 # decimal digit, integer digit
482 decimal = 0
483 if record[6]:
484 flags |= DECIMAL_MASK
485 decimal = int(record[6])
486 digit = 0
487 if record[7]:
488 flags |= DIGIT_MASK
489 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000490 if record[8]:
491 flags |= NUMERIC_MASK
492 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000493 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000494 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000495 )
496 # add entry to index and item tables
497 i = cache.get(item)
498 if i is None:
499 cache[item] = i = len(table)
500 table.append(item)
501 index[char] = i
502
Collin Winter6afaeb72007-08-03 17:06:41 +0000503 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000504 print(sum(map(len, numeric.values())), "numeric code points")
505 print(len(spaces), "whitespace code points")
506 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500507 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000508
Collin Winter6afaeb72007-08-03 17:06:41 +0000509 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000510
Stefan Behnelfaa29482019-06-01 21:49:03 +0200511 with open(FILE, "w") as fp:
512 fprint = partial(print, file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000513
Stefan Behnelfaa29482019-06-01 21:49:03 +0200514 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
515 fprint()
516 fprint("/* a list of unique character type descriptors */")
517 fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
518 for item in table:
519 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
520 fprint("};")
521 fprint()
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500522
Stefan Behnelfaa29482019-06-01 21:49:03 +0200523 fprint("/* extended case mappings */")
524 fprint()
525 fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
526 for c in extra_casing:
527 fprint(" %d," % c)
528 fprint("};")
529 fprint()
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000530
Stefan Behnelfaa29482019-06-01 21:49:03 +0200531 # split decomposition index table
532 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000533
Stefan Behnelfaa29482019-06-01 21:49:03 +0200534 fprint("/* type indexes */")
535 fprint("#define SHIFT", shift)
536 Array("index1", index1).dump(fp, trace)
537 Array("index2", index2).dump(fp, trace)
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000538
Stefan Behnelfaa29482019-06-01 21:49:03 +0200539 # Generate code for _PyUnicode_ToNumeric()
540 numeric_items = sorted(numeric.items())
541 fprint('/* Returns the numeric value as double for Unicode characters')
542 fprint(' * having this property, -1.0 otherwise.')
543 fprint(' */')
544 fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
545 fprint('{')
546 fprint(' switch (ch) {')
547 for value, codepoints in numeric_items:
548 # Turn text into float literals
549 parts = value.split('/')
550 parts = [repr(float(part)) for part in parts]
551 value = '/'.join(parts)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000552
Stefan Behnelfaa29482019-06-01 21:49:03 +0200553 codepoints.sort()
554 for codepoint in codepoints:
555 fprint(' case 0x%04X:' % (codepoint,))
556 fprint(' return (double) %s;' % (value,))
557 fprint(' }')
558 fprint(' return -1.0;')
559 fprint('}')
560 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000561
Stefan Behnelfaa29482019-06-01 21:49:03 +0200562 # Generate code for _PyUnicode_IsWhitespace()
563 fprint("/* Returns 1 for Unicode characters having the bidirectional")
564 fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
565 fprint(" */")
566 fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
567 fprint('{')
568 fprint(' switch (ch) {')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000569
Stefan Behnelfaa29482019-06-01 21:49:03 +0200570 for codepoint in sorted(spaces):
571 fprint(' case 0x%04X:' % (codepoint,))
572 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000573
Stefan Behnelfaa29482019-06-01 21:49:03 +0200574 fprint(' }')
575 fprint(' return 0;')
576 fprint('}')
577 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000578
Stefan Behnelfaa29482019-06-01 21:49:03 +0200579 # Generate code for _PyUnicode_IsLinebreak()
580 fprint("/* Returns 1 for Unicode characters having the line break")
581 fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
582 fprint(" * type 'B', 0 otherwise.")
583 fprint(" */")
584 fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
585 fprint('{')
586 fprint(' switch (ch) {')
587 for codepoint in sorted(linebreaks):
588 fprint(' case 0x%04X:' % (codepoint,))
589 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000590
Stefan Behnelfaa29482019-06-01 21:49:03 +0200591 fprint(' }')
592 fprint(' return 0;')
593 fprint('}')
594 fprint()
595
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000596
597# --------------------------------------------------------------------
598# unicode name database
599
600def makeunicodename(unicode, trace):
601
602 FILE = "Modules/unicodename_db.h"
603
Collin Winter6afaeb72007-08-03 17:06:41 +0000604 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000605
606 # collect names
607 names = [None] * len(unicode.chars)
608
609 for char in unicode.chars:
610 record = unicode.table[char]
611 if record:
612 name = record[1].strip()
613 if name and name[0] != "<":
614 names[char] = name + chr(0)
615
Jon Dufresne39726282017-05-18 07:35:54 -0700616 print(len([n for n in names if n is not None]), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000617
618 # collect unique words from names (note that we differ between
619 # words inside a sentence, and words ending a sentence. the
620 # latter includes the trailing null byte.
621
622 words = {}
623 n = b = 0
624 for char in unicode.chars:
625 name = names[char]
626 if name:
627 w = name.split()
628 b = b + len(name)
629 n = n + len(w)
630 for w in w:
631 l = words.get(w)
632 if l:
633 l.append(None)
634 else:
635 words[w] = [len(words)]
636
Collin Winter6afaeb72007-08-03 17:06:41 +0000637 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000638
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000639 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000640
Martin v. Löwis97225da2002-11-24 23:05:09 +0000641 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000642 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000643 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000644 return -len(alist), aword
645 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000646
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000647 # figure out how many phrasebook escapes we need
648 escapes = 0
649 while escapes * 256 < len(wordlist):
650 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000651 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000652
653 short = 256 - escapes
654
655 assert short > 0
656
Collin Winter6afaeb72007-08-03 17:06:41 +0000657 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000658
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000659 # statistics
660 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000661 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000662 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000663 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000664
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000665 # pick the most commonly used words, and sort the rest on falling
666 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000667
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000668 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000669 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000670 wordlist.extend(wordtail)
671
672 # generate lexicon from words
673
674 lexicon_offset = [0]
675 lexicon = ""
676 words = {}
677
678 # build a lexicon string
679 offset = 0
680 for w, x in wordlist:
681 # encoding: bit 7 indicates last character in word (chr(128)
682 # indicates the last character in an entire string)
683 ww = w[:-1] + chr(ord(w[-1])+128)
684 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000685 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000686 if o < 0:
687 o = offset
688 lexicon = lexicon + ww
689 offset = offset + len(w)
690 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000691 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000692
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000693 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000694
695 # generate phrasebook from names and lexicon
696 phrasebook = [0]
697 phrasebook_offset = [0] * len(unicode.chars)
698 for char in unicode.chars:
699 name = names[char]
700 if name:
701 w = name.split()
702 phrasebook_offset[char] = len(phrasebook)
703 for w in w:
704 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000705 if i < short:
706 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000707 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000708 # store as two bytes
709 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000710 phrasebook.append(i&255)
711
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000712 assert getsize(phrasebook) == 1
713
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000714 #
715 # unicode name hash table
716
717 # extract names
718 data = []
719 for char in unicode.chars:
720 record = unicode.table[char]
721 if record:
722 name = record[1].strip()
723 if name and name[0] != "<":
724 data.append((name, char))
725
726 # the magic number 47 was chosen to minimize the number of
727 # collisions on the current data set. if you like, change it
728 # and see what happens...
729
730 codehash = Hash("code", data, 47)
731
Collin Winter6afaeb72007-08-03 17:06:41 +0000732 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000733
Stefan Behnelfaa29482019-06-01 21:49:03 +0200734 with open(FILE, "w") as fp:
735 fprint = partial(print, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000736
Stefan Behnelfaa29482019-06-01 21:49:03 +0200737 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
738 fprint()
739 fprint("#define NAME_MAXLEN", 256)
740 fprint()
741 fprint("/* lexicon */")
742 Array("lexicon", lexicon).dump(fp, trace)
743 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000744
Stefan Behnelfaa29482019-06-01 21:49:03 +0200745 # split decomposition index table
746 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000747
Stefan Behnelfaa29482019-06-01 21:49:03 +0200748 fprint("/* code->name phrasebook */")
749 fprint("#define phrasebook_shift", shift)
750 fprint("#define phrasebook_short", short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000751
Stefan Behnelfaa29482019-06-01 21:49:03 +0200752 Array("phrasebook", phrasebook).dump(fp, trace)
753 Array("phrasebook_offset1", offset1).dump(fp, trace)
754 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000755
Stefan Behnelfaa29482019-06-01 21:49:03 +0200756 fprint("/* name->code dictionary */")
757 codehash.dump(fp, trace)
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300758
Stefan Behnelfaa29482019-06-01 21:49:03 +0200759 fprint()
760 fprint('static const unsigned int aliases_start = %#x;' %
761 NAME_ALIASES_START)
762 fprint('static const unsigned int aliases_end = %#x;' %
763 (NAME_ALIASES_START + len(unicode.aliases)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300764
Stefan Behnelfaa29482019-06-01 21:49:03 +0200765 fprint('static const unsigned int name_aliases[] = {')
766 for name, codepoint in unicode.aliases:
767 fprint(' 0x%04X,' % codepoint)
768 fprint('};')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300769
Stefan Behnelfaa29482019-06-01 21:49:03 +0200770 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
771 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
772 # sequences or sequences with non-BMP chars are added.
773 # unicodedata_lookup should be adapted too.
774 fprint(dedent("""
775 typedef struct NamedSequence {
776 int seqlen;
777 Py_UCS2 seq[4];
778 } named_sequence;
779 """))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300780
Stefan Behnelfaa29482019-06-01 21:49:03 +0200781 fprint('static const unsigned int named_sequences_start = %#x;' %
782 NAMED_SEQUENCES_START)
783 fprint('static const unsigned int named_sequences_end = %#x;' %
784 (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300785
Stefan Behnelfaa29482019-06-01 21:49:03 +0200786 fprint('static const named_sequence named_sequences[] = {')
787 for name, sequence in unicode.named_sequences:
788 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
789 fprint(' {%d, {%s}},' % (len(sequence), seq_str))
790 fprint('};')
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000791
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000792
793def merge_old_version(version, new, old):
794 # Changes to exclusion file not implemented yet
795 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000796 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000797
798 # In these change records, 0xFF means "no change"
799 bidir_changes = [0xFF]*0x110000
800 category_changes = [0xFF]*0x110000
801 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000802 mirrored_changes = [0xFF]*0x110000
Benjamin Peterson67752312016-09-14 23:53:47 -0700803 east_asian_width_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000804 # In numeric data, 0 means "no change",
805 # -1 means "did not have a numeric value
806 numeric_changes = [0] * 0x110000
807 # normalization_changes is a list of key-value pairs
808 normalization_changes = []
809 for i in range(0x110000):
810 if new.table[i] is None:
811 # Characters unassigned in the new version ought to
812 # be unassigned in the old one
813 assert old.table[i] is None
814 continue
815 # check characters unassigned in the old version
816 if old.table[i] is None:
817 # category 0 is "unassigned"
818 category_changes[i] = 0
819 continue
820 # check characters that differ
821 if old.table[i] != new.table[i]:
822 for k in range(len(old.table[i])):
823 if old.table[i][k] != new.table[i][k]:
824 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300825 if k == 1 and i in PUA_15:
826 # the name is not set in the old.table, but in the
827 # new.table we are using it for aliases and named seq
828 assert value == ''
829 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000830 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
831 category_changes[i] = CATEGORY_NAMES.index(value)
832 elif k == 4:
833 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
834 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
835 elif k == 5:
836 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
837 # We assume that all normalization changes are in 1:1 mappings
838 assert " " not in value
839 normalization_changes.append((i, value))
840 elif k == 6:
841 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
842 # we only support changes where the old value is a single digit
843 assert value in "0123456789"
844 decimal_changes[i] = int(value)
845 elif k == 8:
846 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
847 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000848 if not value:
849 numeric_changes[i] = -1
850 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000851 numeric_changes[i] = float(value)
852 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000853 elif k == 9:
854 if value == 'Y':
855 mirrored_changes[i] = '1'
856 else:
857 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000858 elif k == 11:
859 # change to ISO comment, ignore
860 pass
861 elif k == 12:
862 # change to simple uppercase mapping; ignore
863 pass
864 elif k == 13:
865 # change to simple lowercase mapping; ignore
866 pass
867 elif k == 14:
868 # change to simple titlecase mapping; ignore
869 pass
Benjamin Peterson67752312016-09-14 23:53:47 -0700870 elif k == 15:
871 # change to east asian width
872 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000873 elif k == 16:
874 # derived property changes; not yet
875 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000876 elif k == 17:
877 # normalization quickchecks are not performed
878 # for older versions
879 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000880 else:
881 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000882 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000883 new.changed.append((version, list(zip(bidir_changes, category_changes,
Benjamin Peterson67752312016-09-14 23:53:47 -0700884 decimal_changes, mirrored_changes,
885 east_asian_width_changes,
886 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000887 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000888
Stefan Behnelfaa29482019-06-01 21:49:03 +0200889
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000890def open_data(template, version):
891 local = template % ('-'+version,)
892 if not os.path.exists(local):
893 import urllib.request
894 if version == '3.2.0':
895 # irregular url structure
896 url = 'http://www.unicode.org/Public/3.2-Update/' + local
897 else:
898 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
899 urllib.request.urlretrieve(url, filename=local)
900 if local.endswith('.txt'):
901 return open(local, encoding='utf-8')
902 else:
903 # Unihan.zip
904 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000905
Stefan Behnelfaa29482019-06-01 21:49:03 +0200906
Greg Pricec03e6982019-08-13 19:28:38 -0700907def expand_range(char_range: str) -> Iterator[int]:
908 '''
909 Parses ranges of code points, as described in UAX #44:
910 https://www.unicode.org/reports/tr44/#Code_Point_Ranges
911 '''
912 if '..' in char_range:
913 first, last = [int(c, 16) for c in char_range.split('..')]
914 else:
915 first = last = int(char_range, 16)
916 for char in range(first, last+1):
917 yield char
918
919
Greg Priceef2af1a2019-08-12 22:20:56 -0700920class UcdFile:
921 '''
922 A file in the standard format of the UCD.
923
924 See: https://www.unicode.org/reports/tr44/#Format_Conventions
925
926 Note that, as described there, the Unihan data files have their
927 own separate format.
928 '''
929
930 def __init__(self, template: str, version: str) -> None:
931 self.template = template
932 self.version = version
933
934 def records(self) -> Iterator[List[str]]:
935 with open_data(self.template, self.version) as file:
936 for line in file:
937 line = line.split('#', 1)[0].strip()
938 if not line:
939 continue
940 yield [field.strip() for field in line.split(';')]
941
942 def __iter__(self) -> Iterator[List[str]]:
943 return self.records()
944
Greg Pricec03e6982019-08-13 19:28:38 -0700945 def expanded(self) -> Iterator[Tuple[int, List[str]]]:
946 for record in self.records():
947 char_range, rest = record[0], record[1:]
948 for char in expand_range(char_range):
949 yield char, rest
950
Greg Priceef2af1a2019-08-12 22:20:56 -0700951
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000952# --------------------------------------------------------------------
953# the following support code is taken from the unidb utilities
954# Copyright (c) 1999-2000 by Secret Labs AB
955
956# load a unicode-data file from disk
957
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000958class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000959 # Record structure:
960 # [ID, name, category, combining, bidi, decomp, (6)
961 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
962 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
963 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000964
Greg Price99d208e2019-08-12 22:59:30 -0700965 def __init__(self, version, cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000966 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000967 table = [None] * 0x110000
Greg Priceef2af1a2019-08-12 22:20:56 -0700968 for s in UcdFile(UNICODE_DATA, version):
969 char = int(s[0], 16)
970 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000971
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000972 cjk_ranges_found = []
973
Martin v. Löwis97225da2002-11-24 23:05:09 +0000974 # expand first-last ranges
Greg Price99d208e2019-08-12 22:59:30 -0700975 field = None
976 for i in range(0, 0x110000):
Greg Pricec03e6982019-08-13 19:28:38 -0700977 # The file UnicodeData.txt has its own distinct way of
978 # expressing ranges. See:
979 # https://www.unicode.org/reports/tr44/#Code_Point_Ranges
Greg Price99d208e2019-08-12 22:59:30 -0700980 s = table[i]
981 if s:
982 if s[1][-6:] == "First>":
983 s[1] = ""
984 field = s
985 elif s[1][-5:] == "Last>":
986 if s[1].startswith("<CJK Ideograph"):
987 cjk_ranges_found.append((field[0],
988 s[0]))
989 s[1] = ""
990 field = None
991 elif field:
992 f2 = field[:]
993 f2[0] = "%X" % i
994 table[i] = f2
995 if cjk_check and cjk_ranges != cjk_ranges_found:
996 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000997
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000998 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000999 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001000 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +00001001 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001002
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001003 # check for name aliases and named sequences, see #12753
1004 # aliases and named sequences are not in 3.2.0
1005 if version != '3.2.0':
1006 self.aliases = []
1007 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
1008 # in order to take advantage of the compression and lookup
1009 # algorithms used for the other characters
1010 pua_index = NAME_ALIASES_START
Greg Priceef2af1a2019-08-12 22:20:56 -07001011 for char, name, abbrev in UcdFile(NAME_ALIASES, version):
1012 char = int(char, 16)
1013 self.aliases.append((name, char))
1014 # also store the name in the PUA 1
1015 self.table[pua_index][1] = name
1016 pua_index += 1
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001017 assert pua_index - NAME_ALIASES_START == len(self.aliases)
1018
1019 self.named_sequences = []
Ezio Melotti7c4a7e62013-08-26 01:32:56 +03001020 # store named sequences in the PUA 1, in range U+F0100..,
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001021 # in order to take advantage of the compression and lookup
1022 # algorithms used for the other characters.
1023
Benjamin Peterson71f660e2012-02-20 22:24:29 -05001024 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001025 pua_index = NAMED_SEQUENCES_START
Greg Priceef2af1a2019-08-12 22:20:56 -07001026 for name, chars in UcdFile(NAMED_SEQUENCES, version):
1027 chars = tuple(int(char, 16) for char in chars.split())
1028 # check that the structure defined in makeunicodename is OK
1029 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1030 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1031 "the NamedSequence struct and in unicodedata_lookup")
1032 self.named_sequences.append((name, chars))
1033 # also store these in the PUA 1
1034 self.table[pua_index][1] = name
1035 pua_index += 1
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001036 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1037
Martin v. Löwis677bde22002-11-23 22:08:15 +00001038 self.exclusions = {}
Greg Priceef2af1a2019-08-12 22:20:56 -07001039 for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
1040 char = int(char, 16)
1041 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001042
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001043 widths = [None] * 0x110000
Greg Pricec03e6982019-08-13 19:28:38 -07001044 for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
1045 widths[char] = width
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001046
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001047 for i in range(0, 0x110000):
1048 if table[i] is not None:
1049 table[i].append(widths[i])
1050
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001051 for i in range(0, 0x110000):
1052 if table[i] is not None:
1053 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001054
Greg Pricec03e6982019-08-13 19:28:38 -07001055 for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
1056 if table[char]:
1057 # Some properties (e.g. Default_Ignorable_Code_Point)
1058 # apply to unassigned code points; ignore them
1059 table[char][-1].add(p)
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001060
Greg Pricec03e6982019-08-13 19:28:38 -07001061 for char_range, value in UcdFile(LINE_BREAK, version):
1062 if value not in MANDATORY_LINE_BREAKS:
Greg Priceef2af1a2019-08-12 22:20:56 -07001063 continue
Greg Pricec03e6982019-08-13 19:28:38 -07001064 for char in expand_range(char_range):
Greg Priceef2af1a2019-08-12 22:20:56 -07001065 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001066
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001067 # We only want the quickcheck properties
1068 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1069 # Yes is the default, hence only N and M occur
1070 # In 3.2.0, the format was different (NF?_NO)
1071 # The parsing will incorrectly determine these as
1072 # "yes", however, unicodedata.c will not perform quickchecks
1073 # for older versions, and no delta records will be created.
1074 quickchecks = [0] * 0x110000
1075 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Greg Priceef2af1a2019-08-12 22:20:56 -07001076 for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1077 if len(s) < 2 or s[1] not in qc_order:
1078 continue
1079 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1080 quickcheck_shift = qc_order.index(s[1])*2
1081 quickcheck <<= quickcheck_shift
Greg Pricec03e6982019-08-13 19:28:38 -07001082 for char in expand_range(s[0]):
Greg Priceef2af1a2019-08-12 22:20:56 -07001083 assert not (quickchecks[char]>>quickcheck_shift)&3
1084 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001085 for i in range(0, 0x110000):
1086 if table[i] is not None:
1087 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001088
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001089 with open_data(UNIHAN, version) as file:
1090 zip = zipfile.ZipFile(file)
1091 if version == '3.2.0':
1092 data = zip.open('Unihan-3.2.0.txt').read()
1093 else:
1094 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001095 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001096 if not line.startswith('U+'):
1097 continue
1098 code, tag, value = line.split(None, 3)[:3]
1099 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1100 'kOtherNumeric'):
1101 continue
1102 value = value.strip().replace(',', '')
1103 i = int(code[2:], 16)
1104 # Patch the numeric field
1105 if table[i] is not None:
1106 table[i][8] = value
Greg Priceef2af1a2019-08-12 22:20:56 -07001107
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001108 sc = self.special_casing = {}
Greg Priceef2af1a2019-08-12 22:20:56 -07001109 for data in UcdFile(SPECIAL_CASING, version):
1110 if data[4]:
1111 # We ignore all conditionals (since they depend on
1112 # languages) except for one, which is hardcoded. See
1113 # handle_capital_sigma in unicodeobject.c.
1114 continue
1115 c = int(data[0], 16)
1116 lower = [int(char, 16) for char in data[1].split()]
1117 title = [int(char, 16) for char in data[2].split()]
1118 upper = [int(char, 16) for char in data[3].split()]
1119 sc[c] = (lower, title, upper)
1120
Benjamin Petersond5890c82012-01-14 13:23:30 -05001121 cf = self.case_folding = {}
1122 if version != '3.2.0':
Greg Priceef2af1a2019-08-12 22:20:56 -07001123 for data in UcdFile(CASE_FOLDING, version):
1124 if data[1] in "CF":
1125 c = int(data[0], 16)
1126 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001127
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001128 def uselatin1(self):
1129 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001130 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001131
Stefan Behnelfaa29482019-06-01 21:49:03 +02001132
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001133# hash table tools
1134
1135# this is a straight-forward reimplementation of Python's built-in
1136# dictionary type, using a static data structure, and a custom string
1137# hash algorithm.
1138
1139def myhash(s, magic):
1140 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001141 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001142 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001143 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001144 if ix:
1145 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1146 return h
1147
Stefan Behnelfaa29482019-06-01 21:49:03 +02001148
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001149SIZES = [
1150 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1151 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1152 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1153 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1154]
1155
Stefan Behnelfaa29482019-06-01 21:49:03 +02001156
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001157class Hash:
1158 def __init__(self, name, data, magic):
1159 # turn a (key, value) list into a static hash table structure
1160
1161 # determine table size
1162 for size, poly in SIZES:
1163 if size > len(data):
1164 poly = size + poly
1165 break
1166 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001167 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001168
Collin Winter6afaeb72007-08-03 17:06:41 +00001169 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001170
1171 table = [None] * size
1172
1173 mask = size-1
1174
1175 n = 0
1176
1177 hash = myhash
1178
1179 # initialize hash table
1180 for key, value in data:
1181 h = hash(key, magic)
1182 i = (~h) & mask
1183 v = table[i]
1184 if v is None:
1185 table[i] = value
1186 continue
Stefan Behnelfaa29482019-06-01 21:49:03 +02001187 incr = (h ^ (h >> 3)) & mask
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001188 if not incr:
1189 incr = mask
1190 while 1:
1191 n = n + 1
1192 i = (i + incr) & mask
1193 v = table[i]
1194 if v is None:
1195 table[i] = value
1196 break
1197 incr = incr << 1
1198 if incr > mask:
1199 incr = incr ^ poly
1200
Collin Winter6afaeb72007-08-03 17:06:41 +00001201 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001202 self.collisions = n
1203
1204 for i in range(len(table)):
1205 if table[i] is None:
1206 table[i] = 0
1207
1208 self.data = Array(name + "_hash", table)
1209 self.magic = magic
1210 self.name = name
1211 self.size = size
1212 self.poly = poly
1213
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001214 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001215 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001216 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001217 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1218 file.write("#define %s_size %d\n" % (self.name, self.size))
1219 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1220
Stefan Behnelfaa29482019-06-01 21:49:03 +02001221
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001222# stuff to deal with arrays of unsigned integers
1223
1224class Array:
1225
1226 def __init__(self, name, data):
1227 self.name = name
1228 self.data = data
1229
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001230 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001231 # write data to file, as a C array
1232 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001233 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001234 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Inada Naoki6fec9052019-04-17 08:40:34 +09001235 file.write("static const ")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001236 if size == 1:
1237 file.write("unsigned char")
1238 elif size == 2:
1239 file.write("unsigned short")
1240 else:
1241 file.write("unsigned int")
1242 file.write(" " + self.name + "[] = {\n")
1243 if self.data:
1244 s = " "
1245 for item in self.data:
1246 i = str(item) + ", "
1247 if len(s) + len(i) > 78:
Benjamin Peterson279a9622017-06-22 22:31:08 -07001248 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001249 s = " " + i
1250 else:
1251 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001252 if s.strip():
Benjamin Peterson279a9622017-06-22 22:31:08 -07001253 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001254 file.write("};\n\n")
1255
Stefan Behnelfaa29482019-06-01 21:49:03 +02001256
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001257def getsize(data):
1258 # return smallest possible integer size for the given array
1259 maxdata = max(data)
1260 if maxdata < 256:
1261 return 1
1262 elif maxdata < 65536:
1263 return 2
1264 else:
1265 return 4
1266
Stefan Behnelfaa29482019-06-01 21:49:03 +02001267
Tim Peters21013482000-09-25 07:13:41 +00001268def splitbins(t, trace=0):
1269 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1270
1271 t is a sequence of ints. This function can be useful to save space if
1272 many of the ints are the same. t1 and t2 are lists of ints, and shift
1273 is an int, chosen to minimize the combined size of t1 and t2 (in C
1274 code), and where for each i in range(len(t)),
1275 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1276 where mask is a bitmask isolating the last "shift" bits.
1277
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001278 If optional arg trace is non-zero (default zero), progress info
1279 is printed to sys.stderr. The higher the value, the more info
1280 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001281 """
1282
Tim Peters21013482000-09-25 07:13:41 +00001283 if trace:
1284 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001285 print("%d+%d bins at shift %d; %d bytes" % (
1286 len(t1), len(t2), shift, bytes), file=sys.stderr)
Stefan Behnelfaa29482019-06-01 21:49:03 +02001287 print("Size of original table:", len(t)*getsize(t), "bytes",
1288 file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001289 n = len(t)-1 # last valid index
1290 maxshift = 0 # the most we can shift n and still have something left
1291 if n > 0:
1292 while n >> 1:
1293 n >>= 1
1294 maxshift += 1
1295 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001296 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001297 t = tuple(t) # so slices can be dict keys
1298 for shift in range(maxshift + 1):
1299 t1 = []
1300 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001301 size = 2**shift
1302 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001303 for i in range(0, len(t), size):
1304 bin = t[i:i+size]
1305 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001306 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001307 index = len(t2)
1308 bincache[bin] = index
1309 t2.extend(bin)
1310 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001311 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001312 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001313 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001314 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001315 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001316 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001317 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001318 t1, t2, shift = best
1319 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001320 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001321 dump(t1, t2, shift, bytes)
1322 if __debug__:
1323 # exhaustively verify that the decomposition is correct
1324 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001325 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001326 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1327 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001328
Stefan Behnelfaa29482019-06-01 21:49:03 +02001329
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001330if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001331 maketables(1)