blob: 38c1f19a65671aa60f20233936163c115bd76fcf [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -07004# This script converts Unicode database files to Modules/unicodedata_db.h,
5# Modules/unicodename_db.h, and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00006#
7# history:
8# 2000-09-24 fl created (based on bits and pieces from unidb)
9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000010# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000011# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000012# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000013# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000014# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000015# 2002-09-11 wd use string methods
16# 2002-10-18 mvl update to Unicode 3.2
17# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000018# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000019# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000020# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000021# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000022# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030023# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050024# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000025#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000026# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000027#
28
Ezio Melotti931b8aa2011-10-21 21:57:36 +030029import os
30import sys
31import zipfile
32
Stefan Behnelfaa29482019-06-01 21:49:03 +020033from functools import partial
Greg Priceef2af1a2019-08-12 22:20:56 -070034from textwrap import dedent
35from typing import *
Fredrik Lundhf367cac2000-09-24 23:18:31 +000036
37SCRIPT = sys.argv[0]
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -070038VERSION = "3.3"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000039
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000040# The Unicode Database
R David Murray7445a382014-10-09 17:30:33 -040041# --------------------
42# When changing UCD version please update
43# * Doc/library/stdtypes.rst, and
44# * Doc/library/unicodedata.rst
R David Murray5f16f902014-10-09 20:45:59 -040045# * Doc/reference/lexical_analysis.rst (two occurrences)
Benjamin Peterson3aca40d2019-05-08 20:59:35 -070046UNIDATA_VERSION = "12.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000047UNICODE_DATA = "UnicodeData%s.txt"
48COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
49EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000050UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000051DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000052DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000053LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030054NAME_ALIASES = "NameAliases%s.txt"
55NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050056SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050057CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030058
59# Private Use Areas -- in planes 1, 15, 16
60PUA_1 = range(0xE000, 0xF900)
61PUA_15 = range(0xF0000, 0xFFFFE)
62PUA_16 = range(0x100000, 0x10FFFE)
63
64# we use this ranges of PUA_15 to store name aliases and named sequences
65NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050066NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000067
68old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000069
70CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
71 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
72 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
73 "So" ]
74
75BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
76 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
Benjamin Peterson94d08d92013-10-10 17:24:45 -040077 "ON", "LRI", "RLI", "FSI", "PDI" ]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000078
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000079EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
80
Florent Xicluna806d8cf2010-03-30 19:34:18 +000081MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
82
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000083# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000084ALPHA_MASK = 0x01
85DECIMAL_MASK = 0x02
86DIGIT_MASK = 0x04
87LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000088LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000089SPACE_MASK = 0x20
90TITLE_MASK = 0x40
91UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000092XID_START_MASK = 0x100
93XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000094PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050095NUMERIC_MASK = 0x800
96CASE_IGNORABLE_MASK = 0x1000
97CASED_MASK = 0x2000
98EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000099
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000100# these ranges need to match unicodedata.c:is_unified_ideograph
101cjk_ranges = [
102 ('3400', '4DB5'),
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -0700103 ('4E00', '9FEF'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000104 ('20000', '2A6D6'),
105 ('2A700', '2B734'),
Benjamin Peterson48013832015-06-27 15:45:56 -0500106 ('2B740', '2B81D'),
107 ('2B820', '2CEA1'),
Benjamin Peterson279a9622017-06-22 22:31:08 -0700108 ('2CEB0', '2EBE0'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000109]
110
Stefan Behnelfaa29482019-06-01 21:49:03 +0200111
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000112def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000113
Collin Winter6afaeb72007-08-03 17:06:41 +0000114 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000115
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000116 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000117
Georg Brandl559e5d72008-06-11 18:37:52 +0000118 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000119
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000120 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000121 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000122 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000123 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000124 merge_old_version(version, unicode, old_unicode)
125
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000126 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000127 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000128 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000129
Stefan Behnelfaa29482019-06-01 21:49:03 +0200130
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000131# --------------------------------------------------------------------
132# unicode character properties
133
134def makeunicodedata(unicode, trace):
135
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000136 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000137 table = [dummy]
138 cache = {0: dummy}
139 index = [0] * len(unicode.chars)
140
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000141 FILE = "Modules/unicodedata_db.h"
142
Collin Winter6afaeb72007-08-03 17:06:41 +0000143 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000144
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000145 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000146
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000147 for char in unicode.chars:
148 record = unicode.table[char]
149 if record:
150 # extract database properties
151 category = CATEGORY_NAMES.index(record[2])
152 combining = int(record[3])
153 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
154 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000155 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000156 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000157 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000158 category, combining, bidirectional, mirrored, eastasianwidth,
159 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000160 )
161 # add entry to index and item tables
162 i = cache.get(item)
163 if i is None:
164 cache[item] = i = len(table)
165 table.append(item)
166 index[char] = i
167
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000168 # 2) decomposition data
169
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000170 decomp_data = [0]
171 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000172 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000173 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000174
Martin v. Löwis677bde22002-11-23 22:08:15 +0000175 comp_pairs = []
176 comp_first = [None] * len(unicode.chars)
177 comp_last = [None] * len(unicode.chars)
178
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000179 for char in unicode.chars:
180 record = unicode.table[char]
181 if record:
182 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000183 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000184 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000185 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000186 # prefix
187 if decomp[0][0] == "<":
188 prefix = decomp.pop(0)
189 else:
190 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000191 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192 i = decomp_prefix.index(prefix)
193 except ValueError:
194 i = len(decomp_prefix)
195 decomp_prefix.append(prefix)
196 prefix = i
197 assert prefix < 256
198 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000199 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000200 # Collect NFC pairs
201 if not prefix and len(decomp) == 3 and \
202 char not in unicode.exclusions and \
203 unicode.table[decomp[1]][3] == "0":
204 p, l, r = decomp
205 comp_first[l] = 1
206 comp_last[r] = 1
207 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000208 try:
209 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000210 except ValueError:
211 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000212 decomp_data.extend(decomp)
213 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000214 else:
215 i = 0
216 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000217
Martin v. Löwis677bde22002-11-23 22:08:15 +0000218 f = l = 0
219 comp_first_ranges = []
220 comp_last_ranges = []
221 prev_f = prev_l = None
222 for i in unicode.chars:
223 if comp_first[i] is not None:
224 comp_first[i] = f
225 f += 1
226 if prev_f is None:
227 prev_f = (i,i)
228 elif prev_f[1]+1 == i:
229 prev_f = prev_f[0],i
230 else:
231 comp_first_ranges.append(prev_f)
232 prev_f = (i,i)
233 if comp_last[i] is not None:
234 comp_last[i] = l
235 l += 1
236 if prev_l is None:
237 prev_l = (i,i)
238 elif prev_l[1]+1 == i:
239 prev_l = prev_l[0],i
240 else:
241 comp_last_ranges.append(prev_l)
242 prev_l = (i,i)
243 comp_first_ranges.append(prev_f)
244 comp_last_ranges.append(prev_l)
245 total_first = f
246 total_last = l
247
248 comp_data = [0]*(total_first*total_last)
249 for f,l,char in comp_pairs:
250 f = comp_first[f]
251 l = comp_last[l]
252 comp_data[f*total_last+l] = char
253
Collin Winter6afaeb72007-08-03 17:06:41 +0000254 print(len(table), "unique properties")
255 print(len(decomp_prefix), "unique decomposition prefixes")
256 print(len(decomp_data), "unique decomposition entries:", end=' ')
257 print(decomp_size, "bytes")
258 print(total_first, "first characters in NFC")
259 print(total_last, "last characters in NFC")
260 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000261
Collin Winter6afaeb72007-08-03 17:06:41 +0000262 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000263
Stefan Behnelfaa29482019-06-01 21:49:03 +0200264 with open(FILE, "w") as fp:
265 fprint = partial(print, file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Stefan Behnelfaa29482019-06-01 21:49:03 +0200267 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
268 fprint()
269 fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
270 fprint("/* a list of unique database records */")
271 fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
272 for item in table:
273 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
274 fprint("};")
275 fprint()
Martin v. Löwis677bde22002-11-23 22:08:15 +0000276
Stefan Behnelfaa29482019-06-01 21:49:03 +0200277 fprint("/* Reindexing of NFC first characters. */")
278 fprint("#define TOTAL_FIRST",total_first)
279 fprint("#define TOTAL_LAST",total_last)
280 fprint("struct reindex{int start;short count,index;};")
281 fprint("static struct reindex nfc_first[] = {")
282 for start,end in comp_first_ranges:
283 fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
284 fprint(" {0,0,0}")
285 fprint("};\n")
286 fprint("static struct reindex nfc_last[] = {")
287 for start,end in comp_last_ranges:
288 fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
289 fprint(" {0,0,0}")
290 fprint("};\n")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000291
Stefan Behnelfaa29482019-06-01 21:49:03 +0200292 # FIXME: <fl> the following tables could be made static, and
293 # the support code moved into unicodedatabase.c
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000294
Stefan Behnelfaa29482019-06-01 21:49:03 +0200295 fprint("/* string literals */")
296 fprint("const char *_PyUnicode_CategoryNames[] = {")
297 for name in CATEGORY_NAMES:
298 fprint(" \"%s\"," % name)
299 fprint(" NULL")
300 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000301
Stefan Behnelfaa29482019-06-01 21:49:03 +0200302 fprint("const char *_PyUnicode_BidirectionalNames[] = {")
303 for name in BIDIRECTIONAL_NAMES:
304 fprint(" \"%s\"," % name)
305 fprint(" NULL")
306 fprint("};")
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000307
Stefan Behnelfaa29482019-06-01 21:49:03 +0200308 fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
309 for name in EASTASIANWIDTH_NAMES:
310 fprint(" \"%s\"," % name)
311 fprint(" NULL")
312 fprint("};")
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000313
Stefan Behnelfaa29482019-06-01 21:49:03 +0200314 fprint("static const char *decomp_prefix[] = {")
315 for name in decomp_prefix:
316 fprint(" \"%s\"," % name)
317 fprint(" NULL")
318 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000319
Stefan Behnelfaa29482019-06-01 21:49:03 +0200320 # split record index table
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000321 index1, index2, shift = splitbins(index, trace)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000322
Stefan Behnelfaa29482019-06-01 21:49:03 +0200323 fprint("/* index tables for the database records */")
324 fprint("#define SHIFT", shift)
325 Array("index1", index1).dump(fp, trace)
326 Array("index2", index2).dump(fp, trace)
327
328 # split decomposition index table
329 index1, index2, shift = splitbins(decomp_index, trace)
330
331 fprint("/* decomposition data */")
332 Array("decomp_data", decomp_data).dump(fp, trace)
333
334 fprint("/* index tables for the decomposition data */")
335 fprint("#define DECOMP_SHIFT", shift)
336 Array("decomp_index1", index1).dump(fp, trace)
337 Array("decomp_index2", index2).dump(fp, trace)
338
339 index, index2, shift = splitbins(comp_data, trace)
340 fprint("/* NFC pairs */")
341 fprint("#define COMP_SHIFT", shift)
342 Array("comp_index", index).dump(fp, trace)
343 Array("comp_data", index2).dump(fp, trace)
344
345 # Generate delta tables for old versions
346 for version, table, normalization in unicode.changed:
347 cversion = version.replace(".","_")
348 records = [table[0]]
349 cache = {table[0]:0}
350 index = [0] * len(table)
351 for i, record in enumerate(table):
352 try:
353 index[i] = cache[record]
354 except KeyError:
355 index[i] = cache[record] = len(records)
356 records.append(record)
357 index1, index2, shift = splitbins(index, trace)
358 fprint("static const change_record change_records_%s[] = {" % cversion)
359 for record in records:
360 fprint(" { %s }," % ", ".join(map(str,record)))
361 fprint("};")
362 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
363 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
364 fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
365 fprint("{")
366 fprint(" int index;")
367 fprint(" if (n >= 0x110000) index = 0;")
368 fprint(" else {")
369 fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))
370 fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
371 (cversion, shift, ((1<<shift)-1)))
372 fprint(" }")
373 fprint(" return change_records_%s+index;" % cversion)
374 fprint("}\n")
375 fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
376 fprint("{")
377 fprint(" switch(n) {")
378 for k, v in normalization:
379 fprint(" case %s: return 0x%s;" % (hex(k), v))
380 fprint(" default: return 0;")
381 fprint(" }\n}\n")
382
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000383
384# --------------------------------------------------------------------
385# unicode character type tables
386
387def makeunicodetype(unicode, trace):
388
389 FILE = "Objects/unicodetype_db.h"
390
Collin Winter6afaeb72007-08-03 17:06:41 +0000391 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392
393 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000394 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000395 table = [dummy]
396 cache = {0: dummy}
397 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000398 numeric = {}
399 spaces = []
400 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500401 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402
403 for char in unicode.chars:
404 record = unicode.table[char]
405 if record:
406 # extract database properties
407 category = record[2]
408 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000409 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 flags = 0
411 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
412 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500413 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000414 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000415 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000416 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000417 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000418 if category == "Zs" or bidirectional in ("WS", "B", "S"):
419 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000420 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000421 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000422 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500423 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000424 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000425 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000426 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000427 if "XID_Start" in properties:
428 flags |= XID_START_MASK
429 if "XID_Continue" in properties:
430 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500431 if "Cased" in properties:
432 flags |= CASED_MASK
433 if "Case_Ignorable" in properties:
434 flags |= CASE_IGNORABLE_MASK
435 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500436 cf = unicode.case_folding.get(char, [char])
437 if record[12]:
438 upper = int(record[12], 16)
439 else:
440 upper = char
441 if record[13]:
442 lower = int(record[13], 16)
443 else:
444 lower = char
445 if record[14]:
446 title = int(record[14], 16)
447 else:
448 title = upper
449 if sc is None and cf != [lower]:
450 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500451 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500452 if upper == lower == title:
453 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500454 else:
455 upper = upper - char
456 lower = lower - char
457 title = title - char
458 assert (abs(upper) <= 2147483647 and
459 abs(lower) <= 2147483647 and
460 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000461 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500462 # This happens either when some character maps to more than one
463 # character in uppercase, lowercase, or titlecase or the
464 # casefolded version of the character is different from the
465 # lowercase. The extra characters are stored in a different
466 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500467 flags |= EXTENDED_CASE_MASK
468 lower = len(extra_casing) | (len(sc[0]) << 24)
469 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500470 if cf != sc[0]:
471 lower |= len(cf) << 20
472 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500473 upper = len(extra_casing) | (len(sc[2]) << 24)
474 extra_casing.extend(sc[2])
475 # Title is probably equal to upper.
476 if sc[1] == sc[2]:
477 title = upper
478 else:
479 title = len(extra_casing) | (len(sc[1]) << 24)
480 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000481 # decimal digit, integer digit
482 decimal = 0
483 if record[6]:
484 flags |= DECIMAL_MASK
485 decimal = int(record[6])
486 digit = 0
487 if record[7]:
488 flags |= DIGIT_MASK
489 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000490 if record[8]:
491 flags |= NUMERIC_MASK
492 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000493 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000494 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000495 )
496 # add entry to index and item tables
497 i = cache.get(item)
498 if i is None:
499 cache[item] = i = len(table)
500 table.append(item)
501 index[char] = i
502
Collin Winter6afaeb72007-08-03 17:06:41 +0000503 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000504 print(sum(map(len, numeric.values())), "numeric code points")
505 print(len(spaces), "whitespace code points")
506 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500507 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000508
Collin Winter6afaeb72007-08-03 17:06:41 +0000509 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000510
Stefan Behnelfaa29482019-06-01 21:49:03 +0200511 with open(FILE, "w") as fp:
512 fprint = partial(print, file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000513
Stefan Behnelfaa29482019-06-01 21:49:03 +0200514 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
515 fprint()
516 fprint("/* a list of unique character type descriptors */")
517 fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
518 for item in table:
519 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
520 fprint("};")
521 fprint()
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500522
Stefan Behnelfaa29482019-06-01 21:49:03 +0200523 fprint("/* extended case mappings */")
524 fprint()
525 fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
526 for c in extra_casing:
527 fprint(" %d," % c)
528 fprint("};")
529 fprint()
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000530
Stefan Behnelfaa29482019-06-01 21:49:03 +0200531 # split decomposition index table
532 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000533
Stefan Behnelfaa29482019-06-01 21:49:03 +0200534 fprint("/* type indexes */")
535 fprint("#define SHIFT", shift)
536 Array("index1", index1).dump(fp, trace)
537 Array("index2", index2).dump(fp, trace)
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000538
Stefan Behnelfaa29482019-06-01 21:49:03 +0200539 # Generate code for _PyUnicode_ToNumeric()
540 numeric_items = sorted(numeric.items())
541 fprint('/* Returns the numeric value as double for Unicode characters')
542 fprint(' * having this property, -1.0 otherwise.')
543 fprint(' */')
544 fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
545 fprint('{')
546 fprint(' switch (ch) {')
547 for value, codepoints in numeric_items:
548 # Turn text into float literals
549 parts = value.split('/')
550 parts = [repr(float(part)) for part in parts]
551 value = '/'.join(parts)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000552
Stefan Behnelfaa29482019-06-01 21:49:03 +0200553 codepoints.sort()
554 for codepoint in codepoints:
555 fprint(' case 0x%04X:' % (codepoint,))
556 fprint(' return (double) %s;' % (value,))
557 fprint(' }')
558 fprint(' return -1.0;')
559 fprint('}')
560 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000561
Stefan Behnelfaa29482019-06-01 21:49:03 +0200562 # Generate code for _PyUnicode_IsWhitespace()
563 fprint("/* Returns 1 for Unicode characters having the bidirectional")
564 fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
565 fprint(" */")
566 fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
567 fprint('{')
568 fprint(' switch (ch) {')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000569
Stefan Behnelfaa29482019-06-01 21:49:03 +0200570 for codepoint in sorted(spaces):
571 fprint(' case 0x%04X:' % (codepoint,))
572 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000573
Stefan Behnelfaa29482019-06-01 21:49:03 +0200574 fprint(' }')
575 fprint(' return 0;')
576 fprint('}')
577 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000578
Stefan Behnelfaa29482019-06-01 21:49:03 +0200579 # Generate code for _PyUnicode_IsLinebreak()
580 fprint("/* Returns 1 for Unicode characters having the line break")
581 fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
582 fprint(" * type 'B', 0 otherwise.")
583 fprint(" */")
584 fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
585 fprint('{')
586 fprint(' switch (ch) {')
587 for codepoint in sorted(linebreaks):
588 fprint(' case 0x%04X:' % (codepoint,))
589 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000590
Stefan Behnelfaa29482019-06-01 21:49:03 +0200591 fprint(' }')
592 fprint(' return 0;')
593 fprint('}')
594 fprint()
595
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000596
597# --------------------------------------------------------------------
598# unicode name database
599
600def makeunicodename(unicode, trace):
601
602 FILE = "Modules/unicodename_db.h"
603
Collin Winter6afaeb72007-08-03 17:06:41 +0000604 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000605
606 # collect names
607 names = [None] * len(unicode.chars)
608
609 for char in unicode.chars:
610 record = unicode.table[char]
611 if record:
612 name = record[1].strip()
613 if name and name[0] != "<":
614 names[char] = name + chr(0)
615
Jon Dufresne39726282017-05-18 07:35:54 -0700616 print(len([n for n in names if n is not None]), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000617
618 # collect unique words from names (note that we differ between
619 # words inside a sentence, and words ending a sentence. the
620 # latter includes the trailing null byte.
621
622 words = {}
623 n = b = 0
624 for char in unicode.chars:
625 name = names[char]
626 if name:
627 w = name.split()
628 b = b + len(name)
629 n = n + len(w)
630 for w in w:
631 l = words.get(w)
632 if l:
633 l.append(None)
634 else:
635 words[w] = [len(words)]
636
Collin Winter6afaeb72007-08-03 17:06:41 +0000637 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000638
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000639 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000640
Martin v. Löwis97225da2002-11-24 23:05:09 +0000641 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000642 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000643 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000644 return -len(alist), aword
645 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000646
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000647 # figure out how many phrasebook escapes we need
648 escapes = 0
649 while escapes * 256 < len(wordlist):
650 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000651 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000652
653 short = 256 - escapes
654
655 assert short > 0
656
Collin Winter6afaeb72007-08-03 17:06:41 +0000657 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000658
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000659 # statistics
660 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000661 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000662 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000663 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000664
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000665 # pick the most commonly used words, and sort the rest on falling
666 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000667
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000668 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000669 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000670 wordlist.extend(wordtail)
671
672 # generate lexicon from words
673
674 lexicon_offset = [0]
675 lexicon = ""
676 words = {}
677
678 # build a lexicon string
679 offset = 0
680 for w, x in wordlist:
681 # encoding: bit 7 indicates last character in word (chr(128)
682 # indicates the last character in an entire string)
683 ww = w[:-1] + chr(ord(w[-1])+128)
684 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000685 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000686 if o < 0:
687 o = offset
688 lexicon = lexicon + ww
689 offset = offset + len(w)
690 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000691 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000692
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000693 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000694
695 # generate phrasebook from names and lexicon
696 phrasebook = [0]
697 phrasebook_offset = [0] * len(unicode.chars)
698 for char in unicode.chars:
699 name = names[char]
700 if name:
701 w = name.split()
702 phrasebook_offset[char] = len(phrasebook)
703 for w in w:
704 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000705 if i < short:
706 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000707 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000708 # store as two bytes
709 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000710 phrasebook.append(i&255)
711
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000712 assert getsize(phrasebook) == 1
713
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000714 #
715 # unicode name hash table
716
717 # extract names
718 data = []
719 for char in unicode.chars:
720 record = unicode.table[char]
721 if record:
722 name = record[1].strip()
723 if name and name[0] != "<":
724 data.append((name, char))
725
726 # the magic number 47 was chosen to minimize the number of
727 # collisions on the current data set. if you like, change it
728 # and see what happens...
729
730 codehash = Hash("code", data, 47)
731
Collin Winter6afaeb72007-08-03 17:06:41 +0000732 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000733
Stefan Behnelfaa29482019-06-01 21:49:03 +0200734 with open(FILE, "w") as fp:
735 fprint = partial(print, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000736
Stefan Behnelfaa29482019-06-01 21:49:03 +0200737 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
738 fprint()
739 fprint("#define NAME_MAXLEN", 256)
740 fprint()
741 fprint("/* lexicon */")
742 Array("lexicon", lexicon).dump(fp, trace)
743 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000744
Stefan Behnelfaa29482019-06-01 21:49:03 +0200745 # split decomposition index table
746 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000747
Stefan Behnelfaa29482019-06-01 21:49:03 +0200748 fprint("/* code->name phrasebook */")
749 fprint("#define phrasebook_shift", shift)
750 fprint("#define phrasebook_short", short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000751
Stefan Behnelfaa29482019-06-01 21:49:03 +0200752 Array("phrasebook", phrasebook).dump(fp, trace)
753 Array("phrasebook_offset1", offset1).dump(fp, trace)
754 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000755
Stefan Behnelfaa29482019-06-01 21:49:03 +0200756 fprint("/* name->code dictionary */")
757 codehash.dump(fp, trace)
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300758
Stefan Behnelfaa29482019-06-01 21:49:03 +0200759 fprint()
760 fprint('static const unsigned int aliases_start = %#x;' %
761 NAME_ALIASES_START)
762 fprint('static const unsigned int aliases_end = %#x;' %
763 (NAME_ALIASES_START + len(unicode.aliases)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300764
Stefan Behnelfaa29482019-06-01 21:49:03 +0200765 fprint('static const unsigned int name_aliases[] = {')
766 for name, codepoint in unicode.aliases:
767 fprint(' 0x%04X,' % codepoint)
768 fprint('};')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300769
Stefan Behnelfaa29482019-06-01 21:49:03 +0200770 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
771 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
772 # sequences or sequences with non-BMP chars are added.
773 # unicodedata_lookup should be adapted too.
774 fprint(dedent("""
775 typedef struct NamedSequence {
776 int seqlen;
777 Py_UCS2 seq[4];
778 } named_sequence;
779 """))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300780
Stefan Behnelfaa29482019-06-01 21:49:03 +0200781 fprint('static const unsigned int named_sequences_start = %#x;' %
782 NAMED_SEQUENCES_START)
783 fprint('static const unsigned int named_sequences_end = %#x;' %
784 (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300785
Stefan Behnelfaa29482019-06-01 21:49:03 +0200786 fprint('static const named_sequence named_sequences[] = {')
787 for name, sequence in unicode.named_sequences:
788 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
789 fprint(' {%d, {%s}},' % (len(sequence), seq_str))
790 fprint('};')
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000791
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000792
793def merge_old_version(version, new, old):
794 # Changes to exclusion file not implemented yet
795 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000796 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000797
798 # In these change records, 0xFF means "no change"
799 bidir_changes = [0xFF]*0x110000
800 category_changes = [0xFF]*0x110000
801 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000802 mirrored_changes = [0xFF]*0x110000
Benjamin Peterson67752312016-09-14 23:53:47 -0700803 east_asian_width_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000804 # In numeric data, 0 means "no change",
805 # -1 means "did not have a numeric value
806 numeric_changes = [0] * 0x110000
807 # normalization_changes is a list of key-value pairs
808 normalization_changes = []
809 for i in range(0x110000):
810 if new.table[i] is None:
811 # Characters unassigned in the new version ought to
812 # be unassigned in the old one
813 assert old.table[i] is None
814 continue
815 # check characters unassigned in the old version
816 if old.table[i] is None:
817 # category 0 is "unassigned"
818 category_changes[i] = 0
819 continue
820 # check characters that differ
821 if old.table[i] != new.table[i]:
822 for k in range(len(old.table[i])):
823 if old.table[i][k] != new.table[i][k]:
824 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300825 if k == 1 and i in PUA_15:
826 # the name is not set in the old.table, but in the
827 # new.table we are using it for aliases and named seq
828 assert value == ''
829 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000830 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
831 category_changes[i] = CATEGORY_NAMES.index(value)
832 elif k == 4:
833 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
834 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
835 elif k == 5:
836 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
837 # We assume that all normalization changes are in 1:1 mappings
838 assert " " not in value
839 normalization_changes.append((i, value))
840 elif k == 6:
841 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
842 # we only support changes where the old value is a single digit
843 assert value in "0123456789"
844 decimal_changes[i] = int(value)
845 elif k == 8:
846 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
847 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000848 if not value:
849 numeric_changes[i] = -1
850 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000851 numeric_changes[i] = float(value)
852 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000853 elif k == 9:
854 if value == 'Y':
855 mirrored_changes[i] = '1'
856 else:
857 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000858 elif k == 11:
859 # change to ISO comment, ignore
860 pass
861 elif k == 12:
862 # change to simple uppercase mapping; ignore
863 pass
864 elif k == 13:
865 # change to simple lowercase mapping; ignore
866 pass
867 elif k == 14:
868 # change to simple titlecase mapping; ignore
869 pass
Benjamin Peterson67752312016-09-14 23:53:47 -0700870 elif k == 15:
871 # change to east asian width
872 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000873 elif k == 16:
874 # derived property changes; not yet
875 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000876 elif k == 17:
877 # normalization quickchecks are not performed
878 # for older versions
879 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000880 else:
881 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000882 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000883 new.changed.append((version, list(zip(bidir_changes, category_changes,
Benjamin Peterson67752312016-09-14 23:53:47 -0700884 decimal_changes, mirrored_changes,
885 east_asian_width_changes,
886 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000887 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000888
Stefan Behnelfaa29482019-06-01 21:49:03 +0200889
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000890def open_data(template, version):
891 local = template % ('-'+version,)
892 if not os.path.exists(local):
893 import urllib.request
894 if version == '3.2.0':
895 # irregular url structure
896 url = 'http://www.unicode.org/Public/3.2-Update/' + local
897 else:
898 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
899 urllib.request.urlretrieve(url, filename=local)
900 if local.endswith('.txt'):
901 return open(local, encoding='utf-8')
902 else:
903 # Unihan.zip
904 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000905
Stefan Behnelfaa29482019-06-01 21:49:03 +0200906
Greg Priceef2af1a2019-08-12 22:20:56 -0700907class UcdFile:
908 '''
909 A file in the standard format of the UCD.
910
911 See: https://www.unicode.org/reports/tr44/#Format_Conventions
912
913 Note that, as described there, the Unihan data files have their
914 own separate format.
915 '''
916
917 def __init__(self, template: str, version: str) -> None:
918 self.template = template
919 self.version = version
920
921 def records(self) -> Iterator[List[str]]:
922 with open_data(self.template, self.version) as file:
923 for line in file:
924 line = line.split('#', 1)[0].strip()
925 if not line:
926 continue
927 yield [field.strip() for field in line.split(';')]
928
929 def __iter__(self) -> Iterator[List[str]]:
930 return self.records()
931
932
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000933# --------------------------------------------------------------------
934# the following support code is taken from the unidb utilities
935# Copyright (c) 1999-2000 by Secret Labs AB
936
937# load a unicode-data file from disk
938
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000939class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000940 # Record structure:
941 # [ID, name, category, combining, bidi, decomp, (6)
942 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
943 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
944 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000945
Greg Price99d208e2019-08-12 22:59:30 -0700946 def __init__(self, version, cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000947 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000948 table = [None] * 0x110000
Greg Priceef2af1a2019-08-12 22:20:56 -0700949 for s in UcdFile(UNICODE_DATA, version):
950 char = int(s[0], 16)
951 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000952
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000953 cjk_ranges_found = []
954
Martin v. Löwis97225da2002-11-24 23:05:09 +0000955 # expand first-last ranges
Greg Price99d208e2019-08-12 22:59:30 -0700956 field = None
957 for i in range(0, 0x110000):
958 s = table[i]
959 if s:
960 if s[1][-6:] == "First>":
961 s[1] = ""
962 field = s
963 elif s[1][-5:] == "Last>":
964 if s[1].startswith("<CJK Ideograph"):
965 cjk_ranges_found.append((field[0],
966 s[0]))
967 s[1] = ""
968 field = None
969 elif field:
970 f2 = field[:]
971 f2[0] = "%X" % i
972 table[i] = f2
973 if cjk_check and cjk_ranges != cjk_ranges_found:
974 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000975
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000976 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000977 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000978 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000979 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000980
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300981 # check for name aliases and named sequences, see #12753
982 # aliases and named sequences are not in 3.2.0
983 if version != '3.2.0':
984 self.aliases = []
985 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
986 # in order to take advantage of the compression and lookup
987 # algorithms used for the other characters
988 pua_index = NAME_ALIASES_START
Greg Priceef2af1a2019-08-12 22:20:56 -0700989 for char, name, abbrev in UcdFile(NAME_ALIASES, version):
990 char = int(char, 16)
991 self.aliases.append((name, char))
992 # also store the name in the PUA 1
993 self.table[pua_index][1] = name
994 pua_index += 1
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300995 assert pua_index - NAME_ALIASES_START == len(self.aliases)
996
997 self.named_sequences = []
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300998 # store named sequences in the PUA 1, in range U+F0100..,
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300999 # in order to take advantage of the compression and lookup
1000 # algorithms used for the other characters.
1001
Benjamin Peterson71f660e2012-02-20 22:24:29 -05001002 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001003 pua_index = NAMED_SEQUENCES_START
Greg Priceef2af1a2019-08-12 22:20:56 -07001004 for name, chars in UcdFile(NAMED_SEQUENCES, version):
1005 chars = tuple(int(char, 16) for char in chars.split())
1006 # check that the structure defined in makeunicodename is OK
1007 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1008 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1009 "the NamedSequence struct and in unicodedata_lookup")
1010 self.named_sequences.append((name, chars))
1011 # also store these in the PUA 1
1012 self.table[pua_index][1] = name
1013 pua_index += 1
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001014 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1015
Martin v. Löwis677bde22002-11-23 22:08:15 +00001016 self.exclusions = {}
Greg Priceef2af1a2019-08-12 22:20:56 -07001017 for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
1018 char = int(char, 16)
1019 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001020
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001021 widths = [None] * 0x110000
Greg Priceef2af1a2019-08-12 22:20:56 -07001022 for s in UcdFile(EASTASIAN_WIDTH, version):
1023 if '..' in s[0]:
1024 first, last = [int(c, 16) for c in s[0].split('..')]
1025 chars = list(range(first, last+1))
1026 else:
1027 chars = [int(s[0], 16)]
1028 for char in chars:
1029 widths[char] = s[1]
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001030
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001031 for i in range(0, 0x110000):
1032 if table[i] is not None:
1033 table[i].append(widths[i])
1034
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001035 for i in range(0, 0x110000):
1036 if table[i] is not None:
1037 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001038
Greg Priceef2af1a2019-08-12 22:20:56 -07001039 for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version):
1040 if ".." in r:
1041 first, last = [int(c, 16) for c in r.split('..')]
1042 chars = list(range(first, last+1))
1043 else:
1044 chars = [int(r, 16)]
1045 for char in chars:
1046 if table[char]:
1047 # Some properties (e.g. Default_Ignorable_Code_Point)
1048 # apply to unassigned code points; ignore them
1049 table[char][-1].add(p)
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001050
Greg Priceef2af1a2019-08-12 22:20:56 -07001051 for s in UcdFile(LINE_BREAK, version):
1052 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1053 continue
1054 if '..' not in s[0]:
1055 first = last = int(s[0], 16)
1056 else:
1057 first, last = [int(c, 16) for c in s[0].split('..')]
1058 for char in range(first, last+1):
1059 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001060
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001061 # We only want the quickcheck properties
1062 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1063 # Yes is the default, hence only N and M occur
1064 # In 3.2.0, the format was different (NF?_NO)
1065 # The parsing will incorrectly determine these as
1066 # "yes", however, unicodedata.c will not perform quickchecks
1067 # for older versions, and no delta records will be created.
1068 quickchecks = [0] * 0x110000
1069 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Greg Priceef2af1a2019-08-12 22:20:56 -07001070 for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1071 if len(s) < 2 or s[1] not in qc_order:
1072 continue
1073 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1074 quickcheck_shift = qc_order.index(s[1])*2
1075 quickcheck <<= quickcheck_shift
1076 if '..' not in s[0]:
1077 first = last = int(s[0], 16)
1078 else:
1079 first, last = [int(c, 16) for c in s[0].split('..')]
1080 for char in range(first, last+1):
1081 assert not (quickchecks[char]>>quickcheck_shift)&3
1082 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001083 for i in range(0, 0x110000):
1084 if table[i] is not None:
1085 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001086
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001087 with open_data(UNIHAN, version) as file:
1088 zip = zipfile.ZipFile(file)
1089 if version == '3.2.0':
1090 data = zip.open('Unihan-3.2.0.txt').read()
1091 else:
1092 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001093 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001094 if not line.startswith('U+'):
1095 continue
1096 code, tag, value = line.split(None, 3)[:3]
1097 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1098 'kOtherNumeric'):
1099 continue
1100 value = value.strip().replace(',', '')
1101 i = int(code[2:], 16)
1102 # Patch the numeric field
1103 if table[i] is not None:
1104 table[i][8] = value
Greg Priceef2af1a2019-08-12 22:20:56 -07001105
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001106 sc = self.special_casing = {}
Greg Priceef2af1a2019-08-12 22:20:56 -07001107 for data in UcdFile(SPECIAL_CASING, version):
1108 if data[4]:
1109 # We ignore all conditionals (since they depend on
1110 # languages) except for one, which is hardcoded. See
1111 # handle_capital_sigma in unicodeobject.c.
1112 continue
1113 c = int(data[0], 16)
1114 lower = [int(char, 16) for char in data[1].split()]
1115 title = [int(char, 16) for char in data[2].split()]
1116 upper = [int(char, 16) for char in data[3].split()]
1117 sc[c] = (lower, title, upper)
1118
Benjamin Petersond5890c82012-01-14 13:23:30 -05001119 cf = self.case_folding = {}
1120 if version != '3.2.0':
Greg Priceef2af1a2019-08-12 22:20:56 -07001121 for data in UcdFile(CASE_FOLDING, version):
1122 if data[1] in "CF":
1123 c = int(data[0], 16)
1124 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001125
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001126 def uselatin1(self):
1127 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001128 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001129
Stefan Behnelfaa29482019-06-01 21:49:03 +02001130
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001131# hash table tools
1132
1133# this is a straight-forward reimplementation of Python's built-in
1134# dictionary type, using a static data structure, and a custom string
1135# hash algorithm.
1136
1137def myhash(s, magic):
1138 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001139 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001140 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001141 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001142 if ix:
1143 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1144 return h
1145
Stefan Behnelfaa29482019-06-01 21:49:03 +02001146
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001147SIZES = [
1148 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1149 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1150 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1151 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1152]
1153
Stefan Behnelfaa29482019-06-01 21:49:03 +02001154
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001155class Hash:
1156 def __init__(self, name, data, magic):
1157 # turn a (key, value) list into a static hash table structure
1158
1159 # determine table size
1160 for size, poly in SIZES:
1161 if size > len(data):
1162 poly = size + poly
1163 break
1164 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001165 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001166
Collin Winter6afaeb72007-08-03 17:06:41 +00001167 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001168
1169 table = [None] * size
1170
1171 mask = size-1
1172
1173 n = 0
1174
1175 hash = myhash
1176
1177 # initialize hash table
1178 for key, value in data:
1179 h = hash(key, magic)
1180 i = (~h) & mask
1181 v = table[i]
1182 if v is None:
1183 table[i] = value
1184 continue
Stefan Behnelfaa29482019-06-01 21:49:03 +02001185 incr = (h ^ (h >> 3)) & mask
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001186 if not incr:
1187 incr = mask
1188 while 1:
1189 n = n + 1
1190 i = (i + incr) & mask
1191 v = table[i]
1192 if v is None:
1193 table[i] = value
1194 break
1195 incr = incr << 1
1196 if incr > mask:
1197 incr = incr ^ poly
1198
Collin Winter6afaeb72007-08-03 17:06:41 +00001199 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001200 self.collisions = n
1201
1202 for i in range(len(table)):
1203 if table[i] is None:
1204 table[i] = 0
1205
1206 self.data = Array(name + "_hash", table)
1207 self.magic = magic
1208 self.name = name
1209 self.size = size
1210 self.poly = poly
1211
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001212 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001213 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001214 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001215 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1216 file.write("#define %s_size %d\n" % (self.name, self.size))
1217 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1218
Stefan Behnelfaa29482019-06-01 21:49:03 +02001219
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001220# stuff to deal with arrays of unsigned integers
1221
1222class Array:
1223
1224 def __init__(self, name, data):
1225 self.name = name
1226 self.data = data
1227
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001228 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001229 # write data to file, as a C array
1230 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001231 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001232 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Inada Naoki6fec9052019-04-17 08:40:34 +09001233 file.write("static const ")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001234 if size == 1:
1235 file.write("unsigned char")
1236 elif size == 2:
1237 file.write("unsigned short")
1238 else:
1239 file.write("unsigned int")
1240 file.write(" " + self.name + "[] = {\n")
1241 if self.data:
1242 s = " "
1243 for item in self.data:
1244 i = str(item) + ", "
1245 if len(s) + len(i) > 78:
Benjamin Peterson279a9622017-06-22 22:31:08 -07001246 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001247 s = " " + i
1248 else:
1249 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001250 if s.strip():
Benjamin Peterson279a9622017-06-22 22:31:08 -07001251 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001252 file.write("};\n\n")
1253
Stefan Behnelfaa29482019-06-01 21:49:03 +02001254
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001255def getsize(data):
1256 # return smallest possible integer size for the given array
1257 maxdata = max(data)
1258 if maxdata < 256:
1259 return 1
1260 elif maxdata < 65536:
1261 return 2
1262 else:
1263 return 4
1264
Stefan Behnelfaa29482019-06-01 21:49:03 +02001265
Tim Peters21013482000-09-25 07:13:41 +00001266def splitbins(t, trace=0):
1267 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1268
1269 t is a sequence of ints. This function can be useful to save space if
1270 many of the ints are the same. t1 and t2 are lists of ints, and shift
1271 is an int, chosen to minimize the combined size of t1 and t2 (in C
1272 code), and where for each i in range(len(t)),
1273 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1274 where mask is a bitmask isolating the last "shift" bits.
1275
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001276 If optional arg trace is non-zero (default zero), progress info
1277 is printed to sys.stderr. The higher the value, the more info
1278 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001279 """
1280
Tim Peters21013482000-09-25 07:13:41 +00001281 if trace:
1282 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001283 print("%d+%d bins at shift %d; %d bytes" % (
1284 len(t1), len(t2), shift, bytes), file=sys.stderr)
Stefan Behnelfaa29482019-06-01 21:49:03 +02001285 print("Size of original table:", len(t)*getsize(t), "bytes",
1286 file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001287 n = len(t)-1 # last valid index
1288 maxshift = 0 # the most we can shift n and still have something left
1289 if n > 0:
1290 while n >> 1:
1291 n >>= 1
1292 maxshift += 1
1293 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001294 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001295 t = tuple(t) # so slices can be dict keys
1296 for shift in range(maxshift + 1):
1297 t1 = []
1298 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001299 size = 2**shift
1300 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001301 for i in range(0, len(t), size):
1302 bin = t[i:i+size]
1303 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001304 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001305 index = len(t2)
1306 bincache[bin] = index
1307 t2.extend(bin)
1308 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001309 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001310 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001311 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001312 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001313 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001314 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001315 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001316 t1, t2, shift = best
1317 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001318 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001319 dump(t1, t2, shift, bytes)
1320 if __debug__:
1321 # exhaustively verify that the decomposition is correct
1322 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001323 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001324 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1325 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001326
Stefan Behnelfaa29482019-06-01 21:49:03 +02001327
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001328if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001329 maketables(1)