blob: 1be93ec479c8dc856dc874fa459b6ce6c66f417d [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -07004# This script converts Unicode database files to Modules/unicodedata_db.h,
5# Modules/unicodename_db.h, and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00006#
7# history:
8# 2000-09-24 fl created (based on bits and pieces from unidb)
9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000010# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000011# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000012# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000013# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000014# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000015# 2002-09-11 wd use string methods
16# 2002-10-18 mvl update to Unicode 3.2
17# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000018# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000019# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000020# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000021# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000022# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030023# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050024# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000025#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000026# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000027#
28
Ezio Melotti931b8aa2011-10-21 21:57:36 +030029import os
30import sys
31import zipfile
32
Stefan Behnelfaa29482019-06-01 21:49:03 +020033from functools import partial
Greg Priceef2af1a2019-08-12 22:20:56 -070034from textwrap import dedent
35from typing import *
Fredrik Lundhf367cac2000-09-24 23:18:31 +000036
37SCRIPT = sys.argv[0]
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -070038VERSION = "3.3"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000039
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000040# The Unicode Database
R David Murray7445a382014-10-09 17:30:33 -040041# --------------------
42# When changing UCD version please update
43# * Doc/library/stdtypes.rst, and
44# * Doc/library/unicodedata.rst
R David Murray5f16f902014-10-09 20:45:59 -040045# * Doc/reference/lexical_analysis.rst (two occurrences)
Benjamin Peterson3aca40d2019-05-08 20:59:35 -070046UNIDATA_VERSION = "12.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000047UNICODE_DATA = "UnicodeData%s.txt"
48COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
49EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000050UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000051DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000052DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000053LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030054NAME_ALIASES = "NameAliases%s.txt"
55NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050056SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050057CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030058
59# Private Use Areas -- in planes 1, 15, 16
60PUA_1 = range(0xE000, 0xF900)
61PUA_15 = range(0xF0000, 0xFFFFE)
62PUA_16 = range(0x100000, 0x10FFFE)
63
64# we use this ranges of PUA_15 to store name aliases and named sequences
65NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050066NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000067
68old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000069
70CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
71 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
72 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
73 "So" ]
74
75BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
76 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
Benjamin Peterson94d08d92013-10-10 17:24:45 -040077 "ON", "LRI", "RLI", "FSI", "PDI" ]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000078
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000079EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
80
Florent Xicluna806d8cf2010-03-30 19:34:18 +000081MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
82
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000083# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000084ALPHA_MASK = 0x01
85DECIMAL_MASK = 0x02
86DIGIT_MASK = 0x04
87LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000088LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000089SPACE_MASK = 0x20
90TITLE_MASK = 0x40
91UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000092XID_START_MASK = 0x100
93XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000094PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050095NUMERIC_MASK = 0x800
96CASE_IGNORABLE_MASK = 0x1000
97CASED_MASK = 0x2000
98EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000099
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000100# these ranges need to match unicodedata.c:is_unified_ideograph
101cjk_ranges = [
102 ('3400', '4DB5'),
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -0700103 ('4E00', '9FEF'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000104 ('20000', '2A6D6'),
105 ('2A700', '2B734'),
Benjamin Peterson48013832015-06-27 15:45:56 -0500106 ('2B740', '2B81D'),
107 ('2B820', '2CEA1'),
Benjamin Peterson279a9622017-06-22 22:31:08 -0700108 ('2CEB0', '2EBE0'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000109]
110
Stefan Behnelfaa29482019-06-01 21:49:03 +0200111
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000112def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000113
Collin Winter6afaeb72007-08-03 17:06:41 +0000114 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000115
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000116 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000117
Georg Brandl559e5d72008-06-11 18:37:52 +0000118 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000119
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000120 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000121 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000122 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000123 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000124 merge_old_version(version, unicode, old_unicode)
125
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000126 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000127 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000128 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000129
Stefan Behnelfaa29482019-06-01 21:49:03 +0200130
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000131# --------------------------------------------------------------------
132# unicode character properties
133
134def makeunicodedata(unicode, trace):
135
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000136 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000137 table = [dummy]
138 cache = {0: dummy}
139 index = [0] * len(unicode.chars)
140
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000141 FILE = "Modules/unicodedata_db.h"
142
Collin Winter6afaeb72007-08-03 17:06:41 +0000143 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000144
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000145 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000146
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000147 for char in unicode.chars:
148 record = unicode.table[char]
149 if record:
150 # extract database properties
151 category = CATEGORY_NAMES.index(record[2])
152 combining = int(record[3])
153 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
154 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000155 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000156 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000157 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000158 category, combining, bidirectional, mirrored, eastasianwidth,
159 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000160 )
161 # add entry to index and item tables
162 i = cache.get(item)
163 if i is None:
164 cache[item] = i = len(table)
165 table.append(item)
166 index[char] = i
167
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000168 # 2) decomposition data
169
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000170 decomp_data = [0]
171 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000172 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000173 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000174
Martin v. Löwis677bde22002-11-23 22:08:15 +0000175 comp_pairs = []
176 comp_first = [None] * len(unicode.chars)
177 comp_last = [None] * len(unicode.chars)
178
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000179 for char in unicode.chars:
180 record = unicode.table[char]
181 if record:
182 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000183 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000184 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000185 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000186 # prefix
187 if decomp[0][0] == "<":
188 prefix = decomp.pop(0)
189 else:
190 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000191 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192 i = decomp_prefix.index(prefix)
193 except ValueError:
194 i = len(decomp_prefix)
195 decomp_prefix.append(prefix)
196 prefix = i
197 assert prefix < 256
198 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000199 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000200 # Collect NFC pairs
201 if not prefix and len(decomp) == 3 and \
202 char not in unicode.exclusions and \
203 unicode.table[decomp[1]][3] == "0":
204 p, l, r = decomp
205 comp_first[l] = 1
206 comp_last[r] = 1
207 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000208 try:
209 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000210 except ValueError:
211 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000212 decomp_data.extend(decomp)
213 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000214 else:
215 i = 0
216 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000217
Martin v. Löwis677bde22002-11-23 22:08:15 +0000218 f = l = 0
219 comp_first_ranges = []
220 comp_last_ranges = []
221 prev_f = prev_l = None
222 for i in unicode.chars:
223 if comp_first[i] is not None:
224 comp_first[i] = f
225 f += 1
226 if prev_f is None:
227 prev_f = (i,i)
228 elif prev_f[1]+1 == i:
229 prev_f = prev_f[0],i
230 else:
231 comp_first_ranges.append(prev_f)
232 prev_f = (i,i)
233 if comp_last[i] is not None:
234 comp_last[i] = l
235 l += 1
236 if prev_l is None:
237 prev_l = (i,i)
238 elif prev_l[1]+1 == i:
239 prev_l = prev_l[0],i
240 else:
241 comp_last_ranges.append(prev_l)
242 prev_l = (i,i)
243 comp_first_ranges.append(prev_f)
244 comp_last_ranges.append(prev_l)
245 total_first = f
246 total_last = l
247
248 comp_data = [0]*(total_first*total_last)
249 for f,l,char in comp_pairs:
250 f = comp_first[f]
251 l = comp_last[l]
252 comp_data[f*total_last+l] = char
253
Collin Winter6afaeb72007-08-03 17:06:41 +0000254 print(len(table), "unique properties")
255 print(len(decomp_prefix), "unique decomposition prefixes")
256 print(len(decomp_data), "unique decomposition entries:", end=' ')
257 print(decomp_size, "bytes")
258 print(total_first, "first characters in NFC")
259 print(total_last, "last characters in NFC")
260 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000261
Collin Winter6afaeb72007-08-03 17:06:41 +0000262 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000263
Stefan Behnelfaa29482019-06-01 21:49:03 +0200264 with open(FILE, "w") as fp:
265 fprint = partial(print, file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Stefan Behnelfaa29482019-06-01 21:49:03 +0200267 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
268 fprint()
269 fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
270 fprint("/* a list of unique database records */")
271 fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
272 for item in table:
273 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
274 fprint("};")
275 fprint()
Martin v. Löwis677bde22002-11-23 22:08:15 +0000276
Stefan Behnelfaa29482019-06-01 21:49:03 +0200277 fprint("/* Reindexing of NFC first characters. */")
278 fprint("#define TOTAL_FIRST",total_first)
279 fprint("#define TOTAL_LAST",total_last)
280 fprint("struct reindex{int start;short count,index;};")
281 fprint("static struct reindex nfc_first[] = {")
282 for start,end in comp_first_ranges:
283 fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
284 fprint(" {0,0,0}")
285 fprint("};\n")
286 fprint("static struct reindex nfc_last[] = {")
287 for start,end in comp_last_ranges:
288 fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
289 fprint(" {0,0,0}")
290 fprint("};\n")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000291
Stefan Behnelfaa29482019-06-01 21:49:03 +0200292 # FIXME: <fl> the following tables could be made static, and
293 # the support code moved into unicodedatabase.c
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000294
Stefan Behnelfaa29482019-06-01 21:49:03 +0200295 fprint("/* string literals */")
296 fprint("const char *_PyUnicode_CategoryNames[] = {")
297 for name in CATEGORY_NAMES:
298 fprint(" \"%s\"," % name)
299 fprint(" NULL")
300 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000301
Stefan Behnelfaa29482019-06-01 21:49:03 +0200302 fprint("const char *_PyUnicode_BidirectionalNames[] = {")
303 for name in BIDIRECTIONAL_NAMES:
304 fprint(" \"%s\"," % name)
305 fprint(" NULL")
306 fprint("};")
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000307
Stefan Behnelfaa29482019-06-01 21:49:03 +0200308 fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
309 for name in EASTASIANWIDTH_NAMES:
310 fprint(" \"%s\"," % name)
311 fprint(" NULL")
312 fprint("};")
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000313
Stefan Behnelfaa29482019-06-01 21:49:03 +0200314 fprint("static const char *decomp_prefix[] = {")
315 for name in decomp_prefix:
316 fprint(" \"%s\"," % name)
317 fprint(" NULL")
318 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000319
Stefan Behnelfaa29482019-06-01 21:49:03 +0200320 # split record index table
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000321 index1, index2, shift = splitbins(index, trace)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000322
Stefan Behnelfaa29482019-06-01 21:49:03 +0200323 fprint("/* index tables for the database records */")
324 fprint("#define SHIFT", shift)
325 Array("index1", index1).dump(fp, trace)
326 Array("index2", index2).dump(fp, trace)
327
328 # split decomposition index table
329 index1, index2, shift = splitbins(decomp_index, trace)
330
331 fprint("/* decomposition data */")
332 Array("decomp_data", decomp_data).dump(fp, trace)
333
334 fprint("/* index tables for the decomposition data */")
335 fprint("#define DECOMP_SHIFT", shift)
336 Array("decomp_index1", index1).dump(fp, trace)
337 Array("decomp_index2", index2).dump(fp, trace)
338
339 index, index2, shift = splitbins(comp_data, trace)
340 fprint("/* NFC pairs */")
341 fprint("#define COMP_SHIFT", shift)
342 Array("comp_index", index).dump(fp, trace)
343 Array("comp_data", index2).dump(fp, trace)
344
345 # Generate delta tables for old versions
346 for version, table, normalization in unicode.changed:
347 cversion = version.replace(".","_")
348 records = [table[0]]
349 cache = {table[0]:0}
350 index = [0] * len(table)
351 for i, record in enumerate(table):
352 try:
353 index[i] = cache[record]
354 except KeyError:
355 index[i] = cache[record] = len(records)
356 records.append(record)
357 index1, index2, shift = splitbins(index, trace)
358 fprint("static const change_record change_records_%s[] = {" % cversion)
359 for record in records:
360 fprint(" { %s }," % ", ".join(map(str,record)))
361 fprint("};")
362 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
363 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
364 fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
365 fprint("{")
366 fprint(" int index;")
367 fprint(" if (n >= 0x110000) index = 0;")
368 fprint(" else {")
369 fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))
370 fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
371 (cversion, shift, ((1<<shift)-1)))
372 fprint(" }")
373 fprint(" return change_records_%s+index;" % cversion)
374 fprint("}\n")
375 fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
376 fprint("{")
377 fprint(" switch(n) {")
378 for k, v in normalization:
379 fprint(" case %s: return 0x%s;" % (hex(k), v))
380 fprint(" default: return 0;")
381 fprint(" }\n}\n")
382
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000383
384# --------------------------------------------------------------------
385# unicode character type tables
386
387def makeunicodetype(unicode, trace):
388
389 FILE = "Objects/unicodetype_db.h"
390
Collin Winter6afaeb72007-08-03 17:06:41 +0000391 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392
393 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000394 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000395 table = [dummy]
396 cache = {0: dummy}
397 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000398 numeric = {}
399 spaces = []
400 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500401 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402
403 for char in unicode.chars:
404 record = unicode.table[char]
405 if record:
406 # extract database properties
407 category = record[2]
408 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000409 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 flags = 0
411 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
412 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500413 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000414 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000415 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000416 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000417 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000418 if category == "Zs" or bidirectional in ("WS", "B", "S"):
419 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000420 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000421 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000422 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500423 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000424 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000425 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000426 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000427 if "XID_Start" in properties:
428 flags |= XID_START_MASK
429 if "XID_Continue" in properties:
430 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500431 if "Cased" in properties:
432 flags |= CASED_MASK
433 if "Case_Ignorable" in properties:
434 flags |= CASE_IGNORABLE_MASK
435 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500436 cf = unicode.case_folding.get(char, [char])
437 if record[12]:
438 upper = int(record[12], 16)
439 else:
440 upper = char
441 if record[13]:
442 lower = int(record[13], 16)
443 else:
444 lower = char
445 if record[14]:
446 title = int(record[14], 16)
447 else:
448 title = upper
449 if sc is None and cf != [lower]:
450 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500451 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500452 if upper == lower == title:
453 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500454 else:
455 upper = upper - char
456 lower = lower - char
457 title = title - char
458 assert (abs(upper) <= 2147483647 and
459 abs(lower) <= 2147483647 and
460 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000461 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500462 # This happens either when some character maps to more than one
463 # character in uppercase, lowercase, or titlecase or the
464 # casefolded version of the character is different from the
465 # lowercase. The extra characters are stored in a different
466 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500467 flags |= EXTENDED_CASE_MASK
468 lower = len(extra_casing) | (len(sc[0]) << 24)
469 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500470 if cf != sc[0]:
471 lower |= len(cf) << 20
472 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500473 upper = len(extra_casing) | (len(sc[2]) << 24)
474 extra_casing.extend(sc[2])
475 # Title is probably equal to upper.
476 if sc[1] == sc[2]:
477 title = upper
478 else:
479 title = len(extra_casing) | (len(sc[1]) << 24)
480 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000481 # decimal digit, integer digit
482 decimal = 0
483 if record[6]:
484 flags |= DECIMAL_MASK
485 decimal = int(record[6])
486 digit = 0
487 if record[7]:
488 flags |= DIGIT_MASK
489 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000490 if record[8]:
491 flags |= NUMERIC_MASK
492 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000493 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000494 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000495 )
496 # add entry to index and item tables
497 i = cache.get(item)
498 if i is None:
499 cache[item] = i = len(table)
500 table.append(item)
501 index[char] = i
502
Collin Winter6afaeb72007-08-03 17:06:41 +0000503 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000504 print(sum(map(len, numeric.values())), "numeric code points")
505 print(len(spaces), "whitespace code points")
506 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500507 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000508
Collin Winter6afaeb72007-08-03 17:06:41 +0000509 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000510
Stefan Behnelfaa29482019-06-01 21:49:03 +0200511 with open(FILE, "w") as fp:
512 fprint = partial(print, file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000513
Stefan Behnelfaa29482019-06-01 21:49:03 +0200514 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
515 fprint()
516 fprint("/* a list of unique character type descriptors */")
517 fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
518 for item in table:
519 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
520 fprint("};")
521 fprint()
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500522
Stefan Behnelfaa29482019-06-01 21:49:03 +0200523 fprint("/* extended case mappings */")
524 fprint()
525 fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
526 for c in extra_casing:
527 fprint(" %d," % c)
528 fprint("};")
529 fprint()
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000530
Stefan Behnelfaa29482019-06-01 21:49:03 +0200531 # split decomposition index table
532 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000533
Stefan Behnelfaa29482019-06-01 21:49:03 +0200534 fprint("/* type indexes */")
535 fprint("#define SHIFT", shift)
536 Array("index1", index1).dump(fp, trace)
537 Array("index2", index2).dump(fp, trace)
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000538
Stefan Behnelfaa29482019-06-01 21:49:03 +0200539 # Generate code for _PyUnicode_ToNumeric()
540 numeric_items = sorted(numeric.items())
541 fprint('/* Returns the numeric value as double for Unicode characters')
542 fprint(' * having this property, -1.0 otherwise.')
543 fprint(' */')
544 fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
545 fprint('{')
546 fprint(' switch (ch) {')
547 for value, codepoints in numeric_items:
548 # Turn text into float literals
549 parts = value.split('/')
550 parts = [repr(float(part)) for part in parts]
551 value = '/'.join(parts)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000552
Stefan Behnelfaa29482019-06-01 21:49:03 +0200553 codepoints.sort()
554 for codepoint in codepoints:
555 fprint(' case 0x%04X:' % (codepoint,))
556 fprint(' return (double) %s;' % (value,))
557 fprint(' }')
558 fprint(' return -1.0;')
559 fprint('}')
560 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000561
Stefan Behnelfaa29482019-06-01 21:49:03 +0200562 # Generate code for _PyUnicode_IsWhitespace()
563 fprint("/* Returns 1 for Unicode characters having the bidirectional")
564 fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
565 fprint(" */")
566 fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
567 fprint('{')
568 fprint(' switch (ch) {')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000569
Stefan Behnelfaa29482019-06-01 21:49:03 +0200570 for codepoint in sorted(spaces):
571 fprint(' case 0x%04X:' % (codepoint,))
572 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000573
Stefan Behnelfaa29482019-06-01 21:49:03 +0200574 fprint(' }')
575 fprint(' return 0;')
576 fprint('}')
577 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000578
Stefan Behnelfaa29482019-06-01 21:49:03 +0200579 # Generate code for _PyUnicode_IsLinebreak()
580 fprint("/* Returns 1 for Unicode characters having the line break")
581 fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
582 fprint(" * type 'B', 0 otherwise.")
583 fprint(" */")
584 fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
585 fprint('{')
586 fprint(' switch (ch) {')
587 for codepoint in sorted(linebreaks):
588 fprint(' case 0x%04X:' % (codepoint,))
589 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000590
Stefan Behnelfaa29482019-06-01 21:49:03 +0200591 fprint(' }')
592 fprint(' return 0;')
593 fprint('}')
594 fprint()
595
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000596
597# --------------------------------------------------------------------
598# unicode name database
599
600def makeunicodename(unicode, trace):
601
602 FILE = "Modules/unicodename_db.h"
603
Collin Winter6afaeb72007-08-03 17:06:41 +0000604 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000605
606 # collect names
607 names = [None] * len(unicode.chars)
608
609 for char in unicode.chars:
610 record = unicode.table[char]
611 if record:
612 name = record[1].strip()
613 if name and name[0] != "<":
614 names[char] = name + chr(0)
615
Jon Dufresne39726282017-05-18 07:35:54 -0700616 print(len([n for n in names if n is not None]), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000617
618 # collect unique words from names (note that we differ between
619 # words inside a sentence, and words ending a sentence. the
620 # latter includes the trailing null byte.
621
622 words = {}
623 n = b = 0
624 for char in unicode.chars:
625 name = names[char]
626 if name:
627 w = name.split()
628 b = b + len(name)
629 n = n + len(w)
630 for w in w:
631 l = words.get(w)
632 if l:
633 l.append(None)
634 else:
635 words[w] = [len(words)]
636
Collin Winter6afaeb72007-08-03 17:06:41 +0000637 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000638
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000639 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000640
Martin v. Löwis97225da2002-11-24 23:05:09 +0000641 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000642 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000643 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000644 return -len(alist), aword
645 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000646
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000647 # figure out how many phrasebook escapes we need
648 escapes = 0
649 while escapes * 256 < len(wordlist):
650 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000651 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000652
653 short = 256 - escapes
654
655 assert short > 0
656
Collin Winter6afaeb72007-08-03 17:06:41 +0000657 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000658
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000659 # statistics
660 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000661 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000662 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000663 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000664
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000665 # pick the most commonly used words, and sort the rest on falling
666 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000667
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000668 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000669 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000670 wordlist.extend(wordtail)
671
672 # generate lexicon from words
673
674 lexicon_offset = [0]
675 lexicon = ""
676 words = {}
677
678 # build a lexicon string
679 offset = 0
680 for w, x in wordlist:
681 # encoding: bit 7 indicates last character in word (chr(128)
682 # indicates the last character in an entire string)
683 ww = w[:-1] + chr(ord(w[-1])+128)
684 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000685 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000686 if o < 0:
687 o = offset
688 lexicon = lexicon + ww
689 offset = offset + len(w)
690 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000691 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000692
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000693 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000694
695 # generate phrasebook from names and lexicon
696 phrasebook = [0]
697 phrasebook_offset = [0] * len(unicode.chars)
698 for char in unicode.chars:
699 name = names[char]
700 if name:
701 w = name.split()
702 phrasebook_offset[char] = len(phrasebook)
703 for w in w:
704 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000705 if i < short:
706 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000707 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000708 # store as two bytes
709 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000710 phrasebook.append(i&255)
711
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000712 assert getsize(phrasebook) == 1
713
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000714 #
715 # unicode name hash table
716
717 # extract names
718 data = []
719 for char in unicode.chars:
720 record = unicode.table[char]
721 if record:
722 name = record[1].strip()
723 if name and name[0] != "<":
724 data.append((name, char))
725
726 # the magic number 47 was chosen to minimize the number of
727 # collisions on the current data set. if you like, change it
728 # and see what happens...
729
730 codehash = Hash("code", data, 47)
731
Collin Winter6afaeb72007-08-03 17:06:41 +0000732 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000733
Stefan Behnelfaa29482019-06-01 21:49:03 +0200734 with open(FILE, "w") as fp:
735 fprint = partial(print, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000736
Stefan Behnelfaa29482019-06-01 21:49:03 +0200737 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
738 fprint()
739 fprint("#define NAME_MAXLEN", 256)
740 fprint()
741 fprint("/* lexicon */")
742 Array("lexicon", lexicon).dump(fp, trace)
743 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000744
Stefan Behnelfaa29482019-06-01 21:49:03 +0200745 # split decomposition index table
746 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000747
Stefan Behnelfaa29482019-06-01 21:49:03 +0200748 fprint("/* code->name phrasebook */")
749 fprint("#define phrasebook_shift", shift)
750 fprint("#define phrasebook_short", short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000751
Stefan Behnelfaa29482019-06-01 21:49:03 +0200752 Array("phrasebook", phrasebook).dump(fp, trace)
753 Array("phrasebook_offset1", offset1).dump(fp, trace)
754 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000755
Stefan Behnelfaa29482019-06-01 21:49:03 +0200756 fprint("/* name->code dictionary */")
757 codehash.dump(fp, trace)
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300758
Stefan Behnelfaa29482019-06-01 21:49:03 +0200759 fprint()
760 fprint('static const unsigned int aliases_start = %#x;' %
761 NAME_ALIASES_START)
762 fprint('static const unsigned int aliases_end = %#x;' %
763 (NAME_ALIASES_START + len(unicode.aliases)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300764
Stefan Behnelfaa29482019-06-01 21:49:03 +0200765 fprint('static const unsigned int name_aliases[] = {')
766 for name, codepoint in unicode.aliases:
767 fprint(' 0x%04X,' % codepoint)
768 fprint('};')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300769
Stefan Behnelfaa29482019-06-01 21:49:03 +0200770 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
771 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
772 # sequences or sequences with non-BMP chars are added.
773 # unicodedata_lookup should be adapted too.
774 fprint(dedent("""
775 typedef struct NamedSequence {
776 int seqlen;
777 Py_UCS2 seq[4];
778 } named_sequence;
779 """))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300780
Stefan Behnelfaa29482019-06-01 21:49:03 +0200781 fprint('static const unsigned int named_sequences_start = %#x;' %
782 NAMED_SEQUENCES_START)
783 fprint('static const unsigned int named_sequences_end = %#x;' %
784 (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300785
Stefan Behnelfaa29482019-06-01 21:49:03 +0200786 fprint('static const named_sequence named_sequences[] = {')
787 for name, sequence in unicode.named_sequences:
788 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
789 fprint(' {%d, {%s}},' % (len(sequence), seq_str))
790 fprint('};')
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000791
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000792
793def merge_old_version(version, new, old):
794 # Changes to exclusion file not implemented yet
795 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000796 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000797
798 # In these change records, 0xFF means "no change"
799 bidir_changes = [0xFF]*0x110000
800 category_changes = [0xFF]*0x110000
801 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000802 mirrored_changes = [0xFF]*0x110000
Benjamin Peterson67752312016-09-14 23:53:47 -0700803 east_asian_width_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000804 # In numeric data, 0 means "no change",
805 # -1 means "did not have a numeric value
806 numeric_changes = [0] * 0x110000
807 # normalization_changes is a list of key-value pairs
808 normalization_changes = []
809 for i in range(0x110000):
810 if new.table[i] is None:
811 # Characters unassigned in the new version ought to
812 # be unassigned in the old one
813 assert old.table[i] is None
814 continue
815 # check characters unassigned in the old version
816 if old.table[i] is None:
817 # category 0 is "unassigned"
818 category_changes[i] = 0
819 continue
820 # check characters that differ
821 if old.table[i] != new.table[i]:
822 for k in range(len(old.table[i])):
823 if old.table[i][k] != new.table[i][k]:
824 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300825 if k == 1 and i in PUA_15:
826 # the name is not set in the old.table, but in the
827 # new.table we are using it for aliases and named seq
828 assert value == ''
829 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000830 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
831 category_changes[i] = CATEGORY_NAMES.index(value)
832 elif k == 4:
833 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
834 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
835 elif k == 5:
836 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
837 # We assume that all normalization changes are in 1:1 mappings
838 assert " " not in value
839 normalization_changes.append((i, value))
840 elif k == 6:
841 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
842 # we only support changes where the old value is a single digit
843 assert value in "0123456789"
844 decimal_changes[i] = int(value)
845 elif k == 8:
846 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
847 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000848 if not value:
849 numeric_changes[i] = -1
850 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000851 numeric_changes[i] = float(value)
852 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000853 elif k == 9:
854 if value == 'Y':
855 mirrored_changes[i] = '1'
856 else:
857 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000858 elif k == 11:
859 # change to ISO comment, ignore
860 pass
861 elif k == 12:
862 # change to simple uppercase mapping; ignore
863 pass
864 elif k == 13:
865 # change to simple lowercase mapping; ignore
866 pass
867 elif k == 14:
868 # change to simple titlecase mapping; ignore
869 pass
Benjamin Peterson67752312016-09-14 23:53:47 -0700870 elif k == 15:
871 # change to east asian width
872 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000873 elif k == 16:
874 # derived property changes; not yet
875 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000876 elif k == 17:
877 # normalization quickchecks are not performed
878 # for older versions
879 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000880 else:
881 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000882 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000883 new.changed.append((version, list(zip(bidir_changes, category_changes,
Benjamin Peterson67752312016-09-14 23:53:47 -0700884 decimal_changes, mirrored_changes,
885 east_asian_width_changes,
886 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000887 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000888
Stefan Behnelfaa29482019-06-01 21:49:03 +0200889
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000890def open_data(template, version):
891 local = template % ('-'+version,)
892 if not os.path.exists(local):
893 import urllib.request
894 if version == '3.2.0':
895 # irregular url structure
896 url = 'http://www.unicode.org/Public/3.2-Update/' + local
897 else:
898 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
899 urllib.request.urlretrieve(url, filename=local)
900 if local.endswith('.txt'):
901 return open(local, encoding='utf-8')
902 else:
903 # Unihan.zip
904 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000905
Stefan Behnelfaa29482019-06-01 21:49:03 +0200906
Greg Priceef2af1a2019-08-12 22:20:56 -0700907class UcdFile:
908 '''
909 A file in the standard format of the UCD.
910
911 See: https://www.unicode.org/reports/tr44/#Format_Conventions
912
913 Note that, as described there, the Unihan data files have their
914 own separate format.
915 '''
916
917 def __init__(self, template: str, version: str) -> None:
918 self.template = template
919 self.version = version
920
921 def records(self) -> Iterator[List[str]]:
922 with open_data(self.template, self.version) as file:
923 for line in file:
924 line = line.split('#', 1)[0].strip()
925 if not line:
926 continue
927 yield [field.strip() for field in line.split(';')]
928
929 def __iter__(self) -> Iterator[List[str]]:
930 return self.records()
931
932
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000933# --------------------------------------------------------------------
934# the following support code is taken from the unidb utilities
935# Copyright (c) 1999-2000 by Secret Labs AB
936
937# load a unicode-data file from disk
938
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000939class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000940 # Record structure:
941 # [ID, name, category, combining, bidi, decomp, (6)
942 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
943 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
944 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000945
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000946 def __init__(self, version,
947 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000948 expand=1,
949 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000950 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000951 table = [None] * 0x110000
Greg Priceef2af1a2019-08-12 22:20:56 -0700952 for s in UcdFile(UNICODE_DATA, version):
953 char = int(s[0], 16)
954 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000955
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000956 cjk_ranges_found = []
957
Martin v. Löwis97225da2002-11-24 23:05:09 +0000958 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000959 if expand:
960 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000961 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000962 s = table[i]
963 if s:
964 if s[1][-6:] == "First>":
965 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000966 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000967 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000968 if s[1].startswith("<CJK Ideograph"):
969 cjk_ranges_found.append((field[0],
970 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000971 s[1] = ""
972 field = None
973 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000974 f2 = field[:]
975 f2[0] = "%X" % i
976 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000977 if cjk_check and cjk_ranges != cjk_ranges_found:
978 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000979
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000980 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000981 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000982 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000983 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000984
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300985 # check for name aliases and named sequences, see #12753
986 # aliases and named sequences are not in 3.2.0
987 if version != '3.2.0':
988 self.aliases = []
989 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
990 # in order to take advantage of the compression and lookup
991 # algorithms used for the other characters
992 pua_index = NAME_ALIASES_START
Greg Priceef2af1a2019-08-12 22:20:56 -0700993 for char, name, abbrev in UcdFile(NAME_ALIASES, version):
994 char = int(char, 16)
995 self.aliases.append((name, char))
996 # also store the name in the PUA 1
997 self.table[pua_index][1] = name
998 pua_index += 1
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300999 assert pua_index - NAME_ALIASES_START == len(self.aliases)
1000
1001 self.named_sequences = []
Ezio Melotti7c4a7e62013-08-26 01:32:56 +03001002 # store named sequences in the PUA 1, in range U+F0100..,
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001003 # in order to take advantage of the compression and lookup
1004 # algorithms used for the other characters.
1005
Benjamin Peterson71f660e2012-02-20 22:24:29 -05001006 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001007 pua_index = NAMED_SEQUENCES_START
Greg Priceef2af1a2019-08-12 22:20:56 -07001008 for name, chars in UcdFile(NAMED_SEQUENCES, version):
1009 chars = tuple(int(char, 16) for char in chars.split())
1010 # check that the structure defined in makeunicodename is OK
1011 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1012 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1013 "the NamedSequence struct and in unicodedata_lookup")
1014 self.named_sequences.append((name, chars))
1015 # also store these in the PUA 1
1016 self.table[pua_index][1] = name
1017 pua_index += 1
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001018 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1019
Martin v. Löwis677bde22002-11-23 22:08:15 +00001020 self.exclusions = {}
Greg Priceef2af1a2019-08-12 22:20:56 -07001021 for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
1022 char = int(char, 16)
1023 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001024
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001025 widths = [None] * 0x110000
Greg Priceef2af1a2019-08-12 22:20:56 -07001026 for s in UcdFile(EASTASIAN_WIDTH, version):
1027 if '..' in s[0]:
1028 first, last = [int(c, 16) for c in s[0].split('..')]
1029 chars = list(range(first, last+1))
1030 else:
1031 chars = [int(s[0], 16)]
1032 for char in chars:
1033 widths[char] = s[1]
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001034
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001035 for i in range(0, 0x110000):
1036 if table[i] is not None:
1037 table[i].append(widths[i])
1038
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001039 for i in range(0, 0x110000):
1040 if table[i] is not None:
1041 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001042
Greg Priceef2af1a2019-08-12 22:20:56 -07001043 for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version):
1044 if ".." in r:
1045 first, last = [int(c, 16) for c in r.split('..')]
1046 chars = list(range(first, last+1))
1047 else:
1048 chars = [int(r, 16)]
1049 for char in chars:
1050 if table[char]:
1051 # Some properties (e.g. Default_Ignorable_Code_Point)
1052 # apply to unassigned code points; ignore them
1053 table[char][-1].add(p)
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001054
Greg Priceef2af1a2019-08-12 22:20:56 -07001055 for s in UcdFile(LINE_BREAK, version):
1056 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1057 continue
1058 if '..' not in s[0]:
1059 first = last = int(s[0], 16)
1060 else:
1061 first, last = [int(c, 16) for c in s[0].split('..')]
1062 for char in range(first, last+1):
1063 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001064
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001065 # We only want the quickcheck properties
1066 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1067 # Yes is the default, hence only N and M occur
1068 # In 3.2.0, the format was different (NF?_NO)
1069 # The parsing will incorrectly determine these as
1070 # "yes", however, unicodedata.c will not perform quickchecks
1071 # for older versions, and no delta records will be created.
1072 quickchecks = [0] * 0x110000
1073 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Greg Priceef2af1a2019-08-12 22:20:56 -07001074 for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1075 if len(s) < 2 or s[1] not in qc_order:
1076 continue
1077 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1078 quickcheck_shift = qc_order.index(s[1])*2
1079 quickcheck <<= quickcheck_shift
1080 if '..' not in s[0]:
1081 first = last = int(s[0], 16)
1082 else:
1083 first, last = [int(c, 16) for c in s[0].split('..')]
1084 for char in range(first, last+1):
1085 assert not (quickchecks[char]>>quickcheck_shift)&3
1086 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001087 for i in range(0, 0x110000):
1088 if table[i] is not None:
1089 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001090
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001091 with open_data(UNIHAN, version) as file:
1092 zip = zipfile.ZipFile(file)
1093 if version == '3.2.0':
1094 data = zip.open('Unihan-3.2.0.txt').read()
1095 else:
1096 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001097 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001098 if not line.startswith('U+'):
1099 continue
1100 code, tag, value = line.split(None, 3)[:3]
1101 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1102 'kOtherNumeric'):
1103 continue
1104 value = value.strip().replace(',', '')
1105 i = int(code[2:], 16)
1106 # Patch the numeric field
1107 if table[i] is not None:
1108 table[i][8] = value
Greg Priceef2af1a2019-08-12 22:20:56 -07001109
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001110 sc = self.special_casing = {}
Greg Priceef2af1a2019-08-12 22:20:56 -07001111 for data in UcdFile(SPECIAL_CASING, version):
1112 if data[4]:
1113 # We ignore all conditionals (since they depend on
1114 # languages) except for one, which is hardcoded. See
1115 # handle_capital_sigma in unicodeobject.c.
1116 continue
1117 c = int(data[0], 16)
1118 lower = [int(char, 16) for char in data[1].split()]
1119 title = [int(char, 16) for char in data[2].split()]
1120 upper = [int(char, 16) for char in data[3].split()]
1121 sc[c] = (lower, title, upper)
1122
Benjamin Petersond5890c82012-01-14 13:23:30 -05001123 cf = self.case_folding = {}
1124 if version != '3.2.0':
Greg Priceef2af1a2019-08-12 22:20:56 -07001125 for data in UcdFile(CASE_FOLDING, version):
1126 if data[1] in "CF":
1127 c = int(data[0], 16)
1128 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001129
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001130 def uselatin1(self):
1131 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001132 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001133
Stefan Behnelfaa29482019-06-01 21:49:03 +02001134
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001135# hash table tools
1136
1137# this is a straight-forward reimplementation of Python's built-in
1138# dictionary type, using a static data structure, and a custom string
1139# hash algorithm.
1140
1141def myhash(s, magic):
1142 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001143 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001144 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001145 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001146 if ix:
1147 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1148 return h
1149
Stefan Behnelfaa29482019-06-01 21:49:03 +02001150
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001151SIZES = [
1152 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1153 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1154 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1155 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1156]
1157
Stefan Behnelfaa29482019-06-01 21:49:03 +02001158
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001159class Hash:
1160 def __init__(self, name, data, magic):
1161 # turn a (key, value) list into a static hash table structure
1162
1163 # determine table size
1164 for size, poly in SIZES:
1165 if size > len(data):
1166 poly = size + poly
1167 break
1168 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001169 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001170
Collin Winter6afaeb72007-08-03 17:06:41 +00001171 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001172
1173 table = [None] * size
1174
1175 mask = size-1
1176
1177 n = 0
1178
1179 hash = myhash
1180
1181 # initialize hash table
1182 for key, value in data:
1183 h = hash(key, magic)
1184 i = (~h) & mask
1185 v = table[i]
1186 if v is None:
1187 table[i] = value
1188 continue
Stefan Behnelfaa29482019-06-01 21:49:03 +02001189 incr = (h ^ (h >> 3)) & mask
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001190 if not incr:
1191 incr = mask
1192 while 1:
1193 n = n + 1
1194 i = (i + incr) & mask
1195 v = table[i]
1196 if v is None:
1197 table[i] = value
1198 break
1199 incr = incr << 1
1200 if incr > mask:
1201 incr = incr ^ poly
1202
Collin Winter6afaeb72007-08-03 17:06:41 +00001203 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001204 self.collisions = n
1205
1206 for i in range(len(table)):
1207 if table[i] is None:
1208 table[i] = 0
1209
1210 self.data = Array(name + "_hash", table)
1211 self.magic = magic
1212 self.name = name
1213 self.size = size
1214 self.poly = poly
1215
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001216 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001217 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001218 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001219 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1220 file.write("#define %s_size %d\n" % (self.name, self.size))
1221 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1222
Stefan Behnelfaa29482019-06-01 21:49:03 +02001223
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001224# stuff to deal with arrays of unsigned integers
1225
1226class Array:
1227
1228 def __init__(self, name, data):
1229 self.name = name
1230 self.data = data
1231
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001232 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001233 # write data to file, as a C array
1234 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001235 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001236 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Inada Naoki6fec9052019-04-17 08:40:34 +09001237 file.write("static const ")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001238 if size == 1:
1239 file.write("unsigned char")
1240 elif size == 2:
1241 file.write("unsigned short")
1242 else:
1243 file.write("unsigned int")
1244 file.write(" " + self.name + "[] = {\n")
1245 if self.data:
1246 s = " "
1247 for item in self.data:
1248 i = str(item) + ", "
1249 if len(s) + len(i) > 78:
Benjamin Peterson279a9622017-06-22 22:31:08 -07001250 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001251 s = " " + i
1252 else:
1253 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001254 if s.strip():
Benjamin Peterson279a9622017-06-22 22:31:08 -07001255 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001256 file.write("};\n\n")
1257
Stefan Behnelfaa29482019-06-01 21:49:03 +02001258
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001259def getsize(data):
1260 # return smallest possible integer size for the given array
1261 maxdata = max(data)
1262 if maxdata < 256:
1263 return 1
1264 elif maxdata < 65536:
1265 return 2
1266 else:
1267 return 4
1268
Stefan Behnelfaa29482019-06-01 21:49:03 +02001269
Tim Peters21013482000-09-25 07:13:41 +00001270def splitbins(t, trace=0):
1271 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1272
1273 t is a sequence of ints. This function can be useful to save space if
1274 many of the ints are the same. t1 and t2 are lists of ints, and shift
1275 is an int, chosen to minimize the combined size of t1 and t2 (in C
1276 code), and where for each i in range(len(t)),
1277 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1278 where mask is a bitmask isolating the last "shift" bits.
1279
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001280 If optional arg trace is non-zero (default zero), progress info
1281 is printed to sys.stderr. The higher the value, the more info
1282 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001283 """
1284
Tim Peters21013482000-09-25 07:13:41 +00001285 if trace:
1286 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001287 print("%d+%d bins at shift %d; %d bytes" % (
1288 len(t1), len(t2), shift, bytes), file=sys.stderr)
Stefan Behnelfaa29482019-06-01 21:49:03 +02001289 print("Size of original table:", len(t)*getsize(t), "bytes",
1290 file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001291 n = len(t)-1 # last valid index
1292 maxshift = 0 # the most we can shift n and still have something left
1293 if n > 0:
1294 while n >> 1:
1295 n >>= 1
1296 maxshift += 1
1297 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001298 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001299 t = tuple(t) # so slices can be dict keys
1300 for shift in range(maxshift + 1):
1301 t1 = []
1302 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001303 size = 2**shift
1304 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001305 for i in range(0, len(t), size):
1306 bin = t[i:i+size]
1307 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001308 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001309 index = len(t2)
1310 bincache[bin] = index
1311 t2.extend(bin)
1312 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001313 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001314 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001315 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001316 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001317 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001318 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001319 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001320 t1, t2, shift = best
1321 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001322 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001323 dump(t1, t2, shift, bytes)
1324 if __debug__:
1325 # exhaustively verify that the decomposition is correct
1326 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001327 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001328 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1329 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001330
Stefan Behnelfaa29482019-06-01 21:49:03 +02001331
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001332if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001333 maketables(1)