blob: 5b9427acd3900342413cb9b245157221b9a1de4e [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -07004# This script converts Unicode database files to Modules/unicodedata_db.h,
5# Modules/unicodename_db.h, and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00006#
7# history:
8# 2000-09-24 fl created (based on bits and pieces from unidb)
9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000010# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000011# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000012# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000013# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000014# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000015# 2002-09-11 wd use string methods
16# 2002-10-18 mvl update to Unicode 3.2
17# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000018# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000019# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000020# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000021# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000022# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030023# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050024# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000025#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000026# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000027#
28
Ezio Melotti931b8aa2011-10-21 21:57:36 +030029import os
30import sys
31import zipfile
32
33from textwrap import dedent
Stefan Behnelfaa29482019-06-01 21:49:03 +020034from functools import partial
Fredrik Lundhf367cac2000-09-24 23:18:31 +000035
36SCRIPT = sys.argv[0]
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -070037VERSION = "3.3"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000038
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000039# The Unicode Database
R David Murray7445a382014-10-09 17:30:33 -040040# --------------------
41# When changing UCD version please update
42# * Doc/library/stdtypes.rst, and
43# * Doc/library/unicodedata.rst
R David Murray5f16f902014-10-09 20:45:59 -040044# * Doc/reference/lexical_analysis.rst (two occurrences)
Benjamin Peterson3aca40d2019-05-08 20:59:35 -070045UNIDATA_VERSION = "12.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000046UNICODE_DATA = "UnicodeData%s.txt"
47COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
48EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000049UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000050DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000051DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000052LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030053NAME_ALIASES = "NameAliases%s.txt"
54NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050055SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050056CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030057
58# Private Use Areas -- in planes 1, 15, 16
59PUA_1 = range(0xE000, 0xF900)
60PUA_15 = range(0xF0000, 0xFFFFE)
61PUA_16 = range(0x100000, 0x10FFFE)
62
63# we use this ranges of PUA_15 to store name aliases and named sequences
64NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050065NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000066
67old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000068
69CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
70 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
71 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
72 "So" ]
73
74BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
75 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
Benjamin Peterson94d08d92013-10-10 17:24:45 -040076 "ON", "LRI", "RLI", "FSI", "PDI" ]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000077
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000078EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
79
Florent Xicluna806d8cf2010-03-30 19:34:18 +000080MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
81
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000082# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000083ALPHA_MASK = 0x01
84DECIMAL_MASK = 0x02
85DIGIT_MASK = 0x04
86LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000087LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000088SPACE_MASK = 0x20
89TITLE_MASK = 0x40
90UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000091XID_START_MASK = 0x100
92XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000093PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050094NUMERIC_MASK = 0x800
95CASE_IGNORABLE_MASK = 0x1000
96CASED_MASK = 0x2000
97EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000098
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000099# these ranges need to match unicodedata.c:is_unified_ideograph
100cjk_ranges = [
101 ('3400', '4DB5'),
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -0700102 ('4E00', '9FEF'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000103 ('20000', '2A6D6'),
104 ('2A700', '2B734'),
Benjamin Peterson48013832015-06-27 15:45:56 -0500105 ('2B740', '2B81D'),
106 ('2B820', '2CEA1'),
Benjamin Peterson279a9622017-06-22 22:31:08 -0700107 ('2CEB0', '2EBE0'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000108]
109
Stefan Behnelfaa29482019-06-01 21:49:03 +0200110
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000111def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000112
Collin Winter6afaeb72007-08-03 17:06:41 +0000113 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000114
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000115 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000116
Georg Brandl559e5d72008-06-11 18:37:52 +0000117 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000118
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000119 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000120 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000121 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000122 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000123 merge_old_version(version, unicode, old_unicode)
124
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000125 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000126 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000127 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000128
Stefan Behnelfaa29482019-06-01 21:49:03 +0200129
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000130# --------------------------------------------------------------------
131# unicode character properties
132
133def makeunicodedata(unicode, trace):
134
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000135 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000136 table = [dummy]
137 cache = {0: dummy}
138 index = [0] * len(unicode.chars)
139
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000140 FILE = "Modules/unicodedata_db.h"
141
Collin Winter6afaeb72007-08-03 17:06:41 +0000142 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000143
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000144 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000145
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000146 for char in unicode.chars:
147 record = unicode.table[char]
148 if record:
149 # extract database properties
150 category = CATEGORY_NAMES.index(record[2])
151 combining = int(record[3])
152 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
153 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000154 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000155 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000156 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000157 category, combining, bidirectional, mirrored, eastasianwidth,
158 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000159 )
160 # add entry to index and item tables
161 i = cache.get(item)
162 if i is None:
163 cache[item] = i = len(table)
164 table.append(item)
165 index[char] = i
166
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000167 # 2) decomposition data
168
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169 decomp_data = [0]
170 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000171 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000172 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000173
Martin v. Löwis677bde22002-11-23 22:08:15 +0000174 comp_pairs = []
175 comp_first = [None] * len(unicode.chars)
176 comp_last = [None] * len(unicode.chars)
177
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000178 for char in unicode.chars:
179 record = unicode.table[char]
180 if record:
181 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000182 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000183 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000184 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000185 # prefix
186 if decomp[0][0] == "<":
187 prefix = decomp.pop(0)
188 else:
189 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000190 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000191 i = decomp_prefix.index(prefix)
192 except ValueError:
193 i = len(decomp_prefix)
194 decomp_prefix.append(prefix)
195 prefix = i
196 assert prefix < 256
197 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000198 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000199 # Collect NFC pairs
200 if not prefix and len(decomp) == 3 and \
201 char not in unicode.exclusions and \
202 unicode.table[decomp[1]][3] == "0":
203 p, l, r = decomp
204 comp_first[l] = 1
205 comp_last[r] = 1
206 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000207 try:
208 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000209 except ValueError:
210 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000211 decomp_data.extend(decomp)
212 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000213 else:
214 i = 0
215 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000216
Martin v. Löwis677bde22002-11-23 22:08:15 +0000217 f = l = 0
218 comp_first_ranges = []
219 comp_last_ranges = []
220 prev_f = prev_l = None
221 for i in unicode.chars:
222 if comp_first[i] is not None:
223 comp_first[i] = f
224 f += 1
225 if prev_f is None:
226 prev_f = (i,i)
227 elif prev_f[1]+1 == i:
228 prev_f = prev_f[0],i
229 else:
230 comp_first_ranges.append(prev_f)
231 prev_f = (i,i)
232 if comp_last[i] is not None:
233 comp_last[i] = l
234 l += 1
235 if prev_l is None:
236 prev_l = (i,i)
237 elif prev_l[1]+1 == i:
238 prev_l = prev_l[0],i
239 else:
240 comp_last_ranges.append(prev_l)
241 prev_l = (i,i)
242 comp_first_ranges.append(prev_f)
243 comp_last_ranges.append(prev_l)
244 total_first = f
245 total_last = l
246
247 comp_data = [0]*(total_first*total_last)
248 for f,l,char in comp_pairs:
249 f = comp_first[f]
250 l = comp_last[l]
251 comp_data[f*total_last+l] = char
252
Collin Winter6afaeb72007-08-03 17:06:41 +0000253 print(len(table), "unique properties")
254 print(len(decomp_prefix), "unique decomposition prefixes")
255 print(len(decomp_data), "unique decomposition entries:", end=' ')
256 print(decomp_size, "bytes")
257 print(total_first, "first characters in NFC")
258 print(total_last, "last characters in NFC")
259 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000260
Collin Winter6afaeb72007-08-03 17:06:41 +0000261 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000262
Stefan Behnelfaa29482019-06-01 21:49:03 +0200263 with open(FILE, "w") as fp:
264 fprint = partial(print, file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000265
Stefan Behnelfaa29482019-06-01 21:49:03 +0200266 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
267 fprint()
268 fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
269 fprint("/* a list of unique database records */")
270 fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
271 for item in table:
272 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
273 fprint("};")
274 fprint()
Martin v. Löwis677bde22002-11-23 22:08:15 +0000275
Stefan Behnelfaa29482019-06-01 21:49:03 +0200276 fprint("/* Reindexing of NFC first characters. */")
277 fprint("#define TOTAL_FIRST",total_first)
278 fprint("#define TOTAL_LAST",total_last)
279 fprint("struct reindex{int start;short count,index;};")
280 fprint("static struct reindex nfc_first[] = {")
281 for start,end in comp_first_ranges:
282 fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
283 fprint(" {0,0,0}")
284 fprint("};\n")
285 fprint("static struct reindex nfc_last[] = {")
286 for start,end in comp_last_ranges:
287 fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
288 fprint(" {0,0,0}")
289 fprint("};\n")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000290
Stefan Behnelfaa29482019-06-01 21:49:03 +0200291 # FIXME: <fl> the following tables could be made static, and
292 # the support code moved into unicodedatabase.c
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000293
Stefan Behnelfaa29482019-06-01 21:49:03 +0200294 fprint("/* string literals */")
295 fprint("const char *_PyUnicode_CategoryNames[] = {")
296 for name in CATEGORY_NAMES:
297 fprint(" \"%s\"," % name)
298 fprint(" NULL")
299 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000300
Stefan Behnelfaa29482019-06-01 21:49:03 +0200301 fprint("const char *_PyUnicode_BidirectionalNames[] = {")
302 for name in BIDIRECTIONAL_NAMES:
303 fprint(" \"%s\"," % name)
304 fprint(" NULL")
305 fprint("};")
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000306
Stefan Behnelfaa29482019-06-01 21:49:03 +0200307 fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
308 for name in EASTASIANWIDTH_NAMES:
309 fprint(" \"%s\"," % name)
310 fprint(" NULL")
311 fprint("};")
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000312
Stefan Behnelfaa29482019-06-01 21:49:03 +0200313 fprint("static const char *decomp_prefix[] = {")
314 for name in decomp_prefix:
315 fprint(" \"%s\"," % name)
316 fprint(" NULL")
317 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000318
Stefan Behnelfaa29482019-06-01 21:49:03 +0200319 # split record index table
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000320 index1, index2, shift = splitbins(index, trace)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000321
Stefan Behnelfaa29482019-06-01 21:49:03 +0200322 fprint("/* index tables for the database records */")
323 fprint("#define SHIFT", shift)
324 Array("index1", index1).dump(fp, trace)
325 Array("index2", index2).dump(fp, trace)
326
327 # split decomposition index table
328 index1, index2, shift = splitbins(decomp_index, trace)
329
330 fprint("/* decomposition data */")
331 Array("decomp_data", decomp_data).dump(fp, trace)
332
333 fprint("/* index tables for the decomposition data */")
334 fprint("#define DECOMP_SHIFT", shift)
335 Array("decomp_index1", index1).dump(fp, trace)
336 Array("decomp_index2", index2).dump(fp, trace)
337
338 index, index2, shift = splitbins(comp_data, trace)
339 fprint("/* NFC pairs */")
340 fprint("#define COMP_SHIFT", shift)
341 Array("comp_index", index).dump(fp, trace)
342 Array("comp_data", index2).dump(fp, trace)
343
344 # Generate delta tables for old versions
345 for version, table, normalization in unicode.changed:
346 cversion = version.replace(".","_")
347 records = [table[0]]
348 cache = {table[0]:0}
349 index = [0] * len(table)
350 for i, record in enumerate(table):
351 try:
352 index[i] = cache[record]
353 except KeyError:
354 index[i] = cache[record] = len(records)
355 records.append(record)
356 index1, index2, shift = splitbins(index, trace)
357 fprint("static const change_record change_records_%s[] = {" % cversion)
358 for record in records:
359 fprint(" { %s }," % ", ".join(map(str,record)))
360 fprint("};")
361 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
362 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
363 fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
364 fprint("{")
365 fprint(" int index;")
366 fprint(" if (n >= 0x110000) index = 0;")
367 fprint(" else {")
368 fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))
369 fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
370 (cversion, shift, ((1<<shift)-1)))
371 fprint(" }")
372 fprint(" return change_records_%s+index;" % cversion)
373 fprint("}\n")
374 fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
375 fprint("{")
376 fprint(" switch(n) {")
377 for k, v in normalization:
378 fprint(" case %s: return 0x%s;" % (hex(k), v))
379 fprint(" default: return 0;")
380 fprint(" }\n}\n")
381
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000382
383# --------------------------------------------------------------------
384# unicode character type tables
385
386def makeunicodetype(unicode, trace):
387
388 FILE = "Objects/unicodetype_db.h"
389
Collin Winter6afaeb72007-08-03 17:06:41 +0000390 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000391
392 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000393 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000394 table = [dummy]
395 cache = {0: dummy}
396 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000397 numeric = {}
398 spaces = []
399 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500400 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000401
402 for char in unicode.chars:
403 record = unicode.table[char]
404 if record:
405 # extract database properties
406 category = record[2]
407 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000408 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000409 flags = 0
410 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
411 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500412 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000413 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000414 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000415 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000416 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000417 if category == "Zs" or bidirectional in ("WS", "B", "S"):
418 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000419 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000420 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000421 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500422 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000423 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000424 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000425 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000426 if "XID_Start" in properties:
427 flags |= XID_START_MASK
428 if "XID_Continue" in properties:
429 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500430 if "Cased" in properties:
431 flags |= CASED_MASK
432 if "Case_Ignorable" in properties:
433 flags |= CASE_IGNORABLE_MASK
434 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500435 cf = unicode.case_folding.get(char, [char])
436 if record[12]:
437 upper = int(record[12], 16)
438 else:
439 upper = char
440 if record[13]:
441 lower = int(record[13], 16)
442 else:
443 lower = char
444 if record[14]:
445 title = int(record[14], 16)
446 else:
447 title = upper
448 if sc is None and cf != [lower]:
449 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500450 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500451 if upper == lower == title:
452 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500453 else:
454 upper = upper - char
455 lower = lower - char
456 title = title - char
457 assert (abs(upper) <= 2147483647 and
458 abs(lower) <= 2147483647 and
459 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000460 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500461 # This happens either when some character maps to more than one
462 # character in uppercase, lowercase, or titlecase or the
463 # casefolded version of the character is different from the
464 # lowercase. The extra characters are stored in a different
465 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500466 flags |= EXTENDED_CASE_MASK
467 lower = len(extra_casing) | (len(sc[0]) << 24)
468 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500469 if cf != sc[0]:
470 lower |= len(cf) << 20
471 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500472 upper = len(extra_casing) | (len(sc[2]) << 24)
473 extra_casing.extend(sc[2])
474 # Title is probably equal to upper.
475 if sc[1] == sc[2]:
476 title = upper
477 else:
478 title = len(extra_casing) | (len(sc[1]) << 24)
479 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000480 # decimal digit, integer digit
481 decimal = 0
482 if record[6]:
483 flags |= DECIMAL_MASK
484 decimal = int(record[6])
485 digit = 0
486 if record[7]:
487 flags |= DIGIT_MASK
488 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000489 if record[8]:
490 flags |= NUMERIC_MASK
491 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000492 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000493 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000494 )
495 # add entry to index and item tables
496 i = cache.get(item)
497 if i is None:
498 cache[item] = i = len(table)
499 table.append(item)
500 index[char] = i
501
Collin Winter6afaeb72007-08-03 17:06:41 +0000502 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000503 print(sum(map(len, numeric.values())), "numeric code points")
504 print(len(spaces), "whitespace code points")
505 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500506 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000507
Collin Winter6afaeb72007-08-03 17:06:41 +0000508 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000509
Stefan Behnelfaa29482019-06-01 21:49:03 +0200510 with open(FILE, "w") as fp:
511 fprint = partial(print, file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000512
Stefan Behnelfaa29482019-06-01 21:49:03 +0200513 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
514 fprint()
515 fprint("/* a list of unique character type descriptors */")
516 fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
517 for item in table:
518 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
519 fprint("};")
520 fprint()
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500521
Stefan Behnelfaa29482019-06-01 21:49:03 +0200522 fprint("/* extended case mappings */")
523 fprint()
524 fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
525 for c in extra_casing:
526 fprint(" %d," % c)
527 fprint("};")
528 fprint()
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000529
Stefan Behnelfaa29482019-06-01 21:49:03 +0200530 # split decomposition index table
531 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000532
Stefan Behnelfaa29482019-06-01 21:49:03 +0200533 fprint("/* type indexes */")
534 fprint("#define SHIFT", shift)
535 Array("index1", index1).dump(fp, trace)
536 Array("index2", index2).dump(fp, trace)
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000537
Stefan Behnelfaa29482019-06-01 21:49:03 +0200538 # Generate code for _PyUnicode_ToNumeric()
539 numeric_items = sorted(numeric.items())
540 fprint('/* Returns the numeric value as double for Unicode characters')
541 fprint(' * having this property, -1.0 otherwise.')
542 fprint(' */')
543 fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
544 fprint('{')
545 fprint(' switch (ch) {')
546 for value, codepoints in numeric_items:
547 # Turn text into float literals
548 parts = value.split('/')
549 parts = [repr(float(part)) for part in parts]
550 value = '/'.join(parts)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000551
Stefan Behnelfaa29482019-06-01 21:49:03 +0200552 codepoints.sort()
553 for codepoint in codepoints:
554 fprint(' case 0x%04X:' % (codepoint,))
555 fprint(' return (double) %s;' % (value,))
556 fprint(' }')
557 fprint(' return -1.0;')
558 fprint('}')
559 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000560
Stefan Behnelfaa29482019-06-01 21:49:03 +0200561 # Generate code for _PyUnicode_IsWhitespace()
562 fprint("/* Returns 1 for Unicode characters having the bidirectional")
563 fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
564 fprint(" */")
565 fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
566 fprint('{')
567 fprint(' switch (ch) {')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000568
Stefan Behnelfaa29482019-06-01 21:49:03 +0200569 for codepoint in sorted(spaces):
570 fprint(' case 0x%04X:' % (codepoint,))
571 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000572
Stefan Behnelfaa29482019-06-01 21:49:03 +0200573 fprint(' }')
574 fprint(' return 0;')
575 fprint('}')
576 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000577
Stefan Behnelfaa29482019-06-01 21:49:03 +0200578 # Generate code for _PyUnicode_IsLinebreak()
579 fprint("/* Returns 1 for Unicode characters having the line break")
580 fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
581 fprint(" * type 'B', 0 otherwise.")
582 fprint(" */")
583 fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
584 fprint('{')
585 fprint(' switch (ch) {')
586 for codepoint in sorted(linebreaks):
587 fprint(' case 0x%04X:' % (codepoint,))
588 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000589
Stefan Behnelfaa29482019-06-01 21:49:03 +0200590 fprint(' }')
591 fprint(' return 0;')
592 fprint('}')
593 fprint()
594
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000595
596# --------------------------------------------------------------------
597# unicode name database
598
599def makeunicodename(unicode, trace):
600
601 FILE = "Modules/unicodename_db.h"
602
Collin Winter6afaeb72007-08-03 17:06:41 +0000603 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000604
605 # collect names
606 names = [None] * len(unicode.chars)
607
608 for char in unicode.chars:
609 record = unicode.table[char]
610 if record:
611 name = record[1].strip()
612 if name and name[0] != "<":
613 names[char] = name + chr(0)
614
Jon Dufresne39726282017-05-18 07:35:54 -0700615 print(len([n for n in names if n is not None]), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000616
617 # collect unique words from names (note that we differ between
618 # words inside a sentence, and words ending a sentence. the
619 # latter includes the trailing null byte.
620
621 words = {}
622 n = b = 0
623 for char in unicode.chars:
624 name = names[char]
625 if name:
626 w = name.split()
627 b = b + len(name)
628 n = n + len(w)
629 for w in w:
630 l = words.get(w)
631 if l:
632 l.append(None)
633 else:
634 words[w] = [len(words)]
635
Collin Winter6afaeb72007-08-03 17:06:41 +0000636 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000637
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000638 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000639
Martin v. Löwis97225da2002-11-24 23:05:09 +0000640 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000641 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000642 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000643 return -len(alist), aword
644 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000645
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000646 # figure out how many phrasebook escapes we need
647 escapes = 0
648 while escapes * 256 < len(wordlist):
649 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000650 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000651
652 short = 256 - escapes
653
654 assert short > 0
655
Collin Winter6afaeb72007-08-03 17:06:41 +0000656 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000657
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000658 # statistics
659 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000660 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000661 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000662 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000663
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000664 # pick the most commonly used words, and sort the rest on falling
665 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000666
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000667 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000668 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000669 wordlist.extend(wordtail)
670
671 # generate lexicon from words
672
673 lexicon_offset = [0]
674 lexicon = ""
675 words = {}
676
677 # build a lexicon string
678 offset = 0
679 for w, x in wordlist:
680 # encoding: bit 7 indicates last character in word (chr(128)
681 # indicates the last character in an entire string)
682 ww = w[:-1] + chr(ord(w[-1])+128)
683 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000684 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000685 if o < 0:
686 o = offset
687 lexicon = lexicon + ww
688 offset = offset + len(w)
689 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000690 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000691
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000692 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000693
694 # generate phrasebook from names and lexicon
695 phrasebook = [0]
696 phrasebook_offset = [0] * len(unicode.chars)
697 for char in unicode.chars:
698 name = names[char]
699 if name:
700 w = name.split()
701 phrasebook_offset[char] = len(phrasebook)
702 for w in w:
703 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000704 if i < short:
705 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000706 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000707 # store as two bytes
708 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000709 phrasebook.append(i&255)
710
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000711 assert getsize(phrasebook) == 1
712
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000713 #
714 # unicode name hash table
715
716 # extract names
717 data = []
718 for char in unicode.chars:
719 record = unicode.table[char]
720 if record:
721 name = record[1].strip()
722 if name and name[0] != "<":
723 data.append((name, char))
724
725 # the magic number 47 was chosen to minimize the number of
726 # collisions on the current data set. if you like, change it
727 # and see what happens...
728
729 codehash = Hash("code", data, 47)
730
Collin Winter6afaeb72007-08-03 17:06:41 +0000731 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000732
Stefan Behnelfaa29482019-06-01 21:49:03 +0200733 with open(FILE, "w") as fp:
734 fprint = partial(print, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000735
Stefan Behnelfaa29482019-06-01 21:49:03 +0200736 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
737 fprint()
738 fprint("#define NAME_MAXLEN", 256)
739 fprint()
740 fprint("/* lexicon */")
741 Array("lexicon", lexicon).dump(fp, trace)
742 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000743
Stefan Behnelfaa29482019-06-01 21:49:03 +0200744 # split decomposition index table
745 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000746
Stefan Behnelfaa29482019-06-01 21:49:03 +0200747 fprint("/* code->name phrasebook */")
748 fprint("#define phrasebook_shift", shift)
749 fprint("#define phrasebook_short", short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000750
Stefan Behnelfaa29482019-06-01 21:49:03 +0200751 Array("phrasebook", phrasebook).dump(fp, trace)
752 Array("phrasebook_offset1", offset1).dump(fp, trace)
753 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000754
Stefan Behnelfaa29482019-06-01 21:49:03 +0200755 fprint("/* name->code dictionary */")
756 codehash.dump(fp, trace)
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300757
Stefan Behnelfaa29482019-06-01 21:49:03 +0200758 fprint()
759 fprint('static const unsigned int aliases_start = %#x;' %
760 NAME_ALIASES_START)
761 fprint('static const unsigned int aliases_end = %#x;' %
762 (NAME_ALIASES_START + len(unicode.aliases)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300763
Stefan Behnelfaa29482019-06-01 21:49:03 +0200764 fprint('static const unsigned int name_aliases[] = {')
765 for name, codepoint in unicode.aliases:
766 fprint(' 0x%04X,' % codepoint)
767 fprint('};')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300768
Stefan Behnelfaa29482019-06-01 21:49:03 +0200769 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
770 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
771 # sequences or sequences with non-BMP chars are added.
772 # unicodedata_lookup should be adapted too.
773 fprint(dedent("""
774 typedef struct NamedSequence {
775 int seqlen;
776 Py_UCS2 seq[4];
777 } named_sequence;
778 """))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300779
Stefan Behnelfaa29482019-06-01 21:49:03 +0200780 fprint('static const unsigned int named_sequences_start = %#x;' %
781 NAMED_SEQUENCES_START)
782 fprint('static const unsigned int named_sequences_end = %#x;' %
783 (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300784
Stefan Behnelfaa29482019-06-01 21:49:03 +0200785 fprint('static const named_sequence named_sequences[] = {')
786 for name, sequence in unicode.named_sequences:
787 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
788 fprint(' {%d, {%s}},' % (len(sequence), seq_str))
789 fprint('};')
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000790
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000791
792def merge_old_version(version, new, old):
793 # Changes to exclusion file not implemented yet
794 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000795 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000796
797 # In these change records, 0xFF means "no change"
798 bidir_changes = [0xFF]*0x110000
799 category_changes = [0xFF]*0x110000
800 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000801 mirrored_changes = [0xFF]*0x110000
Benjamin Peterson67752312016-09-14 23:53:47 -0700802 east_asian_width_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000803 # In numeric data, 0 means "no change",
804 # -1 means "did not have a numeric value
805 numeric_changes = [0] * 0x110000
806 # normalization_changes is a list of key-value pairs
807 normalization_changes = []
808 for i in range(0x110000):
809 if new.table[i] is None:
810 # Characters unassigned in the new version ought to
811 # be unassigned in the old one
812 assert old.table[i] is None
813 continue
814 # check characters unassigned in the old version
815 if old.table[i] is None:
816 # category 0 is "unassigned"
817 category_changes[i] = 0
818 continue
819 # check characters that differ
820 if old.table[i] != new.table[i]:
821 for k in range(len(old.table[i])):
822 if old.table[i][k] != new.table[i][k]:
823 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300824 if k == 1 and i in PUA_15:
825 # the name is not set in the old.table, but in the
826 # new.table we are using it for aliases and named seq
827 assert value == ''
828 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000829 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
830 category_changes[i] = CATEGORY_NAMES.index(value)
831 elif k == 4:
832 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
833 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
834 elif k == 5:
835 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
836 # We assume that all normalization changes are in 1:1 mappings
837 assert " " not in value
838 normalization_changes.append((i, value))
839 elif k == 6:
840 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
841 # we only support changes where the old value is a single digit
842 assert value in "0123456789"
843 decimal_changes[i] = int(value)
844 elif k == 8:
845 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
846 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000847 if not value:
848 numeric_changes[i] = -1
849 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000850 numeric_changes[i] = float(value)
851 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000852 elif k == 9:
853 if value == 'Y':
854 mirrored_changes[i] = '1'
855 else:
856 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000857 elif k == 11:
858 # change to ISO comment, ignore
859 pass
860 elif k == 12:
861 # change to simple uppercase mapping; ignore
862 pass
863 elif k == 13:
864 # change to simple lowercase mapping; ignore
865 pass
866 elif k == 14:
867 # change to simple titlecase mapping; ignore
868 pass
Benjamin Peterson67752312016-09-14 23:53:47 -0700869 elif k == 15:
870 # change to east asian width
871 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000872 elif k == 16:
873 # derived property changes; not yet
874 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000875 elif k == 17:
876 # normalization quickchecks are not performed
877 # for older versions
878 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000879 else:
880 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000881 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000882 new.changed.append((version, list(zip(bidir_changes, category_changes,
Benjamin Peterson67752312016-09-14 23:53:47 -0700883 decimal_changes, mirrored_changes,
884 east_asian_width_changes,
885 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000886 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000887
Stefan Behnelfaa29482019-06-01 21:49:03 +0200888
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000889def open_data(template, version):
890 local = template % ('-'+version,)
891 if not os.path.exists(local):
892 import urllib.request
893 if version == '3.2.0':
894 # irregular url structure
895 url = 'http://www.unicode.org/Public/3.2-Update/' + local
896 else:
897 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
898 urllib.request.urlretrieve(url, filename=local)
899 if local.endswith('.txt'):
900 return open(local, encoding='utf-8')
901 else:
902 # Unihan.zip
903 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000904
Stefan Behnelfaa29482019-06-01 21:49:03 +0200905
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000906# --------------------------------------------------------------------
907# the following support code is taken from the unidb utilities
908# Copyright (c) 1999-2000 by Secret Labs AB
909
910# load a unicode-data file from disk
911
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000912class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000913 # Record structure:
914 # [ID, name, category, combining, bidi, decomp, (6)
915 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
916 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
917 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000918
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000919 def __init__(self, version,
920 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000921 expand=1,
922 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000923 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000924 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300925 with open_data(UNICODE_DATA, version) as file:
926 while 1:
927 s = file.readline()
928 if not s:
929 break
930 s = s.strip().split(";")
931 char = int(s[0], 16)
932 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000933
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000934 cjk_ranges_found = []
935
Martin v. Löwis97225da2002-11-24 23:05:09 +0000936 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000937 if expand:
938 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000939 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000940 s = table[i]
941 if s:
942 if s[1][-6:] == "First>":
943 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000944 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000945 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000946 if s[1].startswith("<CJK Ideograph"):
947 cjk_ranges_found.append((field[0],
948 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000949 s[1] = ""
950 field = None
951 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000952 f2 = field[:]
953 f2[0] = "%X" % i
954 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000955 if cjk_check and cjk_ranges != cjk_ranges_found:
956 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000957
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000958 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000959 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000960 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000961 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000962
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300963 # check for name aliases and named sequences, see #12753
964 # aliases and named sequences are not in 3.2.0
965 if version != '3.2.0':
966 self.aliases = []
967 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
968 # in order to take advantage of the compression and lookup
969 # algorithms used for the other characters
970 pua_index = NAME_ALIASES_START
971 with open_data(NAME_ALIASES, version) as file:
972 for s in file:
973 s = s.strip()
974 if not s or s.startswith('#'):
975 continue
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500976 char, name, abbrev = s.split(';')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300977 char = int(char, 16)
978 self.aliases.append((name, char))
979 # also store the name in the PUA 1
980 self.table[pua_index][1] = name
981 pua_index += 1
982 assert pua_index - NAME_ALIASES_START == len(self.aliases)
983
984 self.named_sequences = []
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300985 # store named sequences in the PUA 1, in range U+F0100..,
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300986 # in order to take advantage of the compression and lookup
987 # algorithms used for the other characters.
988
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500989 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300990 pua_index = NAMED_SEQUENCES_START
991 with open_data(NAMED_SEQUENCES, version) as file:
992 for s in file:
993 s = s.strip()
994 if not s or s.startswith('#'):
995 continue
996 name, chars = s.split(';')
997 chars = tuple(int(char, 16) for char in chars.split())
998 # check that the structure defined in makeunicodename is OK
999 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1000 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1001 "the NamedSequence struct and in unicodedata_lookup")
1002 self.named_sequences.append((name, chars))
1003 # also store these in the PUA 1
1004 self.table[pua_index][1] = name
1005 pua_index += 1
1006 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1007
Martin v. Löwis677bde22002-11-23 22:08:15 +00001008 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001009 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
1010 for s in file:
1011 s = s.strip()
1012 if not s:
1013 continue
1014 if s[0] == '#':
1015 continue
1016 char = int(s.split()[0],16)
1017 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001018
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001019 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001020 with open_data(EASTASIAN_WIDTH, version) as file:
1021 for s in file:
1022 s = s.strip()
1023 if not s:
1024 continue
1025 if s[0] == '#':
1026 continue
1027 s = s.split()[0].split(';')
1028 if '..' in s[0]:
1029 first, last = [int(c, 16) for c in s[0].split('..')]
1030 chars = list(range(first, last+1))
1031 else:
1032 chars = [int(s[0], 16)]
1033 for char in chars:
1034 widths[char] = s[1]
1035
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001036 for i in range(0, 0x110000):
1037 if table[i] is not None:
1038 table[i].append(widths[i])
1039
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001040 for i in range(0, 0x110000):
1041 if table[i] is not None:
1042 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001043
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001044 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1045 for s in file:
1046 s = s.split('#', 1)[0].strip()
1047 if not s:
1048 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001049
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001050 r, p = s.split(";")
1051 r = r.strip()
1052 p = p.strip()
1053 if ".." in r:
1054 first, last = [int(c, 16) for c in r.split('..')]
1055 chars = list(range(first, last+1))
1056 else:
1057 chars = [int(r, 16)]
1058 for char in chars:
1059 if table[char]:
1060 # Some properties (e.g. Default_Ignorable_Code_Point)
1061 # apply to unassigned code points; ignore them
1062 table[char][-1].add(p)
1063
1064 with open_data(LINE_BREAK, version) as file:
1065 for s in file:
1066 s = s.partition('#')[0]
1067 s = [i.strip() for i in s.split(';')]
1068 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1069 continue
1070 if '..' not in s[0]:
1071 first = last = int(s[0], 16)
1072 else:
1073 first, last = [int(c, 16) for c in s[0].split('..')]
1074 for char in range(first, last+1):
1075 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001076
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001077 # We only want the quickcheck properties
1078 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1079 # Yes is the default, hence only N and M occur
1080 # In 3.2.0, the format was different (NF?_NO)
1081 # The parsing will incorrectly determine these as
1082 # "yes", however, unicodedata.c will not perform quickchecks
1083 # for older versions, and no delta records will be created.
1084 quickchecks = [0] * 0x110000
1085 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001086 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1087 for s in file:
1088 if '#' in s:
1089 s = s[:s.index('#')]
1090 s = [i.strip() for i in s.split(';')]
1091 if len(s) < 2 or s[1] not in qc_order:
1092 continue
1093 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1094 quickcheck_shift = qc_order.index(s[1])*2
1095 quickcheck <<= quickcheck_shift
1096 if '..' not in s[0]:
1097 first = last = int(s[0], 16)
1098 else:
1099 first, last = [int(c, 16) for c in s[0].split('..')]
1100 for char in range(first, last+1):
1101 assert not (quickchecks[char]>>quickcheck_shift)&3
1102 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001103 for i in range(0, 0x110000):
1104 if table[i] is not None:
1105 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001106
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001107 with open_data(UNIHAN, version) as file:
1108 zip = zipfile.ZipFile(file)
1109 if version == '3.2.0':
1110 data = zip.open('Unihan-3.2.0.txt').read()
1111 else:
1112 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001113 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001114 if not line.startswith('U+'):
1115 continue
1116 code, tag, value = line.split(None, 3)[:3]
1117 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1118 'kOtherNumeric'):
1119 continue
1120 value = value.strip().replace(',', '')
1121 i = int(code[2:], 16)
1122 # Patch the numeric field
1123 if table[i] is not None:
1124 table[i][8] = value
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001125 sc = self.special_casing = {}
1126 with open_data(SPECIAL_CASING, version) as file:
1127 for s in file:
1128 s = s[:-1].split('#', 1)[0]
1129 if not s:
1130 continue
1131 data = s.split("; ")
1132 if data[4]:
1133 # We ignore all conditionals (since they depend on
1134 # languages) except for one, which is hardcoded. See
1135 # handle_capital_sigma in unicodeobject.c.
1136 continue
1137 c = int(data[0], 16)
1138 lower = [int(char, 16) for char in data[1].split()]
1139 title = [int(char, 16) for char in data[2].split()]
1140 upper = [int(char, 16) for char in data[3].split()]
1141 sc[c] = (lower, title, upper)
Benjamin Petersond5890c82012-01-14 13:23:30 -05001142 cf = self.case_folding = {}
1143 if version != '3.2.0':
1144 with open_data(CASE_FOLDING, version) as file:
1145 for s in file:
1146 s = s[:-1].split('#', 1)[0]
1147 if not s:
1148 continue
1149 data = s.split("; ")
1150 if data[1] in "CF":
1151 c = int(data[0], 16)
1152 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001153
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001154 def uselatin1(self):
1155 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001156 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001157
Stefan Behnelfaa29482019-06-01 21:49:03 +02001158
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001159# hash table tools
1160
1161# this is a straight-forward reimplementation of Python's built-in
1162# dictionary type, using a static data structure, and a custom string
1163# hash algorithm.
1164
1165def myhash(s, magic):
1166 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001167 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001168 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001169 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001170 if ix:
1171 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1172 return h
1173
Stefan Behnelfaa29482019-06-01 21:49:03 +02001174
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001175SIZES = [
1176 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1177 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1178 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1179 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1180]
1181
Stefan Behnelfaa29482019-06-01 21:49:03 +02001182
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001183class Hash:
1184 def __init__(self, name, data, magic):
1185 # turn a (key, value) list into a static hash table structure
1186
1187 # determine table size
1188 for size, poly in SIZES:
1189 if size > len(data):
1190 poly = size + poly
1191 break
1192 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001193 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001194
Collin Winter6afaeb72007-08-03 17:06:41 +00001195 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001196
1197 table = [None] * size
1198
1199 mask = size-1
1200
1201 n = 0
1202
1203 hash = myhash
1204
1205 # initialize hash table
1206 for key, value in data:
1207 h = hash(key, magic)
1208 i = (~h) & mask
1209 v = table[i]
1210 if v is None:
1211 table[i] = value
1212 continue
Stefan Behnelfaa29482019-06-01 21:49:03 +02001213 incr = (h ^ (h >> 3)) & mask
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001214 if not incr:
1215 incr = mask
1216 while 1:
1217 n = n + 1
1218 i = (i + incr) & mask
1219 v = table[i]
1220 if v is None:
1221 table[i] = value
1222 break
1223 incr = incr << 1
1224 if incr > mask:
1225 incr = incr ^ poly
1226
Collin Winter6afaeb72007-08-03 17:06:41 +00001227 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001228 self.collisions = n
1229
1230 for i in range(len(table)):
1231 if table[i] is None:
1232 table[i] = 0
1233
1234 self.data = Array(name + "_hash", table)
1235 self.magic = magic
1236 self.name = name
1237 self.size = size
1238 self.poly = poly
1239
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001240 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001241 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001242 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001243 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1244 file.write("#define %s_size %d\n" % (self.name, self.size))
1245 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1246
Stefan Behnelfaa29482019-06-01 21:49:03 +02001247
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001248# stuff to deal with arrays of unsigned integers
1249
1250class Array:
1251
1252 def __init__(self, name, data):
1253 self.name = name
1254 self.data = data
1255
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001256 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001257 # write data to file, as a C array
1258 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001259 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001260 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Inada Naoki6fec9052019-04-17 08:40:34 +09001261 file.write("static const ")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001262 if size == 1:
1263 file.write("unsigned char")
1264 elif size == 2:
1265 file.write("unsigned short")
1266 else:
1267 file.write("unsigned int")
1268 file.write(" " + self.name + "[] = {\n")
1269 if self.data:
1270 s = " "
1271 for item in self.data:
1272 i = str(item) + ", "
1273 if len(s) + len(i) > 78:
Benjamin Peterson279a9622017-06-22 22:31:08 -07001274 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001275 s = " " + i
1276 else:
1277 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001278 if s.strip():
Benjamin Peterson279a9622017-06-22 22:31:08 -07001279 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001280 file.write("};\n\n")
1281
Stefan Behnelfaa29482019-06-01 21:49:03 +02001282
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001283def getsize(data):
1284 # return smallest possible integer size for the given array
1285 maxdata = max(data)
1286 if maxdata < 256:
1287 return 1
1288 elif maxdata < 65536:
1289 return 2
1290 else:
1291 return 4
1292
Stefan Behnelfaa29482019-06-01 21:49:03 +02001293
Tim Peters21013482000-09-25 07:13:41 +00001294def splitbins(t, trace=0):
1295 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1296
1297 t is a sequence of ints. This function can be useful to save space if
1298 many of the ints are the same. t1 and t2 are lists of ints, and shift
1299 is an int, chosen to minimize the combined size of t1 and t2 (in C
1300 code), and where for each i in range(len(t)),
1301 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1302 where mask is a bitmask isolating the last "shift" bits.
1303
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001304 If optional arg trace is non-zero (default zero), progress info
1305 is printed to sys.stderr. The higher the value, the more info
1306 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001307 """
1308
Tim Peters21013482000-09-25 07:13:41 +00001309 if trace:
1310 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001311 print("%d+%d bins at shift %d; %d bytes" % (
1312 len(t1), len(t2), shift, bytes), file=sys.stderr)
Stefan Behnelfaa29482019-06-01 21:49:03 +02001313 print("Size of original table:", len(t)*getsize(t), "bytes",
1314 file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001315 n = len(t)-1 # last valid index
1316 maxshift = 0 # the most we can shift n and still have something left
1317 if n > 0:
1318 while n >> 1:
1319 n >>= 1
1320 maxshift += 1
1321 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001322 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001323 t = tuple(t) # so slices can be dict keys
1324 for shift in range(maxshift + 1):
1325 t1 = []
1326 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001327 size = 2**shift
1328 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001329 for i in range(0, len(t), size):
1330 bin = t[i:i+size]
1331 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001332 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001333 index = len(t2)
1334 bincache[bin] = index
1335 t2.extend(bin)
1336 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001337 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001338 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001339 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001340 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001341 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001342 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001343 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001344 t1, t2, shift = best
1345 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001346 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001347 dump(t1, t2, shift, bytes)
1348 if __debug__:
1349 # exhaustively verify that the decomposition is correct
1350 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001351 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001352 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1353 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001354
Stefan Behnelfaa29482019-06-01 21:49:03 +02001355
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001356if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001357 maketables(1)