blob: a8e92be496292decc54e042ed66e8cc197220dbf [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -07004# This script converts Unicode database files to Modules/unicodedata_db.h,
5# Modules/unicodename_db.h, and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00006#
7# history:
8# 2000-09-24 fl created (based on bits and pieces from unidb)
9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000010# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000011# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000012# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000013# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000014# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000015# 2002-09-11 wd use string methods
16# 2002-10-18 mvl update to Unicode 3.2
17# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000018# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000019# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000020# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000021# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000022# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030023# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050024# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000025#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000026# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000027#
28
Greg Pricea65678c2019-09-12 02:23:43 -070029import dataclasses
Ezio Melotti931b8aa2011-10-21 21:57:36 +030030import os
31import sys
32import zipfile
33
Stefan Behnelfaa29482019-06-01 21:49:03 +020034from functools import partial
Greg Priceef2af1a2019-08-12 22:20:56 -070035from textwrap import dedent
Greg Pricea65678c2019-09-12 02:23:43 -070036from typing import Iterator, List, Optional, Set, Tuple
Fredrik Lundhf367cac2000-09-24 23:18:31 +000037
38SCRIPT = sys.argv[0]
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -070039VERSION = "3.3"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000040
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000041# The Unicode Database
R David Murray7445a382014-10-09 17:30:33 -040042# --------------------
43# When changing UCD version please update
44# * Doc/library/stdtypes.rst, and
45# * Doc/library/unicodedata.rst
R David Murray5f16f902014-10-09 20:45:59 -040046# * Doc/reference/lexical_analysis.rst (two occurrences)
Benjamin Peterson3aca40d2019-05-08 20:59:35 -070047UNIDATA_VERSION = "12.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000048UNICODE_DATA = "UnicodeData%s.txt"
49COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
50EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000051UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000052DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000053DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000054LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030055NAME_ALIASES = "NameAliases%s.txt"
56NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050057SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050058CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030059
60# Private Use Areas -- in planes 1, 15, 16
61PUA_1 = range(0xE000, 0xF900)
62PUA_15 = range(0xF0000, 0xFFFFE)
63PUA_16 = range(0x100000, 0x10FFFE)
64
65# we use this ranges of PUA_15 to store name aliases and named sequences
66NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050067NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068
69old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000070
71CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
72 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
73 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
74 "So" ]
75
76BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
77 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
Benjamin Peterson94d08d92013-10-10 17:24:45 -040078 "ON", "LRI", "RLI", "FSI", "PDI" ]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000079
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000080EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
81
Florent Xicluna806d8cf2010-03-30 19:34:18 +000082MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
83
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000084# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000085ALPHA_MASK = 0x01
86DECIMAL_MASK = 0x02
87DIGIT_MASK = 0x04
88LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000089LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000090SPACE_MASK = 0x20
91TITLE_MASK = 0x40
92UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000093XID_START_MASK = 0x100
94XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000095PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050096NUMERIC_MASK = 0x800
97CASE_IGNORABLE_MASK = 0x1000
98CASED_MASK = 0x2000
99EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000100
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000101# these ranges need to match unicodedata.c:is_unified_ideograph
102cjk_ranges = [
103 ('3400', '4DB5'),
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -0700104 ('4E00', '9FEF'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000105 ('20000', '2A6D6'),
106 ('2A700', '2B734'),
Benjamin Peterson48013832015-06-27 15:45:56 -0500107 ('2B740', '2B81D'),
108 ('2B820', '2CEA1'),
Benjamin Peterson279a9622017-06-22 22:31:08 -0700109 ('2CEB0', '2EBE0'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000110]
111
Stefan Behnelfaa29482019-06-01 21:49:03 +0200112
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000113def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000114
Collin Winter6afaeb72007-08-03 17:06:41 +0000115 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000116
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000117 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000118
Georg Brandl559e5d72008-06-11 18:37:52 +0000119 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000120
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000121 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000122 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000123 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000124 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000125 merge_old_version(version, unicode, old_unicode)
126
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000127 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000128 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000129 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000130
Stefan Behnelfaa29482019-06-01 21:49:03 +0200131
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000132# --------------------------------------------------------------------
133# unicode character properties
134
135def makeunicodedata(unicode, trace):
136
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000137 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000138 table = [dummy]
139 cache = {0: dummy}
140 index = [0] * len(unicode.chars)
141
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000142 FILE = "Modules/unicodedata_db.h"
143
Collin Winter6afaeb72007-08-03 17:06:41 +0000144 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000145
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000146 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000147
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000148 for char in unicode.chars:
149 record = unicode.table[char]
150 if record:
151 # extract database properties
Greg Pricea65678c2019-09-12 02:23:43 -0700152 category = CATEGORY_NAMES.index(record.general_category)
153 combining = int(record.canonical_combining_class)
154 bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
155 mirrored = record.bidi_mirrored == "Y"
156 eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
157 normalizationquickcheck = record.quick_check
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000158 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000159 category, combining, bidirectional, mirrored, eastasianwidth,
160 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000161 )
162 # add entry to index and item tables
163 i = cache.get(item)
164 if i is None:
165 cache[item] = i = len(table)
166 table.append(item)
167 index[char] = i
168
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000169 # 2) decomposition data
170
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000171 decomp_data = [0]
172 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000173 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000174 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000175
Martin v. Löwis677bde22002-11-23 22:08:15 +0000176 comp_pairs = []
177 comp_first = [None] * len(unicode.chars)
178 comp_last = [None] * len(unicode.chars)
179
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000180 for char in unicode.chars:
181 record = unicode.table[char]
182 if record:
Greg Pricea65678c2019-09-12 02:23:43 -0700183 if record.decomposition_type:
184 decomp = record.decomposition_type.split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000185 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000186 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000187 # prefix
188 if decomp[0][0] == "<":
189 prefix = decomp.pop(0)
190 else:
191 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000192 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000193 i = decomp_prefix.index(prefix)
194 except ValueError:
195 i = len(decomp_prefix)
196 decomp_prefix.append(prefix)
197 prefix = i
198 assert prefix < 256
199 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000200 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000201 # Collect NFC pairs
202 if not prefix and len(decomp) == 3 and \
203 char not in unicode.exclusions and \
Greg Pricea65678c2019-09-12 02:23:43 -0700204 unicode.table[decomp[1]].canonical_combining_class == "0":
Martin v. Löwis677bde22002-11-23 22:08:15 +0000205 p, l, r = decomp
206 comp_first[l] = 1
207 comp_last[r] = 1
208 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000209 try:
210 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000211 except ValueError:
212 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000213 decomp_data.extend(decomp)
214 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000215 else:
216 i = 0
217 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000218
Martin v. Löwis677bde22002-11-23 22:08:15 +0000219 f = l = 0
220 comp_first_ranges = []
221 comp_last_ranges = []
222 prev_f = prev_l = None
223 for i in unicode.chars:
224 if comp_first[i] is not None:
225 comp_first[i] = f
226 f += 1
227 if prev_f is None:
228 prev_f = (i,i)
229 elif prev_f[1]+1 == i:
230 prev_f = prev_f[0],i
231 else:
232 comp_first_ranges.append(prev_f)
233 prev_f = (i,i)
234 if comp_last[i] is not None:
235 comp_last[i] = l
236 l += 1
237 if prev_l is None:
238 prev_l = (i,i)
239 elif prev_l[1]+1 == i:
240 prev_l = prev_l[0],i
241 else:
242 comp_last_ranges.append(prev_l)
243 prev_l = (i,i)
244 comp_first_ranges.append(prev_f)
245 comp_last_ranges.append(prev_l)
246 total_first = f
247 total_last = l
248
249 comp_data = [0]*(total_first*total_last)
250 for f,l,char in comp_pairs:
251 f = comp_first[f]
252 l = comp_last[l]
253 comp_data[f*total_last+l] = char
254
Collin Winter6afaeb72007-08-03 17:06:41 +0000255 print(len(table), "unique properties")
256 print(len(decomp_prefix), "unique decomposition prefixes")
257 print(len(decomp_data), "unique decomposition entries:", end=' ')
258 print(decomp_size, "bytes")
259 print(total_first, "first characters in NFC")
260 print(total_last, "last characters in NFC")
261 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000262
Collin Winter6afaeb72007-08-03 17:06:41 +0000263 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000264
Stefan Behnelfaa29482019-06-01 21:49:03 +0200265 with open(FILE, "w") as fp:
266 fprint = partial(print, file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000267
Stefan Behnelfaa29482019-06-01 21:49:03 +0200268 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
269 fprint()
270 fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
271 fprint("/* a list of unique database records */")
272 fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
273 for item in table:
274 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
275 fprint("};")
276 fprint()
Martin v. Löwis677bde22002-11-23 22:08:15 +0000277
Stefan Behnelfaa29482019-06-01 21:49:03 +0200278 fprint("/* Reindexing of NFC first characters. */")
279 fprint("#define TOTAL_FIRST",total_first)
280 fprint("#define TOTAL_LAST",total_last)
281 fprint("struct reindex{int start;short count,index;};")
282 fprint("static struct reindex nfc_first[] = {")
283 for start,end in comp_first_ranges:
284 fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
285 fprint(" {0,0,0}")
286 fprint("};\n")
287 fprint("static struct reindex nfc_last[] = {")
288 for start,end in comp_last_ranges:
289 fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
290 fprint(" {0,0,0}")
291 fprint("};\n")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000292
Stefan Behnelfaa29482019-06-01 21:49:03 +0200293 # FIXME: <fl> the following tables could be made static, and
294 # the support code moved into unicodedatabase.c
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000295
Stefan Behnelfaa29482019-06-01 21:49:03 +0200296 fprint("/* string literals */")
297 fprint("const char *_PyUnicode_CategoryNames[] = {")
298 for name in CATEGORY_NAMES:
299 fprint(" \"%s\"," % name)
300 fprint(" NULL")
301 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000302
Stefan Behnelfaa29482019-06-01 21:49:03 +0200303 fprint("const char *_PyUnicode_BidirectionalNames[] = {")
304 for name in BIDIRECTIONAL_NAMES:
305 fprint(" \"%s\"," % name)
306 fprint(" NULL")
307 fprint("};")
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000308
Stefan Behnelfaa29482019-06-01 21:49:03 +0200309 fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
310 for name in EASTASIANWIDTH_NAMES:
311 fprint(" \"%s\"," % name)
312 fprint(" NULL")
313 fprint("};")
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000314
Stefan Behnelfaa29482019-06-01 21:49:03 +0200315 fprint("static const char *decomp_prefix[] = {")
316 for name in decomp_prefix:
317 fprint(" \"%s\"," % name)
318 fprint(" NULL")
319 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000320
Stefan Behnelfaa29482019-06-01 21:49:03 +0200321 # split record index table
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000322 index1, index2, shift = splitbins(index, trace)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000323
Stefan Behnelfaa29482019-06-01 21:49:03 +0200324 fprint("/* index tables for the database records */")
325 fprint("#define SHIFT", shift)
326 Array("index1", index1).dump(fp, trace)
327 Array("index2", index2).dump(fp, trace)
328
329 # split decomposition index table
330 index1, index2, shift = splitbins(decomp_index, trace)
331
332 fprint("/* decomposition data */")
333 Array("decomp_data", decomp_data).dump(fp, trace)
334
335 fprint("/* index tables for the decomposition data */")
336 fprint("#define DECOMP_SHIFT", shift)
337 Array("decomp_index1", index1).dump(fp, trace)
338 Array("decomp_index2", index2).dump(fp, trace)
339
340 index, index2, shift = splitbins(comp_data, trace)
341 fprint("/* NFC pairs */")
342 fprint("#define COMP_SHIFT", shift)
343 Array("comp_index", index).dump(fp, trace)
344 Array("comp_data", index2).dump(fp, trace)
345
346 # Generate delta tables for old versions
347 for version, table, normalization in unicode.changed:
348 cversion = version.replace(".","_")
349 records = [table[0]]
350 cache = {table[0]:0}
351 index = [0] * len(table)
352 for i, record in enumerate(table):
353 try:
354 index[i] = cache[record]
355 except KeyError:
356 index[i] = cache[record] = len(records)
357 records.append(record)
358 index1, index2, shift = splitbins(index, trace)
359 fprint("static const change_record change_records_%s[] = {" % cversion)
360 for record in records:
361 fprint(" { %s }," % ", ".join(map(str,record)))
362 fprint("};")
363 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
364 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
365 fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
366 fprint("{")
367 fprint(" int index;")
368 fprint(" if (n >= 0x110000) index = 0;")
369 fprint(" else {")
370 fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))
371 fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
372 (cversion, shift, ((1<<shift)-1)))
373 fprint(" }")
374 fprint(" return change_records_%s+index;" % cversion)
375 fprint("}\n")
376 fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
377 fprint("{")
378 fprint(" switch(n) {")
379 for k, v in normalization:
380 fprint(" case %s: return 0x%s;" % (hex(k), v))
381 fprint(" default: return 0;")
382 fprint(" }\n}\n")
383
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000384
385# --------------------------------------------------------------------
386# unicode character type tables
387
388def makeunicodetype(unicode, trace):
389
390 FILE = "Objects/unicodetype_db.h"
391
Collin Winter6afaeb72007-08-03 17:06:41 +0000392 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000393
394 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000395 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000396 table = [dummy]
397 cache = {0: dummy}
398 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000399 numeric = {}
400 spaces = []
401 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500402 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000403
404 for char in unicode.chars:
405 record = unicode.table[char]
406 if record:
407 # extract database properties
Greg Pricea65678c2019-09-12 02:23:43 -0700408 category = record.general_category
409 bidirectional = record.bidi_class
410 properties = record.binary_properties
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000411 flags = 0
412 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
413 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500414 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000415 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000416 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000417 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000418 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000419 if category == "Zs" or bidirectional in ("WS", "B", "S"):
420 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000421 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000422 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000423 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500424 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000425 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000426 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000427 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000428 if "XID_Start" in properties:
429 flags |= XID_START_MASK
430 if "XID_Continue" in properties:
431 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500432 if "Cased" in properties:
433 flags |= CASED_MASK
434 if "Case_Ignorable" in properties:
435 flags |= CASE_IGNORABLE_MASK
436 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500437 cf = unicode.case_folding.get(char, [char])
Greg Pricea65678c2019-09-12 02:23:43 -0700438 if record.simple_uppercase_mapping:
439 upper = int(record.simple_uppercase_mapping, 16)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500440 else:
441 upper = char
Greg Pricea65678c2019-09-12 02:23:43 -0700442 if record.simple_lowercase_mapping:
443 lower = int(record.simple_lowercase_mapping, 16)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500444 else:
445 lower = char
Greg Pricea65678c2019-09-12 02:23:43 -0700446 if record.simple_titlecase_mapping:
447 title = int(record.simple_titlecase_mapping, 16)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500448 else:
449 title = upper
450 if sc is None and cf != [lower]:
451 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500452 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500453 if upper == lower == title:
454 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500455 else:
456 upper = upper - char
457 lower = lower - char
458 title = title - char
459 assert (abs(upper) <= 2147483647 and
460 abs(lower) <= 2147483647 and
461 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000462 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500463 # This happens either when some character maps to more than one
464 # character in uppercase, lowercase, or titlecase or the
465 # casefolded version of the character is different from the
466 # lowercase. The extra characters are stored in a different
467 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500468 flags |= EXTENDED_CASE_MASK
469 lower = len(extra_casing) | (len(sc[0]) << 24)
470 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500471 if cf != sc[0]:
472 lower |= len(cf) << 20
473 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500474 upper = len(extra_casing) | (len(sc[2]) << 24)
475 extra_casing.extend(sc[2])
476 # Title is probably equal to upper.
477 if sc[1] == sc[2]:
478 title = upper
479 else:
480 title = len(extra_casing) | (len(sc[1]) << 24)
481 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000482 # decimal digit, integer digit
483 decimal = 0
Greg Pricea65678c2019-09-12 02:23:43 -0700484 if record.decomposition_mapping:
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000485 flags |= DECIMAL_MASK
Greg Pricea65678c2019-09-12 02:23:43 -0700486 decimal = int(record.decomposition_mapping)
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000487 digit = 0
Greg Pricea65678c2019-09-12 02:23:43 -0700488 if record.numeric_type:
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000489 flags |= DIGIT_MASK
Greg Pricea65678c2019-09-12 02:23:43 -0700490 digit = int(record.numeric_type)
491 if record.numeric_value:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000492 flags |= NUMERIC_MASK
Greg Pricea65678c2019-09-12 02:23:43 -0700493 numeric.setdefault(record.numeric_value, []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000494 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000495 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000496 )
497 # add entry to index and item tables
498 i = cache.get(item)
499 if i is None:
500 cache[item] = i = len(table)
501 table.append(item)
502 index[char] = i
503
Collin Winter6afaeb72007-08-03 17:06:41 +0000504 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000505 print(sum(map(len, numeric.values())), "numeric code points")
506 print(len(spaces), "whitespace code points")
507 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500508 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000509
Collin Winter6afaeb72007-08-03 17:06:41 +0000510 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000511
Stefan Behnelfaa29482019-06-01 21:49:03 +0200512 with open(FILE, "w") as fp:
513 fprint = partial(print, file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000514
Stefan Behnelfaa29482019-06-01 21:49:03 +0200515 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
516 fprint()
517 fprint("/* a list of unique character type descriptors */")
518 fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
519 for item in table:
520 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
521 fprint("};")
522 fprint()
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500523
Stefan Behnelfaa29482019-06-01 21:49:03 +0200524 fprint("/* extended case mappings */")
525 fprint()
526 fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
527 for c in extra_casing:
528 fprint(" %d," % c)
529 fprint("};")
530 fprint()
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000531
Stefan Behnelfaa29482019-06-01 21:49:03 +0200532 # split decomposition index table
533 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000534
Stefan Behnelfaa29482019-06-01 21:49:03 +0200535 fprint("/* type indexes */")
536 fprint("#define SHIFT", shift)
537 Array("index1", index1).dump(fp, trace)
538 Array("index2", index2).dump(fp, trace)
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000539
Stefan Behnelfaa29482019-06-01 21:49:03 +0200540 # Generate code for _PyUnicode_ToNumeric()
541 numeric_items = sorted(numeric.items())
542 fprint('/* Returns the numeric value as double for Unicode characters')
543 fprint(' * having this property, -1.0 otherwise.')
544 fprint(' */')
545 fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
546 fprint('{')
547 fprint(' switch (ch) {')
548 for value, codepoints in numeric_items:
549 # Turn text into float literals
550 parts = value.split('/')
551 parts = [repr(float(part)) for part in parts]
552 value = '/'.join(parts)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000553
Stefan Behnelfaa29482019-06-01 21:49:03 +0200554 codepoints.sort()
555 for codepoint in codepoints:
556 fprint(' case 0x%04X:' % (codepoint,))
557 fprint(' return (double) %s;' % (value,))
558 fprint(' }')
559 fprint(' return -1.0;')
560 fprint('}')
561 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000562
Stefan Behnelfaa29482019-06-01 21:49:03 +0200563 # Generate code for _PyUnicode_IsWhitespace()
564 fprint("/* Returns 1 for Unicode characters having the bidirectional")
565 fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
566 fprint(" */")
567 fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
568 fprint('{')
569 fprint(' switch (ch) {')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000570
Stefan Behnelfaa29482019-06-01 21:49:03 +0200571 for codepoint in sorted(spaces):
572 fprint(' case 0x%04X:' % (codepoint,))
573 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000574
Stefan Behnelfaa29482019-06-01 21:49:03 +0200575 fprint(' }')
576 fprint(' return 0;')
577 fprint('}')
578 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000579
Stefan Behnelfaa29482019-06-01 21:49:03 +0200580 # Generate code for _PyUnicode_IsLinebreak()
581 fprint("/* Returns 1 for Unicode characters having the line break")
582 fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
583 fprint(" * type 'B', 0 otherwise.")
584 fprint(" */")
585 fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
586 fprint('{')
587 fprint(' switch (ch) {')
588 for codepoint in sorted(linebreaks):
589 fprint(' case 0x%04X:' % (codepoint,))
590 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000591
Stefan Behnelfaa29482019-06-01 21:49:03 +0200592 fprint(' }')
593 fprint(' return 0;')
594 fprint('}')
595 fprint()
596
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000597
598# --------------------------------------------------------------------
599# unicode name database
600
601def makeunicodename(unicode, trace):
602
603 FILE = "Modules/unicodename_db.h"
604
Collin Winter6afaeb72007-08-03 17:06:41 +0000605 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000606
607 # collect names
608 names = [None] * len(unicode.chars)
609
610 for char in unicode.chars:
611 record = unicode.table[char]
612 if record:
Greg Pricea65678c2019-09-12 02:23:43 -0700613 name = record.name.strip()
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000614 if name and name[0] != "<":
615 names[char] = name + chr(0)
616
Jon Dufresne39726282017-05-18 07:35:54 -0700617 print(len([n for n in names if n is not None]), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000618
619 # collect unique words from names (note that we differ between
620 # words inside a sentence, and words ending a sentence. the
621 # latter includes the trailing null byte.
622
623 words = {}
624 n = b = 0
625 for char in unicode.chars:
626 name = names[char]
627 if name:
628 w = name.split()
629 b = b + len(name)
630 n = n + len(w)
631 for w in w:
632 l = words.get(w)
633 if l:
634 l.append(None)
635 else:
636 words[w] = [len(words)]
637
Collin Winter6afaeb72007-08-03 17:06:41 +0000638 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000639
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000640 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000641
Martin v. Löwis97225da2002-11-24 23:05:09 +0000642 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000643 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000644 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000645 return -len(alist), aword
646 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000647
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000648 # figure out how many phrasebook escapes we need
649 escapes = 0
650 while escapes * 256 < len(wordlist):
651 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000652 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000653
654 short = 256 - escapes
655
656 assert short > 0
657
Collin Winter6afaeb72007-08-03 17:06:41 +0000658 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000659
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000660 # statistics
661 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000662 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000663 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000664 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000665
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000666 # pick the most commonly used words, and sort the rest on falling
667 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000668
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000669 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000670 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000671 wordlist.extend(wordtail)
672
673 # generate lexicon from words
674
675 lexicon_offset = [0]
676 lexicon = ""
677 words = {}
678
679 # build a lexicon string
680 offset = 0
681 for w, x in wordlist:
682 # encoding: bit 7 indicates last character in word (chr(128)
683 # indicates the last character in an entire string)
684 ww = w[:-1] + chr(ord(w[-1])+128)
685 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000686 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000687 if o < 0:
688 o = offset
689 lexicon = lexicon + ww
690 offset = offset + len(w)
691 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000692 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000693
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000694 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000695
696 # generate phrasebook from names and lexicon
697 phrasebook = [0]
698 phrasebook_offset = [0] * len(unicode.chars)
699 for char in unicode.chars:
700 name = names[char]
701 if name:
702 w = name.split()
703 phrasebook_offset[char] = len(phrasebook)
704 for w in w:
705 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000706 if i < short:
707 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000708 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000709 # store as two bytes
710 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000711 phrasebook.append(i&255)
712
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000713 assert getsize(phrasebook) == 1
714
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000715 #
716 # unicode name hash table
717
718 # extract names
719 data = []
720 for char in unicode.chars:
721 record = unicode.table[char]
722 if record:
Greg Pricea65678c2019-09-12 02:23:43 -0700723 name = record.name.strip()
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000724 if name and name[0] != "<":
725 data.append((name, char))
726
727 # the magic number 47 was chosen to minimize the number of
728 # collisions on the current data set. if you like, change it
729 # and see what happens...
730
731 codehash = Hash("code", data, 47)
732
Collin Winter6afaeb72007-08-03 17:06:41 +0000733 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000734
Stefan Behnelfaa29482019-06-01 21:49:03 +0200735 with open(FILE, "w") as fp:
736 fprint = partial(print, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000737
Stefan Behnelfaa29482019-06-01 21:49:03 +0200738 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
739 fprint()
740 fprint("#define NAME_MAXLEN", 256)
741 fprint()
742 fprint("/* lexicon */")
743 Array("lexicon", lexicon).dump(fp, trace)
744 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000745
Stefan Behnelfaa29482019-06-01 21:49:03 +0200746 # split decomposition index table
747 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000748
Stefan Behnelfaa29482019-06-01 21:49:03 +0200749 fprint("/* code->name phrasebook */")
750 fprint("#define phrasebook_shift", shift)
751 fprint("#define phrasebook_short", short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000752
Stefan Behnelfaa29482019-06-01 21:49:03 +0200753 Array("phrasebook", phrasebook).dump(fp, trace)
754 Array("phrasebook_offset1", offset1).dump(fp, trace)
755 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000756
Stefan Behnelfaa29482019-06-01 21:49:03 +0200757 fprint("/* name->code dictionary */")
758 codehash.dump(fp, trace)
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300759
Stefan Behnelfaa29482019-06-01 21:49:03 +0200760 fprint()
761 fprint('static const unsigned int aliases_start = %#x;' %
762 NAME_ALIASES_START)
763 fprint('static const unsigned int aliases_end = %#x;' %
764 (NAME_ALIASES_START + len(unicode.aliases)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300765
Stefan Behnelfaa29482019-06-01 21:49:03 +0200766 fprint('static const unsigned int name_aliases[] = {')
767 for name, codepoint in unicode.aliases:
768 fprint(' 0x%04X,' % codepoint)
769 fprint('};')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300770
Stefan Behnelfaa29482019-06-01 21:49:03 +0200771 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
772 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
773 # sequences or sequences with non-BMP chars are added.
774 # unicodedata_lookup should be adapted too.
775 fprint(dedent("""
776 typedef struct NamedSequence {
777 int seqlen;
778 Py_UCS2 seq[4];
779 } named_sequence;
780 """))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300781
Stefan Behnelfaa29482019-06-01 21:49:03 +0200782 fprint('static const unsigned int named_sequences_start = %#x;' %
783 NAMED_SEQUENCES_START)
784 fprint('static const unsigned int named_sequences_end = %#x;' %
785 (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300786
Stefan Behnelfaa29482019-06-01 21:49:03 +0200787 fprint('static const named_sequence named_sequences[] = {')
788 for name, sequence in unicode.named_sequences:
789 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
790 fprint(' {%d, {%s}},' % (len(sequence), seq_str))
791 fprint('};')
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000792
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000793
794def merge_old_version(version, new, old):
795 # Changes to exclusion file not implemented yet
796 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000797 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000798
799 # In these change records, 0xFF means "no change"
800 bidir_changes = [0xFF]*0x110000
801 category_changes = [0xFF]*0x110000
802 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000803 mirrored_changes = [0xFF]*0x110000
Benjamin Peterson67752312016-09-14 23:53:47 -0700804 east_asian_width_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000805 # In numeric data, 0 means "no change",
806 # -1 means "did not have a numeric value
807 numeric_changes = [0] * 0x110000
808 # normalization_changes is a list of key-value pairs
809 normalization_changes = []
810 for i in range(0x110000):
811 if new.table[i] is None:
812 # Characters unassigned in the new version ought to
813 # be unassigned in the old one
814 assert old.table[i] is None
815 continue
816 # check characters unassigned in the old version
817 if old.table[i] is None:
818 # category 0 is "unassigned"
819 category_changes[i] = 0
820 continue
821 # check characters that differ
822 if old.table[i] != new.table[i]:
Greg Pricea65678c2019-09-12 02:23:43 -0700823 for k, field in enumerate(dataclasses.fields(UcdRecord)):
824 value = getattr(old.table[i], field.name)
825 new_value = getattr(new.table[i], field.name)
826 if value != new_value:
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300827 if k == 1 and i in PUA_15:
828 # the name is not set in the old.table, but in the
829 # new.table we are using it for aliases and named seq
830 assert value == ''
831 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000832 category_changes[i] = CATEGORY_NAMES.index(value)
833 elif k == 4:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000834 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
835 elif k == 5:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000836 # We assume that all normalization changes are in 1:1 mappings
837 assert " " not in value
838 normalization_changes.append((i, value))
839 elif k == 6:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000840 # we only support changes where the old value is a single digit
841 assert value in "0123456789"
842 decimal_changes[i] = int(value)
843 elif k == 8:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000844 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000845 if not value:
846 numeric_changes[i] = -1
847 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000848 numeric_changes[i] = float(value)
849 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000850 elif k == 9:
851 if value == 'Y':
852 mirrored_changes[i] = '1'
853 else:
854 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000855 elif k == 11:
856 # change to ISO comment, ignore
857 pass
858 elif k == 12:
859 # change to simple uppercase mapping; ignore
860 pass
861 elif k == 13:
862 # change to simple lowercase mapping; ignore
863 pass
864 elif k == 14:
865 # change to simple titlecase mapping; ignore
866 pass
Benjamin Peterson67752312016-09-14 23:53:47 -0700867 elif k == 15:
868 # change to east asian width
869 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000870 elif k == 16:
871 # derived property changes; not yet
872 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000873 elif k == 17:
874 # normalization quickchecks are not performed
875 # for older versions
876 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000877 else:
878 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000879 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000880 new.changed.append((version, list(zip(bidir_changes, category_changes,
Benjamin Peterson67752312016-09-14 23:53:47 -0700881 decimal_changes, mirrored_changes,
882 east_asian_width_changes,
883 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000884 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000885
Stefan Behnelfaa29482019-06-01 21:49:03 +0200886
Greg Price3e4498d2019-08-14 18:18:53 -0700887DATA_DIR = os.path.join('Tools', 'unicode', 'data')
888
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000889def open_data(template, version):
Greg Price3e4498d2019-08-14 18:18:53 -0700890 local = os.path.join(DATA_DIR, template % ('-'+version,))
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000891 if not os.path.exists(local):
892 import urllib.request
893 if version == '3.2.0':
894 # irregular url structure
Greg Price3e4498d2019-08-14 18:18:53 -0700895 url = ('http://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000896 else:
897 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
Greg Price3e4498d2019-08-14 18:18:53 -0700898 os.makedirs(DATA_DIR, exist_ok=True)
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000899 urllib.request.urlretrieve(url, filename=local)
900 if local.endswith('.txt'):
901 return open(local, encoding='utf-8')
902 else:
903 # Unihan.zip
904 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000905
Stefan Behnelfaa29482019-06-01 21:49:03 +0200906
Greg Pricec03e6982019-08-13 19:28:38 -0700907def expand_range(char_range: str) -> Iterator[int]:
908 '''
909 Parses ranges of code points, as described in UAX #44:
910 https://www.unicode.org/reports/tr44/#Code_Point_Ranges
911 '''
912 if '..' in char_range:
913 first, last = [int(c, 16) for c in char_range.split('..')]
914 else:
915 first = last = int(char_range, 16)
916 for char in range(first, last+1):
917 yield char
918
919
Greg Priceef2af1a2019-08-12 22:20:56 -0700920class UcdFile:
921 '''
922 A file in the standard format of the UCD.
923
924 See: https://www.unicode.org/reports/tr44/#Format_Conventions
925
926 Note that, as described there, the Unihan data files have their
927 own separate format.
928 '''
929
930 def __init__(self, template: str, version: str) -> None:
931 self.template = template
932 self.version = version
933
934 def records(self) -> Iterator[List[str]]:
935 with open_data(self.template, self.version) as file:
936 for line in file:
937 line = line.split('#', 1)[0].strip()
938 if not line:
939 continue
940 yield [field.strip() for field in line.split(';')]
941
942 def __iter__(self) -> Iterator[List[str]]:
943 return self.records()
944
Greg Pricec03e6982019-08-13 19:28:38 -0700945 def expanded(self) -> Iterator[Tuple[int, List[str]]]:
946 for record in self.records():
947 char_range, rest = record[0], record[1:]
948 for char in expand_range(char_range):
949 yield char, rest
950
Greg Priceef2af1a2019-08-12 22:20:56 -0700951
Greg Pricea65678c2019-09-12 02:23:43 -0700952@dataclasses.dataclass
953class UcdRecord:
954 # 15 fields from UnicodeData.txt . See:
955 # https://www.unicode.org/reports/tr44/#UnicodeData.txt
956 codepoint: str
957 name: str
958 general_category: str
959 canonical_combining_class: str
960 bidi_class: str
961 decomposition_type: str
962 decomposition_mapping: str
963 numeric_type: str
964 numeric_value: str
965 bidi_mirrored: str
966 unicode_1_name: str # obsolete
967 iso_comment: str # obsolete
968 simple_uppercase_mapping: str
969 simple_lowercase_mapping: str
970 simple_titlecase_mapping: str
971
972 # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
973 east_asian_width: Optional[str]
974
975 # Binary properties, as a set of those that are true.
976 # Taken from multiple files:
977 # https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
978 # https://www.unicode.org/reports/tr44/#LineBreak.txt
979 binary_properties: Set[str]
980
981 # The Quick_Check properties related to normalization:
982 # https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
983 # We store them as a bitmask.
984 quick_check: int
985
986
987def from_row(row: List[str]) -> UcdRecord:
988 return UcdRecord(*row, None, set(), 0)
989
990
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000991# --------------------------------------------------------------------
992# the following support code is taken from the unidb utilities
993# Copyright (c) 1999-2000 by Secret Labs AB
994
995# load a unicode-data file from disk
996
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000997class UnicodeData:
Greg Pricea65678c2019-09-12 02:23:43 -0700998 # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000999
Greg Price99d208e2019-08-12 22:59:30 -07001000 def __init__(self, version, cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001001 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +00001002 table = [None] * 0x110000
Greg Priceef2af1a2019-08-12 22:20:56 -07001003 for s in UcdFile(UNICODE_DATA, version):
1004 char = int(s[0], 16)
Greg Pricea65678c2019-09-12 02:23:43 -07001005 table[char] = from_row(s)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001006
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001007 cjk_ranges_found = []
1008
Martin v. Löwis97225da2002-11-24 23:05:09 +00001009 # expand first-last ranges
Greg Price99d208e2019-08-12 22:59:30 -07001010 field = None
1011 for i in range(0, 0x110000):
Greg Pricec03e6982019-08-13 19:28:38 -07001012 # The file UnicodeData.txt has its own distinct way of
1013 # expressing ranges. See:
1014 # https://www.unicode.org/reports/tr44/#Code_Point_Ranges
Greg Price99d208e2019-08-12 22:59:30 -07001015 s = table[i]
1016 if s:
Greg Pricea65678c2019-09-12 02:23:43 -07001017 if s.name[-6:] == "First>":
1018 s.name = ""
1019 field = dataclasses.astuple(s)[:15]
1020 elif s.name[-5:] == "Last>":
1021 if s.name.startswith("<CJK Ideograph"):
Greg Price99d208e2019-08-12 22:59:30 -07001022 cjk_ranges_found.append((field[0],
Greg Pricea65678c2019-09-12 02:23:43 -07001023 s.codepoint))
1024 s.name = ""
Greg Price99d208e2019-08-12 22:59:30 -07001025 field = None
1026 elif field:
Greg Pricea65678c2019-09-12 02:23:43 -07001027 table[i] = from_row(('%X' % i,) + field[1:])
Greg Price99d208e2019-08-12 22:59:30 -07001028 if cjk_check and cjk_ranges != cjk_ranges_found:
1029 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001030
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001031 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001032 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001033 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +00001034 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001035
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001036 # check for name aliases and named sequences, see #12753
1037 # aliases and named sequences are not in 3.2.0
1038 if version != '3.2.0':
1039 self.aliases = []
1040 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
1041 # in order to take advantage of the compression and lookup
1042 # algorithms used for the other characters
1043 pua_index = NAME_ALIASES_START
Greg Priceef2af1a2019-08-12 22:20:56 -07001044 for char, name, abbrev in UcdFile(NAME_ALIASES, version):
1045 char = int(char, 16)
1046 self.aliases.append((name, char))
1047 # also store the name in the PUA 1
Greg Pricea65678c2019-09-12 02:23:43 -07001048 self.table[pua_index].name = name
Greg Priceef2af1a2019-08-12 22:20:56 -07001049 pua_index += 1
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001050 assert pua_index - NAME_ALIASES_START == len(self.aliases)
1051
1052 self.named_sequences = []
Ezio Melotti7c4a7e62013-08-26 01:32:56 +03001053 # store named sequences in the PUA 1, in range U+F0100..,
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001054 # in order to take advantage of the compression and lookup
1055 # algorithms used for the other characters.
1056
Benjamin Peterson71f660e2012-02-20 22:24:29 -05001057 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001058 pua_index = NAMED_SEQUENCES_START
Greg Priceef2af1a2019-08-12 22:20:56 -07001059 for name, chars in UcdFile(NAMED_SEQUENCES, version):
1060 chars = tuple(int(char, 16) for char in chars.split())
1061 # check that the structure defined in makeunicodename is OK
1062 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1063 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1064 "the NamedSequence struct and in unicodedata_lookup")
1065 self.named_sequences.append((name, chars))
1066 # also store these in the PUA 1
Greg Pricea65678c2019-09-12 02:23:43 -07001067 self.table[pua_index].name = name
Greg Priceef2af1a2019-08-12 22:20:56 -07001068 pua_index += 1
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001069 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1070
Martin v. Löwis677bde22002-11-23 22:08:15 +00001071 self.exclusions = {}
Greg Priceef2af1a2019-08-12 22:20:56 -07001072 for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
1073 char = int(char, 16)
1074 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001075
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001076 widths = [None] * 0x110000
Greg Pricec03e6982019-08-13 19:28:38 -07001077 for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
1078 widths[char] = width
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001079
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001080 for i in range(0, 0x110000):
1081 if table[i] is not None:
Greg Pricea65678c2019-09-12 02:23:43 -07001082 table[i].east_asian_width = widths[i]
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001083
Greg Pricec03e6982019-08-13 19:28:38 -07001084 for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
1085 if table[char]:
1086 # Some properties (e.g. Default_Ignorable_Code_Point)
1087 # apply to unassigned code points; ignore them
Greg Pricea65678c2019-09-12 02:23:43 -07001088 table[char].binary_properties.add(p)
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001089
Greg Pricec03e6982019-08-13 19:28:38 -07001090 for char_range, value in UcdFile(LINE_BREAK, version):
1091 if value not in MANDATORY_LINE_BREAKS:
Greg Priceef2af1a2019-08-12 22:20:56 -07001092 continue
Greg Pricec03e6982019-08-13 19:28:38 -07001093 for char in expand_range(char_range):
Greg Pricea65678c2019-09-12 02:23:43 -07001094 table[char].binary_properties.add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001095
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001096 # We only want the quickcheck properties
1097 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1098 # Yes is the default, hence only N and M occur
1099 # In 3.2.0, the format was different (NF?_NO)
1100 # The parsing will incorrectly determine these as
1101 # "yes", however, unicodedata.c will not perform quickchecks
1102 # for older versions, and no delta records will be created.
1103 quickchecks = [0] * 0x110000
1104 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Greg Priceef2af1a2019-08-12 22:20:56 -07001105 for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1106 if len(s) < 2 or s[1] not in qc_order:
1107 continue
1108 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1109 quickcheck_shift = qc_order.index(s[1])*2
1110 quickcheck <<= quickcheck_shift
Greg Pricec03e6982019-08-13 19:28:38 -07001111 for char in expand_range(s[0]):
Greg Priceef2af1a2019-08-12 22:20:56 -07001112 assert not (quickchecks[char]>>quickcheck_shift)&3
1113 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001114 for i in range(0, 0x110000):
1115 if table[i] is not None:
Greg Pricea65678c2019-09-12 02:23:43 -07001116 table[i].quick_check = quickchecks[i]
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001117
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001118 with open_data(UNIHAN, version) as file:
1119 zip = zipfile.ZipFile(file)
1120 if version == '3.2.0':
1121 data = zip.open('Unihan-3.2.0.txt').read()
1122 else:
1123 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001124 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001125 if not line.startswith('U+'):
1126 continue
1127 code, tag, value = line.split(None, 3)[:3]
1128 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1129 'kOtherNumeric'):
1130 continue
1131 value = value.strip().replace(',', '')
1132 i = int(code[2:], 16)
1133 # Patch the numeric field
1134 if table[i] is not None:
Greg Pricea65678c2019-09-12 02:23:43 -07001135 table[i].numeric_value = value
Greg Priceef2af1a2019-08-12 22:20:56 -07001136
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001137 sc = self.special_casing = {}
Greg Priceef2af1a2019-08-12 22:20:56 -07001138 for data in UcdFile(SPECIAL_CASING, version):
1139 if data[4]:
1140 # We ignore all conditionals (since they depend on
1141 # languages) except for one, which is hardcoded. See
1142 # handle_capital_sigma in unicodeobject.c.
1143 continue
1144 c = int(data[0], 16)
1145 lower = [int(char, 16) for char in data[1].split()]
1146 title = [int(char, 16) for char in data[2].split()]
1147 upper = [int(char, 16) for char in data[3].split()]
1148 sc[c] = (lower, title, upper)
1149
Benjamin Petersond5890c82012-01-14 13:23:30 -05001150 cf = self.case_folding = {}
1151 if version != '3.2.0':
Greg Priceef2af1a2019-08-12 22:20:56 -07001152 for data in UcdFile(CASE_FOLDING, version):
1153 if data[1] in "CF":
1154 c = int(data[0], 16)
1155 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001156
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001157 def uselatin1(self):
1158 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001159 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001160
Stefan Behnelfaa29482019-06-01 21:49:03 +02001161
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001162# hash table tools
1163
1164# this is a straight-forward reimplementation of Python's built-in
1165# dictionary type, using a static data structure, and a custom string
1166# hash algorithm.
1167
1168def myhash(s, magic):
1169 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001170 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001171 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001172 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001173 if ix:
1174 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1175 return h
1176
Stefan Behnelfaa29482019-06-01 21:49:03 +02001177
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001178SIZES = [
1179 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1180 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1181 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1182 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1183]
1184
Stefan Behnelfaa29482019-06-01 21:49:03 +02001185
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001186class Hash:
1187 def __init__(self, name, data, magic):
1188 # turn a (key, value) list into a static hash table structure
1189
1190 # determine table size
1191 for size, poly in SIZES:
1192 if size > len(data):
1193 poly = size + poly
1194 break
1195 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001196 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001197
Collin Winter6afaeb72007-08-03 17:06:41 +00001198 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001199
1200 table = [None] * size
1201
1202 mask = size-1
1203
1204 n = 0
1205
1206 hash = myhash
1207
1208 # initialize hash table
1209 for key, value in data:
1210 h = hash(key, magic)
1211 i = (~h) & mask
1212 v = table[i]
1213 if v is None:
1214 table[i] = value
1215 continue
Stefan Behnelfaa29482019-06-01 21:49:03 +02001216 incr = (h ^ (h >> 3)) & mask
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001217 if not incr:
1218 incr = mask
1219 while 1:
1220 n = n + 1
1221 i = (i + incr) & mask
1222 v = table[i]
1223 if v is None:
1224 table[i] = value
1225 break
1226 incr = incr << 1
1227 if incr > mask:
1228 incr = incr ^ poly
1229
Collin Winter6afaeb72007-08-03 17:06:41 +00001230 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001231 self.collisions = n
1232
1233 for i in range(len(table)):
1234 if table[i] is None:
1235 table[i] = 0
1236
1237 self.data = Array(name + "_hash", table)
1238 self.magic = magic
1239 self.name = name
1240 self.size = size
1241 self.poly = poly
1242
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001243 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001244 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001245 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001246 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1247 file.write("#define %s_size %d\n" % (self.name, self.size))
1248 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1249
Stefan Behnelfaa29482019-06-01 21:49:03 +02001250
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001251# stuff to deal with arrays of unsigned integers
1252
1253class Array:
1254
1255 def __init__(self, name, data):
1256 self.name = name
1257 self.data = data
1258
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001259 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001260 # write data to file, as a C array
1261 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001262 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001263 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Inada Naoki6fec9052019-04-17 08:40:34 +09001264 file.write("static const ")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001265 if size == 1:
1266 file.write("unsigned char")
1267 elif size == 2:
1268 file.write("unsigned short")
1269 else:
1270 file.write("unsigned int")
1271 file.write(" " + self.name + "[] = {\n")
1272 if self.data:
1273 s = " "
1274 for item in self.data:
1275 i = str(item) + ", "
1276 if len(s) + len(i) > 78:
Benjamin Peterson279a9622017-06-22 22:31:08 -07001277 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001278 s = " " + i
1279 else:
1280 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001281 if s.strip():
Benjamin Peterson279a9622017-06-22 22:31:08 -07001282 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001283 file.write("};\n\n")
1284
Stefan Behnelfaa29482019-06-01 21:49:03 +02001285
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001286def getsize(data):
1287 # return smallest possible integer size for the given array
1288 maxdata = max(data)
1289 if maxdata < 256:
1290 return 1
1291 elif maxdata < 65536:
1292 return 2
1293 else:
1294 return 4
1295
Stefan Behnelfaa29482019-06-01 21:49:03 +02001296
Tim Peters21013482000-09-25 07:13:41 +00001297def splitbins(t, trace=0):
1298 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1299
1300 t is a sequence of ints. This function can be useful to save space if
1301 many of the ints are the same. t1 and t2 are lists of ints, and shift
1302 is an int, chosen to minimize the combined size of t1 and t2 (in C
1303 code), and where for each i in range(len(t)),
1304 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1305 where mask is a bitmask isolating the last "shift" bits.
1306
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001307 If optional arg trace is non-zero (default zero), progress info
1308 is printed to sys.stderr. The higher the value, the more info
1309 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001310 """
1311
Tim Peters21013482000-09-25 07:13:41 +00001312 if trace:
1313 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001314 print("%d+%d bins at shift %d; %d bytes" % (
1315 len(t1), len(t2), shift, bytes), file=sys.stderr)
Stefan Behnelfaa29482019-06-01 21:49:03 +02001316 print("Size of original table:", len(t)*getsize(t), "bytes",
1317 file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001318 n = len(t)-1 # last valid index
1319 maxshift = 0 # the most we can shift n and still have something left
1320 if n > 0:
1321 while n >> 1:
1322 n >>= 1
1323 maxshift += 1
1324 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001325 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001326 t = tuple(t) # so slices can be dict keys
1327 for shift in range(maxshift + 1):
1328 t1 = []
1329 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001330 size = 2**shift
1331 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001332 for i in range(0, len(t), size):
1333 bin = t[i:i+size]
1334 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001335 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001336 index = len(t2)
1337 bincache[bin] = index
1338 t2.extend(bin)
1339 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001340 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001341 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001342 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001343 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001344 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001345 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001346 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001347 t1, t2, shift = best
1348 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001349 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001350 dump(t1, t2, shift, bytes)
1351 if __debug__:
1352 # exhaustively verify that the decomposition is correct
1353 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001354 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001355 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1356 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001357
Stefan Behnelfaa29482019-06-01 21:49:03 +02001358
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001359if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001360 maketables(1)