blob: 810b285de2c91e3ee50591d99347077f0ad8e5fe [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -07004# This script converts Unicode database files to Modules/unicodedata_db.h,
5# Modules/unicodename_db.h, and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00006#
7# history:
8# 2000-09-24 fl created (based on bits and pieces from unidb)
9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000010# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000011# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000012# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000013# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000014# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000015# 2002-09-11 wd use string methods
16# 2002-10-18 mvl update to Unicode 3.2
17# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000018# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000019# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000020# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000021# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000022# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030023# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050024# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000025#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000026# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000027#
28
Greg Pricea65678c2019-09-12 02:23:43 -070029import dataclasses
Ezio Melotti931b8aa2011-10-21 21:57:36 +030030import os
31import sys
32import zipfile
33
Stefan Behnelfaa29482019-06-01 21:49:03 +020034from functools import partial
Greg Priceef2af1a2019-08-12 22:20:56 -070035from textwrap import dedent
Greg Pricea65678c2019-09-12 02:23:43 -070036from typing import Iterator, List, Optional, Set, Tuple
Fredrik Lundhf367cac2000-09-24 23:18:31 +000037
38SCRIPT = sys.argv[0]
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -070039VERSION = "3.3"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000040
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000041# The Unicode Database
R David Murray7445a382014-10-09 17:30:33 -040042# --------------------
43# When changing UCD version please update
44# * Doc/library/stdtypes.rst, and
45# * Doc/library/unicodedata.rst
R David Murray5f16f902014-10-09 20:45:59 -040046# * Doc/reference/lexical_analysis.rst (two occurrences)
Benjamin Peterson051b9d02020-03-10 20:41:34 -070047UNIDATA_VERSION = "13.0.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000048UNICODE_DATA = "UnicodeData%s.txt"
49COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
50EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000051UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000052DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000053DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000054LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030055NAME_ALIASES = "NameAliases%s.txt"
56NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050057SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050058CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030059
60# Private Use Areas -- in planes 1, 15, 16
61PUA_1 = range(0xE000, 0xF900)
62PUA_15 = range(0xF0000, 0xFFFFE)
63PUA_16 = range(0x100000, 0x10FFFE)
64
65# we use this ranges of PUA_15 to store name aliases and named sequences
66NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050067NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068
69old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000070
71CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
72 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
73 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
74 "So" ]
75
76BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
77 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
Benjamin Peterson94d08d92013-10-10 17:24:45 -040078 "ON", "LRI", "RLI", "FSI", "PDI" ]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000079
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000080EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
81
Florent Xicluna806d8cf2010-03-30 19:34:18 +000082MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
83
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000084# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000085ALPHA_MASK = 0x01
86DECIMAL_MASK = 0x02
87DIGIT_MASK = 0x04
88LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000089LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000090SPACE_MASK = 0x20
91TITLE_MASK = 0x40
92UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000093XID_START_MASK = 0x100
94XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000095PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050096NUMERIC_MASK = 0x800
97CASE_IGNORABLE_MASK = 0x1000
98CASED_MASK = 0x2000
99EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000100
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000101# these ranges need to match unicodedata.c:is_unified_ideograph
102cjk_ranges = [
Benjamin Peterson051b9d02020-03-10 20:41:34 -0700103 ('3400', '4DBF'),
104 ('4E00', '9FFC'),
105 ('20000', '2A6DD'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000106 ('2A700', '2B734'),
Benjamin Peterson48013832015-06-27 15:45:56 -0500107 ('2B740', '2B81D'),
108 ('2B820', '2CEA1'),
Benjamin Peterson279a9622017-06-22 22:31:08 -0700109 ('2CEB0', '2EBE0'),
Benjamin Peterson051b9d02020-03-10 20:41:34 -0700110 ('30000', '3134A'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000111]
112
Stefan Behnelfaa29482019-06-01 21:49:03 +0200113
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000114def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000115
Collin Winter6afaeb72007-08-03 17:06:41 +0000116 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000117
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000118 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000119
Georg Brandl559e5d72008-06-11 18:37:52 +0000120 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000121
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000122 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000123 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000124 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000125 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000126 merge_old_version(version, unicode, old_unicode)
127
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000128 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000129 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000130 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000131
Stefan Behnelfaa29482019-06-01 21:49:03 +0200132
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000133# --------------------------------------------------------------------
134# unicode character properties
135
136def makeunicodedata(unicode, trace):
137
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000138 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000139 table = [dummy]
140 cache = {0: dummy}
141 index = [0] * len(unicode.chars)
142
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000143 FILE = "Modules/unicodedata_db.h"
144
Collin Winter6afaeb72007-08-03 17:06:41 +0000145 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000146
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000147 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000148
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000149 for char in unicode.chars:
150 record = unicode.table[char]
151 if record:
152 # extract database properties
Greg Pricea65678c2019-09-12 02:23:43 -0700153 category = CATEGORY_NAMES.index(record.general_category)
154 combining = int(record.canonical_combining_class)
155 bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
156 mirrored = record.bidi_mirrored == "Y"
157 eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
158 normalizationquickcheck = record.quick_check
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000159 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000160 category, combining, bidirectional, mirrored, eastasianwidth,
161 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000162 )
163 # add entry to index and item tables
164 i = cache.get(item)
165 if i is None:
166 cache[item] = i = len(table)
167 table.append(item)
168 index[char] = i
169
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000170 # 2) decomposition data
171
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000172 decomp_data = [0]
173 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000174 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000175 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000176
Martin v. Löwis677bde22002-11-23 22:08:15 +0000177 comp_pairs = []
178 comp_first = [None] * len(unicode.chars)
179 comp_last = [None] * len(unicode.chars)
180
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000181 for char in unicode.chars:
182 record = unicode.table[char]
183 if record:
Greg Pricea65678c2019-09-12 02:23:43 -0700184 if record.decomposition_type:
185 decomp = record.decomposition_type.split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000186 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000187 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000188 # prefix
189 if decomp[0][0] == "<":
190 prefix = decomp.pop(0)
191 else:
192 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000193 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000194 i = decomp_prefix.index(prefix)
195 except ValueError:
196 i = len(decomp_prefix)
197 decomp_prefix.append(prefix)
198 prefix = i
199 assert prefix < 256
200 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000201 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000202 # Collect NFC pairs
203 if not prefix and len(decomp) == 3 and \
204 char not in unicode.exclusions and \
Greg Pricea65678c2019-09-12 02:23:43 -0700205 unicode.table[decomp[1]].canonical_combining_class == "0":
Martin v. Löwis677bde22002-11-23 22:08:15 +0000206 p, l, r = decomp
207 comp_first[l] = 1
208 comp_last[r] = 1
209 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000210 try:
211 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000212 except ValueError:
213 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000214 decomp_data.extend(decomp)
215 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000216 else:
217 i = 0
218 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000219
Martin v. Löwis677bde22002-11-23 22:08:15 +0000220 f = l = 0
221 comp_first_ranges = []
222 comp_last_ranges = []
223 prev_f = prev_l = None
224 for i in unicode.chars:
225 if comp_first[i] is not None:
226 comp_first[i] = f
227 f += 1
228 if prev_f is None:
229 prev_f = (i,i)
230 elif prev_f[1]+1 == i:
231 prev_f = prev_f[0],i
232 else:
233 comp_first_ranges.append(prev_f)
234 prev_f = (i,i)
235 if comp_last[i] is not None:
236 comp_last[i] = l
237 l += 1
238 if prev_l is None:
239 prev_l = (i,i)
240 elif prev_l[1]+1 == i:
241 prev_l = prev_l[0],i
242 else:
243 comp_last_ranges.append(prev_l)
244 prev_l = (i,i)
245 comp_first_ranges.append(prev_f)
246 comp_last_ranges.append(prev_l)
247 total_first = f
248 total_last = l
249
250 comp_data = [0]*(total_first*total_last)
251 for f,l,char in comp_pairs:
252 f = comp_first[f]
253 l = comp_last[l]
254 comp_data[f*total_last+l] = char
255
Collin Winter6afaeb72007-08-03 17:06:41 +0000256 print(len(table), "unique properties")
257 print(len(decomp_prefix), "unique decomposition prefixes")
258 print(len(decomp_data), "unique decomposition entries:", end=' ')
259 print(decomp_size, "bytes")
260 print(total_first, "first characters in NFC")
261 print(total_last, "last characters in NFC")
262 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000263
Collin Winter6afaeb72007-08-03 17:06:41 +0000264 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000265
Stefan Behnelfaa29482019-06-01 21:49:03 +0200266 with open(FILE, "w") as fp:
267 fprint = partial(print, file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000268
Stefan Behnelfaa29482019-06-01 21:49:03 +0200269 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
270 fprint()
271 fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
272 fprint("/* a list of unique database records */")
273 fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
274 for item in table:
275 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
276 fprint("};")
277 fprint()
Martin v. Löwis677bde22002-11-23 22:08:15 +0000278
Stefan Behnelfaa29482019-06-01 21:49:03 +0200279 fprint("/* Reindexing of NFC first characters. */")
280 fprint("#define TOTAL_FIRST",total_first)
281 fprint("#define TOTAL_LAST",total_last)
282 fprint("struct reindex{int start;short count,index;};")
283 fprint("static struct reindex nfc_first[] = {")
284 for start,end in comp_first_ranges:
285 fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
286 fprint(" {0,0,0}")
287 fprint("};\n")
288 fprint("static struct reindex nfc_last[] = {")
289 for start,end in comp_last_ranges:
290 fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
291 fprint(" {0,0,0}")
292 fprint("};\n")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000293
Stefan Behnelfaa29482019-06-01 21:49:03 +0200294 # FIXME: <fl> the following tables could be made static, and
295 # the support code moved into unicodedatabase.c
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000296
Stefan Behnelfaa29482019-06-01 21:49:03 +0200297 fprint("/* string literals */")
298 fprint("const char *_PyUnicode_CategoryNames[] = {")
299 for name in CATEGORY_NAMES:
300 fprint(" \"%s\"," % name)
301 fprint(" NULL")
302 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000303
Stefan Behnelfaa29482019-06-01 21:49:03 +0200304 fprint("const char *_PyUnicode_BidirectionalNames[] = {")
305 for name in BIDIRECTIONAL_NAMES:
306 fprint(" \"%s\"," % name)
307 fprint(" NULL")
308 fprint("};")
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000309
Stefan Behnelfaa29482019-06-01 21:49:03 +0200310 fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
311 for name in EASTASIANWIDTH_NAMES:
312 fprint(" \"%s\"," % name)
313 fprint(" NULL")
314 fprint("};")
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000315
Stefan Behnelfaa29482019-06-01 21:49:03 +0200316 fprint("static const char *decomp_prefix[] = {")
317 for name in decomp_prefix:
318 fprint(" \"%s\"," % name)
319 fprint(" NULL")
320 fprint("};")
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000321
Stefan Behnelfaa29482019-06-01 21:49:03 +0200322 # split record index table
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000323 index1, index2, shift = splitbins(index, trace)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000324
Stefan Behnelfaa29482019-06-01 21:49:03 +0200325 fprint("/* index tables for the database records */")
326 fprint("#define SHIFT", shift)
327 Array("index1", index1).dump(fp, trace)
328 Array("index2", index2).dump(fp, trace)
329
330 # split decomposition index table
331 index1, index2, shift = splitbins(decomp_index, trace)
332
333 fprint("/* decomposition data */")
334 Array("decomp_data", decomp_data).dump(fp, trace)
335
336 fprint("/* index tables for the decomposition data */")
337 fprint("#define DECOMP_SHIFT", shift)
338 Array("decomp_index1", index1).dump(fp, trace)
339 Array("decomp_index2", index2).dump(fp, trace)
340
341 index, index2, shift = splitbins(comp_data, trace)
342 fprint("/* NFC pairs */")
343 fprint("#define COMP_SHIFT", shift)
344 Array("comp_index", index).dump(fp, trace)
345 Array("comp_data", index2).dump(fp, trace)
346
347 # Generate delta tables for old versions
348 for version, table, normalization in unicode.changed:
349 cversion = version.replace(".","_")
350 records = [table[0]]
351 cache = {table[0]:0}
352 index = [0] * len(table)
353 for i, record in enumerate(table):
354 try:
355 index[i] = cache[record]
356 except KeyError:
357 index[i] = cache[record] = len(records)
358 records.append(record)
359 index1, index2, shift = splitbins(index, trace)
360 fprint("static const change_record change_records_%s[] = {" % cversion)
361 for record in records:
362 fprint(" { %s }," % ", ".join(map(str,record)))
363 fprint("};")
364 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
365 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
366 fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
367 fprint("{")
368 fprint(" int index;")
369 fprint(" if (n >= 0x110000) index = 0;")
370 fprint(" else {")
371 fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))
372 fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
373 (cversion, shift, ((1<<shift)-1)))
374 fprint(" }")
375 fprint(" return change_records_%s+index;" % cversion)
376 fprint("}\n")
377 fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
378 fprint("{")
379 fprint(" switch(n) {")
380 for k, v in normalization:
381 fprint(" case %s: return 0x%s;" % (hex(k), v))
382 fprint(" default: return 0;")
383 fprint(" }\n}\n")
384
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000385
386# --------------------------------------------------------------------
387# unicode character type tables
388
389def makeunicodetype(unicode, trace):
390
391 FILE = "Objects/unicodetype_db.h"
392
Collin Winter6afaeb72007-08-03 17:06:41 +0000393 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000394
395 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000396 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000397 table = [dummy]
398 cache = {0: dummy}
399 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000400 numeric = {}
401 spaces = []
402 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500403 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000404
405 for char in unicode.chars:
406 record = unicode.table[char]
407 if record:
408 # extract database properties
Greg Pricea65678c2019-09-12 02:23:43 -0700409 category = record.general_category
410 bidirectional = record.bidi_class
411 properties = record.binary_properties
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000412 flags = 0
413 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
414 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500415 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000416 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000417 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000418 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000419 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000420 if category == "Zs" or bidirectional in ("WS", "B", "S"):
421 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000422 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000423 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000424 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500425 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000426 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000427 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000428 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000429 if "XID_Start" in properties:
430 flags |= XID_START_MASK
431 if "XID_Continue" in properties:
432 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500433 if "Cased" in properties:
434 flags |= CASED_MASK
435 if "Case_Ignorable" in properties:
436 flags |= CASE_IGNORABLE_MASK
437 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500438 cf = unicode.case_folding.get(char, [char])
Greg Pricea65678c2019-09-12 02:23:43 -0700439 if record.simple_uppercase_mapping:
440 upper = int(record.simple_uppercase_mapping, 16)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500441 else:
442 upper = char
Greg Pricea65678c2019-09-12 02:23:43 -0700443 if record.simple_lowercase_mapping:
444 lower = int(record.simple_lowercase_mapping, 16)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500445 else:
446 lower = char
Greg Pricea65678c2019-09-12 02:23:43 -0700447 if record.simple_titlecase_mapping:
448 title = int(record.simple_titlecase_mapping, 16)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500449 else:
450 title = upper
451 if sc is None and cf != [lower]:
452 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500453 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500454 if upper == lower == title:
455 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500456 else:
457 upper = upper - char
458 lower = lower - char
459 title = title - char
460 assert (abs(upper) <= 2147483647 and
461 abs(lower) <= 2147483647 and
462 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000463 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500464 # This happens either when some character maps to more than one
465 # character in uppercase, lowercase, or titlecase or the
466 # casefolded version of the character is different from the
467 # lowercase. The extra characters are stored in a different
468 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500469 flags |= EXTENDED_CASE_MASK
470 lower = len(extra_casing) | (len(sc[0]) << 24)
471 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500472 if cf != sc[0]:
473 lower |= len(cf) << 20
474 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500475 upper = len(extra_casing) | (len(sc[2]) << 24)
476 extra_casing.extend(sc[2])
477 # Title is probably equal to upper.
478 if sc[1] == sc[2]:
479 title = upper
480 else:
481 title = len(extra_casing) | (len(sc[1]) << 24)
482 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000483 # decimal digit, integer digit
484 decimal = 0
Greg Pricea65678c2019-09-12 02:23:43 -0700485 if record.decomposition_mapping:
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000486 flags |= DECIMAL_MASK
Greg Pricea65678c2019-09-12 02:23:43 -0700487 decimal = int(record.decomposition_mapping)
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000488 digit = 0
Greg Pricea65678c2019-09-12 02:23:43 -0700489 if record.numeric_type:
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000490 flags |= DIGIT_MASK
Greg Pricea65678c2019-09-12 02:23:43 -0700491 digit = int(record.numeric_type)
492 if record.numeric_value:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000493 flags |= NUMERIC_MASK
Greg Pricea65678c2019-09-12 02:23:43 -0700494 numeric.setdefault(record.numeric_value, []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000495 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000496 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000497 )
498 # add entry to index and item tables
499 i = cache.get(item)
500 if i is None:
501 cache[item] = i = len(table)
502 table.append(item)
503 index[char] = i
504
Collin Winter6afaeb72007-08-03 17:06:41 +0000505 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000506 print(sum(map(len, numeric.values())), "numeric code points")
507 print(len(spaces), "whitespace code points")
508 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500509 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000510
Collin Winter6afaeb72007-08-03 17:06:41 +0000511 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000512
Stefan Behnelfaa29482019-06-01 21:49:03 +0200513 with open(FILE, "w") as fp:
514 fprint = partial(print, file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000515
Stefan Behnelfaa29482019-06-01 21:49:03 +0200516 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
517 fprint()
518 fprint("/* a list of unique character type descriptors */")
519 fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
520 for item in table:
521 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
522 fprint("};")
523 fprint()
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500524
Stefan Behnelfaa29482019-06-01 21:49:03 +0200525 fprint("/* extended case mappings */")
526 fprint()
527 fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
528 for c in extra_casing:
529 fprint(" %d," % c)
530 fprint("};")
531 fprint()
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000532
Stefan Behnelfaa29482019-06-01 21:49:03 +0200533 # split decomposition index table
534 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000535
Stefan Behnelfaa29482019-06-01 21:49:03 +0200536 fprint("/* type indexes */")
537 fprint("#define SHIFT", shift)
538 Array("index1", index1).dump(fp, trace)
539 Array("index2", index2).dump(fp, trace)
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000540
Stefan Behnelfaa29482019-06-01 21:49:03 +0200541 # Generate code for _PyUnicode_ToNumeric()
542 numeric_items = sorted(numeric.items())
543 fprint('/* Returns the numeric value as double for Unicode characters')
544 fprint(' * having this property, -1.0 otherwise.')
545 fprint(' */')
546 fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
547 fprint('{')
548 fprint(' switch (ch) {')
549 for value, codepoints in numeric_items:
550 # Turn text into float literals
551 parts = value.split('/')
552 parts = [repr(float(part)) for part in parts]
553 value = '/'.join(parts)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000554
Stefan Behnelfaa29482019-06-01 21:49:03 +0200555 codepoints.sort()
556 for codepoint in codepoints:
557 fprint(' case 0x%04X:' % (codepoint,))
558 fprint(' return (double) %s;' % (value,))
559 fprint(' }')
560 fprint(' return -1.0;')
561 fprint('}')
562 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000563
Stefan Behnelfaa29482019-06-01 21:49:03 +0200564 # Generate code for _PyUnicode_IsWhitespace()
565 fprint("/* Returns 1 for Unicode characters having the bidirectional")
566 fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
567 fprint(" */")
568 fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
569 fprint('{')
570 fprint(' switch (ch) {')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000571
Stefan Behnelfaa29482019-06-01 21:49:03 +0200572 for codepoint in sorted(spaces):
573 fprint(' case 0x%04X:' % (codepoint,))
574 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000575
Stefan Behnelfaa29482019-06-01 21:49:03 +0200576 fprint(' }')
577 fprint(' return 0;')
578 fprint('}')
579 fprint()
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000580
Stefan Behnelfaa29482019-06-01 21:49:03 +0200581 # Generate code for _PyUnicode_IsLinebreak()
582 fprint("/* Returns 1 for Unicode characters having the line break")
583 fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
584 fprint(" * type 'B', 0 otherwise.")
585 fprint(" */")
586 fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
587 fprint('{')
588 fprint(' switch (ch) {')
589 for codepoint in sorted(linebreaks):
590 fprint(' case 0x%04X:' % (codepoint,))
591 fprint(' return 1;')
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000592
Stefan Behnelfaa29482019-06-01 21:49:03 +0200593 fprint(' }')
594 fprint(' return 0;')
595 fprint('}')
596 fprint()
597
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000598
599# --------------------------------------------------------------------
600# unicode name database
601
602def makeunicodename(unicode, trace):
603
604 FILE = "Modules/unicodename_db.h"
605
Collin Winter6afaeb72007-08-03 17:06:41 +0000606 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000607
608 # collect names
609 names = [None] * len(unicode.chars)
610
611 for char in unicode.chars:
612 record = unicode.table[char]
613 if record:
Greg Pricea65678c2019-09-12 02:23:43 -0700614 name = record.name.strip()
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000615 if name and name[0] != "<":
616 names[char] = name + chr(0)
617
Jon Dufresne39726282017-05-18 07:35:54 -0700618 print(len([n for n in names if n is not None]), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000619
620 # collect unique words from names (note that we differ between
621 # words inside a sentence, and words ending a sentence. the
622 # latter includes the trailing null byte.
623
624 words = {}
625 n = b = 0
626 for char in unicode.chars:
627 name = names[char]
628 if name:
629 w = name.split()
630 b = b + len(name)
631 n = n + len(w)
632 for w in w:
633 l = words.get(w)
634 if l:
635 l.append(None)
636 else:
637 words[w] = [len(words)]
638
Collin Winter6afaeb72007-08-03 17:06:41 +0000639 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000640
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000641 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000642
Martin v. Löwis97225da2002-11-24 23:05:09 +0000643 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000644 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000645 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000646 return -len(alist), aword
647 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000648
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000649 # figure out how many phrasebook escapes we need
650 escapes = 0
651 while escapes * 256 < len(wordlist):
652 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000653 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000654
655 short = 256 - escapes
656
657 assert short > 0
658
Collin Winter6afaeb72007-08-03 17:06:41 +0000659 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000660
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000661 # statistics
662 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000663 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000664 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000665 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000666
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000667 # pick the most commonly used words, and sort the rest on falling
668 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000669
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000670 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000671 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000672 wordlist.extend(wordtail)
673
674 # generate lexicon from words
675
676 lexicon_offset = [0]
677 lexicon = ""
678 words = {}
679
680 # build a lexicon string
681 offset = 0
682 for w, x in wordlist:
683 # encoding: bit 7 indicates last character in word (chr(128)
684 # indicates the last character in an entire string)
685 ww = w[:-1] + chr(ord(w[-1])+128)
686 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000687 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000688 if o < 0:
689 o = offset
690 lexicon = lexicon + ww
691 offset = offset + len(w)
692 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000693 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000694
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000695 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000696
697 # generate phrasebook from names and lexicon
698 phrasebook = [0]
699 phrasebook_offset = [0] * len(unicode.chars)
700 for char in unicode.chars:
701 name = names[char]
702 if name:
703 w = name.split()
704 phrasebook_offset[char] = len(phrasebook)
705 for w in w:
706 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000707 if i < short:
708 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000709 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000710 # store as two bytes
711 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000712 phrasebook.append(i&255)
713
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000714 assert getsize(phrasebook) == 1
715
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000716 #
717 # unicode name hash table
718
719 # extract names
720 data = []
721 for char in unicode.chars:
722 record = unicode.table[char]
723 if record:
Greg Pricea65678c2019-09-12 02:23:43 -0700724 name = record.name.strip()
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000725 if name and name[0] != "<":
726 data.append((name, char))
727
728 # the magic number 47 was chosen to minimize the number of
729 # collisions on the current data set. if you like, change it
730 # and see what happens...
731
732 codehash = Hash("code", data, 47)
733
Collin Winter6afaeb72007-08-03 17:06:41 +0000734 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000735
Stefan Behnelfaa29482019-06-01 21:49:03 +0200736 with open(FILE, "w") as fp:
737 fprint = partial(print, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000738
Stefan Behnelfaa29482019-06-01 21:49:03 +0200739 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
740 fprint()
741 fprint("#define NAME_MAXLEN", 256)
742 fprint()
743 fprint("/* lexicon */")
744 Array("lexicon", lexicon).dump(fp, trace)
745 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000746
Stefan Behnelfaa29482019-06-01 21:49:03 +0200747 # split decomposition index table
748 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000749
Stefan Behnelfaa29482019-06-01 21:49:03 +0200750 fprint("/* code->name phrasebook */")
751 fprint("#define phrasebook_shift", shift)
752 fprint("#define phrasebook_short", short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000753
Stefan Behnelfaa29482019-06-01 21:49:03 +0200754 Array("phrasebook", phrasebook).dump(fp, trace)
755 Array("phrasebook_offset1", offset1).dump(fp, trace)
756 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000757
Stefan Behnelfaa29482019-06-01 21:49:03 +0200758 fprint("/* name->code dictionary */")
759 codehash.dump(fp, trace)
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300760
Stefan Behnelfaa29482019-06-01 21:49:03 +0200761 fprint()
762 fprint('static const unsigned int aliases_start = %#x;' %
763 NAME_ALIASES_START)
764 fprint('static const unsigned int aliases_end = %#x;' %
765 (NAME_ALIASES_START + len(unicode.aliases)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300766
Stefan Behnelfaa29482019-06-01 21:49:03 +0200767 fprint('static const unsigned int name_aliases[] = {')
768 for name, codepoint in unicode.aliases:
769 fprint(' 0x%04X,' % codepoint)
770 fprint('};')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300771
Stefan Behnelfaa29482019-06-01 21:49:03 +0200772 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
773 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
774 # sequences or sequences with non-BMP chars are added.
775 # unicodedata_lookup should be adapted too.
776 fprint(dedent("""
777 typedef struct NamedSequence {
778 int seqlen;
779 Py_UCS2 seq[4];
780 } named_sequence;
781 """))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300782
Stefan Behnelfaa29482019-06-01 21:49:03 +0200783 fprint('static const unsigned int named_sequences_start = %#x;' %
784 NAMED_SEQUENCES_START)
785 fprint('static const unsigned int named_sequences_end = %#x;' %
786 (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300787
Stefan Behnelfaa29482019-06-01 21:49:03 +0200788 fprint('static const named_sequence named_sequences[] = {')
789 for name, sequence in unicode.named_sequences:
790 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
791 fprint(' {%d, {%s}},' % (len(sequence), seq_str))
792 fprint('};')
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000793
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000794
795def merge_old_version(version, new, old):
796 # Changes to exclusion file not implemented yet
797 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000798 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000799
800 # In these change records, 0xFF means "no change"
801 bidir_changes = [0xFF]*0x110000
802 category_changes = [0xFF]*0x110000
803 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000804 mirrored_changes = [0xFF]*0x110000
Benjamin Peterson67752312016-09-14 23:53:47 -0700805 east_asian_width_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000806 # In numeric data, 0 means "no change",
807 # -1 means "did not have a numeric value
808 numeric_changes = [0] * 0x110000
809 # normalization_changes is a list of key-value pairs
810 normalization_changes = []
811 for i in range(0x110000):
812 if new.table[i] is None:
813 # Characters unassigned in the new version ought to
814 # be unassigned in the old one
815 assert old.table[i] is None
816 continue
817 # check characters unassigned in the old version
818 if old.table[i] is None:
819 # category 0 is "unassigned"
820 category_changes[i] = 0
821 continue
822 # check characters that differ
823 if old.table[i] != new.table[i]:
Greg Pricea65678c2019-09-12 02:23:43 -0700824 for k, field in enumerate(dataclasses.fields(UcdRecord)):
825 value = getattr(old.table[i], field.name)
826 new_value = getattr(new.table[i], field.name)
827 if value != new_value:
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300828 if k == 1 and i in PUA_15:
829 # the name is not set in the old.table, but in the
830 # new.table we are using it for aliases and named seq
831 assert value == ''
832 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000833 category_changes[i] = CATEGORY_NAMES.index(value)
834 elif k == 4:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000835 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
836 elif k == 5:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000837 # We assume that all normalization changes are in 1:1 mappings
838 assert " " not in value
839 normalization_changes.append((i, value))
840 elif k == 6:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000841 # we only support changes where the old value is a single digit
842 assert value in "0123456789"
843 decimal_changes[i] = int(value)
844 elif k == 8:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000845 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000846 if not value:
847 numeric_changes[i] = -1
848 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000849 numeric_changes[i] = float(value)
850 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000851 elif k == 9:
852 if value == 'Y':
853 mirrored_changes[i] = '1'
854 else:
855 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000856 elif k == 11:
857 # change to ISO comment, ignore
858 pass
859 elif k == 12:
860 # change to simple uppercase mapping; ignore
861 pass
862 elif k == 13:
863 # change to simple lowercase mapping; ignore
864 pass
865 elif k == 14:
866 # change to simple titlecase mapping; ignore
867 pass
Benjamin Peterson67752312016-09-14 23:53:47 -0700868 elif k == 15:
869 # change to east asian width
870 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000871 elif k == 16:
872 # derived property changes; not yet
873 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000874 elif k == 17:
875 # normalization quickchecks are not performed
876 # for older versions
877 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000878 else:
879 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000880 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000881 new.changed.append((version, list(zip(bidir_changes, category_changes,
Benjamin Peterson67752312016-09-14 23:53:47 -0700882 decimal_changes, mirrored_changes,
883 east_asian_width_changes,
884 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000885 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000886
Stefan Behnelfaa29482019-06-01 21:49:03 +0200887
Greg Price3e4498d2019-08-14 18:18:53 -0700888DATA_DIR = os.path.join('Tools', 'unicode', 'data')
889
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000890def open_data(template, version):
Greg Price3e4498d2019-08-14 18:18:53 -0700891 local = os.path.join(DATA_DIR, template % ('-'+version,))
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000892 if not os.path.exists(local):
893 import urllib.request
894 if version == '3.2.0':
895 # irregular url structure
Benjamin Peterson51796e52020-03-10 21:10:59 -0700896 url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000897 else:
Benjamin Peterson51796e52020-03-10 21:10:59 -0700898 url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
Greg Price3e4498d2019-08-14 18:18:53 -0700899 os.makedirs(DATA_DIR, exist_ok=True)
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000900 urllib.request.urlretrieve(url, filename=local)
901 if local.endswith('.txt'):
902 return open(local, encoding='utf-8')
903 else:
904 # Unihan.zip
905 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000906
Stefan Behnelfaa29482019-06-01 21:49:03 +0200907
Greg Pricec03e6982019-08-13 19:28:38 -0700908def expand_range(char_range: str) -> Iterator[int]:
909 '''
910 Parses ranges of code points, as described in UAX #44:
911 https://www.unicode.org/reports/tr44/#Code_Point_Ranges
912 '''
913 if '..' in char_range:
914 first, last = [int(c, 16) for c in char_range.split('..')]
915 else:
916 first = last = int(char_range, 16)
917 for char in range(first, last+1):
918 yield char
919
920
Greg Priceef2af1a2019-08-12 22:20:56 -0700921class UcdFile:
922 '''
923 A file in the standard format of the UCD.
924
925 See: https://www.unicode.org/reports/tr44/#Format_Conventions
926
927 Note that, as described there, the Unihan data files have their
928 own separate format.
929 '''
930
931 def __init__(self, template: str, version: str) -> None:
932 self.template = template
933 self.version = version
934
935 def records(self) -> Iterator[List[str]]:
936 with open_data(self.template, self.version) as file:
937 for line in file:
938 line = line.split('#', 1)[0].strip()
939 if not line:
940 continue
941 yield [field.strip() for field in line.split(';')]
942
943 def __iter__(self) -> Iterator[List[str]]:
944 return self.records()
945
Greg Pricec03e6982019-08-13 19:28:38 -0700946 def expanded(self) -> Iterator[Tuple[int, List[str]]]:
947 for record in self.records():
948 char_range, rest = record[0], record[1:]
949 for char in expand_range(char_range):
950 yield char, rest
951
Greg Priceef2af1a2019-08-12 22:20:56 -0700952
Greg Pricea65678c2019-09-12 02:23:43 -0700953@dataclasses.dataclass
954class UcdRecord:
955 # 15 fields from UnicodeData.txt . See:
956 # https://www.unicode.org/reports/tr44/#UnicodeData.txt
957 codepoint: str
958 name: str
959 general_category: str
960 canonical_combining_class: str
961 bidi_class: str
962 decomposition_type: str
963 decomposition_mapping: str
964 numeric_type: str
965 numeric_value: str
966 bidi_mirrored: str
967 unicode_1_name: str # obsolete
968 iso_comment: str # obsolete
969 simple_uppercase_mapping: str
970 simple_lowercase_mapping: str
971 simple_titlecase_mapping: str
972
973 # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
974 east_asian_width: Optional[str]
975
976 # Binary properties, as a set of those that are true.
977 # Taken from multiple files:
978 # https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
979 # https://www.unicode.org/reports/tr44/#LineBreak.txt
980 binary_properties: Set[str]
981
982 # The Quick_Check properties related to normalization:
983 # https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
984 # We store them as a bitmask.
985 quick_check: int
986
987
988def from_row(row: List[str]) -> UcdRecord:
989 return UcdRecord(*row, None, set(), 0)
990
991
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000992# --------------------------------------------------------------------
993# the following support code is taken from the unidb utilities
994# Copyright (c) 1999-2000 by Secret Labs AB
995
996# load a unicode-data file from disk
997
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000998class UnicodeData:
Greg Pricea65678c2019-09-12 02:23:43 -0700999 # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001000
Greg Price99d208e2019-08-12 22:59:30 -07001001 def __init__(self, version, cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001002 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +00001003 table = [None] * 0x110000
Greg Priceef2af1a2019-08-12 22:20:56 -07001004 for s in UcdFile(UNICODE_DATA, version):
1005 char = int(s[0], 16)
Greg Pricea65678c2019-09-12 02:23:43 -07001006 table[char] = from_row(s)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001007
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001008 cjk_ranges_found = []
1009
Martin v. Löwis97225da2002-11-24 23:05:09 +00001010 # expand first-last ranges
Greg Price99d208e2019-08-12 22:59:30 -07001011 field = None
1012 for i in range(0, 0x110000):
Greg Pricec03e6982019-08-13 19:28:38 -07001013 # The file UnicodeData.txt has its own distinct way of
1014 # expressing ranges. See:
1015 # https://www.unicode.org/reports/tr44/#Code_Point_Ranges
Greg Price99d208e2019-08-12 22:59:30 -07001016 s = table[i]
1017 if s:
Greg Pricea65678c2019-09-12 02:23:43 -07001018 if s.name[-6:] == "First>":
1019 s.name = ""
1020 field = dataclasses.astuple(s)[:15]
1021 elif s.name[-5:] == "Last>":
1022 if s.name.startswith("<CJK Ideograph"):
Greg Price99d208e2019-08-12 22:59:30 -07001023 cjk_ranges_found.append((field[0],
Greg Pricea65678c2019-09-12 02:23:43 -07001024 s.codepoint))
1025 s.name = ""
Greg Price99d208e2019-08-12 22:59:30 -07001026 field = None
1027 elif field:
Greg Pricea65678c2019-09-12 02:23:43 -07001028 table[i] = from_row(('%X' % i,) + field[1:])
Greg Price99d208e2019-08-12 22:59:30 -07001029 if cjk_check and cjk_ranges != cjk_ranges_found:
1030 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001031
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001032 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001033 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001034 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +00001035 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001036
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001037 # check for name aliases and named sequences, see #12753
1038 # aliases and named sequences are not in 3.2.0
1039 if version != '3.2.0':
1040 self.aliases = []
1041 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
1042 # in order to take advantage of the compression and lookup
1043 # algorithms used for the other characters
1044 pua_index = NAME_ALIASES_START
Greg Priceef2af1a2019-08-12 22:20:56 -07001045 for char, name, abbrev in UcdFile(NAME_ALIASES, version):
1046 char = int(char, 16)
1047 self.aliases.append((name, char))
1048 # also store the name in the PUA 1
Greg Pricea65678c2019-09-12 02:23:43 -07001049 self.table[pua_index].name = name
Greg Priceef2af1a2019-08-12 22:20:56 -07001050 pua_index += 1
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001051 assert pua_index - NAME_ALIASES_START == len(self.aliases)
1052
1053 self.named_sequences = []
Ezio Melotti7c4a7e62013-08-26 01:32:56 +03001054 # store named sequences in the PUA 1, in range U+F0100..,
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001055 # in order to take advantage of the compression and lookup
1056 # algorithms used for the other characters.
1057
Benjamin Peterson71f660e2012-02-20 22:24:29 -05001058 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001059 pua_index = NAMED_SEQUENCES_START
Greg Priceef2af1a2019-08-12 22:20:56 -07001060 for name, chars in UcdFile(NAMED_SEQUENCES, version):
1061 chars = tuple(int(char, 16) for char in chars.split())
1062 # check that the structure defined in makeunicodename is OK
1063 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1064 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1065 "the NamedSequence struct and in unicodedata_lookup")
1066 self.named_sequences.append((name, chars))
1067 # also store these in the PUA 1
Greg Pricea65678c2019-09-12 02:23:43 -07001068 self.table[pua_index].name = name
Greg Priceef2af1a2019-08-12 22:20:56 -07001069 pua_index += 1
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001070 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1071
Martin v. Löwis677bde22002-11-23 22:08:15 +00001072 self.exclusions = {}
Greg Priceef2af1a2019-08-12 22:20:56 -07001073 for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
1074 char = int(char, 16)
1075 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001076
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001077 widths = [None] * 0x110000
Greg Pricec03e6982019-08-13 19:28:38 -07001078 for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
1079 widths[char] = width
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001080
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001081 for i in range(0, 0x110000):
1082 if table[i] is not None:
Greg Pricea65678c2019-09-12 02:23:43 -07001083 table[i].east_asian_width = widths[i]
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001084
Greg Pricec03e6982019-08-13 19:28:38 -07001085 for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
1086 if table[char]:
1087 # Some properties (e.g. Default_Ignorable_Code_Point)
1088 # apply to unassigned code points; ignore them
Greg Pricea65678c2019-09-12 02:23:43 -07001089 table[char].binary_properties.add(p)
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001090
Greg Pricec03e6982019-08-13 19:28:38 -07001091 for char_range, value in UcdFile(LINE_BREAK, version):
1092 if value not in MANDATORY_LINE_BREAKS:
Greg Priceef2af1a2019-08-12 22:20:56 -07001093 continue
Greg Pricec03e6982019-08-13 19:28:38 -07001094 for char in expand_range(char_range):
Greg Pricea65678c2019-09-12 02:23:43 -07001095 table[char].binary_properties.add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001096
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001097 # We only want the quickcheck properties
1098 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1099 # Yes is the default, hence only N and M occur
1100 # In 3.2.0, the format was different (NF?_NO)
1101 # The parsing will incorrectly determine these as
1102 # "yes", however, unicodedata.c will not perform quickchecks
1103 # for older versions, and no delta records will be created.
1104 quickchecks = [0] * 0x110000
1105 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Greg Priceef2af1a2019-08-12 22:20:56 -07001106 for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1107 if len(s) < 2 or s[1] not in qc_order:
1108 continue
1109 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1110 quickcheck_shift = qc_order.index(s[1])*2
1111 quickcheck <<= quickcheck_shift
Greg Pricec03e6982019-08-13 19:28:38 -07001112 for char in expand_range(s[0]):
Greg Priceef2af1a2019-08-12 22:20:56 -07001113 assert not (quickchecks[char]>>quickcheck_shift)&3
1114 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001115 for i in range(0, 0x110000):
1116 if table[i] is not None:
Greg Pricea65678c2019-09-12 02:23:43 -07001117 table[i].quick_check = quickchecks[i]
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001118
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001119 with open_data(UNIHAN, version) as file:
1120 zip = zipfile.ZipFile(file)
1121 if version == '3.2.0':
1122 data = zip.open('Unihan-3.2.0.txt').read()
1123 else:
1124 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001125 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001126 if not line.startswith('U+'):
1127 continue
1128 code, tag, value = line.split(None, 3)[:3]
1129 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1130 'kOtherNumeric'):
1131 continue
1132 value = value.strip().replace(',', '')
1133 i = int(code[2:], 16)
1134 # Patch the numeric field
1135 if table[i] is not None:
Greg Pricea65678c2019-09-12 02:23:43 -07001136 table[i].numeric_value = value
Greg Priceef2af1a2019-08-12 22:20:56 -07001137
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001138 sc = self.special_casing = {}
Greg Priceef2af1a2019-08-12 22:20:56 -07001139 for data in UcdFile(SPECIAL_CASING, version):
1140 if data[4]:
1141 # We ignore all conditionals (since they depend on
1142 # languages) except for one, which is hardcoded. See
1143 # handle_capital_sigma in unicodeobject.c.
1144 continue
1145 c = int(data[0], 16)
1146 lower = [int(char, 16) for char in data[1].split()]
1147 title = [int(char, 16) for char in data[2].split()]
1148 upper = [int(char, 16) for char in data[3].split()]
1149 sc[c] = (lower, title, upper)
1150
Benjamin Petersond5890c82012-01-14 13:23:30 -05001151 cf = self.case_folding = {}
1152 if version != '3.2.0':
Greg Priceef2af1a2019-08-12 22:20:56 -07001153 for data in UcdFile(CASE_FOLDING, version):
1154 if data[1] in "CF":
1155 c = int(data[0], 16)
1156 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001157
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001158 def uselatin1(self):
1159 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001160 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001161
Stefan Behnelfaa29482019-06-01 21:49:03 +02001162
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001163# hash table tools
1164
1165# this is a straight-forward reimplementation of Python's built-in
1166# dictionary type, using a static data structure, and a custom string
1167# hash algorithm.
1168
1169def myhash(s, magic):
1170 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001171 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001172 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001173 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001174 if ix:
1175 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1176 return h
1177
Stefan Behnelfaa29482019-06-01 21:49:03 +02001178
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001179SIZES = [
1180 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1181 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1182 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1183 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1184]
1185
Stefan Behnelfaa29482019-06-01 21:49:03 +02001186
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001187class Hash:
1188 def __init__(self, name, data, magic):
1189 # turn a (key, value) list into a static hash table structure
1190
1191 # determine table size
1192 for size, poly in SIZES:
1193 if size > len(data):
1194 poly = size + poly
1195 break
1196 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001197 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001198
Collin Winter6afaeb72007-08-03 17:06:41 +00001199 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001200
1201 table = [None] * size
1202
1203 mask = size-1
1204
1205 n = 0
1206
1207 hash = myhash
1208
1209 # initialize hash table
1210 for key, value in data:
1211 h = hash(key, magic)
1212 i = (~h) & mask
1213 v = table[i]
1214 if v is None:
1215 table[i] = value
1216 continue
Stefan Behnelfaa29482019-06-01 21:49:03 +02001217 incr = (h ^ (h >> 3)) & mask
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001218 if not incr:
1219 incr = mask
1220 while 1:
1221 n = n + 1
1222 i = (i + incr) & mask
1223 v = table[i]
1224 if v is None:
1225 table[i] = value
1226 break
1227 incr = incr << 1
1228 if incr > mask:
1229 incr = incr ^ poly
1230
Collin Winter6afaeb72007-08-03 17:06:41 +00001231 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001232 self.collisions = n
1233
1234 for i in range(len(table)):
1235 if table[i] is None:
1236 table[i] = 0
1237
1238 self.data = Array(name + "_hash", table)
1239 self.magic = magic
1240 self.name = name
1241 self.size = size
1242 self.poly = poly
1243
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001244 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001245 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001246 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001247 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1248 file.write("#define %s_size %d\n" % (self.name, self.size))
1249 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1250
Stefan Behnelfaa29482019-06-01 21:49:03 +02001251
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001252# stuff to deal with arrays of unsigned integers
1253
1254class Array:
1255
1256 def __init__(self, name, data):
1257 self.name = name
1258 self.data = data
1259
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001260 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001261 # write data to file, as a C array
1262 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001263 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001264 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Inada Naoki6fec9052019-04-17 08:40:34 +09001265 file.write("static const ")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001266 if size == 1:
1267 file.write("unsigned char")
1268 elif size == 2:
1269 file.write("unsigned short")
1270 else:
1271 file.write("unsigned int")
1272 file.write(" " + self.name + "[] = {\n")
1273 if self.data:
1274 s = " "
1275 for item in self.data:
1276 i = str(item) + ", "
1277 if len(s) + len(i) > 78:
Benjamin Peterson279a9622017-06-22 22:31:08 -07001278 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001279 s = " " + i
1280 else:
1281 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001282 if s.strip():
Benjamin Peterson279a9622017-06-22 22:31:08 -07001283 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001284 file.write("};\n\n")
1285
Stefan Behnelfaa29482019-06-01 21:49:03 +02001286
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001287def getsize(data):
1288 # return smallest possible integer size for the given array
1289 maxdata = max(data)
1290 if maxdata < 256:
1291 return 1
1292 elif maxdata < 65536:
1293 return 2
1294 else:
1295 return 4
1296
Stefan Behnelfaa29482019-06-01 21:49:03 +02001297
Tim Peters21013482000-09-25 07:13:41 +00001298def splitbins(t, trace=0):
1299 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1300
1301 t is a sequence of ints. This function can be useful to save space if
1302 many of the ints are the same. t1 and t2 are lists of ints, and shift
1303 is an int, chosen to minimize the combined size of t1 and t2 (in C
1304 code), and where for each i in range(len(t)),
1305 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1306 where mask is a bitmask isolating the last "shift" bits.
1307
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001308 If optional arg trace is non-zero (default zero), progress info
1309 is printed to sys.stderr. The higher the value, the more info
1310 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001311 """
1312
Tim Peters21013482000-09-25 07:13:41 +00001313 if trace:
1314 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001315 print("%d+%d bins at shift %d; %d bytes" % (
1316 len(t1), len(t2), shift, bytes), file=sys.stderr)
Stefan Behnelfaa29482019-06-01 21:49:03 +02001317 print("Size of original table:", len(t)*getsize(t), "bytes",
1318 file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001319 n = len(t)-1 # last valid index
1320 maxshift = 0 # the most we can shift n and still have something left
1321 if n > 0:
1322 while n >> 1:
1323 n >>= 1
1324 maxshift += 1
1325 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001326 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001327 t = tuple(t) # so slices can be dict keys
1328 for shift in range(maxshift + 1):
1329 t1 = []
1330 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001331 size = 2**shift
1332 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001333 for i in range(0, len(t), size):
1334 bin = t[i:i+size]
1335 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001336 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001337 index = len(t2)
1338 bincache[bin] = index
1339 t2.extend(bin)
1340 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001341 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001342 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001343 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001344 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001345 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001346 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001347 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001348 t1, t2, shift = best
1349 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001350 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001351 dump(t1, t2, shift, bytes)
1352 if __debug__:
1353 # exhaustively verify that the decomposition is correct
1354 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001355 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001356 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1357 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001358
Stefan Behnelfaa29482019-06-01 21:49:03 +02001359
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001360if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001361 maketables(1)