blob: d9770979e1fa6b1ea19d73cbf8fc229a5742a0a5 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030024# 2011-10-21 ezio add support for name aliases and named sequences
Fredrik Lundhcfcea492000-09-25 08:07:06 +000025#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000026# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000027#
28
Ezio Melotti931b8aa2011-10-21 21:57:36 +030029import os
30import sys
31import zipfile
32
33from textwrap import dedent
34from operator import itemgetter
Fredrik Lundhf367cac2000-09-24 23:18:31 +000035
36SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000037VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000038
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000039# The Unicode Database
Martin v. Löwisbaecd722010-10-11 22:42:28 +000040UNIDATA_VERSION = "6.0.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000041UNICODE_DATA = "UnicodeData%s.txt"
42COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
43EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000044UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000045DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000046DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000047LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030048NAME_ALIASES = "NameAliases%s.txt"
49NAMED_SEQUENCES = "NamedSequences%s.txt"
50
51# Private Use Areas -- in planes 1, 15, 16
52PUA_1 = range(0xE000, 0xF900)
53PUA_15 = range(0xF0000, 0xFFFFE)
54PUA_16 = range(0x100000, 0x10FFFE)
55
56# we use this ranges of PUA_15 to store name aliases and named sequences
57NAME_ALIASES_START = 0xF0000
58NAMED_SEQUENCES_START = 0xF0100
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000059
60old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000061
62CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
63 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
64 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
65 "So" ]
66
67BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
68 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
69 "ON" ]
70
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000071EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
72
Florent Xicluna806d8cf2010-03-30 19:34:18 +000073MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
74
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000075# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000076ALPHA_MASK = 0x01
77DECIMAL_MASK = 0x02
78DIGIT_MASK = 0x04
79LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000080LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000081SPACE_MASK = 0x20
82TITLE_MASK = 0x40
83UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000084XID_START_MASK = 0x100
85XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000086PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000087NODELTA_MASK = 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000088NUMERIC_MASK = 0x1000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000089
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000090# these ranges need to match unicodedata.c:is_unified_ideograph
91cjk_ranges = [
92 ('3400', '4DB5'),
93 ('4E00', '9FCB'),
94 ('20000', '2A6D6'),
95 ('2A700', '2B734'),
96 ('2B740', '2B81D')
97]
98
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000099def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000100
Collin Winter6afaeb72007-08-03 17:06:41 +0000101 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000102
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000103 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000104 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000105
Georg Brandl559e5d72008-06-11 18:37:52 +0000106 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000107
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000108 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000109 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000110 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000111 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000112 merge_old_version(version, unicode, old_unicode)
113
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000114 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000115 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000116 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000117
118# --------------------------------------------------------------------
119# unicode character properties
120
121def makeunicodedata(unicode, trace):
122
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000123 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000124 table = [dummy]
125 cache = {0: dummy}
126 index = [0] * len(unicode.chars)
127
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000128 FILE = "Modules/unicodedata_db.h"
129
Collin Winter6afaeb72007-08-03 17:06:41 +0000130 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000131
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000132 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000133
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000134 for char in unicode.chars:
135 record = unicode.table[char]
136 if record:
137 # extract database properties
138 category = CATEGORY_NAMES.index(record[2])
139 combining = int(record[3])
140 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
141 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000142 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000143 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000144 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000145 category, combining, bidirectional, mirrored, eastasianwidth,
146 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000147 )
148 # add entry to index and item tables
149 i = cache.get(item)
150 if i is None:
151 cache[item] = i = len(table)
152 table.append(item)
153 index[char] = i
154
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000155 # 2) decomposition data
156
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000157 decomp_data = [0]
158 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000159 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000160 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000161
Martin v. Löwis677bde22002-11-23 22:08:15 +0000162 comp_pairs = []
163 comp_first = [None] * len(unicode.chars)
164 comp_last = [None] * len(unicode.chars)
165
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000166 for char in unicode.chars:
167 record = unicode.table[char]
168 if record:
169 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000170 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000171 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000172 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000173 # prefix
174 if decomp[0][0] == "<":
175 prefix = decomp.pop(0)
176 else:
177 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000178 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 i = decomp_prefix.index(prefix)
180 except ValueError:
181 i = len(decomp_prefix)
182 decomp_prefix.append(prefix)
183 prefix = i
184 assert prefix < 256
185 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000186 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000187 # Collect NFC pairs
188 if not prefix and len(decomp) == 3 and \
189 char not in unicode.exclusions and \
190 unicode.table[decomp[1]][3] == "0":
191 p, l, r = decomp
192 comp_first[l] = 1
193 comp_last[r] = 1
194 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000195 try:
196 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000197 except ValueError:
198 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000199 decomp_data.extend(decomp)
200 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000201 else:
202 i = 0
203 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000204
Martin v. Löwis677bde22002-11-23 22:08:15 +0000205 f = l = 0
206 comp_first_ranges = []
207 comp_last_ranges = []
208 prev_f = prev_l = None
209 for i in unicode.chars:
210 if comp_first[i] is not None:
211 comp_first[i] = f
212 f += 1
213 if prev_f is None:
214 prev_f = (i,i)
215 elif prev_f[1]+1 == i:
216 prev_f = prev_f[0],i
217 else:
218 comp_first_ranges.append(prev_f)
219 prev_f = (i,i)
220 if comp_last[i] is not None:
221 comp_last[i] = l
222 l += 1
223 if prev_l is None:
224 prev_l = (i,i)
225 elif prev_l[1]+1 == i:
226 prev_l = prev_l[0],i
227 else:
228 comp_last_ranges.append(prev_l)
229 prev_l = (i,i)
230 comp_first_ranges.append(prev_f)
231 comp_last_ranges.append(prev_l)
232 total_first = f
233 total_last = l
234
235 comp_data = [0]*(total_first*total_last)
236 for f,l,char in comp_pairs:
237 f = comp_first[f]
238 l = comp_last[l]
239 comp_data[f*total_last+l] = char
240
Collin Winter6afaeb72007-08-03 17:06:41 +0000241 print(len(table), "unique properties")
242 print(len(decomp_prefix), "unique decomposition prefixes")
243 print(len(decomp_data), "unique decomposition entries:", end=' ')
244 print(decomp_size, "bytes")
245 print(total_first, "first characters in NFC")
246 print(total_last, "last characters in NFC")
247 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000248
Collin Winter6afaeb72007-08-03 17:06:41 +0000249 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000250
Fred Drake9c685052000-10-26 03:56:46 +0000251 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000252 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
253 print(file=fp)
254 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
255 print("/* a list of unique database records */", file=fp)
256 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000257 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000258 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000259 print("};", file=fp)
260 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000261
Collin Winter6afaeb72007-08-03 17:06:41 +0000262 print("/* Reindexing of NFC first characters. */", file=fp)
263 print("#define TOTAL_FIRST",total_first, file=fp)
264 print("#define TOTAL_LAST",total_last, file=fp)
265 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000266 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000267 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000268 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
269 print(" {0,0,0}", file=fp)
270 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000271 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000272 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
274 print(" {0,0,0}", file=fp)
275 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000276
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000277 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000278 # the support code moved into unicodedatabase.c
279
Collin Winter6afaeb72007-08-03 17:06:41 +0000280 print("/* string literals */", file=fp)
281 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000282 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000283 print(" \"%s\"," % name, file=fp)
284 print(" NULL", file=fp)
285 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000286
Collin Winter6afaeb72007-08-03 17:06:41 +0000287 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000288 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000289 print(" \"%s\"," % name, file=fp)
290 print(" NULL", file=fp)
291 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000292
Collin Winter6afaeb72007-08-03 17:06:41 +0000293 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000294 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000295 print(" \"%s\"," % name, file=fp)
296 print(" NULL", file=fp)
297 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000298
Collin Winter6afaeb72007-08-03 17:06:41 +0000299 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000300 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000301 print(" \"%s\"," % name, file=fp)
302 print(" NULL", file=fp)
303 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000304
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000305 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000306 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000307
Collin Winter6afaeb72007-08-03 17:06:41 +0000308 print("/* index tables for the database records */", file=fp)
309 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000310 Array("index1", index1).dump(fp, trace)
311 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000312
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000313 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000314 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000315
Collin Winter6afaeb72007-08-03 17:06:41 +0000316 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000317 Array("decomp_data", decomp_data).dump(fp, trace)
318
Collin Winter6afaeb72007-08-03 17:06:41 +0000319 print("/* index tables for the decomposition data */", file=fp)
320 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000321 Array("decomp_index1", index1).dump(fp, trace)
322 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000323
Martin v. Löwis677bde22002-11-23 22:08:15 +0000324 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000325 print("/* NFC pairs */", file=fp)
326 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000327 Array("comp_index", index).dump(fp, trace)
328 Array("comp_data", index2).dump(fp, trace)
329
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000330 # Generate delta tables for old versions
331 for version, table, normalization in unicode.changed:
332 cversion = version.replace(".","_")
333 records = [table[0]]
334 cache = {table[0]:0}
335 index = [0] * len(table)
336 for i, record in enumerate(table):
337 try:
338 index[i] = cache[record]
339 except KeyError:
340 index[i] = cache[record] = len(records)
341 records.append(record)
342 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000343 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000344 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000345 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
346 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000347 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
348 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000349 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
350 print("{", file=fp)
351 print("\tint index;", file=fp)
352 print("\tif (n >= 0x110000) index = 0;", file=fp)
353 print("\telse {", file=fp)
354 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
355 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
356 (cversion, shift, ((1<<shift)-1)), file=fp)
357 print("\t}", file=fp)
358 print("\treturn change_records_%s+index;" % cversion, file=fp)
359 print("}\n", file=fp)
360 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
361 print("{", file=fp)
362 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000363 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000364 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
365 print("\tdefault: return 0;", file=fp)
366 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000367
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000368 fp.close()
369
370# --------------------------------------------------------------------
371# unicode character type tables
372
373def makeunicodetype(unicode, trace):
374
375 FILE = "Objects/unicodetype_db.h"
376
Collin Winter6afaeb72007-08-03 17:06:41 +0000377 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000378
379 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000380 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000381 table = [dummy]
382 cache = {0: dummy}
383 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000384 numeric = {}
385 spaces = []
386 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000387
388 for char in unicode.chars:
389 record = unicode.table[char]
390 if record:
391 # extract database properties
392 category = record[2]
393 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000394 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000395 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000396 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000397 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
398 flags |= ALPHA_MASK
399 if category == "Ll":
400 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000401 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000402 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000403 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000404 if category == "Zs" or bidirectional in ("WS", "B", "S"):
405 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000406 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000407 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000408 flags |= TITLE_MASK
409 if category == "Lu":
410 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000411 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000412 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000413 if "XID_Start" in properties:
414 flags |= XID_START_MASK
415 if "XID_Continue" in properties:
416 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000417 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000418 if record[12]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000419 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000420 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000421 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000422 if record[13]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000423 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000424 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000425 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000426 if record[14]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000427 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000428 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000429 # UCD.html says that a missing title char means that
430 # it defaults to the uppercase character, not to the
431 # character itself. Apparently, in the current UCD (5.x)
432 # this feature is never used
433 title = upper
434 upper_d = upper - char
435 lower_d = lower - char
436 title_d = title - char
437 if -32768 <= upper_d <= 32767 and \
438 -32768 <= lower_d <= 32767 and \
439 -32768 <= title_d <= 32767:
440 # use deltas
441 upper = upper_d & 0xffff
442 lower = lower_d & 0xffff
443 title = title_d & 0xffff
444 else:
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000445 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000446 # decimal digit, integer digit
447 decimal = 0
448 if record[6]:
449 flags |= DECIMAL_MASK
450 decimal = int(record[6])
451 digit = 0
452 if record[7]:
453 flags |= DIGIT_MASK
454 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000455 if record[8]:
456 flags |= NUMERIC_MASK
457 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000458 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000459 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000460 )
461 # add entry to index and item tables
462 i = cache.get(item)
463 if i is None:
464 cache[item] = i = len(table)
465 table.append(item)
466 index[char] = i
467
Collin Winter6afaeb72007-08-03 17:06:41 +0000468 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000469 print(sum(map(len, numeric.values())), "numeric code points")
470 print(len(spaces), "whitespace code points")
471 print(len(linebreaks), "linebreak code points")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000472
Collin Winter6afaeb72007-08-03 17:06:41 +0000473 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000474
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000475 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000476 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
477 print(file=fp)
478 print("/* a list of unique character type descriptors */", file=fp)
479 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000480 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000481 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
482 print("};", file=fp)
483 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000484
485 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000486 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000487
Collin Winter6afaeb72007-08-03 17:06:41 +0000488 print("/* type indexes */", file=fp)
489 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000490 Array("index1", index1).dump(fp, trace)
491 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000492
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000493 # Generate code for _PyUnicode_ToNumeric()
494 numeric_items = sorted(numeric.items())
495 print('/* Returns the numeric value as double for Unicode characters', file=fp)
496 print(' * having this property, -1.0 otherwise.', file=fp)
497 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000498 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000499 print('{', file=fp)
500 print(' switch (ch) {', file=fp)
501 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000502 # Turn text into float literals
503 parts = value.split('/')
504 parts = [repr(float(part)) for part in parts]
505 value = '/'.join(parts)
506
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000507 codepoints.sort()
508 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000509 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000510 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000511 print(' }', file=fp)
512 print(' return -1.0;', file=fp)
513 print('}', file=fp)
514 print(file=fp)
515
516 # Generate code for _PyUnicode_IsWhitespace()
517 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
518 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
519 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000520 print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000521 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000522 print(' switch (ch) {', file=fp)
523
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000524 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000525 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000526 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000527
528 print(' }', file=fp)
529 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000530 print('}', file=fp)
531 print(file=fp)
532
533 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000534 print("/* Returns 1 for Unicode characters having the line break", file=fp)
535 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
536 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000537 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000538 print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000539 print('{', file=fp)
540 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000541 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000542 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000543 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000544
545 print(' }', file=fp)
546 print(' return 0;', file=fp)
547 print('}', file=fp)
548 print(file=fp)
549
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000550 fp.close()
551
552# --------------------------------------------------------------------
553# unicode name database
554
555def makeunicodename(unicode, trace):
556
557 FILE = "Modules/unicodename_db.h"
558
Collin Winter6afaeb72007-08-03 17:06:41 +0000559 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000560
561 # collect names
562 names = [None] * len(unicode.chars)
563
564 for char in unicode.chars:
565 record = unicode.table[char]
566 if record:
567 name = record[1].strip()
568 if name and name[0] != "<":
569 names[char] = name + chr(0)
570
Georg Brandl559e5d72008-06-11 18:37:52 +0000571 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000572
573 # collect unique words from names (note that we differ between
574 # words inside a sentence, and words ending a sentence. the
575 # latter includes the trailing null byte.
576
577 words = {}
578 n = b = 0
579 for char in unicode.chars:
580 name = names[char]
581 if name:
582 w = name.split()
583 b = b + len(name)
584 n = n + len(w)
585 for w in w:
586 l = words.get(w)
587 if l:
588 l.append(None)
589 else:
590 words[w] = [len(words)]
591
Collin Winter6afaeb72007-08-03 17:06:41 +0000592 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000593
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000594 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000595
Martin v. Löwis97225da2002-11-24 23:05:09 +0000596 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000597 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000598 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000599 return -len(alist), aword
600 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000601
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000602 # figure out how many phrasebook escapes we need
603 escapes = 0
604 while escapes * 256 < len(wordlist):
605 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000606 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000607
608 short = 256 - escapes
609
610 assert short > 0
611
Collin Winter6afaeb72007-08-03 17:06:41 +0000612 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000613
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000614 # statistics
615 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000616 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000617 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000618 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000619
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000620 # pick the most commonly used words, and sort the rest on falling
621 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000622
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000623 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000624 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000625 wordlist.extend(wordtail)
626
627 # generate lexicon from words
628
629 lexicon_offset = [0]
630 lexicon = ""
631 words = {}
632
633 # build a lexicon string
634 offset = 0
635 for w, x in wordlist:
636 # encoding: bit 7 indicates last character in word (chr(128)
637 # indicates the last character in an entire string)
638 ww = w[:-1] + chr(ord(w[-1])+128)
639 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000640 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000641 if o < 0:
642 o = offset
643 lexicon = lexicon + ww
644 offset = offset + len(w)
645 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000646 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000647
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000648 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000649
650 # generate phrasebook from names and lexicon
651 phrasebook = [0]
652 phrasebook_offset = [0] * len(unicode.chars)
653 for char in unicode.chars:
654 name = names[char]
655 if name:
656 w = name.split()
657 phrasebook_offset[char] = len(phrasebook)
658 for w in w:
659 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000660 if i < short:
661 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000662 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000663 # store as two bytes
664 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000665 phrasebook.append(i&255)
666
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000667 assert getsize(phrasebook) == 1
668
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000669 #
670 # unicode name hash table
671
672 # extract names
673 data = []
674 for char in unicode.chars:
675 record = unicode.table[char]
676 if record:
677 name = record[1].strip()
678 if name and name[0] != "<":
679 data.append((name, char))
680
681 # the magic number 47 was chosen to minimize the number of
682 # collisions on the current data set. if you like, change it
683 # and see what happens...
684
685 codehash = Hash("code", data, 47)
686
Collin Winter6afaeb72007-08-03 17:06:41 +0000687 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000688
689 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000690 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
691 print(file=fp)
692 print("#define NAME_MAXLEN", 256, file=fp)
693 print(file=fp)
694 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000695 Array("lexicon", lexicon).dump(fp, trace)
696 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000697
698 # split decomposition index table
699 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
700
Collin Winter6afaeb72007-08-03 17:06:41 +0000701 print("/* code->name phrasebook */", file=fp)
702 print("#define phrasebook_shift", shift, file=fp)
703 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000704
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000705 Array("phrasebook", phrasebook).dump(fp, trace)
706 Array("phrasebook_offset1", offset1).dump(fp, trace)
707 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000708
Collin Winter6afaeb72007-08-03 17:06:41 +0000709 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000710 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000711
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300712 print(file=fp)
713 print('static const unsigned int aliases_start = %#x;' %
714 NAME_ALIASES_START, file=fp)
715 print('static const unsigned int aliases_end = %#x;' %
716 (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
717
718 print('static const unsigned int name_aliases[] = {', file=fp)
719 for name, codepoint in unicode.aliases:
720 print(' 0x%04X,' % codepoint, file=fp)
721 print('};', file=fp)
722
723 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
724 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
725 # sequences or sequences with non-BMP chars are added.
726 # unicodedata_lookup should be adapted too.
727 print(dedent("""
728 typedef struct NamedSequence {
729 int seqlen;
730 Py_UCS2 seq[4];
731 } named_sequence;
732 """), file=fp)
733
734 print('static const unsigned int named_sequences_start = %#x;' %
735 NAMED_SEQUENCES_START, file=fp)
736 print('static const unsigned int named_sequences_end = %#x;' %
737 (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
738
739 print('static const named_sequence named_sequences[] = {', file=fp)
740 for name, sequence in unicode.named_sequences:
741 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
742 print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
743 print('};', file=fp)
744
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000745 fp.close()
746
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000747
748def merge_old_version(version, new, old):
749 # Changes to exclusion file not implemented yet
750 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000751 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000752
753 # In these change records, 0xFF means "no change"
754 bidir_changes = [0xFF]*0x110000
755 category_changes = [0xFF]*0x110000
756 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000757 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000758 # In numeric data, 0 means "no change",
759 # -1 means "did not have a numeric value
760 numeric_changes = [0] * 0x110000
761 # normalization_changes is a list of key-value pairs
762 normalization_changes = []
763 for i in range(0x110000):
764 if new.table[i] is None:
765 # Characters unassigned in the new version ought to
766 # be unassigned in the old one
767 assert old.table[i] is None
768 continue
769 # check characters unassigned in the old version
770 if old.table[i] is None:
771 # category 0 is "unassigned"
772 category_changes[i] = 0
773 continue
774 # check characters that differ
775 if old.table[i] != new.table[i]:
776 for k in range(len(old.table[i])):
777 if old.table[i][k] != new.table[i][k]:
778 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300779 if k == 1 and i in PUA_15:
780 # the name is not set in the old.table, but in the
781 # new.table we are using it for aliases and named seq
782 assert value == ''
783 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000784 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
785 category_changes[i] = CATEGORY_NAMES.index(value)
786 elif k == 4:
787 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
788 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
789 elif k == 5:
790 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
791 # We assume that all normalization changes are in 1:1 mappings
792 assert " " not in value
793 normalization_changes.append((i, value))
794 elif k == 6:
795 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
796 # we only support changes where the old value is a single digit
797 assert value in "0123456789"
798 decimal_changes[i] = int(value)
799 elif k == 8:
800 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
801 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000802 if not value:
803 numeric_changes[i] = -1
804 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000805 numeric_changes[i] = float(value)
806 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000807 elif k == 9:
808 if value == 'Y':
809 mirrored_changes[i] = '1'
810 else:
811 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000812 elif k == 11:
813 # change to ISO comment, ignore
814 pass
815 elif k == 12:
816 # change to simple uppercase mapping; ignore
817 pass
818 elif k == 13:
819 # change to simple lowercase mapping; ignore
820 pass
821 elif k == 14:
822 # change to simple titlecase mapping; ignore
823 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000824 elif k == 16:
825 # derived property changes; not yet
826 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000827 elif k == 17:
828 # normalization quickchecks are not performed
829 # for older versions
830 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000831 else:
832 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000833 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000834 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000835 decimal_changes, mirrored_changes,
836 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000837 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000838
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000839def open_data(template, version):
840 local = template % ('-'+version,)
841 if not os.path.exists(local):
842 import urllib.request
843 if version == '3.2.0':
844 # irregular url structure
845 url = 'http://www.unicode.org/Public/3.2-Update/' + local
846 else:
847 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
848 urllib.request.urlretrieve(url, filename=local)
849 if local.endswith('.txt'):
850 return open(local, encoding='utf-8')
851 else:
852 # Unihan.zip
853 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000854
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000855# --------------------------------------------------------------------
856# the following support code is taken from the unidb utilities
857# Copyright (c) 1999-2000 by Secret Labs AB
858
859# load a unicode-data file from disk
860
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000861class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000862 # Record structure:
863 # [ID, name, category, combining, bidi, decomp, (6)
864 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
865 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
866 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000867
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000868 def __init__(self, version,
869 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000870 expand=1,
871 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000872 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000873 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300874 with open_data(UNICODE_DATA, version) as file:
875 while 1:
876 s = file.readline()
877 if not s:
878 break
879 s = s.strip().split(";")
880 char = int(s[0], 16)
881 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000882
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000883 cjk_ranges_found = []
884
Martin v. Löwis97225da2002-11-24 23:05:09 +0000885 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000886 if expand:
887 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000888 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000889 s = table[i]
890 if s:
891 if s[1][-6:] == "First>":
892 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000893 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000894 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000895 if s[1].startswith("<CJK Ideograph"):
896 cjk_ranges_found.append((field[0],
897 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000898 s[1] = ""
899 field = None
900 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000901 f2 = field[:]
902 f2[0] = "%X" % i
903 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000904 if cjk_check and cjk_ranges != cjk_ranges_found:
905 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000906
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000907 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000908 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000909 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000910 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000911
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300912 # check for name aliases and named sequences, see #12753
913 # aliases and named sequences are not in 3.2.0
914 if version != '3.2.0':
915 self.aliases = []
916 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
917 # in order to take advantage of the compression and lookup
918 # algorithms used for the other characters
919 pua_index = NAME_ALIASES_START
920 with open_data(NAME_ALIASES, version) as file:
921 for s in file:
922 s = s.strip()
923 if not s or s.startswith('#'):
924 continue
925 char, name = s.split(';')
926 char = int(char, 16)
927 self.aliases.append((name, char))
928 # also store the name in the PUA 1
929 self.table[pua_index][1] = name
930 pua_index += 1
931 assert pua_index - NAME_ALIASES_START == len(self.aliases)
932
933 self.named_sequences = []
934 # store named seqences in the PUA 1, in range U+F0100..,
935 # in order to take advantage of the compression and lookup
936 # algorithms used for the other characters.
937
938 pua_index = NAMED_SEQUENCES_START
939 with open_data(NAMED_SEQUENCES, version) as file:
940 for s in file:
941 s = s.strip()
942 if not s or s.startswith('#'):
943 continue
944 name, chars = s.split(';')
945 chars = tuple(int(char, 16) for char in chars.split())
946 # check that the structure defined in makeunicodename is OK
947 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
948 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
949 "the NamedSequence struct and in unicodedata_lookup")
950 self.named_sequences.append((name, chars))
951 # also store these in the PUA 1
952 self.table[pua_index][1] = name
953 pua_index += 1
954 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
955
Martin v. Löwis677bde22002-11-23 22:08:15 +0000956 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300957 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
958 for s in file:
959 s = s.strip()
960 if not s:
961 continue
962 if s[0] == '#':
963 continue
964 char = int(s.split()[0],16)
965 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000966
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000967 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300968 with open_data(EASTASIAN_WIDTH, version) as file:
969 for s in file:
970 s = s.strip()
971 if not s:
972 continue
973 if s[0] == '#':
974 continue
975 s = s.split()[0].split(';')
976 if '..' in s[0]:
977 first, last = [int(c, 16) for c in s[0].split('..')]
978 chars = list(range(first, last+1))
979 else:
980 chars = [int(s[0], 16)]
981 for char in chars:
982 widths[char] = s[1]
983
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000984 for i in range(0, 0x110000):
985 if table[i] is not None:
986 table[i].append(widths[i])
987
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000988 for i in range(0, 0x110000):
989 if table[i] is not None:
990 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000991
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300992 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
993 for s in file:
994 s = s.split('#', 1)[0].strip()
995 if not s:
996 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000997
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300998 r, p = s.split(";")
999 r = r.strip()
1000 p = p.strip()
1001 if ".." in r:
1002 first, last = [int(c, 16) for c in r.split('..')]
1003 chars = list(range(first, last+1))
1004 else:
1005 chars = [int(r, 16)]
1006 for char in chars:
1007 if table[char]:
1008 # Some properties (e.g. Default_Ignorable_Code_Point)
1009 # apply to unassigned code points; ignore them
1010 table[char][-1].add(p)
1011
1012 with open_data(LINE_BREAK, version) as file:
1013 for s in file:
1014 s = s.partition('#')[0]
1015 s = [i.strip() for i in s.split(';')]
1016 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1017 continue
1018 if '..' not in s[0]:
1019 first = last = int(s[0], 16)
1020 else:
1021 first, last = [int(c, 16) for c in s[0].split('..')]
1022 for char in range(first, last+1):
1023 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001024
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001025 # We only want the quickcheck properties
1026 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1027 # Yes is the default, hence only N and M occur
1028 # In 3.2.0, the format was different (NF?_NO)
1029 # The parsing will incorrectly determine these as
1030 # "yes", however, unicodedata.c will not perform quickchecks
1031 # for older versions, and no delta records will be created.
1032 quickchecks = [0] * 0x110000
1033 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001034 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1035 for s in file:
1036 if '#' in s:
1037 s = s[:s.index('#')]
1038 s = [i.strip() for i in s.split(';')]
1039 if len(s) < 2 or s[1] not in qc_order:
1040 continue
1041 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1042 quickcheck_shift = qc_order.index(s[1])*2
1043 quickcheck <<= quickcheck_shift
1044 if '..' not in s[0]:
1045 first = last = int(s[0], 16)
1046 else:
1047 first, last = [int(c, 16) for c in s[0].split('..')]
1048 for char in range(first, last+1):
1049 assert not (quickchecks[char]>>quickcheck_shift)&3
1050 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001051 for i in range(0, 0x110000):
1052 if table[i] is not None:
1053 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001054
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001055 with open_data(UNIHAN, version) as file:
1056 zip = zipfile.ZipFile(file)
1057 if version == '3.2.0':
1058 data = zip.open('Unihan-3.2.0.txt').read()
1059 else:
1060 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001061 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001062 if not line.startswith('U+'):
1063 continue
1064 code, tag, value = line.split(None, 3)[:3]
1065 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1066 'kOtherNumeric'):
1067 continue
1068 value = value.strip().replace(',', '')
1069 i = int(code[2:], 16)
1070 # Patch the numeric field
1071 if table[i] is not None:
1072 table[i][8] = value
1073
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001074 def uselatin1(self):
1075 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001076 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001077
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001078# hash table tools
1079
1080# this is a straight-forward reimplementation of Python's built-in
1081# dictionary type, using a static data structure, and a custom string
1082# hash algorithm.
1083
1084def myhash(s, magic):
1085 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001086 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001087 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001088 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001089 if ix:
1090 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1091 return h
1092
1093SIZES = [
1094 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1095 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1096 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1097 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1098]
1099
1100class Hash:
1101 def __init__(self, name, data, magic):
1102 # turn a (key, value) list into a static hash table structure
1103
1104 # determine table size
1105 for size, poly in SIZES:
1106 if size > len(data):
1107 poly = size + poly
1108 break
1109 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001110 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001111
Collin Winter6afaeb72007-08-03 17:06:41 +00001112 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001113
1114 table = [None] * size
1115
1116 mask = size-1
1117
1118 n = 0
1119
1120 hash = myhash
1121
1122 # initialize hash table
1123 for key, value in data:
1124 h = hash(key, magic)
1125 i = (~h) & mask
1126 v = table[i]
1127 if v is None:
1128 table[i] = value
1129 continue
1130 incr = (h ^ (h >> 3)) & mask;
1131 if not incr:
1132 incr = mask
1133 while 1:
1134 n = n + 1
1135 i = (i + incr) & mask
1136 v = table[i]
1137 if v is None:
1138 table[i] = value
1139 break
1140 incr = incr << 1
1141 if incr > mask:
1142 incr = incr ^ poly
1143
Collin Winter6afaeb72007-08-03 17:06:41 +00001144 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001145 self.collisions = n
1146
1147 for i in range(len(table)):
1148 if table[i] is None:
1149 table[i] = 0
1150
1151 self.data = Array(name + "_hash", table)
1152 self.magic = magic
1153 self.name = name
1154 self.size = size
1155 self.poly = poly
1156
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001157 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001158 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001159 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001160 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1161 file.write("#define %s_size %d\n" % (self.name, self.size))
1162 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1163
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001164# stuff to deal with arrays of unsigned integers
1165
1166class Array:
1167
1168 def __init__(self, name, data):
1169 self.name = name
1170 self.data = data
1171
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001172 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001173 # write data to file, as a C array
1174 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001175 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001176 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001177 file.write("static ")
1178 if size == 1:
1179 file.write("unsigned char")
1180 elif size == 2:
1181 file.write("unsigned short")
1182 else:
1183 file.write("unsigned int")
1184 file.write(" " + self.name + "[] = {\n")
1185 if self.data:
1186 s = " "
1187 for item in self.data:
1188 i = str(item) + ", "
1189 if len(s) + len(i) > 78:
1190 file.write(s + "\n")
1191 s = " " + i
1192 else:
1193 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001194 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001195 file.write(s + "\n")
1196 file.write("};\n\n")
1197
1198def getsize(data):
1199 # return smallest possible integer size for the given array
1200 maxdata = max(data)
1201 if maxdata < 256:
1202 return 1
1203 elif maxdata < 65536:
1204 return 2
1205 else:
1206 return 4
1207
Tim Peters21013482000-09-25 07:13:41 +00001208def splitbins(t, trace=0):
1209 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1210
1211 t is a sequence of ints. This function can be useful to save space if
1212 many of the ints are the same. t1 and t2 are lists of ints, and shift
1213 is an int, chosen to minimize the combined size of t1 and t2 (in C
1214 code), and where for each i in range(len(t)),
1215 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1216 where mask is a bitmask isolating the last "shift" bits.
1217
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001218 If optional arg trace is non-zero (default zero), progress info
1219 is printed to sys.stderr. The higher the value, the more info
1220 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001221 """
1222
Tim Peters21013482000-09-25 07:13:41 +00001223 if trace:
1224 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001225 print("%d+%d bins at shift %d; %d bytes" % (
1226 len(t1), len(t2), shift, bytes), file=sys.stderr)
1227 print("Size of original table:", len(t)*getsize(t), \
1228 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001229 n = len(t)-1 # last valid index
1230 maxshift = 0 # the most we can shift n and still have something left
1231 if n > 0:
1232 while n >> 1:
1233 n >>= 1
1234 maxshift += 1
1235 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001236 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001237 t = tuple(t) # so slices can be dict keys
1238 for shift in range(maxshift + 1):
1239 t1 = []
1240 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001241 size = 2**shift
1242 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001243 for i in range(0, len(t), size):
1244 bin = t[i:i+size]
1245 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001246 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001247 index = len(t2)
1248 bincache[bin] = index
1249 t2.extend(bin)
1250 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001251 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001252 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001253 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001254 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001255 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001256 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001257 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001258 t1, t2, shift = best
1259 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001260 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001261 dump(t1, t2, shift, bytes)
1262 if __debug__:
1263 # exhaustively verify that the decomposition is correct
1264 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001265 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001266 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1267 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001268
1269if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001270 maketables(1)