blob: 885e559fd4634c716bac2c750c15b8de297109b3 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Fredrik Lundhcfcea492000-09-25 08:07:06 +000023#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000024# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000025#
26
27import sys
28
29SCRIPT = sys.argv[0]
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000030VERSION = "2.5"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000031
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000032# The Unicode Database
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000033UNIDATA_VERSION = "4.1.0"
34UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000037DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000038
39old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000040
41CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
42 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
43 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
44 "So" ]
45
46BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
47 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
48 "ON" ]
49
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000050EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
51
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000052# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000053ALPHA_MASK = 0x01
54DECIMAL_MASK = 0x02
55DIGIT_MASK = 0x04
56LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000057LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000058SPACE_MASK = 0x20
59TITLE_MASK = 0x40
60UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000061XID_START_MASK = 0x100
62XID_CONTINUE_MASK = 0x200
Fredrik Lundhe9133f72000-09-25 17:59:57 +000063
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000064def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000065
Collin Winter6afaeb72007-08-03 17:06:41 +000066 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000067
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068 version = ""
69 unicode = UnicodeData(UNICODE_DATA % version,
70 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000071 EASTASIAN_WIDTH % version,
72 DERIVED_CORE_PROPERTIES % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000073
Georg Brandlbf82e372008-05-16 17:02:34 +000074 print(len(filter(None, unicode.table)), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000075
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000077 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
79 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000080 EASTASIAN_WIDTH % ("-"+version),
81 DERIVED_CORE_PROPERTIES % ("-"+version))
Georg Brandlbf82e372008-05-16 17:02:34 +000082 print(len(filter(None, old_unicode.table)), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000083 merge_old_version(version, unicode, old_unicode)
84
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000085 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000086 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000087 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000088
89# --------------------------------------------------------------------
90# unicode character properties
91
92def makeunicodedata(unicode, trace):
93
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000094 dummy = (0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000095 table = [dummy]
96 cache = {0: dummy}
97 index = [0] * len(unicode.chars)
98
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000099 FILE = "Modules/unicodedata_db.h"
100
Collin Winter6afaeb72007-08-03 17:06:41 +0000101 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000102
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000103 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000104
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000105 for char in unicode.chars:
106 record = unicode.table[char]
107 if record:
108 # extract database properties
109 category = CATEGORY_NAMES.index(record[2])
110 combining = int(record[3])
111 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
112 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000113 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000114 item = (
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000115 category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000116 )
117 # add entry to index and item tables
118 i = cache.get(item)
119 if i is None:
120 cache[item] = i = len(table)
121 table.append(item)
122 index[char] = i
123
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000124 # 2) decomposition data
125
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000126 decomp_data = [0]
127 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000128 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000130
Martin v. Löwis677bde22002-11-23 22:08:15 +0000131 comp_pairs = []
132 comp_first = [None] * len(unicode.chars)
133 comp_last = [None] * len(unicode.chars)
134
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000135 for char in unicode.chars:
136 record = unicode.table[char]
137 if record:
138 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000139 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000140 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000141 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000142 # prefix
143 if decomp[0][0] == "<":
144 prefix = decomp.pop(0)
145 else:
146 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000147 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000148 i = decomp_prefix.index(prefix)
149 except ValueError:
150 i = len(decomp_prefix)
151 decomp_prefix.append(prefix)
152 prefix = i
153 assert prefix < 256
154 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000155 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000156 # Collect NFC pairs
157 if not prefix and len(decomp) == 3 and \
158 char not in unicode.exclusions and \
159 unicode.table[decomp[1]][3] == "0":
160 p, l, r = decomp
161 comp_first[l] = 1
162 comp_last[r] = 1
163 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000164 try:
165 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000166 except ValueError:
167 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168 decomp_data.extend(decomp)
169 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000170 else:
171 i = 0
172 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000173
Martin v. Löwis677bde22002-11-23 22:08:15 +0000174 f = l = 0
175 comp_first_ranges = []
176 comp_last_ranges = []
177 prev_f = prev_l = None
178 for i in unicode.chars:
179 if comp_first[i] is not None:
180 comp_first[i] = f
181 f += 1
182 if prev_f is None:
183 prev_f = (i,i)
184 elif prev_f[1]+1 == i:
185 prev_f = prev_f[0],i
186 else:
187 comp_first_ranges.append(prev_f)
188 prev_f = (i,i)
189 if comp_last[i] is not None:
190 comp_last[i] = l
191 l += 1
192 if prev_l is None:
193 prev_l = (i,i)
194 elif prev_l[1]+1 == i:
195 prev_l = prev_l[0],i
196 else:
197 comp_last_ranges.append(prev_l)
198 prev_l = (i,i)
199 comp_first_ranges.append(prev_f)
200 comp_last_ranges.append(prev_l)
201 total_first = f
202 total_last = l
203
204 comp_data = [0]*(total_first*total_last)
205 for f,l,char in comp_pairs:
206 f = comp_first[f]
207 l = comp_last[l]
208 comp_data[f*total_last+l] = char
209
Collin Winter6afaeb72007-08-03 17:06:41 +0000210 print(len(table), "unique properties")
211 print(len(decomp_prefix), "unique decomposition prefixes")
212 print(len(decomp_data), "unique decomposition entries:", end=' ')
213 print(decomp_size, "bytes")
214 print(total_first, "first characters in NFC")
215 print(total_last, "last characters in NFC")
216 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000217
Collin Winter6afaeb72007-08-03 17:06:41 +0000218 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000219
Fred Drake9c685052000-10-26 03:56:46 +0000220 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000221 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
222 print(file=fp)
223 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
224 print("/* a list of unique database records */", file=fp)
225 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000226 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000227 print(" {%d, %d, %d, %d, %d}," % item, file=fp)
228 print("};", file=fp)
229 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000230
Collin Winter6afaeb72007-08-03 17:06:41 +0000231 print("/* Reindexing of NFC first characters. */", file=fp)
232 print("#define TOTAL_FIRST",total_first, file=fp)
233 print("#define TOTAL_LAST",total_last, file=fp)
234 print("struct reindex{int start;short count,index;};", file=fp)
235 print("struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000236 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000237 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
238 print(" {0,0,0}", file=fp)
239 print("};\n", file=fp)
240 print("struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000241 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000242 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
243 print(" {0,0,0}", file=fp)
244 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000245
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000246 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000247 # the support code moved into unicodedatabase.c
248
Collin Winter6afaeb72007-08-03 17:06:41 +0000249 print("/* string literals */", file=fp)
250 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000251 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000252 print(" \"%s\"," % name, file=fp)
253 print(" NULL", file=fp)
254 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000255
Collin Winter6afaeb72007-08-03 17:06:41 +0000256 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000257 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000258 print(" \"%s\"," % name, file=fp)
259 print(" NULL", file=fp)
260 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000261
Collin Winter6afaeb72007-08-03 17:06:41 +0000262 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000263 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000264 print(" \"%s\"," % name, file=fp)
265 print(" NULL", file=fp)
266 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000267
Collin Winter6afaeb72007-08-03 17:06:41 +0000268 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000269 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000270 print(" \"%s\"," % name, file=fp)
271 print(" NULL", file=fp)
272 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000273
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000274 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000275 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000276
Collin Winter6afaeb72007-08-03 17:06:41 +0000277 print("/* index tables for the database records */", file=fp)
278 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000279 Array("index1", index1).dump(fp, trace)
280 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000281
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000282 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000283 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000284
Collin Winter6afaeb72007-08-03 17:06:41 +0000285 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000286 Array("decomp_data", decomp_data).dump(fp, trace)
287
Collin Winter6afaeb72007-08-03 17:06:41 +0000288 print("/* index tables for the decomposition data */", file=fp)
289 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000290 Array("decomp_index1", index1).dump(fp, trace)
291 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000292
Martin v. Löwis677bde22002-11-23 22:08:15 +0000293 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000294 print("/* NFC pairs */", file=fp)
295 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000296 Array("comp_index", index).dump(fp, trace)
297 Array("comp_data", index2).dump(fp, trace)
298
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000299 # Generate delta tables for old versions
300 for version, table, normalization in unicode.changed:
301 cversion = version.replace(".","_")
302 records = [table[0]]
303 cache = {table[0]:0}
304 index = [0] * len(table)
305 for i, record in enumerate(table):
306 try:
307 index[i] = cache[record]
308 except KeyError:
309 index[i] = cache[record] = len(records)
310 records.append(record)
311 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000312 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000313 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000314 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
315 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000316 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
317 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000318 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
319 print("{", file=fp)
320 print("\tint index;", file=fp)
321 print("\tif (n >= 0x110000) index = 0;", file=fp)
322 print("\telse {", file=fp)
323 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
324 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
325 (cversion, shift, ((1<<shift)-1)), file=fp)
326 print("\t}", file=fp)
327 print("\treturn change_records_%s+index;" % cversion, file=fp)
328 print("}\n", file=fp)
329 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
330 print("{", file=fp)
331 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000332 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000333 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
334 print("\tdefault: return 0;", file=fp)
335 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000336
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000337 fp.close()
338
339# --------------------------------------------------------------------
340# unicode character type tables
341
342def makeunicodetype(unicode, trace):
343
344 FILE = "Objects/unicodetype_db.h"
345
Collin Winter6afaeb72007-08-03 17:06:41 +0000346 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000347
348 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000349 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000350 table = [dummy]
351 cache = {0: dummy}
352 index = [0] * len(unicode.chars)
353
354 for char in unicode.chars:
355 record = unicode.table[char]
356 if record:
357 # extract database properties
358 category = record[2]
359 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000360 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000361 flags = 0
362 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
363 flags |= ALPHA_MASK
364 if category == "Ll":
365 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000366 if category == "Zl" or bidirectional == "B":
367 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000368 if category == "Zs" or bidirectional in ("WS", "B", "S"):
369 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000370 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000371 flags |= TITLE_MASK
372 if category == "Lu":
373 flags |= UPPER_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000374 if "XID_Start" in properties:
375 flags |= XID_START_MASK
376 if "XID_Continue" in properties:
377 flags |= XID_CONTINUE_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000378 # use delta predictor for upper/lower/title
379 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000380 upper = int(record[12], 16) - char
381 assert -32768 <= upper <= 32767
382 upper = upper & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000383 else:
384 upper = 0
385 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000386 lower = int(record[13], 16) - char
387 assert -32768 <= lower <= 32767
388 lower = lower & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000389 else:
390 lower = 0
391 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000392 title = int(record[14], 16) - char
393 assert -32768 <= lower <= 32767
394 title = title & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000395 else:
396 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000397 # decimal digit, integer digit
398 decimal = 0
399 if record[6]:
400 flags |= DECIMAL_MASK
401 decimal = int(record[6])
402 digit = 0
403 if record[7]:
404 flags |= DIGIT_MASK
405 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000407 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000408 )
409 # add entry to index and item tables
410 i = cache.get(item)
411 if i is None:
412 cache[item] = i = len(table)
413 table.append(item)
414 index[char] = i
415
Collin Winter6afaeb72007-08-03 17:06:41 +0000416 print(len(table), "unique character type entries")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000417
Collin Winter6afaeb72007-08-03 17:06:41 +0000418 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000419
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000420 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000421 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
422 print(file=fp)
423 print("/* a list of unique character type descriptors */", file=fp)
424 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000425 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000426 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
427 print("};", file=fp)
428 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000429
430 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000431 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000432
Collin Winter6afaeb72007-08-03 17:06:41 +0000433 print("/* type indexes */", file=fp)
434 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000435 Array("index1", index1).dump(fp, trace)
436 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000437
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000438 fp.close()
439
440# --------------------------------------------------------------------
441# unicode name database
442
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000443def CmpToKey(mycmp):
444 'Convert a cmp= function into a key= function'
445 class K(object):
446 def __init__(self, obj, *args):
447 self.obj = obj
448 def __lt__(self, other):
449 return mycmp(self.obj, other.obj) == -1
450 return K
451
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000452def makeunicodename(unicode, trace):
453
454 FILE = "Modules/unicodename_db.h"
455
Collin Winter6afaeb72007-08-03 17:06:41 +0000456 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000457
458 # collect names
459 names = [None] * len(unicode.chars)
460
461 for char in unicode.chars:
462 record = unicode.table[char]
463 if record:
464 name = record[1].strip()
465 if name and name[0] != "<":
466 names[char] = name + chr(0)
467
Georg Brandlbf82e372008-05-16 17:02:34 +0000468 print(len(n for n in names if n is not None), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000469
470 # collect unique words from names (note that we differ between
471 # words inside a sentence, and words ending a sentence. the
472 # latter includes the trailing null byte.
473
474 words = {}
475 n = b = 0
476 for char in unicode.chars:
477 name = names[char]
478 if name:
479 w = name.split()
480 b = b + len(name)
481 n = n + len(w)
482 for w in w:
483 l = words.get(w)
484 if l:
485 l.append(None)
486 else:
487 words[w] = [len(words)]
488
Collin Winter6afaeb72007-08-03 17:06:41 +0000489 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000490
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000491 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000492
Martin v. Löwis97225da2002-11-24 23:05:09 +0000493 # sort on falling frequency, then by name
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000494 def cmpwords(a,b):
495 aword, alist = a
496 bword, blist = b
Martin v. Löwis97225da2002-11-24 23:05:09 +0000497 r = -cmp(len(alist),len(blist))
498 if r:
499 return r
500 return cmp(aword, bword)
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000501 wordlist.sort(key=CmpToKey(cmpwords))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000502
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000503 # figure out how many phrasebook escapes we need
504 escapes = 0
505 while escapes * 256 < len(wordlist):
506 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000507 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000508
509 short = 256 - escapes
510
511 assert short > 0
512
Collin Winter6afaeb72007-08-03 17:06:41 +0000513 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000514
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000515 # statistics
516 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000517 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000518 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000519 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000520
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000521 # pick the most commonly used words, and sort the rest on falling
522 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000523
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000524 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000525 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000526 wordlist.extend(wordtail)
527
528 # generate lexicon from words
529
530 lexicon_offset = [0]
531 lexicon = ""
532 words = {}
533
534 # build a lexicon string
535 offset = 0
536 for w, x in wordlist:
537 # encoding: bit 7 indicates last character in word (chr(128)
538 # indicates the last character in an entire string)
539 ww = w[:-1] + chr(ord(w[-1])+128)
540 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000541 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000542 if o < 0:
543 o = offset
544 lexicon = lexicon + ww
545 offset = offset + len(w)
546 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000547 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000548
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000549 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000550
551 # generate phrasebook from names and lexicon
552 phrasebook = [0]
553 phrasebook_offset = [0] * len(unicode.chars)
554 for char in unicode.chars:
555 name = names[char]
556 if name:
557 w = name.split()
558 phrasebook_offset[char] = len(phrasebook)
559 for w in w:
560 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000561 if i < short:
562 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000563 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000564 # store as two bytes
565 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000566 phrasebook.append(i&255)
567
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000568 assert getsize(phrasebook) == 1
569
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000570 #
571 # unicode name hash table
572
573 # extract names
574 data = []
575 for char in unicode.chars:
576 record = unicode.table[char]
577 if record:
578 name = record[1].strip()
579 if name and name[0] != "<":
580 data.append((name, char))
581
582 # the magic number 47 was chosen to minimize the number of
583 # collisions on the current data set. if you like, change it
584 # and see what happens...
585
586 codehash = Hash("code", data, 47)
587
Collin Winter6afaeb72007-08-03 17:06:41 +0000588 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000589
590 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000591 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
592 print(file=fp)
593 print("#define NAME_MAXLEN", 256, file=fp)
594 print(file=fp)
595 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000596 Array("lexicon", lexicon).dump(fp, trace)
597 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000598
599 # split decomposition index table
600 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
601
Collin Winter6afaeb72007-08-03 17:06:41 +0000602 print("/* code->name phrasebook */", file=fp)
603 print("#define phrasebook_shift", shift, file=fp)
604 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000605
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000606 Array("phrasebook", phrasebook).dump(fp, trace)
607 Array("phrasebook_offset1", offset1).dump(fp, trace)
608 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000609
Collin Winter6afaeb72007-08-03 17:06:41 +0000610 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000611 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000612
613 fp.close()
614
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000615
616def merge_old_version(version, new, old):
617 # Changes to exclusion file not implemented yet
618 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000619 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000620
621 # In these change records, 0xFF means "no change"
622 bidir_changes = [0xFF]*0x110000
623 category_changes = [0xFF]*0x110000
624 decimal_changes = [0xFF]*0x110000
625 # In numeric data, 0 means "no change",
626 # -1 means "did not have a numeric value
627 numeric_changes = [0] * 0x110000
628 # normalization_changes is a list of key-value pairs
629 normalization_changes = []
630 for i in range(0x110000):
631 if new.table[i] is None:
632 # Characters unassigned in the new version ought to
633 # be unassigned in the old one
634 assert old.table[i] is None
635 continue
636 # check characters unassigned in the old version
637 if old.table[i] is None:
638 # category 0 is "unassigned"
639 category_changes[i] = 0
640 continue
641 # check characters that differ
642 if old.table[i] != new.table[i]:
643 for k in range(len(old.table[i])):
644 if old.table[i][k] != new.table[i][k]:
645 value = old.table[i][k]
646 if k == 2:
647 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
648 category_changes[i] = CATEGORY_NAMES.index(value)
649 elif k == 4:
650 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
651 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
652 elif k == 5:
653 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
654 # We assume that all normalization changes are in 1:1 mappings
655 assert " " not in value
656 normalization_changes.append((i, value))
657 elif k == 6:
658 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
659 # we only support changes where the old value is a single digit
660 assert value in "0123456789"
661 decimal_changes[i] = int(value)
662 elif k == 8:
663 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
664 # Since 0 encodes "no change", the old value is better not 0
665 assert value != "0" and value != "-1"
666 if not value:
667 numeric_changes[i] = -1
668 else:
669 assert re.match("^[0-9]+$", value)
670 numeric_changes[i] = int(value)
671 elif k == 11:
672 # change to ISO comment, ignore
673 pass
674 elif k == 12:
675 # change to simple uppercase mapping; ignore
676 pass
677 elif k == 13:
678 # change to simple lowercase mapping; ignore
679 pass
680 elif k == 14:
681 # change to simple titlecase mapping; ignore
682 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000683 elif k == 16:
684 # derived property changes; not yet
685 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000686 else:
687 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000688 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000689 new.changed.append((version, list(zip(bidir_changes, category_changes,
690 decimal_changes, numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000691 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000692
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000693
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000694# --------------------------------------------------------------------
695# the following support code is taken from the unidb utilities
696# Copyright (c) 1999-2000 by Secret Labs AB
697
698# load a unicode-data file from disk
699
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000700import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000701
702class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000703 # Record structure:
704 # [ID, name, category, combining, bidi, decomp, (6)
705 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
706 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
707 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000708
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000709 def __init__(self, filename, exclusions, eastasianwidth,
710 derivedprops, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000711 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000712 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000713 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000714 while 1:
715 s = file.readline()
716 if not s:
717 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000718 s = s.strip().split(";")
719 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000720 table[char] = s
721
Martin v. Löwis97225da2002-11-24 23:05:09 +0000722 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000723 if expand:
724 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000725 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000726 s = table[i]
727 if s:
728 if s[1][-6:] == "First>":
729 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000730 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000731 elif s[1][-5:] == "Last>":
732 s[1] = ""
733 field = None
734 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000735 f2 = field[:]
736 f2[0] = "%X" % i
737 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000738
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000739 # public attributes
740 self.filename = filename
741 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000742 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000743
Martin v. Löwis677bde22002-11-23 22:08:15 +0000744 file = open(exclusions)
745 self.exclusions = {}
746 for s in file:
747 s = s.strip()
748 if not s:
749 continue
750 if s[0] == '#':
751 continue
752 char = int(s.split()[0],16)
753 self.exclusions[char] = 1
754
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000755 widths = [None] * 0x110000
756 for s in open(eastasianwidth):
757 s = s.strip()
758 if not s:
759 continue
760 if s[0] == '#':
761 continue
762 s = s.split()[0].split(';')
763 if '..' in s[0]:
764 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000765 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000766 else:
767 chars = [int(s[0], 16)]
768 for char in chars:
769 widths[char] = s[1]
770 for i in range(0, 0x110000):
771 if table[i] is not None:
772 table[i].append(widths[i])
773
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000774 for i in range(0, 0x110000):
775 if table[i] is not None:
776 table[i].append(set())
777 for s in open(derivedprops):
778 s = s.split('#', 1)[0].strip()
779 if not s:
780 continue
781
782 r, p = s.split(";")
783 r = r.strip()
784 p = p.strip()
785 if ".." in r:
786 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000787 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000788 else:
789 chars = [int(r, 16)]
790 for char in chars:
791 if table[char]:
792 # Some properties (e.g. Default_Ignorable_Code_Point)
793 # apply to unassigned code points; ignore them
794 table[char][-1].add(p)
795
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000796 def uselatin1(self):
797 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000798 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000799
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000800# hash table tools
801
802# this is a straight-forward reimplementation of Python's built-in
803# dictionary type, using a static data structure, and a custom string
804# hash algorithm.
805
806def myhash(s, magic):
807 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000808 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000809 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000810 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000811 if ix:
812 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
813 return h
814
815SIZES = [
816 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
817 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
818 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
819 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
820]
821
822class Hash:
823 def __init__(self, name, data, magic):
824 # turn a (key, value) list into a static hash table structure
825
826 # determine table size
827 for size, poly in SIZES:
828 if size > len(data):
829 poly = size + poly
830 break
831 else:
Collin Wintera817e582007-08-22 23:05:06 +0000832 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000833
Collin Winter6afaeb72007-08-03 17:06:41 +0000834 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000835
836 table = [None] * size
837
838 mask = size-1
839
840 n = 0
841
842 hash = myhash
843
844 # initialize hash table
845 for key, value in data:
846 h = hash(key, magic)
847 i = (~h) & mask
848 v = table[i]
849 if v is None:
850 table[i] = value
851 continue
852 incr = (h ^ (h >> 3)) & mask;
853 if not incr:
854 incr = mask
855 while 1:
856 n = n + 1
857 i = (i + incr) & mask
858 v = table[i]
859 if v is None:
860 table[i] = value
861 break
862 incr = incr << 1
863 if incr > mask:
864 incr = incr ^ poly
865
Collin Winter6afaeb72007-08-03 17:06:41 +0000866 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000867 self.collisions = n
868
869 for i in range(len(table)):
870 if table[i] is None:
871 table[i] = 0
872
873 self.data = Array(name + "_hash", table)
874 self.magic = magic
875 self.name = name
876 self.size = size
877 self.poly = poly
878
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000879 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000880 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000881 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000882 file.write("#define %s_magic %d\n" % (self.name, self.magic))
883 file.write("#define %s_size %d\n" % (self.name, self.size))
884 file.write("#define %s_poly %d\n" % (self.name, self.poly))
885
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000886# stuff to deal with arrays of unsigned integers
887
888class Array:
889
890 def __init__(self, name, data):
891 self.name = name
892 self.data = data
893
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000894 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000895 # write data to file, as a C array
896 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000897 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000898 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000899 file.write("static ")
900 if size == 1:
901 file.write("unsigned char")
902 elif size == 2:
903 file.write("unsigned short")
904 else:
905 file.write("unsigned int")
906 file.write(" " + self.name + "[] = {\n")
907 if self.data:
908 s = " "
909 for item in self.data:
910 i = str(item) + ", "
911 if len(s) + len(i) > 78:
912 file.write(s + "\n")
913 s = " " + i
914 else:
915 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000916 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000917 file.write(s + "\n")
918 file.write("};\n\n")
919
920def getsize(data):
921 # return smallest possible integer size for the given array
922 maxdata = max(data)
923 if maxdata < 256:
924 return 1
925 elif maxdata < 65536:
926 return 2
927 else:
928 return 4
929
Tim Peters21013482000-09-25 07:13:41 +0000930def splitbins(t, trace=0):
931 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
932
933 t is a sequence of ints. This function can be useful to save space if
934 many of the ints are the same. t1 and t2 are lists of ints, and shift
935 is an int, chosen to minimize the combined size of t1 and t2 (in C
936 code), and where for each i in range(len(t)),
937 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
938 where mask is a bitmask isolating the last "shift" bits.
939
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000940 If optional arg trace is non-zero (default zero), progress info
941 is printed to sys.stderr. The higher the value, the more info
942 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000943 """
944
945 import sys
946 if trace:
947 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +0000948 print("%d+%d bins at shift %d; %d bytes" % (
949 len(t1), len(t2), shift, bytes), file=sys.stderr)
950 print("Size of original table:", len(t)*getsize(t), \
951 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000952 n = len(t)-1 # last valid index
953 maxshift = 0 # the most we can shift n and still have something left
954 if n > 0:
955 while n >> 1:
956 n >>= 1
957 maxshift += 1
958 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +0000959 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +0000960 t = tuple(t) # so slices can be dict keys
961 for shift in range(maxshift + 1):
962 t1 = []
963 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000964 size = 2**shift
965 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000966 for i in range(0, len(t), size):
967 bin = t[i:i+size]
968 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000969 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000970 index = len(t2)
971 bincache[bin] = index
972 t2.extend(bin)
973 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000974 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000975 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000976 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000977 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000978 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000979 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000980 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000981 t1, t2, shift = best
982 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000983 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000984 dump(t1, t2, shift, bytes)
985 if __debug__:
986 # exhaustively verify that the decomposition is correct
987 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +0000988 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +0000989 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
990 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000991
992if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000993 maketables(1)