blob: f080ca2da3373a376a6cb0eebf580c6034bb3373 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Fredrik Lundhcfcea492000-09-25 08:07:06 +000023#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000024# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000025#
26
27import sys
28
29SCRIPT = sys.argv[0]
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000030VERSION = "2.5"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000031
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000032# The Unicode Database
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000033UNIDATA_VERSION = "4.1.0"
34UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000037DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000038
39old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000040
41CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
42 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
43 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
44 "So" ]
45
46BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
47 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
48 "ON" ]
49
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000050EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
51
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000052# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000053ALPHA_MASK = 0x01
54DECIMAL_MASK = 0x02
55DIGIT_MASK = 0x04
56LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000057LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000058SPACE_MASK = 0x20
59TITLE_MASK = 0x40
60UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000061XID_START_MASK = 0x100
62XID_CONTINUE_MASK = 0x200
Fredrik Lundhe9133f72000-09-25 17:59:57 +000063
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000064def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000065
Collin Winter6afaeb72007-08-03 17:06:41 +000066 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000067
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068 version = ""
69 unicode = UnicodeData(UNICODE_DATA % version,
70 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000071 EASTASIAN_WIDTH % version,
72 DERIVED_CORE_PROPERTIES % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000073
Martin v. Löwis13c3e382007-08-14 22:37:03 +000074 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000075
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000077 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
79 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000080 EASTASIAN_WIDTH % ("-"+version),
81 DERIVED_CORE_PROPERTIES % ("-"+version))
82 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000083 merge_old_version(version, unicode, old_unicode)
84
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000085 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000086 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000087 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000088
89# --------------------------------------------------------------------
90# unicode character properties
91
92def makeunicodedata(unicode, trace):
93
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000094 dummy = (0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000095 table = [dummy]
96 cache = {0: dummy}
97 index = [0] * len(unicode.chars)
98
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000099 FILE = "Modules/unicodedata_db.h"
100
Collin Winter6afaeb72007-08-03 17:06:41 +0000101 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000102
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000103 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000104
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000105 for char in unicode.chars:
106 record = unicode.table[char]
107 if record:
108 # extract database properties
109 category = CATEGORY_NAMES.index(record[2])
110 combining = int(record[3])
111 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
112 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000113 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000114 item = (
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000115 category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000116 )
117 # add entry to index and item tables
118 i = cache.get(item)
119 if i is None:
120 cache[item] = i = len(table)
121 table.append(item)
122 index[char] = i
123
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000124 # 2) decomposition data
125
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000126 decomp_data = [0]
127 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000128 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000130
Martin v. Löwis677bde22002-11-23 22:08:15 +0000131 comp_pairs = []
132 comp_first = [None] * len(unicode.chars)
133 comp_last = [None] * len(unicode.chars)
134
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000135 for char in unicode.chars:
136 record = unicode.table[char]
137 if record:
138 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000139 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000140 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000141 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000142 # prefix
143 if decomp[0][0] == "<":
144 prefix = decomp.pop(0)
145 else:
146 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000147 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000148 i = decomp_prefix.index(prefix)
149 except ValueError:
150 i = len(decomp_prefix)
151 decomp_prefix.append(prefix)
152 prefix = i
153 assert prefix < 256
154 # content
155 decomp = [prefix + (len(decomp)<<8)] +\
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000156 list(map(lambda s: int(s, 16), decomp))
Martin v. Löwis677bde22002-11-23 22:08:15 +0000157 # Collect NFC pairs
158 if not prefix and len(decomp) == 3 and \
159 char not in unicode.exclusions and \
160 unicode.table[decomp[1]][3] == "0":
161 p, l, r = decomp
162 comp_first[l] = 1
163 comp_last[r] = 1
164 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000165 try:
166 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000167 except ValueError:
168 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169 decomp_data.extend(decomp)
170 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000171 else:
172 i = 0
173 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000174
Martin v. Löwis677bde22002-11-23 22:08:15 +0000175 f = l = 0
176 comp_first_ranges = []
177 comp_last_ranges = []
178 prev_f = prev_l = None
179 for i in unicode.chars:
180 if comp_first[i] is not None:
181 comp_first[i] = f
182 f += 1
183 if prev_f is None:
184 prev_f = (i,i)
185 elif prev_f[1]+1 == i:
186 prev_f = prev_f[0],i
187 else:
188 comp_first_ranges.append(prev_f)
189 prev_f = (i,i)
190 if comp_last[i] is not None:
191 comp_last[i] = l
192 l += 1
193 if prev_l is None:
194 prev_l = (i,i)
195 elif prev_l[1]+1 == i:
196 prev_l = prev_l[0],i
197 else:
198 comp_last_ranges.append(prev_l)
199 prev_l = (i,i)
200 comp_first_ranges.append(prev_f)
201 comp_last_ranges.append(prev_l)
202 total_first = f
203 total_last = l
204
205 comp_data = [0]*(total_first*total_last)
206 for f,l,char in comp_pairs:
207 f = comp_first[f]
208 l = comp_last[l]
209 comp_data[f*total_last+l] = char
210
Collin Winter6afaeb72007-08-03 17:06:41 +0000211 print(len(table), "unique properties")
212 print(len(decomp_prefix), "unique decomposition prefixes")
213 print(len(decomp_data), "unique decomposition entries:", end=' ')
214 print(decomp_size, "bytes")
215 print(total_first, "first characters in NFC")
216 print(total_last, "last characters in NFC")
217 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000218
Collin Winter6afaeb72007-08-03 17:06:41 +0000219 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000220
Fred Drake9c685052000-10-26 03:56:46 +0000221 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000222 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
223 print(file=fp)
224 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
225 print("/* a list of unique database records */", file=fp)
226 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000227 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000228 print(" {%d, %d, %d, %d, %d}," % item, file=fp)
229 print("};", file=fp)
230 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000231
Collin Winter6afaeb72007-08-03 17:06:41 +0000232 print("/* Reindexing of NFC first characters. */", file=fp)
233 print("#define TOTAL_FIRST",total_first, file=fp)
234 print("#define TOTAL_LAST",total_last, file=fp)
235 print("struct reindex{int start;short count,index;};", file=fp)
236 print("struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000237 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000238 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
239 print(" {0,0,0}", file=fp)
240 print("};\n", file=fp)
241 print("struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000242 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000243 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
244 print(" {0,0,0}", file=fp)
245 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000246
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000247 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000248 # the support code moved into unicodedatabase.c
249
Collin Winter6afaeb72007-08-03 17:06:41 +0000250 print("/* string literals */", file=fp)
251 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000252 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000253 print(" \"%s\"," % name, file=fp)
254 print(" NULL", file=fp)
255 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000256
Collin Winter6afaeb72007-08-03 17:06:41 +0000257 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000258 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000259 print(" \"%s\"," % name, file=fp)
260 print(" NULL", file=fp)
261 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000262
Collin Winter6afaeb72007-08-03 17:06:41 +0000263 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000264 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000265 print(" \"%s\"," % name, file=fp)
266 print(" NULL", file=fp)
267 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000268
Collin Winter6afaeb72007-08-03 17:06:41 +0000269 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000270 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000271 print(" \"%s\"," % name, file=fp)
272 print(" NULL", file=fp)
273 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000274
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000275 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000276 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000277
Collin Winter6afaeb72007-08-03 17:06:41 +0000278 print("/* index tables for the database records */", file=fp)
279 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000280 Array("index1", index1).dump(fp, trace)
281 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000282
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000283 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000284 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000285
Collin Winter6afaeb72007-08-03 17:06:41 +0000286 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000287 Array("decomp_data", decomp_data).dump(fp, trace)
288
Collin Winter6afaeb72007-08-03 17:06:41 +0000289 print("/* index tables for the decomposition data */", file=fp)
290 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000291 Array("decomp_index1", index1).dump(fp, trace)
292 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000293
Martin v. Löwis677bde22002-11-23 22:08:15 +0000294 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000295 print("/* NFC pairs */", file=fp)
296 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000297 Array("comp_index", index).dump(fp, trace)
298 Array("comp_data", index2).dump(fp, trace)
299
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000300 # Generate delta tables for old versions
301 for version, table, normalization in unicode.changed:
302 cversion = version.replace(".","_")
303 records = [table[0]]
304 cache = {table[0]:0}
305 index = [0] * len(table)
306 for i, record in enumerate(table):
307 try:
308 index[i] = cache[record]
309 except KeyError:
310 index[i] = cache[record] = len(records)
311 records.append(record)
312 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000313 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000314 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000315 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
316 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000317 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
318 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000319 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
320 print("{", file=fp)
321 print("\tint index;", file=fp)
322 print("\tif (n >= 0x110000) index = 0;", file=fp)
323 print("\telse {", file=fp)
324 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
325 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
326 (cversion, shift, ((1<<shift)-1)), file=fp)
327 print("\t}", file=fp)
328 print("\treturn change_records_%s+index;" % cversion, file=fp)
329 print("}\n", file=fp)
330 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
331 print("{", file=fp)
332 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000333 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000334 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
335 print("\tdefault: return 0;", file=fp)
336 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000337
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000338 fp.close()
339
340# --------------------------------------------------------------------
341# unicode character type tables
342
343def makeunicodetype(unicode, trace):
344
345 FILE = "Objects/unicodetype_db.h"
346
Collin Winter6afaeb72007-08-03 17:06:41 +0000347 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000348
349 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000350 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000351 table = [dummy]
352 cache = {0: dummy}
353 index = [0] * len(unicode.chars)
354
355 for char in unicode.chars:
356 record = unicode.table[char]
357 if record:
358 # extract database properties
359 category = record[2]
360 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000361 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000362 flags = 0
363 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
364 flags |= ALPHA_MASK
365 if category == "Ll":
366 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000367 if category == "Zl" or bidirectional == "B":
368 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000369 if category == "Zs" or bidirectional in ("WS", "B", "S"):
370 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000371 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000372 flags |= TITLE_MASK
373 if category == "Lu":
374 flags |= UPPER_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000375 if "XID_Start" in properties:
376 flags |= XID_START_MASK
377 if "XID_Continue" in properties:
378 flags |= XID_CONTINUE_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000379 # use delta predictor for upper/lower/title
380 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000381 upper = int(record[12], 16) - char
382 assert -32768 <= upper <= 32767
383 upper = upper & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000384 else:
385 upper = 0
386 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000387 lower = int(record[13], 16) - char
388 assert -32768 <= lower <= 32767
389 lower = lower & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000390 else:
391 lower = 0
392 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000393 title = int(record[14], 16) - char
394 assert -32768 <= lower <= 32767
395 title = title & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000396 else:
397 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000398 # decimal digit, integer digit
399 decimal = 0
400 if record[6]:
401 flags |= DECIMAL_MASK
402 decimal = int(record[6])
403 digit = 0
404 if record[7]:
405 flags |= DIGIT_MASK
406 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000407 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000408 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000409 )
410 # add entry to index and item tables
411 i = cache.get(item)
412 if i is None:
413 cache[item] = i = len(table)
414 table.append(item)
415 index[char] = i
416
Collin Winter6afaeb72007-08-03 17:06:41 +0000417 print(len(table), "unique character type entries")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000418
Collin Winter6afaeb72007-08-03 17:06:41 +0000419 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000420
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000421 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000422 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
423 print(file=fp)
424 print("/* a list of unique character type descriptors */", file=fp)
425 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000426 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000427 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
428 print("};", file=fp)
429 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000430
431 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000432 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000433
Collin Winter6afaeb72007-08-03 17:06:41 +0000434 print("/* type indexes */", file=fp)
435 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000436 Array("index1", index1).dump(fp, trace)
437 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000438
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000439 fp.close()
440
441# --------------------------------------------------------------------
442# unicode name database
443
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000444def CmpToKey(mycmp):
445 'Convert a cmp= function into a key= function'
446 class K(object):
447 def __init__(self, obj, *args):
448 self.obj = obj
449 def __lt__(self, other):
450 return mycmp(self.obj, other.obj) == -1
451 return K
452
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000453def makeunicodename(unicode, trace):
454
455 FILE = "Modules/unicodename_db.h"
456
Collin Winter6afaeb72007-08-03 17:06:41 +0000457 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000458
459 # collect names
460 names = [None] * len(unicode.chars)
461
462 for char in unicode.chars:
463 record = unicode.table[char]
464 if record:
465 name = record[1].strip()
466 if name and name[0] != "<":
467 names[char] = name + chr(0)
468
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000469 print(len(list(filter(lambda n: n is not None, names))), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000470
471 # collect unique words from names (note that we differ between
472 # words inside a sentence, and words ending a sentence. the
473 # latter includes the trailing null byte.
474
475 words = {}
476 n = b = 0
477 for char in unicode.chars:
478 name = names[char]
479 if name:
480 w = name.split()
481 b = b + len(name)
482 n = n + len(w)
483 for w in w:
484 l = words.get(w)
485 if l:
486 l.append(None)
487 else:
488 words[w] = [len(words)]
489
Collin Winter6afaeb72007-08-03 17:06:41 +0000490 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000491
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000492 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000493
Martin v. Löwis97225da2002-11-24 23:05:09 +0000494 # sort on falling frequency, then by name
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000495 def cmpwords(a,b):
496 aword, alist = a
497 bword, blist = b
Martin v. Löwis97225da2002-11-24 23:05:09 +0000498 r = -cmp(len(alist),len(blist))
499 if r:
500 return r
501 return cmp(aword, bword)
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000502 wordlist.sort(key=CmpToKey(cmpwords))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000503
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000504 # figure out how many phrasebook escapes we need
505 escapes = 0
506 while escapes * 256 < len(wordlist):
507 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000508 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000509
510 short = 256 - escapes
511
512 assert short > 0
513
Collin Winter6afaeb72007-08-03 17:06:41 +0000514 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000515
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000516 # statistics
517 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000518 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000519 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000520 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000521
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000522 # pick the most commonly used words, and sort the rest on falling
523 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000524
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000525 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000526 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000527 wordlist.extend(wordtail)
528
529 # generate lexicon from words
530
531 lexicon_offset = [0]
532 lexicon = ""
533 words = {}
534
535 # build a lexicon string
536 offset = 0
537 for w, x in wordlist:
538 # encoding: bit 7 indicates last character in word (chr(128)
539 # indicates the last character in an entire string)
540 ww = w[:-1] + chr(ord(w[-1])+128)
541 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000542 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000543 if o < 0:
544 o = offset
545 lexicon = lexicon + ww
546 offset = offset + len(w)
547 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000548 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000549
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000550 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000551
552 # generate phrasebook from names and lexicon
553 phrasebook = [0]
554 phrasebook_offset = [0] * len(unicode.chars)
555 for char in unicode.chars:
556 name = names[char]
557 if name:
558 w = name.split()
559 phrasebook_offset[char] = len(phrasebook)
560 for w in w:
561 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000562 if i < short:
563 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000564 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000565 # store as two bytes
566 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000567 phrasebook.append(i&255)
568
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000569 assert getsize(phrasebook) == 1
570
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000571 #
572 # unicode name hash table
573
574 # extract names
575 data = []
576 for char in unicode.chars:
577 record = unicode.table[char]
578 if record:
579 name = record[1].strip()
580 if name and name[0] != "<":
581 data.append((name, char))
582
583 # the magic number 47 was chosen to minimize the number of
584 # collisions on the current data set. if you like, change it
585 # and see what happens...
586
587 codehash = Hash("code", data, 47)
588
Collin Winter6afaeb72007-08-03 17:06:41 +0000589 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000590
591 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000592 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
593 print(file=fp)
594 print("#define NAME_MAXLEN", 256, file=fp)
595 print(file=fp)
596 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000597 Array("lexicon", lexicon).dump(fp, trace)
598 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000599
600 # split decomposition index table
601 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
602
Collin Winter6afaeb72007-08-03 17:06:41 +0000603 print("/* code->name phrasebook */", file=fp)
604 print("#define phrasebook_shift", shift, file=fp)
605 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000606
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000607 Array("phrasebook", phrasebook).dump(fp, trace)
608 Array("phrasebook_offset1", offset1).dump(fp, trace)
609 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000610
Collin Winter6afaeb72007-08-03 17:06:41 +0000611 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000612 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000613
614 fp.close()
615
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000616
617def merge_old_version(version, new, old):
618 # Changes to exclusion file not implemented yet
619 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000620 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000621
622 # In these change records, 0xFF means "no change"
623 bidir_changes = [0xFF]*0x110000
624 category_changes = [0xFF]*0x110000
625 decimal_changes = [0xFF]*0x110000
626 # In numeric data, 0 means "no change",
627 # -1 means "did not have a numeric value
628 numeric_changes = [0] * 0x110000
629 # normalization_changes is a list of key-value pairs
630 normalization_changes = []
631 for i in range(0x110000):
632 if new.table[i] is None:
633 # Characters unassigned in the new version ought to
634 # be unassigned in the old one
635 assert old.table[i] is None
636 continue
637 # check characters unassigned in the old version
638 if old.table[i] is None:
639 # category 0 is "unassigned"
640 category_changes[i] = 0
641 continue
642 # check characters that differ
643 if old.table[i] != new.table[i]:
644 for k in range(len(old.table[i])):
645 if old.table[i][k] != new.table[i][k]:
646 value = old.table[i][k]
647 if k == 2:
648 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
649 category_changes[i] = CATEGORY_NAMES.index(value)
650 elif k == 4:
651 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
652 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
653 elif k == 5:
654 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
655 # We assume that all normalization changes are in 1:1 mappings
656 assert " " not in value
657 normalization_changes.append((i, value))
658 elif k == 6:
659 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
660 # we only support changes where the old value is a single digit
661 assert value in "0123456789"
662 decimal_changes[i] = int(value)
663 elif k == 8:
664 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
665 # Since 0 encodes "no change", the old value is better not 0
666 assert value != "0" and value != "-1"
667 if not value:
668 numeric_changes[i] = -1
669 else:
670 assert re.match("^[0-9]+$", value)
671 numeric_changes[i] = int(value)
672 elif k == 11:
673 # change to ISO comment, ignore
674 pass
675 elif k == 12:
676 # change to simple uppercase mapping; ignore
677 pass
678 elif k == 13:
679 # change to simple lowercase mapping; ignore
680 pass
681 elif k == 14:
682 # change to simple titlecase mapping; ignore
683 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000684 elif k == 16:
685 # derived property changes; not yet
686 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000687 else:
688 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000689 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000690 new.changed.append((version, list(zip(bidir_changes, category_changes,
691 decimal_changes, numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000692 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000693
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000694
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000695# --------------------------------------------------------------------
696# the following support code is taken from the unidb utilities
697# Copyright (c) 1999-2000 by Secret Labs AB
698
699# load a unicode-data file from disk
700
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000701import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000702
703class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000704 # Record structure:
705 # [ID, name, category, combining, bidi, decomp, (6)
706 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
707 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
708 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000709
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000710 def __init__(self, filename, exclusions, eastasianwidth,
711 derivedprops, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000712 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000713 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000714 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000715 while 1:
716 s = file.readline()
717 if not s:
718 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000719 s = s.strip().split(";")
720 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000721 table[char] = s
722
Martin v. Löwis97225da2002-11-24 23:05:09 +0000723 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000724 if expand:
725 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000726 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000727 s = table[i]
728 if s:
729 if s[1][-6:] == "First>":
730 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000731 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000732 elif s[1][-5:] == "Last>":
733 s[1] = ""
734 field = None
735 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000736 f2 = field[:]
737 f2[0] = "%X" % i
738 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000739
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000740 # public attributes
741 self.filename = filename
742 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000743 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000744
Martin v. Löwis677bde22002-11-23 22:08:15 +0000745 file = open(exclusions)
746 self.exclusions = {}
747 for s in file:
748 s = s.strip()
749 if not s:
750 continue
751 if s[0] == '#':
752 continue
753 char = int(s.split()[0],16)
754 self.exclusions[char] = 1
755
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000756 widths = [None] * 0x110000
757 for s in open(eastasianwidth):
758 s = s.strip()
759 if not s:
760 continue
761 if s[0] == '#':
762 continue
763 s = s.split()[0].split(';')
764 if '..' in s[0]:
765 first, last = [int(c, 16) for c in s[0].split('..')]
766 chars = range(first, last+1)
767 else:
768 chars = [int(s[0], 16)]
769 for char in chars:
770 widths[char] = s[1]
771 for i in range(0, 0x110000):
772 if table[i] is not None:
773 table[i].append(widths[i])
774
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000775 for i in range(0, 0x110000):
776 if table[i] is not None:
777 table[i].append(set())
778 for s in open(derivedprops):
779 s = s.split('#', 1)[0].strip()
780 if not s:
781 continue
782
783 r, p = s.split(";")
784 r = r.strip()
785 p = p.strip()
786 if ".." in r:
787 first, last = [int(c, 16) for c in r.split('..')]
788 chars = range(first, last+1)
789 else:
790 chars = [int(r, 16)]
791 for char in chars:
792 if table[char]:
793 # Some properties (e.g. Default_Ignorable_Code_Point)
794 # apply to unassigned code points; ignore them
795 table[char][-1].add(p)
796
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000797 def uselatin1(self):
798 # restrict character range to ISO Latin 1
799 self.chars = range(256)
800
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000801# hash table tools
802
803# this is a straight-forward reimplementation of Python's built-in
804# dictionary type, using a static data structure, and a custom string
805# hash algorithm.
806
807def myhash(s, magic):
808 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000809 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000810 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000811 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000812 if ix:
813 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
814 return h
815
816SIZES = [
817 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
818 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
819 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
820 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
821]
822
823class Hash:
824 def __init__(self, name, data, magic):
825 # turn a (key, value) list into a static hash table structure
826
827 # determine table size
828 for size, poly in SIZES:
829 if size > len(data):
830 poly = size + poly
831 break
832 else:
Collin Wintera817e582007-08-22 23:05:06 +0000833 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000834
Collin Winter6afaeb72007-08-03 17:06:41 +0000835 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000836
837 table = [None] * size
838
839 mask = size-1
840
841 n = 0
842
843 hash = myhash
844
845 # initialize hash table
846 for key, value in data:
847 h = hash(key, magic)
848 i = (~h) & mask
849 v = table[i]
850 if v is None:
851 table[i] = value
852 continue
853 incr = (h ^ (h >> 3)) & mask;
854 if not incr:
855 incr = mask
856 while 1:
857 n = n + 1
858 i = (i + incr) & mask
859 v = table[i]
860 if v is None:
861 table[i] = value
862 break
863 incr = incr << 1
864 if incr > mask:
865 incr = incr ^ poly
866
Collin Winter6afaeb72007-08-03 17:06:41 +0000867 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000868 self.collisions = n
869
870 for i in range(len(table)):
871 if table[i] is None:
872 table[i] = 0
873
874 self.data = Array(name + "_hash", table)
875 self.magic = magic
876 self.name = name
877 self.size = size
878 self.poly = poly
879
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000880 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000881 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000882 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000883 file.write("#define %s_magic %d\n" % (self.name, self.magic))
884 file.write("#define %s_size %d\n" % (self.name, self.size))
885 file.write("#define %s_poly %d\n" % (self.name, self.poly))
886
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000887# stuff to deal with arrays of unsigned integers
888
889class Array:
890
891 def __init__(self, name, data):
892 self.name = name
893 self.data = data
894
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000895 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000896 # write data to file, as a C array
897 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000898 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000899 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000900 file.write("static ")
901 if size == 1:
902 file.write("unsigned char")
903 elif size == 2:
904 file.write("unsigned short")
905 else:
906 file.write("unsigned int")
907 file.write(" " + self.name + "[] = {\n")
908 if self.data:
909 s = " "
910 for item in self.data:
911 i = str(item) + ", "
912 if len(s) + len(i) > 78:
913 file.write(s + "\n")
914 s = " " + i
915 else:
916 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000917 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000918 file.write(s + "\n")
919 file.write("};\n\n")
920
921def getsize(data):
922 # return smallest possible integer size for the given array
923 maxdata = max(data)
924 if maxdata < 256:
925 return 1
926 elif maxdata < 65536:
927 return 2
928 else:
929 return 4
930
Tim Peters21013482000-09-25 07:13:41 +0000931def splitbins(t, trace=0):
932 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
933
934 t is a sequence of ints. This function can be useful to save space if
935 many of the ints are the same. t1 and t2 are lists of ints, and shift
936 is an int, chosen to minimize the combined size of t1 and t2 (in C
937 code), and where for each i in range(len(t)),
938 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
939 where mask is a bitmask isolating the last "shift" bits.
940
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000941 If optional arg trace is non-zero (default zero), progress info
942 is printed to sys.stderr. The higher the value, the more info
943 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000944 """
945
946 import sys
947 if trace:
948 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +0000949 print("%d+%d bins at shift %d; %d bytes" % (
950 len(t1), len(t2), shift, bytes), file=sys.stderr)
951 print("Size of original table:", len(t)*getsize(t), \
952 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000953 n = len(t)-1 # last valid index
954 maxshift = 0 # the most we can shift n and still have something left
955 if n > 0:
956 while n >> 1:
957 n >>= 1
958 maxshift += 1
959 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +0000960 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +0000961 t = tuple(t) # so slices can be dict keys
962 for shift in range(maxshift + 1):
963 t1 = []
964 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000965 size = 2**shift
966 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000967 for i in range(0, len(t), size):
968 bin = t[i:i+size]
969 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000970 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000971 index = len(t2)
972 bincache[bin] = index
973 t2.extend(bin)
974 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000975 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000976 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000977 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000978 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000979 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000980 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000981 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000982 t1, t2, shift = best
983 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000984 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000985 dump(t1, t2, shift, bytes)
986 if __debug__:
987 # exhaustively verify that the decomposition is correct
988 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +0000989 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +0000990 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
991 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000992
993if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000994 maketables(1)