blob: 0aabdf738c127ecac8b699126a1ac5a9c325469a [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Fredrik Lundhcfcea492000-09-25 08:07:06 +000023#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000024# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000025#
26
27import sys
28
29SCRIPT = sys.argv[0]
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000030VERSION = "2.5"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000031
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000032# The Unicode Database
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000033UNIDATA_VERSION = "4.1.0"
34UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
37
38old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000039
40CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
41 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
42 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
43 "So" ]
44
45BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
46 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
47 "ON" ]
48
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000049EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
50
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000051# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000052ALPHA_MASK = 0x01
53DECIMAL_MASK = 0x02
54DIGIT_MASK = 0x04
55LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000056LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000057SPACE_MASK = 0x20
58TITLE_MASK = 0x40
59UPPER_MASK = 0x80
60
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000061def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000062
Collin Winter6afaeb72007-08-03 17:06:41 +000063 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000064
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000065 version = ""
66 unicode = UnicodeData(UNICODE_DATA % version,
67 COMPOSITION_EXCLUSIONS % version,
68 EASTASIAN_WIDTH % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000069
Collin Winter6afaeb72007-08-03 17:06:41 +000070 print(len(filter(None, unicode.table)), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000071
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000072 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000073 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000074 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
75 COMPOSITION_EXCLUSIONS % ("-"+version),
76 EASTASIAN_WIDTH % ("-"+version))
Collin Winter6afaeb72007-08-03 17:06:41 +000077 print(len(filter(None, old_unicode.table)), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078 merge_old_version(version, unicode, old_unicode)
79
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000080 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000081 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000082 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000083
84# --------------------------------------------------------------------
85# unicode character properties
86
87def makeunicodedata(unicode, trace):
88
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000089 dummy = (0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000090 table = [dummy]
91 cache = {0: dummy}
92 index = [0] * len(unicode.chars)
93
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000094 FILE = "Modules/unicodedata_db.h"
95
Collin Winter6afaeb72007-08-03 17:06:41 +000096 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000097
Fredrik Lundhcfcea492000-09-25 08:07:06 +000098 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000099
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000100 for char in unicode.chars:
101 record = unicode.table[char]
102 if record:
103 # extract database properties
104 category = CATEGORY_NAMES.index(record[2])
105 combining = int(record[3])
106 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
107 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000108 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000109 item = (
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000110 category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000111 )
112 # add entry to index and item tables
113 i = cache.get(item)
114 if i is None:
115 cache[item] = i = len(table)
116 table.append(item)
117 index[char] = i
118
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000119 # 2) decomposition data
120
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000121 decomp_data = [0]
122 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000123 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000124 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000125
Martin v. Löwis677bde22002-11-23 22:08:15 +0000126 comp_pairs = []
127 comp_first = [None] * len(unicode.chars)
128 comp_last = [None] * len(unicode.chars)
129
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000130 for char in unicode.chars:
131 record = unicode.table[char]
132 if record:
133 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000134 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000135 if len(decomp) > 19:
136 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000137 # prefix
138 if decomp[0][0] == "<":
139 prefix = decomp.pop(0)
140 else:
141 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000142 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000143 i = decomp_prefix.index(prefix)
144 except ValueError:
145 i = len(decomp_prefix)
146 decomp_prefix.append(prefix)
147 prefix = i
148 assert prefix < 256
149 # content
150 decomp = [prefix + (len(decomp)<<8)] +\
151 map(lambda s: int(s, 16), decomp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000152 # Collect NFC pairs
153 if not prefix and len(decomp) == 3 and \
154 char not in unicode.exclusions and \
155 unicode.table[decomp[1]][3] == "0":
156 p, l, r = decomp
157 comp_first[l] = 1
158 comp_last[r] = 1
159 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000160 try:
161 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000162 except ValueError:
163 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000164 decomp_data.extend(decomp)
165 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000166 else:
167 i = 0
168 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000169
Martin v. Löwis677bde22002-11-23 22:08:15 +0000170 f = l = 0
171 comp_first_ranges = []
172 comp_last_ranges = []
173 prev_f = prev_l = None
174 for i in unicode.chars:
175 if comp_first[i] is not None:
176 comp_first[i] = f
177 f += 1
178 if prev_f is None:
179 prev_f = (i,i)
180 elif prev_f[1]+1 == i:
181 prev_f = prev_f[0],i
182 else:
183 comp_first_ranges.append(prev_f)
184 prev_f = (i,i)
185 if comp_last[i] is not None:
186 comp_last[i] = l
187 l += 1
188 if prev_l is None:
189 prev_l = (i,i)
190 elif prev_l[1]+1 == i:
191 prev_l = prev_l[0],i
192 else:
193 comp_last_ranges.append(prev_l)
194 prev_l = (i,i)
195 comp_first_ranges.append(prev_f)
196 comp_last_ranges.append(prev_l)
197 total_first = f
198 total_last = l
199
200 comp_data = [0]*(total_first*total_last)
201 for f,l,char in comp_pairs:
202 f = comp_first[f]
203 l = comp_last[l]
204 comp_data[f*total_last+l] = char
205
Collin Winter6afaeb72007-08-03 17:06:41 +0000206 print(len(table), "unique properties")
207 print(len(decomp_prefix), "unique decomposition prefixes")
208 print(len(decomp_data), "unique decomposition entries:", end=' ')
209 print(decomp_size, "bytes")
210 print(total_first, "first characters in NFC")
211 print(total_last, "last characters in NFC")
212 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000213
Collin Winter6afaeb72007-08-03 17:06:41 +0000214 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000215
Fred Drake9c685052000-10-26 03:56:46 +0000216 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000217 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
218 print(file=fp)
219 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
220 print("/* a list of unique database records */", file=fp)
221 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000222 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000223 print(" {%d, %d, %d, %d, %d}," % item, file=fp)
224 print("};", file=fp)
225 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000226
Collin Winter6afaeb72007-08-03 17:06:41 +0000227 print("/* Reindexing of NFC first characters. */", file=fp)
228 print("#define TOTAL_FIRST",total_first, file=fp)
229 print("#define TOTAL_LAST",total_last, file=fp)
230 print("struct reindex{int start;short count,index;};", file=fp)
231 print("struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000232 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000233 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
234 print(" {0,0,0}", file=fp)
235 print("};\n", file=fp)
236 print("struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000237 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000238 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
239 print(" {0,0,0}", file=fp)
240 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000241
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000242 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000243 # the support code moved into unicodedatabase.c
244
Collin Winter6afaeb72007-08-03 17:06:41 +0000245 print("/* string literals */", file=fp)
246 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000247 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000248 print(" \"%s\"," % name, file=fp)
249 print(" NULL", file=fp)
250 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000251
Collin Winter6afaeb72007-08-03 17:06:41 +0000252 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000253 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000254 print(" \"%s\"," % name, file=fp)
255 print(" NULL", file=fp)
256 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000257
Collin Winter6afaeb72007-08-03 17:06:41 +0000258 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000259 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000260 print(" \"%s\"," % name, file=fp)
261 print(" NULL", file=fp)
262 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000263
Collin Winter6afaeb72007-08-03 17:06:41 +0000264 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000265 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000266 print(" \"%s\"," % name, file=fp)
267 print(" NULL", file=fp)
268 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000269
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000270 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000271 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000272
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print("/* index tables for the database records */", file=fp)
274 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000275 Array("index1", index1).dump(fp, trace)
276 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000277
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000278 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000279 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000280
Collin Winter6afaeb72007-08-03 17:06:41 +0000281 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000282 Array("decomp_data", decomp_data).dump(fp, trace)
283
Collin Winter6afaeb72007-08-03 17:06:41 +0000284 print("/* index tables for the decomposition data */", file=fp)
285 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000286 Array("decomp_index1", index1).dump(fp, trace)
287 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000288
Martin v. Löwis677bde22002-11-23 22:08:15 +0000289 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000290 print("/* NFC pairs */", file=fp)
291 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000292 Array("comp_index", index).dump(fp, trace)
293 Array("comp_data", index2).dump(fp, trace)
294
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000295 # Generate delta tables for old versions
296 for version, table, normalization in unicode.changed:
297 cversion = version.replace(".","_")
298 records = [table[0]]
299 cache = {table[0]:0}
300 index = [0] * len(table)
301 for i, record in enumerate(table):
302 try:
303 index[i] = cache[record]
304 except KeyError:
305 index[i] = cache[record] = len(records)
306 records.append(record)
307 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000308 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000309 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000310 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
311 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000312 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
313 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000314 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
315 print("{", file=fp)
316 print("\tint index;", file=fp)
317 print("\tif (n >= 0x110000) index = 0;", file=fp)
318 print("\telse {", file=fp)
319 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
320 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
321 (cversion, shift, ((1<<shift)-1)), file=fp)
322 print("\t}", file=fp)
323 print("\treturn change_records_%s+index;" % cversion, file=fp)
324 print("}\n", file=fp)
325 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
326 print("{", file=fp)
327 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000328 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000329 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
330 print("\tdefault: return 0;", file=fp)
331 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000332
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000333 fp.close()
334
335# --------------------------------------------------------------------
336# unicode character type tables
337
338def makeunicodetype(unicode, trace):
339
340 FILE = "Objects/unicodetype_db.h"
341
Collin Winter6afaeb72007-08-03 17:06:41 +0000342 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000343
344 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000345 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000346 table = [dummy]
347 cache = {0: dummy}
348 index = [0] * len(unicode.chars)
349
350 for char in unicode.chars:
351 record = unicode.table[char]
352 if record:
353 # extract database properties
354 category = record[2]
355 bidirectional = record[4]
356 flags = 0
357 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
358 flags |= ALPHA_MASK
359 if category == "Ll":
360 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000361 if category == "Zl" or bidirectional == "B":
362 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000363 if category == "Zs" or bidirectional in ("WS", "B", "S"):
364 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000365 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000366 flags |= TITLE_MASK
367 if category == "Lu":
368 flags |= UPPER_MASK
369 # use delta predictor for upper/lower/title
370 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000371 upper = int(record[12], 16) - char
372 assert -32768 <= upper <= 32767
373 upper = upper & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000374 else:
375 upper = 0
376 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000377 lower = int(record[13], 16) - char
378 assert -32768 <= lower <= 32767
379 lower = lower & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000380 else:
381 lower = 0
382 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000383 title = int(record[14], 16) - char
384 assert -32768 <= lower <= 32767
385 title = title & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000386 else:
387 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000388 # decimal digit, integer digit
389 decimal = 0
390 if record[6]:
391 flags |= DECIMAL_MASK
392 decimal = int(record[6])
393 digit = 0
394 if record[7]:
395 flags |= DIGIT_MASK
396 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000397 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000398 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000399 )
400 # add entry to index and item tables
401 i = cache.get(item)
402 if i is None:
403 cache[item] = i = len(table)
404 table.append(item)
405 index[char] = i
406
Collin Winter6afaeb72007-08-03 17:06:41 +0000407 print(len(table), "unique character type entries")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000408
Collin Winter6afaeb72007-08-03 17:06:41 +0000409 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000410
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000411 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000412 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
413 print(file=fp)
414 print("/* a list of unique character type descriptors */", file=fp)
415 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000416 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000417 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
418 print("};", file=fp)
419 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000420
421 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000422 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000423
Collin Winter6afaeb72007-08-03 17:06:41 +0000424 print("/* type indexes */", file=fp)
425 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000426 Array("index1", index1).dump(fp, trace)
427 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000428
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000429 fp.close()
430
431# --------------------------------------------------------------------
432# unicode name database
433
434def makeunicodename(unicode, trace):
435
436 FILE = "Modules/unicodename_db.h"
437
Collin Winter6afaeb72007-08-03 17:06:41 +0000438 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000439
440 # collect names
441 names = [None] * len(unicode.chars)
442
443 for char in unicode.chars:
444 record = unicode.table[char]
445 if record:
446 name = record[1].strip()
447 if name and name[0] != "<":
448 names[char] = name + chr(0)
449
Collin Winter6afaeb72007-08-03 17:06:41 +0000450 print(len(filter(lambda n: n is not None, names)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000451
452 # collect unique words from names (note that we differ between
453 # words inside a sentence, and words ending a sentence. the
454 # latter includes the trailing null byte.
455
456 words = {}
457 n = b = 0
458 for char in unicode.chars:
459 name = names[char]
460 if name:
461 w = name.split()
462 b = b + len(name)
463 n = n + len(w)
464 for w in w:
465 l = words.get(w)
466 if l:
467 l.append(None)
468 else:
469 words[w] = [len(words)]
470
Collin Winter6afaeb72007-08-03 17:06:41 +0000471 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000472
473 wordlist = words.items()
474
Martin v. Löwis97225da2002-11-24 23:05:09 +0000475 # sort on falling frequency, then by name
476 def cmpwords((aword, alist),(bword, blist)):
477 r = -cmp(len(alist),len(blist))
478 if r:
479 return r
480 return cmp(aword, bword)
481 wordlist.sort(cmpwords)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000482
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000483 # figure out how many phrasebook escapes we need
484 escapes = 0
485 while escapes * 256 < len(wordlist):
486 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000487 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000488
489 short = 256 - escapes
490
491 assert short > 0
492
Collin Winter6afaeb72007-08-03 17:06:41 +0000493 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000494
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000495 # statistics
496 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000497 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000498 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000499 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000500
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000501 # pick the most commonly used words, and sort the rest on falling
502 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000503
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000504 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000505 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
506 wordlist.extend(wordtail)
507
508 # generate lexicon from words
509
510 lexicon_offset = [0]
511 lexicon = ""
512 words = {}
513
514 # build a lexicon string
515 offset = 0
516 for w, x in wordlist:
517 # encoding: bit 7 indicates last character in word (chr(128)
518 # indicates the last character in an entire string)
519 ww = w[:-1] + chr(ord(w[-1])+128)
520 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000521 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000522 if o < 0:
523 o = offset
524 lexicon = lexicon + ww
525 offset = offset + len(w)
526 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000527 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000528
529 lexicon = map(ord, lexicon)
530
531 # generate phrasebook from names and lexicon
532 phrasebook = [0]
533 phrasebook_offset = [0] * len(unicode.chars)
534 for char in unicode.chars:
535 name = names[char]
536 if name:
537 w = name.split()
538 phrasebook_offset[char] = len(phrasebook)
539 for w in w:
540 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000541 if i < short:
542 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000543 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000544 # store as two bytes
545 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000546 phrasebook.append(i&255)
547
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000548 assert getsize(phrasebook) == 1
549
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000550 #
551 # unicode name hash table
552
553 # extract names
554 data = []
555 for char in unicode.chars:
556 record = unicode.table[char]
557 if record:
558 name = record[1].strip()
559 if name and name[0] != "<":
560 data.append((name, char))
561
562 # the magic number 47 was chosen to minimize the number of
563 # collisions on the current data set. if you like, change it
564 # and see what happens...
565
566 codehash = Hash("code", data, 47)
567
Collin Winter6afaeb72007-08-03 17:06:41 +0000568 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000569
570 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000571 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
572 print(file=fp)
573 print("#define NAME_MAXLEN", 256, file=fp)
574 print(file=fp)
575 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000576 Array("lexicon", lexicon).dump(fp, trace)
577 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000578
579 # split decomposition index table
580 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
581
Collin Winter6afaeb72007-08-03 17:06:41 +0000582 print("/* code->name phrasebook */", file=fp)
583 print("#define phrasebook_shift", shift, file=fp)
584 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000585
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000586 Array("phrasebook", phrasebook).dump(fp, trace)
587 Array("phrasebook_offset1", offset1).dump(fp, trace)
588 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000589
Collin Winter6afaeb72007-08-03 17:06:41 +0000590 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000591 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000592
593 fp.close()
594
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000595
596def merge_old_version(version, new, old):
597 # Changes to exclusion file not implemented yet
598 if old.exclusions != new.exclusions:
599 raise NotImplementedError, "exclusions differ"
600
601 # In these change records, 0xFF means "no change"
602 bidir_changes = [0xFF]*0x110000
603 category_changes = [0xFF]*0x110000
604 decimal_changes = [0xFF]*0x110000
605 # In numeric data, 0 means "no change",
606 # -1 means "did not have a numeric value
607 numeric_changes = [0] * 0x110000
608 # normalization_changes is a list of key-value pairs
609 normalization_changes = []
610 for i in range(0x110000):
611 if new.table[i] is None:
612 # Characters unassigned in the new version ought to
613 # be unassigned in the old one
614 assert old.table[i] is None
615 continue
616 # check characters unassigned in the old version
617 if old.table[i] is None:
618 # category 0 is "unassigned"
619 category_changes[i] = 0
620 continue
621 # check characters that differ
622 if old.table[i] != new.table[i]:
623 for k in range(len(old.table[i])):
624 if old.table[i][k] != new.table[i][k]:
625 value = old.table[i][k]
626 if k == 2:
627 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
628 category_changes[i] = CATEGORY_NAMES.index(value)
629 elif k == 4:
630 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
631 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
632 elif k == 5:
633 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
634 # We assume that all normalization changes are in 1:1 mappings
635 assert " " not in value
636 normalization_changes.append((i, value))
637 elif k == 6:
638 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
639 # we only support changes where the old value is a single digit
640 assert value in "0123456789"
641 decimal_changes[i] = int(value)
642 elif k == 8:
643 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
644 # Since 0 encodes "no change", the old value is better not 0
645 assert value != "0" and value != "-1"
646 if not value:
647 numeric_changes[i] = -1
648 else:
649 assert re.match("^[0-9]+$", value)
650 numeric_changes[i] = int(value)
651 elif k == 11:
652 # change to ISO comment, ignore
653 pass
654 elif k == 12:
655 # change to simple uppercase mapping; ignore
656 pass
657 elif k == 13:
658 # change to simple lowercase mapping; ignore
659 pass
660 elif k == 14:
661 # change to simple titlecase mapping; ignore
662 pass
663 else:
664 class Difference(Exception):pass
665 raise Difference, (hex(i), k, old.table[i], new.table[i])
666 new.changed.append((version, zip(bidir_changes, category_changes,
667 decimal_changes, numeric_changes),
668 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000669
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000670
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000671# --------------------------------------------------------------------
672# the following support code is taken from the unidb utilities
673# Copyright (c) 1999-2000 by Secret Labs AB
674
675# load a unicode-data file from disk
676
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000677import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000678
679class UnicodeData:
680
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000681 def __init__(self, filename, exclusions, eastasianwidth, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000682 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000683 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000684 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000685 while 1:
686 s = file.readline()
687 if not s:
688 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000689 s = s.strip().split(";")
690 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000691 table[char] = s
692
Martin v. Löwis97225da2002-11-24 23:05:09 +0000693 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000694 if expand:
695 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000696 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000697 s = table[i]
698 if s:
699 if s[1][-6:] == "First>":
700 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000701 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000702 elif s[1][-5:] == "Last>":
703 s[1] = ""
704 field = None
705 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000706 f2 = field[:]
707 f2[0] = "%X" % i
708 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000709
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000710 # public attributes
711 self.filename = filename
712 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000713 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000714
Martin v. Löwis677bde22002-11-23 22:08:15 +0000715 file = open(exclusions)
716 self.exclusions = {}
717 for s in file:
718 s = s.strip()
719 if not s:
720 continue
721 if s[0] == '#':
722 continue
723 char = int(s.split()[0],16)
724 self.exclusions[char] = 1
725
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000726 widths = [None] * 0x110000
727 for s in open(eastasianwidth):
728 s = s.strip()
729 if not s:
730 continue
731 if s[0] == '#':
732 continue
733 s = s.split()[0].split(';')
734 if '..' in s[0]:
735 first, last = [int(c, 16) for c in s[0].split('..')]
736 chars = range(first, last+1)
737 else:
738 chars = [int(s[0], 16)]
739 for char in chars:
740 widths[char] = s[1]
741 for i in range(0, 0x110000):
742 if table[i] is not None:
743 table[i].append(widths[i])
744
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000745 def uselatin1(self):
746 # restrict character range to ISO Latin 1
747 self.chars = range(256)
748
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000749# hash table tools
750
751# this is a straight-forward reimplementation of Python's built-in
752# dictionary type, using a static data structure, and a custom string
753# hash algorithm.
754
755def myhash(s, magic):
756 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000757 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000758 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000759 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000760 if ix:
761 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
762 return h
763
764SIZES = [
765 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
766 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
767 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
768 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
769]
770
771class Hash:
772 def __init__(self, name, data, magic):
773 # turn a (key, value) list into a static hash table structure
774
775 # determine table size
776 for size, poly in SIZES:
777 if size > len(data):
778 poly = size + poly
779 break
780 else:
781 raise AssertionError, "ran out of polynominals"
782
Collin Winter6afaeb72007-08-03 17:06:41 +0000783 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000784
785 table = [None] * size
786
787 mask = size-1
788
789 n = 0
790
791 hash = myhash
792
793 # initialize hash table
794 for key, value in data:
795 h = hash(key, magic)
796 i = (~h) & mask
797 v = table[i]
798 if v is None:
799 table[i] = value
800 continue
801 incr = (h ^ (h >> 3)) & mask;
802 if not incr:
803 incr = mask
804 while 1:
805 n = n + 1
806 i = (i + incr) & mask
807 v = table[i]
808 if v is None:
809 table[i] = value
810 break
811 incr = incr << 1
812 if incr > mask:
813 incr = incr ^ poly
814
Collin Winter6afaeb72007-08-03 17:06:41 +0000815 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000816 self.collisions = n
817
818 for i in range(len(table)):
819 if table[i] is None:
820 table[i] = 0
821
822 self.data = Array(name + "_hash", table)
823 self.magic = magic
824 self.name = name
825 self.size = size
826 self.poly = poly
827
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000828 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000829 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000830 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000831 file.write("#define %s_magic %d\n" % (self.name, self.magic))
832 file.write("#define %s_size %d\n" % (self.name, self.size))
833 file.write("#define %s_poly %d\n" % (self.name, self.poly))
834
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000835# stuff to deal with arrays of unsigned integers
836
837class Array:
838
839 def __init__(self, name, data):
840 self.name = name
841 self.data = data
842
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000843 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000844 # write data to file, as a C array
845 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000846 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000847 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000848 file.write("static ")
849 if size == 1:
850 file.write("unsigned char")
851 elif size == 2:
852 file.write("unsigned short")
853 else:
854 file.write("unsigned int")
855 file.write(" " + self.name + "[] = {\n")
856 if self.data:
857 s = " "
858 for item in self.data:
859 i = str(item) + ", "
860 if len(s) + len(i) > 78:
861 file.write(s + "\n")
862 s = " " + i
863 else:
864 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000865 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000866 file.write(s + "\n")
867 file.write("};\n\n")
868
869def getsize(data):
870 # return smallest possible integer size for the given array
871 maxdata = max(data)
872 if maxdata < 256:
873 return 1
874 elif maxdata < 65536:
875 return 2
876 else:
877 return 4
878
Tim Peters21013482000-09-25 07:13:41 +0000879def splitbins(t, trace=0):
880 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
881
882 t is a sequence of ints. This function can be useful to save space if
883 many of the ints are the same. t1 and t2 are lists of ints, and shift
884 is an int, chosen to minimize the combined size of t1 and t2 (in C
885 code), and where for each i in range(len(t)),
886 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
887 where mask is a bitmask isolating the last "shift" bits.
888
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000889 If optional arg trace is non-zero (default zero), progress info
890 is printed to sys.stderr. The higher the value, the more info
891 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000892 """
893
894 import sys
895 if trace:
896 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +0000897 print("%d+%d bins at shift %d; %d bytes" % (
898 len(t1), len(t2), shift, bytes), file=sys.stderr)
899 print("Size of original table:", len(t)*getsize(t), \
900 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000901 n = len(t)-1 # last valid index
902 maxshift = 0 # the most we can shift n and still have something left
903 if n > 0:
904 while n >> 1:
905 n >>= 1
906 maxshift += 1
907 del n
908 bytes = sys.maxint # smallest total size so far
909 t = tuple(t) # so slices can be dict keys
910 for shift in range(maxshift + 1):
911 t1 = []
912 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000913 size = 2**shift
914 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000915 for i in range(0, len(t), size):
916 bin = t[i:i+size]
917 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000918 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000919 index = len(t2)
920 bincache[bin] = index
921 t2.extend(bin)
922 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000923 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000924 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000925 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000926 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000927 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000928 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000929 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000930 t1, t2, shift = best
931 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000932 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000933 dump(t1, t2, shift, bytes)
934 if __debug__:
935 # exhaustively verify that the decomposition is correct
936 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +0000937 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +0000938 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
939 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000940
941if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000942 maketables(1)