blob: 88ed002e9196dab900b3dab64db4907ef05d8110 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
28import sys
29
30SCRIPT = sys.argv[0]
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000031VERSION = "2.5"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034UNIDATA_VERSION = "4.1.0"
35UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000038DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000039
40old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000041
42CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
43 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
44 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
45 "So" ]
46
47BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
48 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
49 "ON" ]
50
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000051EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
52
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000053# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000054ALPHA_MASK = 0x01
55DECIMAL_MASK = 0x02
56DIGIT_MASK = 0x04
57LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059SPACE_MASK = 0x20
60TITLE_MASK = 0x40
61UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000062XID_START_MASK = 0x100
63XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000064PRINTABLE_MASK = 0x400
Fredrik Lundhe9133f72000-09-25 17:59:57 +000065
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000066def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000067
Collin Winter6afaeb72007-08-03 17:06:41 +000068 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000069
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000070 version = ""
71 unicode = UnicodeData(UNICODE_DATA % version,
72 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000073 EASTASIAN_WIDTH % version,
74 DERIVED_CORE_PROPERTIES % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000075
Georg Brandl559e5d72008-06-11 18:37:52 +000076 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000077
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000079 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000080 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
81 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000082 EASTASIAN_WIDTH % ("-"+version),
83 DERIVED_CORE_PROPERTIES % ("-"+version))
Georg Brandl559e5d72008-06-11 18:37:52 +000084 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000085 merge_old_version(version, unicode, old_unicode)
86
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000087 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000088 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000089 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000090
91# --------------------------------------------------------------------
92# unicode character properties
93
94def makeunicodedata(unicode, trace):
95
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000096 dummy = (0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000097 table = [dummy]
98 cache = {0: dummy}
99 index = [0] * len(unicode.chars)
100
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000101 FILE = "Modules/unicodedata_db.h"
102
Collin Winter6afaeb72007-08-03 17:06:41 +0000103 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000104
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000105 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000106
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000107 for char in unicode.chars:
108 record = unicode.table[char]
109 if record:
110 # extract database properties
111 category = CATEGORY_NAMES.index(record[2])
112 combining = int(record[3])
113 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
114 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000115 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000116 item = (
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000117 category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000118 )
119 # add entry to index and item tables
120 i = cache.get(item)
121 if i is None:
122 cache[item] = i = len(table)
123 table.append(item)
124 index[char] = i
125
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000126 # 2) decomposition data
127
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000128 decomp_data = [0]
129 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000130 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000131 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000132
Martin v. Löwis677bde22002-11-23 22:08:15 +0000133 comp_pairs = []
134 comp_first = [None] * len(unicode.chars)
135 comp_last = [None] * len(unicode.chars)
136
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000137 for char in unicode.chars:
138 record = unicode.table[char]
139 if record:
140 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000141 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000142 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000143 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000144 # prefix
145 if decomp[0][0] == "<":
146 prefix = decomp.pop(0)
147 else:
148 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000149 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000150 i = decomp_prefix.index(prefix)
151 except ValueError:
152 i = len(decomp_prefix)
153 decomp_prefix.append(prefix)
154 prefix = i
155 assert prefix < 256
156 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000157 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000158 # Collect NFC pairs
159 if not prefix and len(decomp) == 3 and \
160 char not in unicode.exclusions and \
161 unicode.table[decomp[1]][3] == "0":
162 p, l, r = decomp
163 comp_first[l] = 1
164 comp_last[r] = 1
165 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000166 try:
167 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000168 except ValueError:
169 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000170 decomp_data.extend(decomp)
171 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000172 else:
173 i = 0
174 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000175
Martin v. Löwis677bde22002-11-23 22:08:15 +0000176 f = l = 0
177 comp_first_ranges = []
178 comp_last_ranges = []
179 prev_f = prev_l = None
180 for i in unicode.chars:
181 if comp_first[i] is not None:
182 comp_first[i] = f
183 f += 1
184 if prev_f is None:
185 prev_f = (i,i)
186 elif prev_f[1]+1 == i:
187 prev_f = prev_f[0],i
188 else:
189 comp_first_ranges.append(prev_f)
190 prev_f = (i,i)
191 if comp_last[i] is not None:
192 comp_last[i] = l
193 l += 1
194 if prev_l is None:
195 prev_l = (i,i)
196 elif prev_l[1]+1 == i:
197 prev_l = prev_l[0],i
198 else:
199 comp_last_ranges.append(prev_l)
200 prev_l = (i,i)
201 comp_first_ranges.append(prev_f)
202 comp_last_ranges.append(prev_l)
203 total_first = f
204 total_last = l
205
206 comp_data = [0]*(total_first*total_last)
207 for f,l,char in comp_pairs:
208 f = comp_first[f]
209 l = comp_last[l]
210 comp_data[f*total_last+l] = char
211
Collin Winter6afaeb72007-08-03 17:06:41 +0000212 print(len(table), "unique properties")
213 print(len(decomp_prefix), "unique decomposition prefixes")
214 print(len(decomp_data), "unique decomposition entries:", end=' ')
215 print(decomp_size, "bytes")
216 print(total_first, "first characters in NFC")
217 print(total_last, "last characters in NFC")
218 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000219
Collin Winter6afaeb72007-08-03 17:06:41 +0000220 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000221
Fred Drake9c685052000-10-26 03:56:46 +0000222 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000223 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
224 print(file=fp)
225 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
226 print("/* a list of unique database records */", file=fp)
227 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000228 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000229 print(" {%d, %d, %d, %d, %d}," % item, file=fp)
230 print("};", file=fp)
231 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000232
Collin Winter6afaeb72007-08-03 17:06:41 +0000233 print("/* Reindexing of NFC first characters. */", file=fp)
234 print("#define TOTAL_FIRST",total_first, file=fp)
235 print("#define TOTAL_LAST",total_last, file=fp)
236 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000237 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000238 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000239 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
240 print(" {0,0,0}", file=fp)
241 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000242 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000243 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000244 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
245 print(" {0,0,0}", file=fp)
246 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000247
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000248 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000249 # the support code moved into unicodedatabase.c
250
Collin Winter6afaeb72007-08-03 17:06:41 +0000251 print("/* string literals */", file=fp)
252 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000253 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000254 print(" \"%s\"," % name, file=fp)
255 print(" NULL", file=fp)
256 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000257
Collin Winter6afaeb72007-08-03 17:06:41 +0000258 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000259 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000260 print(" \"%s\"," % name, file=fp)
261 print(" NULL", file=fp)
262 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000263
Collin Winter6afaeb72007-08-03 17:06:41 +0000264 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000265 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000266 print(" \"%s\"," % name, file=fp)
267 print(" NULL", file=fp)
268 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000269
Collin Winter6afaeb72007-08-03 17:06:41 +0000270 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000271 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000272 print(" \"%s\"," % name, file=fp)
273 print(" NULL", file=fp)
274 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000275
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000276 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000277 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000278
Collin Winter6afaeb72007-08-03 17:06:41 +0000279 print("/* index tables for the database records */", file=fp)
280 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000281 Array("index1", index1).dump(fp, trace)
282 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000283
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000284 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000285 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000286
Collin Winter6afaeb72007-08-03 17:06:41 +0000287 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000288 Array("decomp_data", decomp_data).dump(fp, trace)
289
Collin Winter6afaeb72007-08-03 17:06:41 +0000290 print("/* index tables for the decomposition data */", file=fp)
291 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000292 Array("decomp_index1", index1).dump(fp, trace)
293 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000294
Martin v. Löwis677bde22002-11-23 22:08:15 +0000295 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000296 print("/* NFC pairs */", file=fp)
297 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000298 Array("comp_index", index).dump(fp, trace)
299 Array("comp_data", index2).dump(fp, trace)
300
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000301 # Generate delta tables for old versions
302 for version, table, normalization in unicode.changed:
303 cversion = version.replace(".","_")
304 records = [table[0]]
305 cache = {table[0]:0}
306 index = [0] * len(table)
307 for i, record in enumerate(table):
308 try:
309 index[i] = cache[record]
310 except KeyError:
311 index[i] = cache[record] = len(records)
312 records.append(record)
313 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000314 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000315 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000316 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
317 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000318 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
319 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000320 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
321 print("{", file=fp)
322 print("\tint index;", file=fp)
323 print("\tif (n >= 0x110000) index = 0;", file=fp)
324 print("\telse {", file=fp)
325 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
326 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
327 (cversion, shift, ((1<<shift)-1)), file=fp)
328 print("\t}", file=fp)
329 print("\treturn change_records_%s+index;" % cversion, file=fp)
330 print("}\n", file=fp)
331 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
332 print("{", file=fp)
333 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000334 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000335 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
336 print("\tdefault: return 0;", file=fp)
337 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000338
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000339 fp.close()
340
341# --------------------------------------------------------------------
342# unicode character type tables
343
344def makeunicodetype(unicode, trace):
345
346 FILE = "Objects/unicodetype_db.h"
347
Collin Winter6afaeb72007-08-03 17:06:41 +0000348 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000349
350 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000351 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000352 table = [dummy]
353 cache = {0: dummy}
354 index = [0] * len(unicode.chars)
355
356 for char in unicode.chars:
357 record = unicode.table[char]
358 if record:
359 # extract database properties
360 category = record[2]
361 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000362 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000363 flags = 0
364 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
365 flags |= ALPHA_MASK
366 if category == "Ll":
367 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000368 if category == "Zl" or bidirectional == "B":
369 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000370 if category == "Zs" or bidirectional in ("WS", "B", "S"):
371 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000372 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000373 flags |= TITLE_MASK
374 if category == "Lu":
375 flags |= UPPER_MASK
Georg Brandld52429f2008-07-04 15:55:02 +0000376 if char == " " or category[0] not in ("C", "Z"):
377 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000378 if "XID_Start" in properties:
379 flags |= XID_START_MASK
380 if "XID_Continue" in properties:
381 flags |= XID_CONTINUE_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000382 # use delta predictor for upper/lower/title
383 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000384 upper = int(record[12], 16) - char
385 assert -32768 <= upper <= 32767
386 upper = upper & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000387 else:
388 upper = 0
389 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000390 lower = int(record[13], 16) - char
391 assert -32768 <= lower <= 32767
392 lower = lower & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000393 else:
394 lower = 0
395 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000396 title = int(record[14], 16) - char
397 assert -32768 <= lower <= 32767
398 title = title & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000399 else:
400 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000401 # decimal digit, integer digit
402 decimal = 0
403 if record[6]:
404 flags |= DECIMAL_MASK
405 decimal = int(record[6])
406 digit = 0
407 if record[7]:
408 flags |= DIGIT_MASK
409 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000411 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000412 )
413 # add entry to index and item tables
414 i = cache.get(item)
415 if i is None:
416 cache[item] = i = len(table)
417 table.append(item)
418 index[char] = i
419
Collin Winter6afaeb72007-08-03 17:06:41 +0000420 print(len(table), "unique character type entries")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000421
Collin Winter6afaeb72007-08-03 17:06:41 +0000422 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000423
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000424 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000425 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
426 print(file=fp)
427 print("/* a list of unique character type descriptors */", file=fp)
428 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000429 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000430 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
431 print("};", file=fp)
432 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000433
434 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000435 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000436
Collin Winter6afaeb72007-08-03 17:06:41 +0000437 print("/* type indexes */", file=fp)
438 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000439 Array("index1", index1).dump(fp, trace)
440 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000441
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000442 fp.close()
443
444# --------------------------------------------------------------------
445# unicode name database
446
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000447def CmpToKey(mycmp):
448 'Convert a cmp= function into a key= function'
449 class K(object):
450 def __init__(self, obj, *args):
451 self.obj = obj
452 def __lt__(self, other):
453 return mycmp(self.obj, other.obj) == -1
454 return K
455
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000456def makeunicodename(unicode, trace):
457
458 FILE = "Modules/unicodename_db.h"
459
Collin Winter6afaeb72007-08-03 17:06:41 +0000460 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000461
462 # collect names
463 names = [None] * len(unicode.chars)
464
465 for char in unicode.chars:
466 record = unicode.table[char]
467 if record:
468 name = record[1].strip()
469 if name and name[0] != "<":
470 names[char] = name + chr(0)
471
Georg Brandl559e5d72008-06-11 18:37:52 +0000472 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000473
474 # collect unique words from names (note that we differ between
475 # words inside a sentence, and words ending a sentence. the
476 # latter includes the trailing null byte.
477
478 words = {}
479 n = b = 0
480 for char in unicode.chars:
481 name = names[char]
482 if name:
483 w = name.split()
484 b = b + len(name)
485 n = n + len(w)
486 for w in w:
487 l = words.get(w)
488 if l:
489 l.append(None)
490 else:
491 words[w] = [len(words)]
492
Collin Winter6afaeb72007-08-03 17:06:41 +0000493 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000494
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000495 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000496
Martin v. Löwis97225da2002-11-24 23:05:09 +0000497 # sort on falling frequency, then by name
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000498 def cmpwords(a,b):
499 aword, alist = a
500 bword, blist = b
Martin v. Löwis97225da2002-11-24 23:05:09 +0000501 r = -cmp(len(alist),len(blist))
502 if r:
503 return r
504 return cmp(aword, bword)
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000505 wordlist.sort(key=CmpToKey(cmpwords))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000506
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000507 # figure out how many phrasebook escapes we need
508 escapes = 0
509 while escapes * 256 < len(wordlist):
510 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000511 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000512
513 short = 256 - escapes
514
515 assert short > 0
516
Collin Winter6afaeb72007-08-03 17:06:41 +0000517 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000518
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000519 # statistics
520 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000521 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000522 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000523 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000524
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000525 # pick the most commonly used words, and sort the rest on falling
526 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000527
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000528 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000529 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000530 wordlist.extend(wordtail)
531
532 # generate lexicon from words
533
534 lexicon_offset = [0]
535 lexicon = ""
536 words = {}
537
538 # build a lexicon string
539 offset = 0
540 for w, x in wordlist:
541 # encoding: bit 7 indicates last character in word (chr(128)
542 # indicates the last character in an entire string)
543 ww = w[:-1] + chr(ord(w[-1])+128)
544 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000545 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000546 if o < 0:
547 o = offset
548 lexicon = lexicon + ww
549 offset = offset + len(w)
550 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000551 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000552
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000553 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000554
555 # generate phrasebook from names and lexicon
556 phrasebook = [0]
557 phrasebook_offset = [0] * len(unicode.chars)
558 for char in unicode.chars:
559 name = names[char]
560 if name:
561 w = name.split()
562 phrasebook_offset[char] = len(phrasebook)
563 for w in w:
564 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000565 if i < short:
566 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000567 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000568 # store as two bytes
569 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000570 phrasebook.append(i&255)
571
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000572 assert getsize(phrasebook) == 1
573
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000574 #
575 # unicode name hash table
576
577 # extract names
578 data = []
579 for char in unicode.chars:
580 record = unicode.table[char]
581 if record:
582 name = record[1].strip()
583 if name and name[0] != "<":
584 data.append((name, char))
585
586 # the magic number 47 was chosen to minimize the number of
587 # collisions on the current data set. if you like, change it
588 # and see what happens...
589
590 codehash = Hash("code", data, 47)
591
Collin Winter6afaeb72007-08-03 17:06:41 +0000592 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000593
594 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000595 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
596 print(file=fp)
597 print("#define NAME_MAXLEN", 256, file=fp)
598 print(file=fp)
599 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000600 Array("lexicon", lexicon).dump(fp, trace)
601 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000602
603 # split decomposition index table
604 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
605
Collin Winter6afaeb72007-08-03 17:06:41 +0000606 print("/* code->name phrasebook */", file=fp)
607 print("#define phrasebook_shift", shift, file=fp)
608 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000609
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000610 Array("phrasebook", phrasebook).dump(fp, trace)
611 Array("phrasebook_offset1", offset1).dump(fp, trace)
612 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000613
Collin Winter6afaeb72007-08-03 17:06:41 +0000614 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000615 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000616
617 fp.close()
618
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000619
620def merge_old_version(version, new, old):
621 # Changes to exclusion file not implemented yet
622 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000623 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000624
625 # In these change records, 0xFF means "no change"
626 bidir_changes = [0xFF]*0x110000
627 category_changes = [0xFF]*0x110000
628 decimal_changes = [0xFF]*0x110000
629 # In numeric data, 0 means "no change",
630 # -1 means "did not have a numeric value
631 numeric_changes = [0] * 0x110000
632 # normalization_changes is a list of key-value pairs
633 normalization_changes = []
634 for i in range(0x110000):
635 if new.table[i] is None:
636 # Characters unassigned in the new version ought to
637 # be unassigned in the old one
638 assert old.table[i] is None
639 continue
640 # check characters unassigned in the old version
641 if old.table[i] is None:
642 # category 0 is "unassigned"
643 category_changes[i] = 0
644 continue
645 # check characters that differ
646 if old.table[i] != new.table[i]:
647 for k in range(len(old.table[i])):
648 if old.table[i][k] != new.table[i][k]:
649 value = old.table[i][k]
650 if k == 2:
651 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
652 category_changes[i] = CATEGORY_NAMES.index(value)
653 elif k == 4:
654 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
655 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
656 elif k == 5:
657 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
658 # We assume that all normalization changes are in 1:1 mappings
659 assert " " not in value
660 normalization_changes.append((i, value))
661 elif k == 6:
662 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
663 # we only support changes where the old value is a single digit
664 assert value in "0123456789"
665 decimal_changes[i] = int(value)
666 elif k == 8:
667 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
668 # Since 0 encodes "no change", the old value is better not 0
669 assert value != "0" and value != "-1"
670 if not value:
671 numeric_changes[i] = -1
672 else:
673 assert re.match("^[0-9]+$", value)
674 numeric_changes[i] = int(value)
675 elif k == 11:
676 # change to ISO comment, ignore
677 pass
678 elif k == 12:
679 # change to simple uppercase mapping; ignore
680 pass
681 elif k == 13:
682 # change to simple lowercase mapping; ignore
683 pass
684 elif k == 14:
685 # change to simple titlecase mapping; ignore
686 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000687 elif k == 16:
688 # derived property changes; not yet
689 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000690 else:
691 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000692 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000693 new.changed.append((version, list(zip(bidir_changes, category_changes,
694 decimal_changes, numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000695 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000696
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000697
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000698# --------------------------------------------------------------------
699# the following support code is taken from the unidb utilities
700# Copyright (c) 1999-2000 by Secret Labs AB
701
702# load a unicode-data file from disk
703
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000704import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000705
706class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000707 # Record structure:
708 # [ID, name, category, combining, bidi, decomp, (6)
709 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
710 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
711 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000712
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000713 def __init__(self, filename, exclusions, eastasianwidth,
714 derivedprops, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000715 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000716 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000717 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000718 while 1:
719 s = file.readline()
720 if not s:
721 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000722 s = s.strip().split(";")
723 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000724 table[char] = s
725
Martin v. Löwis97225da2002-11-24 23:05:09 +0000726 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000727 if expand:
728 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000729 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000730 s = table[i]
731 if s:
732 if s[1][-6:] == "First>":
733 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000734 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000735 elif s[1][-5:] == "Last>":
736 s[1] = ""
737 field = None
738 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000739 f2 = field[:]
740 f2[0] = "%X" % i
741 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000742
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000743 # public attributes
744 self.filename = filename
745 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000746 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000747
Martin v. Löwis677bde22002-11-23 22:08:15 +0000748 file = open(exclusions)
749 self.exclusions = {}
750 for s in file:
751 s = s.strip()
752 if not s:
753 continue
754 if s[0] == '#':
755 continue
756 char = int(s.split()[0],16)
757 self.exclusions[char] = 1
758
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000759 widths = [None] * 0x110000
760 for s in open(eastasianwidth):
761 s = s.strip()
762 if not s:
763 continue
764 if s[0] == '#':
765 continue
766 s = s.split()[0].split(';')
767 if '..' in s[0]:
768 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000769 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000770 else:
771 chars = [int(s[0], 16)]
772 for char in chars:
773 widths[char] = s[1]
774 for i in range(0, 0x110000):
775 if table[i] is not None:
776 table[i].append(widths[i])
777
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000778 for i in range(0, 0x110000):
779 if table[i] is not None:
780 table[i].append(set())
781 for s in open(derivedprops):
782 s = s.split('#', 1)[0].strip()
783 if not s:
784 continue
785
786 r, p = s.split(";")
787 r = r.strip()
788 p = p.strip()
789 if ".." in r:
790 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000791 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000792 else:
793 chars = [int(r, 16)]
794 for char in chars:
795 if table[char]:
796 # Some properties (e.g. Default_Ignorable_Code_Point)
797 # apply to unassigned code points; ignore them
798 table[char][-1].add(p)
799
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000800 def uselatin1(self):
801 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000802 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000803
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000804# hash table tools
805
806# this is a straight-forward reimplementation of Python's built-in
807# dictionary type, using a static data structure, and a custom string
808# hash algorithm.
809
810def myhash(s, magic):
811 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000812 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000813 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000814 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000815 if ix:
816 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
817 return h
818
819SIZES = [
820 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
821 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
822 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
823 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
824]
825
826class Hash:
827 def __init__(self, name, data, magic):
828 # turn a (key, value) list into a static hash table structure
829
830 # determine table size
831 for size, poly in SIZES:
832 if size > len(data):
833 poly = size + poly
834 break
835 else:
Collin Wintera817e582007-08-22 23:05:06 +0000836 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000837
Collin Winter6afaeb72007-08-03 17:06:41 +0000838 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000839
840 table = [None] * size
841
842 mask = size-1
843
844 n = 0
845
846 hash = myhash
847
848 # initialize hash table
849 for key, value in data:
850 h = hash(key, magic)
851 i = (~h) & mask
852 v = table[i]
853 if v is None:
854 table[i] = value
855 continue
856 incr = (h ^ (h >> 3)) & mask;
857 if not incr:
858 incr = mask
859 while 1:
860 n = n + 1
861 i = (i + incr) & mask
862 v = table[i]
863 if v is None:
864 table[i] = value
865 break
866 incr = incr << 1
867 if incr > mask:
868 incr = incr ^ poly
869
Collin Winter6afaeb72007-08-03 17:06:41 +0000870 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000871 self.collisions = n
872
873 for i in range(len(table)):
874 if table[i] is None:
875 table[i] = 0
876
877 self.data = Array(name + "_hash", table)
878 self.magic = magic
879 self.name = name
880 self.size = size
881 self.poly = poly
882
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000883 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000884 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000885 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000886 file.write("#define %s_magic %d\n" % (self.name, self.magic))
887 file.write("#define %s_size %d\n" % (self.name, self.size))
888 file.write("#define %s_poly %d\n" % (self.name, self.poly))
889
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000890# stuff to deal with arrays of unsigned integers
891
892class Array:
893
894 def __init__(self, name, data):
895 self.name = name
896 self.data = data
897
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000898 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000899 # write data to file, as a C array
900 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000901 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000902 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000903 file.write("static ")
904 if size == 1:
905 file.write("unsigned char")
906 elif size == 2:
907 file.write("unsigned short")
908 else:
909 file.write("unsigned int")
910 file.write(" " + self.name + "[] = {\n")
911 if self.data:
912 s = " "
913 for item in self.data:
914 i = str(item) + ", "
915 if len(s) + len(i) > 78:
916 file.write(s + "\n")
917 s = " " + i
918 else:
919 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000920 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000921 file.write(s + "\n")
922 file.write("};\n\n")
923
924def getsize(data):
925 # return smallest possible integer size for the given array
926 maxdata = max(data)
927 if maxdata < 256:
928 return 1
929 elif maxdata < 65536:
930 return 2
931 else:
932 return 4
933
Tim Peters21013482000-09-25 07:13:41 +0000934def splitbins(t, trace=0):
935 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
936
937 t is a sequence of ints. This function can be useful to save space if
938 many of the ints are the same. t1 and t2 are lists of ints, and shift
939 is an int, chosen to minimize the combined size of t1 and t2 (in C
940 code), and where for each i in range(len(t)),
941 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
942 where mask is a bitmask isolating the last "shift" bits.
943
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000944 If optional arg trace is non-zero (default zero), progress info
945 is printed to sys.stderr. The higher the value, the more info
946 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000947 """
948
949 import sys
950 if trace:
951 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +0000952 print("%d+%d bins at shift %d; %d bytes" % (
953 len(t1), len(t2), shift, bytes), file=sys.stderr)
954 print("Size of original table:", len(t)*getsize(t), \
955 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000956 n = len(t)-1 # last valid index
957 maxshift = 0 # the most we can shift n and still have something left
958 if n > 0:
959 while n >> 1:
960 n >>= 1
961 maxshift += 1
962 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +0000963 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +0000964 t = tuple(t) # so slices can be dict keys
965 for shift in range(maxshift + 1):
966 t1 = []
967 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000968 size = 2**shift
969 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000970 for i in range(0, len(t), size):
971 bin = t[i:i+size]
972 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000973 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000974 index = len(t2)
975 bincache[bin] = index
976 t2.extend(bin)
977 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000978 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000979 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000980 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000981 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000982 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000983 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000984 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000985 t1, t2, shift = best
986 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000987 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000988 dump(t1, t2, shift, bytes)
989 if __debug__:
990 # exhaustively verify that the decomposition is correct
991 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +0000992 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +0000993 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
994 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000995
996if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000997 maketables(1)