blob: 52839e6f26df5de4ccde7bd67bb239f9e873ceee [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandl559e5d72008-06-11 18:37:52 +000023# 2008-06-11 gb add NONPRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
28import sys
29
30SCRIPT = sys.argv[0]
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000031VERSION = "2.5"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034UNIDATA_VERSION = "4.1.0"
35UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000038DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000039
40old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000041
42CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
43 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
44 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
45 "So" ]
46
47BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
48 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
49 "ON" ]
50
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000051EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
52
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000053# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000054ALPHA_MASK = 0x01
55DECIMAL_MASK = 0x02
56DIGIT_MASK = 0x04
57LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059SPACE_MASK = 0x20
60TITLE_MASK = 0x40
61UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000062XID_START_MASK = 0x100
63XID_CONTINUE_MASK = 0x200
Georg Brandl559e5d72008-06-11 18:37:52 +000064NONPRINTABLE_MASK = 0x400
Fredrik Lundhe9133f72000-09-25 17:59:57 +000065
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000066def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000067
Collin Winter6afaeb72007-08-03 17:06:41 +000068 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000069
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000070 version = ""
71 unicode = UnicodeData(UNICODE_DATA % version,
72 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000073 EASTASIAN_WIDTH % version,
74 DERIVED_CORE_PROPERTIES % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000075
Georg Brandl559e5d72008-06-11 18:37:52 +000076 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000077
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000079 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000080 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
81 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000082 EASTASIAN_WIDTH % ("-"+version),
83 DERIVED_CORE_PROPERTIES % ("-"+version))
Georg Brandl559e5d72008-06-11 18:37:52 +000084 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000085 merge_old_version(version, unicode, old_unicode)
86
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000087 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000088 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000089 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000090
91# --------------------------------------------------------------------
92# unicode character properties
93
94def makeunicodedata(unicode, trace):
95
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000096 dummy = (0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000097 table = [dummy]
98 cache = {0: dummy}
99 index = [0] * len(unicode.chars)
100
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000101 FILE = "Modules/unicodedata_db.h"
102
Collin Winter6afaeb72007-08-03 17:06:41 +0000103 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000104
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000105 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000106
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000107 for char in unicode.chars:
108 record = unicode.table[char]
109 if record:
110 # extract database properties
111 category = CATEGORY_NAMES.index(record[2])
112 combining = int(record[3])
113 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
114 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000115 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000116 item = (
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000117 category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000118 )
119 # add entry to index and item tables
120 i = cache.get(item)
121 if i is None:
122 cache[item] = i = len(table)
123 table.append(item)
124 index[char] = i
125
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000126 # 2) decomposition data
127
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000128 decomp_data = [0]
129 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000130 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000131 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000132
Martin v. Löwis677bde22002-11-23 22:08:15 +0000133 comp_pairs = []
134 comp_first = [None] * len(unicode.chars)
135 comp_last = [None] * len(unicode.chars)
136
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000137 for char in unicode.chars:
138 record = unicode.table[char]
139 if record:
140 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000141 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000142 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000143 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000144 # prefix
145 if decomp[0][0] == "<":
146 prefix = decomp.pop(0)
147 else:
148 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000149 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000150 i = decomp_prefix.index(prefix)
151 except ValueError:
152 i = len(decomp_prefix)
153 decomp_prefix.append(prefix)
154 prefix = i
155 assert prefix < 256
156 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000157 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000158 # Collect NFC pairs
159 if not prefix and len(decomp) == 3 and \
160 char not in unicode.exclusions and \
161 unicode.table[decomp[1]][3] == "0":
162 p, l, r = decomp
163 comp_first[l] = 1
164 comp_last[r] = 1
165 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000166 try:
167 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000168 except ValueError:
169 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000170 decomp_data.extend(decomp)
171 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000172 else:
173 i = 0
174 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000175
Martin v. Löwis677bde22002-11-23 22:08:15 +0000176 f = l = 0
177 comp_first_ranges = []
178 comp_last_ranges = []
179 prev_f = prev_l = None
180 for i in unicode.chars:
181 if comp_first[i] is not None:
182 comp_first[i] = f
183 f += 1
184 if prev_f is None:
185 prev_f = (i,i)
186 elif prev_f[1]+1 == i:
187 prev_f = prev_f[0],i
188 else:
189 comp_first_ranges.append(prev_f)
190 prev_f = (i,i)
191 if comp_last[i] is not None:
192 comp_last[i] = l
193 l += 1
194 if prev_l is None:
195 prev_l = (i,i)
196 elif prev_l[1]+1 == i:
197 prev_l = prev_l[0],i
198 else:
199 comp_last_ranges.append(prev_l)
200 prev_l = (i,i)
201 comp_first_ranges.append(prev_f)
202 comp_last_ranges.append(prev_l)
203 total_first = f
204 total_last = l
205
206 comp_data = [0]*(total_first*total_last)
207 for f,l,char in comp_pairs:
208 f = comp_first[f]
209 l = comp_last[l]
210 comp_data[f*total_last+l] = char
211
Collin Winter6afaeb72007-08-03 17:06:41 +0000212 print(len(table), "unique properties")
213 print(len(decomp_prefix), "unique decomposition prefixes")
214 print(len(decomp_data), "unique decomposition entries:", end=' ')
215 print(decomp_size, "bytes")
216 print(total_first, "first characters in NFC")
217 print(total_last, "last characters in NFC")
218 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000219
Collin Winter6afaeb72007-08-03 17:06:41 +0000220 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000221
Fred Drake9c685052000-10-26 03:56:46 +0000222 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000223 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
224 print(file=fp)
225 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
226 print("/* a list of unique database records */", file=fp)
227 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000228 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000229 print(" {%d, %d, %d, %d, %d}," % item, file=fp)
230 print("};", file=fp)
231 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000232
Collin Winter6afaeb72007-08-03 17:06:41 +0000233 print("/* Reindexing of NFC first characters. */", file=fp)
234 print("#define TOTAL_FIRST",total_first, file=fp)
235 print("#define TOTAL_LAST",total_last, file=fp)
236 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000237 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000238 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000239 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
240 print(" {0,0,0}", file=fp)
241 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000242 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000243 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000244 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
245 print(" {0,0,0}", file=fp)
246 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000247
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000248 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000249 # the support code moved into unicodedatabase.c
250
Collin Winter6afaeb72007-08-03 17:06:41 +0000251 print("/* string literals */", file=fp)
252 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000253 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000254 print(" \"%s\"," % name, file=fp)
255 print(" NULL", file=fp)
256 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000257
Collin Winter6afaeb72007-08-03 17:06:41 +0000258 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000259 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000260 print(" \"%s\"," % name, file=fp)
261 print(" NULL", file=fp)
262 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000263
Collin Winter6afaeb72007-08-03 17:06:41 +0000264 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000265 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000266 print(" \"%s\"," % name, file=fp)
267 print(" NULL", file=fp)
268 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000269
Collin Winter6afaeb72007-08-03 17:06:41 +0000270 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000271 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000272 print(" \"%s\"," % name, file=fp)
273 print(" NULL", file=fp)
274 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000275
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000276 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000277 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000278
Collin Winter6afaeb72007-08-03 17:06:41 +0000279 print("/* index tables for the database records */", file=fp)
280 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000281 Array("index1", index1).dump(fp, trace)
282 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000283
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000284 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000285 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000286
Collin Winter6afaeb72007-08-03 17:06:41 +0000287 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000288 Array("decomp_data", decomp_data).dump(fp, trace)
289
Collin Winter6afaeb72007-08-03 17:06:41 +0000290 print("/* index tables for the decomposition data */", file=fp)
291 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000292 Array("decomp_index1", index1).dump(fp, trace)
293 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000294
Martin v. Löwis677bde22002-11-23 22:08:15 +0000295 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000296 print("/* NFC pairs */", file=fp)
297 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000298 Array("comp_index", index).dump(fp, trace)
299 Array("comp_data", index2).dump(fp, trace)
300
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000301 # Generate delta tables for old versions
302 for version, table, normalization in unicode.changed:
303 cversion = version.replace(".","_")
304 records = [table[0]]
305 cache = {table[0]:0}
306 index = [0] * len(table)
307 for i, record in enumerate(table):
308 try:
309 index[i] = cache[record]
310 except KeyError:
311 index[i] = cache[record] = len(records)
312 records.append(record)
313 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000314 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000315 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000316 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
317 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000318 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
319 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000320 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
321 print("{", file=fp)
322 print("\tint index;", file=fp)
323 print("\tif (n >= 0x110000) index = 0;", file=fp)
324 print("\telse {", file=fp)
325 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
326 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
327 (cversion, shift, ((1<<shift)-1)), file=fp)
328 print("\t}", file=fp)
329 print("\treturn change_records_%s+index;" % cversion, file=fp)
330 print("}\n", file=fp)
331 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
332 print("{", file=fp)
333 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000334 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000335 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
336 print("\tdefault: return 0;", file=fp)
337 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000338
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000339 fp.close()
340
341# --------------------------------------------------------------------
342# unicode character type tables
343
344def makeunicodetype(unicode, trace):
345
346 FILE = "Objects/unicodetype_db.h"
347
Collin Winter6afaeb72007-08-03 17:06:41 +0000348 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000349
350 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000351 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000352 table = [dummy]
353 cache = {0: dummy}
354 index = [0] * len(unicode.chars)
355
356 for char in unicode.chars:
357 record = unicode.table[char]
358 if record:
359 # extract database properties
360 category = record[2]
361 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000362 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000363 flags = 0
364 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
365 flags |= ALPHA_MASK
366 if category == "Ll":
367 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000368 if category == "Zl" or bidirectional == "B":
369 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000370 if category == "Zs" or bidirectional in ("WS", "B", "S"):
371 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000372 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000373 flags |= TITLE_MASK
374 if category == "Lu":
375 flags |= UPPER_MASK
Georg Brandl559e5d72008-06-11 18:37:52 +0000376 if category[0] == "C":
377 flags |= NONPRINTABLE_MASK
378 if category[0] == "Z" and char != " ":
379 flags |= NONPRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000380 if "XID_Start" in properties:
381 flags |= XID_START_MASK
382 if "XID_Continue" in properties:
383 flags |= XID_CONTINUE_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000384 # use delta predictor for upper/lower/title
385 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000386 upper = int(record[12], 16) - char
387 assert -32768 <= upper <= 32767
388 upper = upper & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000389 else:
390 upper = 0
391 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000392 lower = int(record[13], 16) - char
393 assert -32768 <= lower <= 32767
394 lower = lower & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000395 else:
396 lower = 0
397 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000398 title = int(record[14], 16) - char
399 assert -32768 <= lower <= 32767
400 title = title & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000401 else:
402 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000403 # decimal digit, integer digit
404 decimal = 0
405 if record[6]:
406 flags |= DECIMAL_MASK
407 decimal = int(record[6])
408 digit = 0
409 if record[7]:
410 flags |= DIGIT_MASK
411 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000412 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000413 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000414 )
415 # add entry to index and item tables
416 i = cache.get(item)
417 if i is None:
418 cache[item] = i = len(table)
419 table.append(item)
420 index[char] = i
421
Collin Winter6afaeb72007-08-03 17:06:41 +0000422 print(len(table), "unique character type entries")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000423
Collin Winter6afaeb72007-08-03 17:06:41 +0000424 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000425
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000426 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000427 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
428 print(file=fp)
429 print("/* a list of unique character type descriptors */", file=fp)
430 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000431 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000432 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
433 print("};", file=fp)
434 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000435
436 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000437 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000438
Collin Winter6afaeb72007-08-03 17:06:41 +0000439 print("/* type indexes */", file=fp)
440 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000441 Array("index1", index1).dump(fp, trace)
442 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000443
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000444 fp.close()
445
446# --------------------------------------------------------------------
447# unicode name database
448
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000449def CmpToKey(mycmp):
450 'Convert a cmp= function into a key= function'
451 class K(object):
452 def __init__(self, obj, *args):
453 self.obj = obj
454 def __lt__(self, other):
455 return mycmp(self.obj, other.obj) == -1
456 return K
457
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000458def makeunicodename(unicode, trace):
459
460 FILE = "Modules/unicodename_db.h"
461
Collin Winter6afaeb72007-08-03 17:06:41 +0000462 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000463
464 # collect names
465 names = [None] * len(unicode.chars)
466
467 for char in unicode.chars:
468 record = unicode.table[char]
469 if record:
470 name = record[1].strip()
471 if name and name[0] != "<":
472 names[char] = name + chr(0)
473
Georg Brandl559e5d72008-06-11 18:37:52 +0000474 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000475
476 # collect unique words from names (note that we differ between
477 # words inside a sentence, and words ending a sentence. the
478 # latter includes the trailing null byte.
479
480 words = {}
481 n = b = 0
482 for char in unicode.chars:
483 name = names[char]
484 if name:
485 w = name.split()
486 b = b + len(name)
487 n = n + len(w)
488 for w in w:
489 l = words.get(w)
490 if l:
491 l.append(None)
492 else:
493 words[w] = [len(words)]
494
Collin Winter6afaeb72007-08-03 17:06:41 +0000495 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000496
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000497 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000498
Martin v. Löwis97225da2002-11-24 23:05:09 +0000499 # sort on falling frequency, then by name
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000500 def cmpwords(a,b):
501 aword, alist = a
502 bword, blist = b
Martin v. Löwis97225da2002-11-24 23:05:09 +0000503 r = -cmp(len(alist),len(blist))
504 if r:
505 return r
506 return cmp(aword, bword)
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000507 wordlist.sort(key=CmpToKey(cmpwords))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000508
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000509 # figure out how many phrasebook escapes we need
510 escapes = 0
511 while escapes * 256 < len(wordlist):
512 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000513 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000514
515 short = 256 - escapes
516
517 assert short > 0
518
Collin Winter6afaeb72007-08-03 17:06:41 +0000519 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000520
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000521 # statistics
522 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000523 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000524 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000525 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000526
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000527 # pick the most commonly used words, and sort the rest on falling
528 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000529
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000530 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000531 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000532 wordlist.extend(wordtail)
533
534 # generate lexicon from words
535
536 lexicon_offset = [0]
537 lexicon = ""
538 words = {}
539
540 # build a lexicon string
541 offset = 0
542 for w, x in wordlist:
543 # encoding: bit 7 indicates last character in word (chr(128)
544 # indicates the last character in an entire string)
545 ww = w[:-1] + chr(ord(w[-1])+128)
546 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000547 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000548 if o < 0:
549 o = offset
550 lexicon = lexicon + ww
551 offset = offset + len(w)
552 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000553 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000554
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000555 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000556
557 # generate phrasebook from names and lexicon
558 phrasebook = [0]
559 phrasebook_offset = [0] * len(unicode.chars)
560 for char in unicode.chars:
561 name = names[char]
562 if name:
563 w = name.split()
564 phrasebook_offset[char] = len(phrasebook)
565 for w in w:
566 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000567 if i < short:
568 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000569 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000570 # store as two bytes
571 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000572 phrasebook.append(i&255)
573
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000574 assert getsize(phrasebook) == 1
575
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000576 #
577 # unicode name hash table
578
579 # extract names
580 data = []
581 for char in unicode.chars:
582 record = unicode.table[char]
583 if record:
584 name = record[1].strip()
585 if name and name[0] != "<":
586 data.append((name, char))
587
588 # the magic number 47 was chosen to minimize the number of
589 # collisions on the current data set. if you like, change it
590 # and see what happens...
591
592 codehash = Hash("code", data, 47)
593
Collin Winter6afaeb72007-08-03 17:06:41 +0000594 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000595
596 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000597 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
598 print(file=fp)
599 print("#define NAME_MAXLEN", 256, file=fp)
600 print(file=fp)
601 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000602 Array("lexicon", lexicon).dump(fp, trace)
603 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000604
605 # split decomposition index table
606 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
607
Collin Winter6afaeb72007-08-03 17:06:41 +0000608 print("/* code->name phrasebook */", file=fp)
609 print("#define phrasebook_shift", shift, file=fp)
610 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000611
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000612 Array("phrasebook", phrasebook).dump(fp, trace)
613 Array("phrasebook_offset1", offset1).dump(fp, trace)
614 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000615
Collin Winter6afaeb72007-08-03 17:06:41 +0000616 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000617 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000618
619 fp.close()
620
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000621
622def merge_old_version(version, new, old):
623 # Changes to exclusion file not implemented yet
624 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000625 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000626
627 # In these change records, 0xFF means "no change"
628 bidir_changes = [0xFF]*0x110000
629 category_changes = [0xFF]*0x110000
630 decimal_changes = [0xFF]*0x110000
631 # In numeric data, 0 means "no change",
632 # -1 means "did not have a numeric value
633 numeric_changes = [0] * 0x110000
634 # normalization_changes is a list of key-value pairs
635 normalization_changes = []
636 for i in range(0x110000):
637 if new.table[i] is None:
638 # Characters unassigned in the new version ought to
639 # be unassigned in the old one
640 assert old.table[i] is None
641 continue
642 # check characters unassigned in the old version
643 if old.table[i] is None:
644 # category 0 is "unassigned"
645 category_changes[i] = 0
646 continue
647 # check characters that differ
648 if old.table[i] != new.table[i]:
649 for k in range(len(old.table[i])):
650 if old.table[i][k] != new.table[i][k]:
651 value = old.table[i][k]
652 if k == 2:
653 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
654 category_changes[i] = CATEGORY_NAMES.index(value)
655 elif k == 4:
656 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
657 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
658 elif k == 5:
659 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
660 # We assume that all normalization changes are in 1:1 mappings
661 assert " " not in value
662 normalization_changes.append((i, value))
663 elif k == 6:
664 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
665 # we only support changes where the old value is a single digit
666 assert value in "0123456789"
667 decimal_changes[i] = int(value)
668 elif k == 8:
669 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
670 # Since 0 encodes "no change", the old value is better not 0
671 assert value != "0" and value != "-1"
672 if not value:
673 numeric_changes[i] = -1
674 else:
675 assert re.match("^[0-9]+$", value)
676 numeric_changes[i] = int(value)
677 elif k == 11:
678 # change to ISO comment, ignore
679 pass
680 elif k == 12:
681 # change to simple uppercase mapping; ignore
682 pass
683 elif k == 13:
684 # change to simple lowercase mapping; ignore
685 pass
686 elif k == 14:
687 # change to simple titlecase mapping; ignore
688 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000689 elif k == 16:
690 # derived property changes; not yet
691 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000692 else:
693 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000694 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000695 new.changed.append((version, list(zip(bidir_changes, category_changes,
696 decimal_changes, numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000697 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000698
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000699
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000700# --------------------------------------------------------------------
701# the following support code is taken from the unidb utilities
702# Copyright (c) 1999-2000 by Secret Labs AB
703
704# load a unicode-data file from disk
705
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000706import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000707
708class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000709 # Record structure:
710 # [ID, name, category, combining, bidi, decomp, (6)
711 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
712 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
713 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000714
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000715 def __init__(self, filename, exclusions, eastasianwidth,
716 derivedprops, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000717 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000718 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000719 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000720 while 1:
721 s = file.readline()
722 if not s:
723 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000724 s = s.strip().split(";")
725 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000726 table[char] = s
727
Martin v. Löwis97225da2002-11-24 23:05:09 +0000728 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000729 if expand:
730 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000731 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000732 s = table[i]
733 if s:
734 if s[1][-6:] == "First>":
735 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000736 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000737 elif s[1][-5:] == "Last>":
738 s[1] = ""
739 field = None
740 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000741 f2 = field[:]
742 f2[0] = "%X" % i
743 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000744
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000745 # public attributes
746 self.filename = filename
747 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000748 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000749
Martin v. Löwis677bde22002-11-23 22:08:15 +0000750 file = open(exclusions)
751 self.exclusions = {}
752 for s in file:
753 s = s.strip()
754 if not s:
755 continue
756 if s[0] == '#':
757 continue
758 char = int(s.split()[0],16)
759 self.exclusions[char] = 1
760
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000761 widths = [None] * 0x110000
762 for s in open(eastasianwidth):
763 s = s.strip()
764 if not s:
765 continue
766 if s[0] == '#':
767 continue
768 s = s.split()[0].split(';')
769 if '..' in s[0]:
770 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000771 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000772 else:
773 chars = [int(s[0], 16)]
774 for char in chars:
775 widths[char] = s[1]
776 for i in range(0, 0x110000):
777 if table[i] is not None:
778 table[i].append(widths[i])
779
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000780 for i in range(0, 0x110000):
781 if table[i] is not None:
782 table[i].append(set())
783 for s in open(derivedprops):
784 s = s.split('#', 1)[0].strip()
785 if not s:
786 continue
787
788 r, p = s.split(";")
789 r = r.strip()
790 p = p.strip()
791 if ".." in r:
792 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000793 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000794 else:
795 chars = [int(r, 16)]
796 for char in chars:
797 if table[char]:
798 # Some properties (e.g. Default_Ignorable_Code_Point)
799 # apply to unassigned code points; ignore them
800 table[char][-1].add(p)
801
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000802 def uselatin1(self):
803 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000804 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000805
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000806# hash table tools
807
808# this is a straight-forward reimplementation of Python's built-in
809# dictionary type, using a static data structure, and a custom string
810# hash algorithm.
811
812def myhash(s, magic):
813 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000814 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000815 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000816 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000817 if ix:
818 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
819 return h
820
821SIZES = [
822 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
823 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
824 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
825 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
826]
827
828class Hash:
829 def __init__(self, name, data, magic):
830 # turn a (key, value) list into a static hash table structure
831
832 # determine table size
833 for size, poly in SIZES:
834 if size > len(data):
835 poly = size + poly
836 break
837 else:
Collin Wintera817e582007-08-22 23:05:06 +0000838 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000839
Collin Winter6afaeb72007-08-03 17:06:41 +0000840 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000841
842 table = [None] * size
843
844 mask = size-1
845
846 n = 0
847
848 hash = myhash
849
850 # initialize hash table
851 for key, value in data:
852 h = hash(key, magic)
853 i = (~h) & mask
854 v = table[i]
855 if v is None:
856 table[i] = value
857 continue
858 incr = (h ^ (h >> 3)) & mask;
859 if not incr:
860 incr = mask
861 while 1:
862 n = n + 1
863 i = (i + incr) & mask
864 v = table[i]
865 if v is None:
866 table[i] = value
867 break
868 incr = incr << 1
869 if incr > mask:
870 incr = incr ^ poly
871
Collin Winter6afaeb72007-08-03 17:06:41 +0000872 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000873 self.collisions = n
874
875 for i in range(len(table)):
876 if table[i] is None:
877 table[i] = 0
878
879 self.data = Array(name + "_hash", table)
880 self.magic = magic
881 self.name = name
882 self.size = size
883 self.poly = poly
884
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000885 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000886 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000887 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000888 file.write("#define %s_magic %d\n" % (self.name, self.magic))
889 file.write("#define %s_size %d\n" % (self.name, self.size))
890 file.write("#define %s_poly %d\n" % (self.name, self.poly))
891
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000892# stuff to deal with arrays of unsigned integers
893
894class Array:
895
896 def __init__(self, name, data):
897 self.name = name
898 self.data = data
899
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000900 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000901 # write data to file, as a C array
902 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000903 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000904 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000905 file.write("static ")
906 if size == 1:
907 file.write("unsigned char")
908 elif size == 2:
909 file.write("unsigned short")
910 else:
911 file.write("unsigned int")
912 file.write(" " + self.name + "[] = {\n")
913 if self.data:
914 s = " "
915 for item in self.data:
916 i = str(item) + ", "
917 if len(s) + len(i) > 78:
918 file.write(s + "\n")
919 s = " " + i
920 else:
921 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000922 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000923 file.write(s + "\n")
924 file.write("};\n\n")
925
926def getsize(data):
927 # return smallest possible integer size for the given array
928 maxdata = max(data)
929 if maxdata < 256:
930 return 1
931 elif maxdata < 65536:
932 return 2
933 else:
934 return 4
935
Tim Peters21013482000-09-25 07:13:41 +0000936def splitbins(t, trace=0):
937 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
938
939 t is a sequence of ints. This function can be useful to save space if
940 many of the ints are the same. t1 and t2 are lists of ints, and shift
941 is an int, chosen to minimize the combined size of t1 and t2 (in C
942 code), and where for each i in range(len(t)),
943 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
944 where mask is a bitmask isolating the last "shift" bits.
945
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000946 If optional arg trace is non-zero (default zero), progress info
947 is printed to sys.stderr. The higher the value, the more info
948 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000949 """
950
951 import sys
952 if trace:
953 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +0000954 print("%d+%d bins at shift %d; %d bytes" % (
955 len(t1), len(t2), shift, bytes), file=sys.stderr)
956 print("Size of original table:", len(t)*getsize(t), \
957 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000958 n = len(t)-1 # last valid index
959 maxshift = 0 # the most we can shift n and still have something left
960 if n > 0:
961 while n >> 1:
962 n >>= 1
963 maxshift += 1
964 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +0000965 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +0000966 t = tuple(t) # so slices can be dict keys
967 for shift in range(maxshift + 1):
968 t1 = []
969 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000970 size = 2**shift
971 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000972 for i in range(0, len(t), size):
973 bin = t[i:i+size]
974 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000975 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000976 index = len(t2)
977 bincache[bin] = index
978 t2.extend(bin)
979 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000980 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000981 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000982 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000983 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000984 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000985 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000986 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000987 t1, t2, shift = best
988 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000989 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000990 dump(t1, t2, shift, bytes)
991 if __debug__:
992 # exhaustively verify that the decomposition is correct
993 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +0000994 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +0000995 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
996 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000997
998if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000999 maketables(1)