blob: 930a0df1485db64daa149ca991c2aeb2bda59bb7 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
28import sys
29
30SCRIPT = sys.argv[0]
Martin v. Löwis93cbca32008-09-10 14:08:48 +000031VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Martin v. Löwis93cbca32008-09-10 14:08:48 +000034UNIDATA_VERSION = "5.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000035UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000038DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000039
40old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000041
42CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
43 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
44 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
45 "So" ]
46
47BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
48 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
49 "ON" ]
50
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000051EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
52
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000053# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000054ALPHA_MASK = 0x01
55DECIMAL_MASK = 0x02
56DIGIT_MASK = 0x04
57LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059SPACE_MASK = 0x20
60TITLE_MASK = 0x40
61UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000062XID_START_MASK = 0x100
63XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000064PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000065NODELTA_MASK = 0x800
Fredrik Lundhe9133f72000-09-25 17:59:57 +000066
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000067def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000068
Collin Winter6afaeb72007-08-03 17:06:41 +000069 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000070
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000071 version = ""
72 unicode = UnicodeData(UNICODE_DATA % version,
73 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000074 EASTASIAN_WIDTH % version,
75 DERIVED_CORE_PROPERTIES % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000076
Georg Brandl559e5d72008-06-11 18:37:52 +000077 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000078
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000079 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000080 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000081 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
82 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000083 EASTASIAN_WIDTH % ("-"+version),
84 DERIVED_CORE_PROPERTIES % ("-"+version))
Georg Brandl559e5d72008-06-11 18:37:52 +000085 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086 merge_old_version(version, unicode, old_unicode)
87
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000088 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000089 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000090 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000091
92# --------------------------------------------------------------------
93# unicode character properties
94
95def makeunicodedata(unicode, trace):
96
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000097 dummy = (0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000098 table = [dummy]
99 cache = {0: dummy}
100 index = [0] * len(unicode.chars)
101
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000102 FILE = "Modules/unicodedata_db.h"
103
Collin Winter6afaeb72007-08-03 17:06:41 +0000104 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000105
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000106 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000107
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000108 for char in unicode.chars:
109 record = unicode.table[char]
110 if record:
111 # extract database properties
112 category = CATEGORY_NAMES.index(record[2])
113 combining = int(record[3])
114 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
115 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000116 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000117 item = (
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000118 category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000119 )
120 # add entry to index and item tables
121 i = cache.get(item)
122 if i is None:
123 cache[item] = i = len(table)
124 table.append(item)
125 index[char] = i
126
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000127 # 2) decomposition data
128
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 decomp_data = [0]
130 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000131 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000132 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000133
Martin v. Löwis677bde22002-11-23 22:08:15 +0000134 comp_pairs = []
135 comp_first = [None] * len(unicode.chars)
136 comp_last = [None] * len(unicode.chars)
137
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000138 for char in unicode.chars:
139 record = unicode.table[char]
140 if record:
141 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000142 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000143 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000144 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000145 # prefix
146 if decomp[0][0] == "<":
147 prefix = decomp.pop(0)
148 else:
149 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000150 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000151 i = decomp_prefix.index(prefix)
152 except ValueError:
153 i = len(decomp_prefix)
154 decomp_prefix.append(prefix)
155 prefix = i
156 assert prefix < 256
157 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000158 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000159 # Collect NFC pairs
160 if not prefix and len(decomp) == 3 and \
161 char not in unicode.exclusions and \
162 unicode.table[decomp[1]][3] == "0":
163 p, l, r = decomp
164 comp_first[l] = 1
165 comp_last[r] = 1
166 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000167 try:
168 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000169 except ValueError:
170 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000171 decomp_data.extend(decomp)
172 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000173 else:
174 i = 0
175 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000176
Martin v. Löwis677bde22002-11-23 22:08:15 +0000177 f = l = 0
178 comp_first_ranges = []
179 comp_last_ranges = []
180 prev_f = prev_l = None
181 for i in unicode.chars:
182 if comp_first[i] is not None:
183 comp_first[i] = f
184 f += 1
185 if prev_f is None:
186 prev_f = (i,i)
187 elif prev_f[1]+1 == i:
188 prev_f = prev_f[0],i
189 else:
190 comp_first_ranges.append(prev_f)
191 prev_f = (i,i)
192 if comp_last[i] is not None:
193 comp_last[i] = l
194 l += 1
195 if prev_l is None:
196 prev_l = (i,i)
197 elif prev_l[1]+1 == i:
198 prev_l = prev_l[0],i
199 else:
200 comp_last_ranges.append(prev_l)
201 prev_l = (i,i)
202 comp_first_ranges.append(prev_f)
203 comp_last_ranges.append(prev_l)
204 total_first = f
205 total_last = l
206
207 comp_data = [0]*(total_first*total_last)
208 for f,l,char in comp_pairs:
209 f = comp_first[f]
210 l = comp_last[l]
211 comp_data[f*total_last+l] = char
212
Collin Winter6afaeb72007-08-03 17:06:41 +0000213 print(len(table), "unique properties")
214 print(len(decomp_prefix), "unique decomposition prefixes")
215 print(len(decomp_data), "unique decomposition entries:", end=' ')
216 print(decomp_size, "bytes")
217 print(total_first, "first characters in NFC")
218 print(total_last, "last characters in NFC")
219 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000220
Collin Winter6afaeb72007-08-03 17:06:41 +0000221 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000222
Fred Drake9c685052000-10-26 03:56:46 +0000223 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000224 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
225 print(file=fp)
226 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
227 print("/* a list of unique database records */", file=fp)
228 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000229 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000230 print(" {%d, %d, %d, %d, %d}," % item, file=fp)
231 print("};", file=fp)
232 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000233
Collin Winter6afaeb72007-08-03 17:06:41 +0000234 print("/* Reindexing of NFC first characters. */", file=fp)
235 print("#define TOTAL_FIRST",total_first, file=fp)
236 print("#define TOTAL_LAST",total_last, file=fp)
237 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000238 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000239 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000240 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
241 print(" {0,0,0}", file=fp)
242 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000243 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000244 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000245 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
246 print(" {0,0,0}", file=fp)
247 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000248
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000249 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000250 # the support code moved into unicodedatabase.c
251
Collin Winter6afaeb72007-08-03 17:06:41 +0000252 print("/* string literals */", file=fp)
253 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000254 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000255 print(" \"%s\"," % name, file=fp)
256 print(" NULL", file=fp)
257 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000258
Collin Winter6afaeb72007-08-03 17:06:41 +0000259 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000260 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000261 print(" \"%s\"," % name, file=fp)
262 print(" NULL", file=fp)
263 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000264
Collin Winter6afaeb72007-08-03 17:06:41 +0000265 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000266 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print(" \"%s\"," % name, file=fp)
268 print(" NULL", file=fp)
269 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000270
Collin Winter6afaeb72007-08-03 17:06:41 +0000271 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000272 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print(" \"%s\"," % name, file=fp)
274 print(" NULL", file=fp)
275 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000276
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000277 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000278 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000279
Collin Winter6afaeb72007-08-03 17:06:41 +0000280 print("/* index tables for the database records */", file=fp)
281 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000282 Array("index1", index1).dump(fp, trace)
283 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000284
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000285 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000286 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000287
Collin Winter6afaeb72007-08-03 17:06:41 +0000288 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000289 Array("decomp_data", decomp_data).dump(fp, trace)
290
Collin Winter6afaeb72007-08-03 17:06:41 +0000291 print("/* index tables for the decomposition data */", file=fp)
292 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000293 Array("decomp_index1", index1).dump(fp, trace)
294 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000295
Martin v. Löwis677bde22002-11-23 22:08:15 +0000296 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000297 print("/* NFC pairs */", file=fp)
298 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000299 Array("comp_index", index).dump(fp, trace)
300 Array("comp_data", index2).dump(fp, trace)
301
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000302 # Generate delta tables for old versions
303 for version, table, normalization in unicode.changed:
304 cversion = version.replace(".","_")
305 records = [table[0]]
306 cache = {table[0]:0}
307 index = [0] * len(table)
308 for i, record in enumerate(table):
309 try:
310 index[i] = cache[record]
311 except KeyError:
312 index[i] = cache[record] = len(records)
313 records.append(record)
314 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000315 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000316 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000317 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
318 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000319 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
320 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000321 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
322 print("{", file=fp)
323 print("\tint index;", file=fp)
324 print("\tif (n >= 0x110000) index = 0;", file=fp)
325 print("\telse {", file=fp)
326 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
327 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
328 (cversion, shift, ((1<<shift)-1)), file=fp)
329 print("\t}", file=fp)
330 print("\treturn change_records_%s+index;" % cversion, file=fp)
331 print("}\n", file=fp)
332 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
333 print("{", file=fp)
334 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000335 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000336 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
337 print("\tdefault: return 0;", file=fp)
338 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000339
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000340 fp.close()
341
342# --------------------------------------------------------------------
343# unicode character type tables
344
345def makeunicodetype(unicode, trace):
346
347 FILE = "Objects/unicodetype_db.h"
348
Collin Winter6afaeb72007-08-03 17:06:41 +0000349 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000350
351 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000352 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000353 table = [dummy]
354 cache = {0: dummy}
355 index = [0] * len(unicode.chars)
356
357 for char in unicode.chars:
358 record = unicode.table[char]
359 if record:
360 # extract database properties
361 category = record[2]
362 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000363 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000364 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000365 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000366 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
367 flags |= ALPHA_MASK
368 if category == "Ll":
369 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000370 if category == "Zl" or bidirectional == "B":
371 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000372 if category == "Zs" or bidirectional in ("WS", "B", "S"):
373 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000374 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000375 flags |= TITLE_MASK
376 if category == "Lu":
377 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000378 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000379 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000380 if "XID_Start" in properties:
381 flags |= XID_START_MASK
382 if "XID_Continue" in properties:
383 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000384 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000385 if record[12]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000386 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000387 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000388 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000389 if record[13]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000390 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000391 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000392 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000393 if record[14]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000394 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000395 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000396 # UCD.html says that a missing title char means that
397 # it defaults to the uppercase character, not to the
398 # character itself. Apparently, in the current UCD (5.x)
399 # this feature is never used
400 title = upper
401 upper_d = upper - char
402 lower_d = lower - char
403 title_d = title - char
404 if -32768 <= upper_d <= 32767 and \
405 -32768 <= lower_d <= 32767 and \
406 -32768 <= title_d <= 32767:
407 # use deltas
408 upper = upper_d & 0xffff
409 lower = lower_d & 0xffff
410 title = title_d & 0xffff
411 else:
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000412 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000413 # decimal digit, integer digit
414 decimal = 0
415 if record[6]:
416 flags |= DECIMAL_MASK
417 decimal = int(record[6])
418 digit = 0
419 if record[7]:
420 flags |= DIGIT_MASK
421 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000422 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000423 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000424 )
425 # add entry to index and item tables
426 i = cache.get(item)
427 if i is None:
428 cache[item] = i = len(table)
429 table.append(item)
430 index[char] = i
431
Collin Winter6afaeb72007-08-03 17:06:41 +0000432 print(len(table), "unique character type entries")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000433
Collin Winter6afaeb72007-08-03 17:06:41 +0000434 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000435
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000436 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000437 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
438 print(file=fp)
439 print("/* a list of unique character type descriptors */", file=fp)
440 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000441 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000442 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
443 print("};", file=fp)
444 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000445
446 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000447 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000448
Collin Winter6afaeb72007-08-03 17:06:41 +0000449 print("/* type indexes */", file=fp)
450 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000451 Array("index1", index1).dump(fp, trace)
452 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000453
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000454 fp.close()
455
456# --------------------------------------------------------------------
457# unicode name database
458
459def makeunicodename(unicode, trace):
460
461 FILE = "Modules/unicodename_db.h"
462
Collin Winter6afaeb72007-08-03 17:06:41 +0000463 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000464
465 # collect names
466 names = [None] * len(unicode.chars)
467
468 for char in unicode.chars:
469 record = unicode.table[char]
470 if record:
471 name = record[1].strip()
472 if name and name[0] != "<":
473 names[char] = name + chr(0)
474
Georg Brandl559e5d72008-06-11 18:37:52 +0000475 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000476
477 # collect unique words from names (note that we differ between
478 # words inside a sentence, and words ending a sentence. the
479 # latter includes the trailing null byte.
480
481 words = {}
482 n = b = 0
483 for char in unicode.chars:
484 name = names[char]
485 if name:
486 w = name.split()
487 b = b + len(name)
488 n = n + len(w)
489 for w in w:
490 l = words.get(w)
491 if l:
492 l.append(None)
493 else:
494 words[w] = [len(words)]
495
Collin Winter6afaeb72007-08-03 17:06:41 +0000496 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000497
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000498 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000499
Martin v. Löwis97225da2002-11-24 23:05:09 +0000500 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000501 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000502 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000503 return -len(alist), aword
504 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000505
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000506 # figure out how many phrasebook escapes we need
507 escapes = 0
508 while escapes * 256 < len(wordlist):
509 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000510 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000511
512 short = 256 - escapes
513
514 assert short > 0
515
Collin Winter6afaeb72007-08-03 17:06:41 +0000516 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000517
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000518 # statistics
519 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000520 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000521 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000522 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000523
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000524 # pick the most commonly used words, and sort the rest on falling
525 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000526
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000527 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000528 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000529 wordlist.extend(wordtail)
530
531 # generate lexicon from words
532
533 lexicon_offset = [0]
534 lexicon = ""
535 words = {}
536
537 # build a lexicon string
538 offset = 0
539 for w, x in wordlist:
540 # encoding: bit 7 indicates last character in word (chr(128)
541 # indicates the last character in an entire string)
542 ww = w[:-1] + chr(ord(w[-1])+128)
543 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000544 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000545 if o < 0:
546 o = offset
547 lexicon = lexicon + ww
548 offset = offset + len(w)
549 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000550 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000551
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000552 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000553
554 # generate phrasebook from names and lexicon
555 phrasebook = [0]
556 phrasebook_offset = [0] * len(unicode.chars)
557 for char in unicode.chars:
558 name = names[char]
559 if name:
560 w = name.split()
561 phrasebook_offset[char] = len(phrasebook)
562 for w in w:
563 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000564 if i < short:
565 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000566 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000567 # store as two bytes
568 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000569 phrasebook.append(i&255)
570
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000571 assert getsize(phrasebook) == 1
572
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000573 #
574 # unicode name hash table
575
576 # extract names
577 data = []
578 for char in unicode.chars:
579 record = unicode.table[char]
580 if record:
581 name = record[1].strip()
582 if name and name[0] != "<":
583 data.append((name, char))
584
585 # the magic number 47 was chosen to minimize the number of
586 # collisions on the current data set. if you like, change it
587 # and see what happens...
588
589 codehash = Hash("code", data, 47)
590
Collin Winter6afaeb72007-08-03 17:06:41 +0000591 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000592
593 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000594 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
595 print(file=fp)
596 print("#define NAME_MAXLEN", 256, file=fp)
597 print(file=fp)
598 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000599 Array("lexicon", lexicon).dump(fp, trace)
600 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000601
602 # split decomposition index table
603 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
604
Collin Winter6afaeb72007-08-03 17:06:41 +0000605 print("/* code->name phrasebook */", file=fp)
606 print("#define phrasebook_shift", shift, file=fp)
607 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000608
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000609 Array("phrasebook", phrasebook).dump(fp, trace)
610 Array("phrasebook_offset1", offset1).dump(fp, trace)
611 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000612
Collin Winter6afaeb72007-08-03 17:06:41 +0000613 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000614 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000615
616 fp.close()
617
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000618
619def merge_old_version(version, new, old):
620 # Changes to exclusion file not implemented yet
621 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000622 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000623
624 # In these change records, 0xFF means "no change"
625 bidir_changes = [0xFF]*0x110000
626 category_changes = [0xFF]*0x110000
627 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000628 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000629 # In numeric data, 0 means "no change",
630 # -1 means "did not have a numeric value
631 numeric_changes = [0] * 0x110000
632 # normalization_changes is a list of key-value pairs
633 normalization_changes = []
634 for i in range(0x110000):
635 if new.table[i] is None:
636 # Characters unassigned in the new version ought to
637 # be unassigned in the old one
638 assert old.table[i] is None
639 continue
640 # check characters unassigned in the old version
641 if old.table[i] is None:
642 # category 0 is "unassigned"
643 category_changes[i] = 0
644 continue
645 # check characters that differ
646 if old.table[i] != new.table[i]:
647 for k in range(len(old.table[i])):
648 if old.table[i][k] != new.table[i][k]:
649 value = old.table[i][k]
650 if k == 2:
651 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
652 category_changes[i] = CATEGORY_NAMES.index(value)
653 elif k == 4:
654 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
655 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
656 elif k == 5:
657 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
658 # We assume that all normalization changes are in 1:1 mappings
659 assert " " not in value
660 normalization_changes.append((i, value))
661 elif k == 6:
662 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
663 # we only support changes where the old value is a single digit
664 assert value in "0123456789"
665 decimal_changes[i] = int(value)
666 elif k == 8:
667 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
668 # Since 0 encodes "no change", the old value is better not 0
669 assert value != "0" and value != "-1"
670 if not value:
671 numeric_changes[i] = -1
672 else:
673 assert re.match("^[0-9]+$", value)
674 numeric_changes[i] = int(value)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000675 elif k == 9:
676 if value == 'Y':
677 mirrored_changes[i] = '1'
678 else:
679 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000680 elif k == 11:
681 # change to ISO comment, ignore
682 pass
683 elif k == 12:
684 # change to simple uppercase mapping; ignore
685 pass
686 elif k == 13:
687 # change to simple lowercase mapping; ignore
688 pass
689 elif k == 14:
690 # change to simple titlecase mapping; ignore
691 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000692 elif k == 16:
693 # derived property changes; not yet
694 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000695 else:
696 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000697 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000698 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000699 decimal_changes, mirrored_changes,
700 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000701 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000702
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000703
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000704# --------------------------------------------------------------------
705# the following support code is taken from the unidb utilities
706# Copyright (c) 1999-2000 by Secret Labs AB
707
708# load a unicode-data file from disk
709
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000710import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000711
712class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000713 # Record structure:
714 # [ID, name, category, combining, bidi, decomp, (6)
715 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
716 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
717 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000718
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000719 def __init__(self, filename, exclusions, eastasianwidth,
720 derivedprops, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000721 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000722 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000723 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000724 while 1:
725 s = file.readline()
726 if not s:
727 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000728 s = s.strip().split(";")
729 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000730 table[char] = s
731
Martin v. Löwis97225da2002-11-24 23:05:09 +0000732 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000733 if expand:
734 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000735 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000736 s = table[i]
737 if s:
738 if s[1][-6:] == "First>":
739 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000740 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000741 elif s[1][-5:] == "Last>":
742 s[1] = ""
743 field = None
744 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000745 f2 = field[:]
746 f2[0] = "%X" % i
747 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000748
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000749 # public attributes
750 self.filename = filename
751 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000752 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000753
Martin v. Löwis677bde22002-11-23 22:08:15 +0000754 file = open(exclusions)
755 self.exclusions = {}
756 for s in file:
757 s = s.strip()
758 if not s:
759 continue
760 if s[0] == '#':
761 continue
762 char = int(s.split()[0],16)
763 self.exclusions[char] = 1
764
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000765 widths = [None] * 0x110000
766 for s in open(eastasianwidth):
767 s = s.strip()
768 if not s:
769 continue
770 if s[0] == '#':
771 continue
772 s = s.split()[0].split(';')
773 if '..' in s[0]:
774 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000775 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000776 else:
777 chars = [int(s[0], 16)]
778 for char in chars:
779 widths[char] = s[1]
780 for i in range(0, 0x110000):
781 if table[i] is not None:
782 table[i].append(widths[i])
783
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000784 for i in range(0, 0x110000):
785 if table[i] is not None:
786 table[i].append(set())
787 for s in open(derivedprops):
788 s = s.split('#', 1)[0].strip()
789 if not s:
790 continue
791
792 r, p = s.split(";")
793 r = r.strip()
794 p = p.strip()
795 if ".." in r:
796 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000797 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000798 else:
799 chars = [int(r, 16)]
800 for char in chars:
801 if table[char]:
802 # Some properties (e.g. Default_Ignorable_Code_Point)
803 # apply to unassigned code points; ignore them
804 table[char][-1].add(p)
805
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000806 def uselatin1(self):
807 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000808 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000809
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000810# hash table tools
811
812# this is a straight-forward reimplementation of Python's built-in
813# dictionary type, using a static data structure, and a custom string
814# hash algorithm.
815
816def myhash(s, magic):
817 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000818 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000819 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000820 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000821 if ix:
822 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
823 return h
824
825SIZES = [
826 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
827 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
828 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
829 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
830]
831
832class Hash:
833 def __init__(self, name, data, magic):
834 # turn a (key, value) list into a static hash table structure
835
836 # determine table size
837 for size, poly in SIZES:
838 if size > len(data):
839 poly = size + poly
840 break
841 else:
Collin Wintera817e582007-08-22 23:05:06 +0000842 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000843
Collin Winter6afaeb72007-08-03 17:06:41 +0000844 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000845
846 table = [None] * size
847
848 mask = size-1
849
850 n = 0
851
852 hash = myhash
853
854 # initialize hash table
855 for key, value in data:
856 h = hash(key, magic)
857 i = (~h) & mask
858 v = table[i]
859 if v is None:
860 table[i] = value
861 continue
862 incr = (h ^ (h >> 3)) & mask;
863 if not incr:
864 incr = mask
865 while 1:
866 n = n + 1
867 i = (i + incr) & mask
868 v = table[i]
869 if v is None:
870 table[i] = value
871 break
872 incr = incr << 1
873 if incr > mask:
874 incr = incr ^ poly
875
Collin Winter6afaeb72007-08-03 17:06:41 +0000876 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000877 self.collisions = n
878
879 for i in range(len(table)):
880 if table[i] is None:
881 table[i] = 0
882
883 self.data = Array(name + "_hash", table)
884 self.magic = magic
885 self.name = name
886 self.size = size
887 self.poly = poly
888
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000889 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000890 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000891 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000892 file.write("#define %s_magic %d\n" % (self.name, self.magic))
893 file.write("#define %s_size %d\n" % (self.name, self.size))
894 file.write("#define %s_poly %d\n" % (self.name, self.poly))
895
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000896# stuff to deal with arrays of unsigned integers
897
898class Array:
899
900 def __init__(self, name, data):
901 self.name = name
902 self.data = data
903
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000904 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000905 # write data to file, as a C array
906 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000907 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000908 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000909 file.write("static ")
910 if size == 1:
911 file.write("unsigned char")
912 elif size == 2:
913 file.write("unsigned short")
914 else:
915 file.write("unsigned int")
916 file.write(" " + self.name + "[] = {\n")
917 if self.data:
918 s = " "
919 for item in self.data:
920 i = str(item) + ", "
921 if len(s) + len(i) > 78:
922 file.write(s + "\n")
923 s = " " + i
924 else:
925 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000926 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000927 file.write(s + "\n")
928 file.write("};\n\n")
929
930def getsize(data):
931 # return smallest possible integer size for the given array
932 maxdata = max(data)
933 if maxdata < 256:
934 return 1
935 elif maxdata < 65536:
936 return 2
937 else:
938 return 4
939
Tim Peters21013482000-09-25 07:13:41 +0000940def splitbins(t, trace=0):
941 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
942
943 t is a sequence of ints. This function can be useful to save space if
944 many of the ints are the same. t1 and t2 are lists of ints, and shift
945 is an int, chosen to minimize the combined size of t1 and t2 (in C
946 code), and where for each i in range(len(t)),
947 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
948 where mask is a bitmask isolating the last "shift" bits.
949
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000950 If optional arg trace is non-zero (default zero), progress info
951 is printed to sys.stderr. The higher the value, the more info
952 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000953 """
954
955 import sys
956 if trace:
957 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +0000958 print("%d+%d bins at shift %d; %d bytes" % (
959 len(t1), len(t2), shift, bytes), file=sys.stderr)
960 print("Size of original table:", len(t)*getsize(t), \
961 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000962 n = len(t)-1 # last valid index
963 maxshift = 0 # the most we can shift n and still have something left
964 if n > 0:
965 while n >> 1:
966 n >>= 1
967 maxshift += 1
968 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +0000969 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +0000970 t = tuple(t) # so slices can be dict keys
971 for shift in range(maxshift + 1):
972 t1 = []
973 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000974 size = 2**shift
975 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000976 for i in range(0, len(t), size):
977 bin = t[i:i+size]
978 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000979 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000980 index = len(t2)
981 bincache[bin] = index
982 t2.extend(bin)
983 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000984 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000985 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000986 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000987 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000988 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000989 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000990 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000991 t1, t2, shift = best
992 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000993 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000994 dump(t1, t2, shift, bytes)
995 if __debug__:
996 # exhaustively verify that the decomposition is correct
997 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +0000998 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +0000999 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1000 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001001
1002if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001003 maketables(1)