blob: 0b32412b4ea176e14a70fca83d130aa56a38bf33 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
28import sys
29
30SCRIPT = sys.argv[0]
Martin v. Löwis93cbca32008-09-10 14:08:48 +000031VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Martin v. Löwis93cbca32008-09-10 14:08:48 +000034UNIDATA_VERSION = "5.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000035UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000038DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000039
40old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000041
42CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
43 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
44 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
45 "So" ]
46
47BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
48 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
49 "ON" ]
50
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000051EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
52
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000053# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000054ALPHA_MASK = 0x01
55DECIMAL_MASK = 0x02
56DIGIT_MASK = 0x04
57LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059SPACE_MASK = 0x20
60TITLE_MASK = 0x40
61UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000062XID_START_MASK = 0x100
63XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000064PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000065NODELTA_MASK = 0x800
Fredrik Lundhe9133f72000-09-25 17:59:57 +000066
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000067def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000068
Collin Winter6afaeb72007-08-03 17:06:41 +000069 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000070
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000071 version = ""
72 unicode = UnicodeData(UNICODE_DATA % version,
73 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000074 EASTASIAN_WIDTH % version,
75 DERIVED_CORE_PROPERTIES % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000076
Georg Brandl559e5d72008-06-11 18:37:52 +000077 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000078
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000079 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000080 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000081 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
82 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000083 EASTASIAN_WIDTH % ("-"+version),
84 DERIVED_CORE_PROPERTIES % ("-"+version))
Georg Brandl559e5d72008-06-11 18:37:52 +000085 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086 merge_old_version(version, unicode, old_unicode)
87
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000088 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000089 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000090 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000091
92# --------------------------------------------------------------------
93# unicode character properties
94
95def makeunicodedata(unicode, trace):
96
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000097 dummy = (0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000098 table = [dummy]
99 cache = {0: dummy}
100 index = [0] * len(unicode.chars)
101
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000102 FILE = "Modules/unicodedata_db.h"
103
Collin Winter6afaeb72007-08-03 17:06:41 +0000104 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000105
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000106 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000107
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000108 for char in unicode.chars:
109 record = unicode.table[char]
110 if record:
111 # extract database properties
112 category = CATEGORY_NAMES.index(record[2])
113 combining = int(record[3])
114 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
115 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000116 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000117 item = (
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000118 category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000119 )
120 # add entry to index and item tables
121 i = cache.get(item)
122 if i is None:
123 cache[item] = i = len(table)
124 table.append(item)
125 index[char] = i
126
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000127 # 2) decomposition data
128
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 decomp_data = [0]
130 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000131 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000132 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000133
Martin v. Löwis677bde22002-11-23 22:08:15 +0000134 comp_pairs = []
135 comp_first = [None] * len(unicode.chars)
136 comp_last = [None] * len(unicode.chars)
137
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000138 for char in unicode.chars:
139 record = unicode.table[char]
140 if record:
141 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000142 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000143 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000144 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000145 # prefix
146 if decomp[0][0] == "<":
147 prefix = decomp.pop(0)
148 else:
149 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000150 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000151 i = decomp_prefix.index(prefix)
152 except ValueError:
153 i = len(decomp_prefix)
154 decomp_prefix.append(prefix)
155 prefix = i
156 assert prefix < 256
157 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000158 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000159 # Collect NFC pairs
160 if not prefix and len(decomp) == 3 and \
161 char not in unicode.exclusions and \
162 unicode.table[decomp[1]][3] == "0":
163 p, l, r = decomp
164 comp_first[l] = 1
165 comp_last[r] = 1
166 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000167 try:
168 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000169 except ValueError:
170 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000171 decomp_data.extend(decomp)
172 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000173 else:
174 i = 0
175 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000176
Martin v. Löwis677bde22002-11-23 22:08:15 +0000177 f = l = 0
178 comp_first_ranges = []
179 comp_last_ranges = []
180 prev_f = prev_l = None
181 for i in unicode.chars:
182 if comp_first[i] is not None:
183 comp_first[i] = f
184 f += 1
185 if prev_f is None:
186 prev_f = (i,i)
187 elif prev_f[1]+1 == i:
188 prev_f = prev_f[0],i
189 else:
190 comp_first_ranges.append(prev_f)
191 prev_f = (i,i)
192 if comp_last[i] is not None:
193 comp_last[i] = l
194 l += 1
195 if prev_l is None:
196 prev_l = (i,i)
197 elif prev_l[1]+1 == i:
198 prev_l = prev_l[0],i
199 else:
200 comp_last_ranges.append(prev_l)
201 prev_l = (i,i)
202 comp_first_ranges.append(prev_f)
203 comp_last_ranges.append(prev_l)
204 total_first = f
205 total_last = l
206
207 comp_data = [0]*(total_first*total_last)
208 for f,l,char in comp_pairs:
209 f = comp_first[f]
210 l = comp_last[l]
211 comp_data[f*total_last+l] = char
212
Collin Winter6afaeb72007-08-03 17:06:41 +0000213 print(len(table), "unique properties")
214 print(len(decomp_prefix), "unique decomposition prefixes")
215 print(len(decomp_data), "unique decomposition entries:", end=' ')
216 print(decomp_size, "bytes")
217 print(total_first, "first characters in NFC")
218 print(total_last, "last characters in NFC")
219 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000220
Collin Winter6afaeb72007-08-03 17:06:41 +0000221 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000222
Fred Drake9c685052000-10-26 03:56:46 +0000223 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000224 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
225 print(file=fp)
226 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
227 print("/* a list of unique database records */", file=fp)
228 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000229 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000230 print(" {%d, %d, %d, %d, %d}," % item, file=fp)
231 print("};", file=fp)
232 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000233
Collin Winter6afaeb72007-08-03 17:06:41 +0000234 print("/* Reindexing of NFC first characters. */", file=fp)
235 print("#define TOTAL_FIRST",total_first, file=fp)
236 print("#define TOTAL_LAST",total_last, file=fp)
237 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000238 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000239 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000240 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
241 print(" {0,0,0}", file=fp)
242 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000243 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000244 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000245 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
246 print(" {0,0,0}", file=fp)
247 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000248
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000249 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000250 # the support code moved into unicodedatabase.c
251
Collin Winter6afaeb72007-08-03 17:06:41 +0000252 print("/* string literals */", file=fp)
253 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000254 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000255 print(" \"%s\"," % name, file=fp)
256 print(" NULL", file=fp)
257 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000258
Collin Winter6afaeb72007-08-03 17:06:41 +0000259 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000260 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000261 print(" \"%s\"," % name, file=fp)
262 print(" NULL", file=fp)
263 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000264
Collin Winter6afaeb72007-08-03 17:06:41 +0000265 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000266 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print(" \"%s\"," % name, file=fp)
268 print(" NULL", file=fp)
269 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000270
Collin Winter6afaeb72007-08-03 17:06:41 +0000271 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000272 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print(" \"%s\"," % name, file=fp)
274 print(" NULL", file=fp)
275 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000276
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000277 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000278 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000279
Collin Winter6afaeb72007-08-03 17:06:41 +0000280 print("/* index tables for the database records */", file=fp)
281 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000282 Array("index1", index1).dump(fp, trace)
283 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000284
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000285 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000286 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000287
Collin Winter6afaeb72007-08-03 17:06:41 +0000288 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000289 Array("decomp_data", decomp_data).dump(fp, trace)
290
Collin Winter6afaeb72007-08-03 17:06:41 +0000291 print("/* index tables for the decomposition data */", file=fp)
292 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000293 Array("decomp_index1", index1).dump(fp, trace)
294 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000295
Martin v. Löwis677bde22002-11-23 22:08:15 +0000296 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000297 print("/* NFC pairs */", file=fp)
298 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000299 Array("comp_index", index).dump(fp, trace)
300 Array("comp_data", index2).dump(fp, trace)
301
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000302 # Generate delta tables for old versions
303 for version, table, normalization in unicode.changed:
304 cversion = version.replace(".","_")
305 records = [table[0]]
306 cache = {table[0]:0}
307 index = [0] * len(table)
308 for i, record in enumerate(table):
309 try:
310 index[i] = cache[record]
311 except KeyError:
312 index[i] = cache[record] = len(records)
313 records.append(record)
314 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000315 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000316 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000317 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
318 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000319 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
320 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000321 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
322 print("{", file=fp)
323 print("\tint index;", file=fp)
324 print("\tif (n >= 0x110000) index = 0;", file=fp)
325 print("\telse {", file=fp)
326 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
327 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
328 (cversion, shift, ((1<<shift)-1)), file=fp)
329 print("\t}", file=fp)
330 print("\treturn change_records_%s+index;" % cversion, file=fp)
331 print("}\n", file=fp)
332 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
333 print("{", file=fp)
334 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000335 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000336 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
337 print("\tdefault: return 0;", file=fp)
338 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000339
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000340 fp.close()
341
342# --------------------------------------------------------------------
343# unicode character type tables
344
345def makeunicodetype(unicode, trace):
346
347 FILE = "Objects/unicodetype_db.h"
348
Collin Winter6afaeb72007-08-03 17:06:41 +0000349 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000350
351 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000352 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000353 table = [dummy]
354 cache = {0: dummy}
355 index = [0] * len(unicode.chars)
356
357 for char in unicode.chars:
358 record = unicode.table[char]
359 if record:
360 # extract database properties
361 category = record[2]
362 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000363 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000364 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000365 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000366 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
367 flags |= ALPHA_MASK
368 if category == "Ll":
369 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000370 if category == "Zl" or bidirectional == "B":
371 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000372 if category == "Zs" or bidirectional in ("WS", "B", "S"):
373 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000374 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000375 flags |= TITLE_MASK
376 if category == "Lu":
377 flags |= UPPER_MASK
Georg Brandld52429f2008-07-04 15:55:02 +0000378 if char == " " or category[0] not in ("C", "Z"):
379 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000380 if "XID_Start" in properties:
381 flags |= XID_START_MASK
382 if "XID_Continue" in properties:
383 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000384 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000385 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000386 upper = int(record[12], 16) - char
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000387 if -32768 <= upper <= 32767 and delta:
388 upper = upper & 0xffff
389 else:
390 upper += char
391 delta = False
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392 else:
393 upper = 0
394 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000395 lower = int(record[13], 16) - char
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000396 if -32768 <= lower <= 32767 and delta:
397 lower = lower & 0xffff
398 else:
399 lower += char
400 delta = False
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000401 else:
402 lower = 0
403 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000404 title = int(record[14], 16) - char
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000405 if -32768 <= lower <= 32767 and delta:
406 title = title & 0xffff
407 else:
408 title += char
409 delta = False
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 else:
411 title = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000412 if not delta:
413 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000414 # decimal digit, integer digit
415 decimal = 0
416 if record[6]:
417 flags |= DECIMAL_MASK
418 decimal = int(record[6])
419 digit = 0
420 if record[7]:
421 flags |= DIGIT_MASK
422 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000423 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000424 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000425 )
426 # add entry to index and item tables
427 i = cache.get(item)
428 if i is None:
429 cache[item] = i = len(table)
430 table.append(item)
431 index[char] = i
432
Collin Winter6afaeb72007-08-03 17:06:41 +0000433 print(len(table), "unique character type entries")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000434
Collin Winter6afaeb72007-08-03 17:06:41 +0000435 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000436
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000437 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000438 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
439 print(file=fp)
440 print("/* a list of unique character type descriptors */", file=fp)
441 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000442 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000443 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
444 print("};", file=fp)
445 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000446
447 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000448 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000449
Collin Winter6afaeb72007-08-03 17:06:41 +0000450 print("/* type indexes */", file=fp)
451 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000452 Array("index1", index1).dump(fp, trace)
453 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000454
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000455 fp.close()
456
457# --------------------------------------------------------------------
458# unicode name database
459
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000460def CmpToKey(mycmp):
461 'Convert a cmp= function into a key= function'
462 class K(object):
463 def __init__(self, obj, *args):
464 self.obj = obj
465 def __lt__(self, other):
466 return mycmp(self.obj, other.obj) == -1
467 return K
468
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000469def makeunicodename(unicode, trace):
470
471 FILE = "Modules/unicodename_db.h"
472
Collin Winter6afaeb72007-08-03 17:06:41 +0000473 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000474
475 # collect names
476 names = [None] * len(unicode.chars)
477
478 for char in unicode.chars:
479 record = unicode.table[char]
480 if record:
481 name = record[1].strip()
482 if name and name[0] != "<":
483 names[char] = name + chr(0)
484
Georg Brandl559e5d72008-06-11 18:37:52 +0000485 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000486
487 # collect unique words from names (note that we differ between
488 # words inside a sentence, and words ending a sentence. the
489 # latter includes the trailing null byte.
490
491 words = {}
492 n = b = 0
493 for char in unicode.chars:
494 name = names[char]
495 if name:
496 w = name.split()
497 b = b + len(name)
498 n = n + len(w)
499 for w in w:
500 l = words.get(w)
501 if l:
502 l.append(None)
503 else:
504 words[w] = [len(words)]
505
Collin Winter6afaeb72007-08-03 17:06:41 +0000506 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000507
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000508 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000509
Martin v. Löwis97225da2002-11-24 23:05:09 +0000510 # sort on falling frequency, then by name
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000511 def cmpwords(a,b):
512 aword, alist = a
513 bword, blist = b
Martin v. Löwis97225da2002-11-24 23:05:09 +0000514 r = -cmp(len(alist),len(blist))
515 if r:
516 return r
517 return cmp(aword, bword)
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000518 wordlist.sort(key=CmpToKey(cmpwords))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000519
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000520 # figure out how many phrasebook escapes we need
521 escapes = 0
522 while escapes * 256 < len(wordlist):
523 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000524 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000525
526 short = 256 - escapes
527
528 assert short > 0
529
Collin Winter6afaeb72007-08-03 17:06:41 +0000530 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000531
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000532 # statistics
533 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000534 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000535 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000536 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000537
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000538 # pick the most commonly used words, and sort the rest on falling
539 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000540
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000541 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000542 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000543 wordlist.extend(wordtail)
544
545 # generate lexicon from words
546
547 lexicon_offset = [0]
548 lexicon = ""
549 words = {}
550
551 # build a lexicon string
552 offset = 0
553 for w, x in wordlist:
554 # encoding: bit 7 indicates last character in word (chr(128)
555 # indicates the last character in an entire string)
556 ww = w[:-1] + chr(ord(w[-1])+128)
557 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000558 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000559 if o < 0:
560 o = offset
561 lexicon = lexicon + ww
562 offset = offset + len(w)
563 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000564 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000565
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000566 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000567
568 # generate phrasebook from names and lexicon
569 phrasebook = [0]
570 phrasebook_offset = [0] * len(unicode.chars)
571 for char in unicode.chars:
572 name = names[char]
573 if name:
574 w = name.split()
575 phrasebook_offset[char] = len(phrasebook)
576 for w in w:
577 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000578 if i < short:
579 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000580 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000581 # store as two bytes
582 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000583 phrasebook.append(i&255)
584
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000585 assert getsize(phrasebook) == 1
586
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000587 #
588 # unicode name hash table
589
590 # extract names
591 data = []
592 for char in unicode.chars:
593 record = unicode.table[char]
594 if record:
595 name = record[1].strip()
596 if name and name[0] != "<":
597 data.append((name, char))
598
599 # the magic number 47 was chosen to minimize the number of
600 # collisions on the current data set. if you like, change it
601 # and see what happens...
602
603 codehash = Hash("code", data, 47)
604
Collin Winter6afaeb72007-08-03 17:06:41 +0000605 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000606
607 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000608 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
609 print(file=fp)
610 print("#define NAME_MAXLEN", 256, file=fp)
611 print(file=fp)
612 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000613 Array("lexicon", lexicon).dump(fp, trace)
614 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000615
616 # split decomposition index table
617 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
618
Collin Winter6afaeb72007-08-03 17:06:41 +0000619 print("/* code->name phrasebook */", file=fp)
620 print("#define phrasebook_shift", shift, file=fp)
621 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000622
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000623 Array("phrasebook", phrasebook).dump(fp, trace)
624 Array("phrasebook_offset1", offset1).dump(fp, trace)
625 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000626
Collin Winter6afaeb72007-08-03 17:06:41 +0000627 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000628 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000629
630 fp.close()
631
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000632
633def merge_old_version(version, new, old):
634 # Changes to exclusion file not implemented yet
635 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000636 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000637
638 # In these change records, 0xFF means "no change"
639 bidir_changes = [0xFF]*0x110000
640 category_changes = [0xFF]*0x110000
641 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000642 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000643 # In numeric data, 0 means "no change",
644 # -1 means "did not have a numeric value
645 numeric_changes = [0] * 0x110000
646 # normalization_changes is a list of key-value pairs
647 normalization_changes = []
648 for i in range(0x110000):
649 if new.table[i] is None:
650 # Characters unassigned in the new version ought to
651 # be unassigned in the old one
652 assert old.table[i] is None
653 continue
654 # check characters unassigned in the old version
655 if old.table[i] is None:
656 # category 0 is "unassigned"
657 category_changes[i] = 0
658 continue
659 # check characters that differ
660 if old.table[i] != new.table[i]:
661 for k in range(len(old.table[i])):
662 if old.table[i][k] != new.table[i][k]:
663 value = old.table[i][k]
664 if k == 2:
665 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
666 category_changes[i] = CATEGORY_NAMES.index(value)
667 elif k == 4:
668 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
669 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
670 elif k == 5:
671 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
672 # We assume that all normalization changes are in 1:1 mappings
673 assert " " not in value
674 normalization_changes.append((i, value))
675 elif k == 6:
676 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
677 # we only support changes where the old value is a single digit
678 assert value in "0123456789"
679 decimal_changes[i] = int(value)
680 elif k == 8:
681 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
682 # Since 0 encodes "no change", the old value is better not 0
683 assert value != "0" and value != "-1"
684 if not value:
685 numeric_changes[i] = -1
686 else:
687 assert re.match("^[0-9]+$", value)
688 numeric_changes[i] = int(value)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000689 elif k == 9:
690 if value == 'Y':
691 mirrored_changes[i] = '1'
692 else:
693 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000694 elif k == 11:
695 # change to ISO comment, ignore
696 pass
697 elif k == 12:
698 # change to simple uppercase mapping; ignore
699 pass
700 elif k == 13:
701 # change to simple lowercase mapping; ignore
702 pass
703 elif k == 14:
704 # change to simple titlecase mapping; ignore
705 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000706 elif k == 16:
707 # derived property changes; not yet
708 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000709 else:
710 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000711 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000712 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000713 decimal_changes, mirrored_changes,
714 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000715 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000716
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000717
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000718# --------------------------------------------------------------------
719# the following support code is taken from the unidb utilities
720# Copyright (c) 1999-2000 by Secret Labs AB
721
722# load a unicode-data file from disk
723
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000724import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000725
726class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000727 # Record structure:
728 # [ID, name, category, combining, bidi, decomp, (6)
729 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
730 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
731 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000732
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000733 def __init__(self, filename, exclusions, eastasianwidth,
734 derivedprops, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000735 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000736 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000737 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000738 while 1:
739 s = file.readline()
740 if not s:
741 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000742 s = s.strip().split(";")
743 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000744 table[char] = s
745
Martin v. Löwis97225da2002-11-24 23:05:09 +0000746 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000747 if expand:
748 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000749 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000750 s = table[i]
751 if s:
752 if s[1][-6:] == "First>":
753 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000754 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000755 elif s[1][-5:] == "Last>":
756 s[1] = ""
757 field = None
758 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000759 f2 = field[:]
760 f2[0] = "%X" % i
761 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000762
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000763 # public attributes
764 self.filename = filename
765 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000766 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000767
Martin v. Löwis677bde22002-11-23 22:08:15 +0000768 file = open(exclusions)
769 self.exclusions = {}
770 for s in file:
771 s = s.strip()
772 if not s:
773 continue
774 if s[0] == '#':
775 continue
776 char = int(s.split()[0],16)
777 self.exclusions[char] = 1
778
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000779 widths = [None] * 0x110000
780 for s in open(eastasianwidth):
781 s = s.strip()
782 if not s:
783 continue
784 if s[0] == '#':
785 continue
786 s = s.split()[0].split(';')
787 if '..' in s[0]:
788 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000789 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000790 else:
791 chars = [int(s[0], 16)]
792 for char in chars:
793 widths[char] = s[1]
794 for i in range(0, 0x110000):
795 if table[i] is not None:
796 table[i].append(widths[i])
797
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000798 for i in range(0, 0x110000):
799 if table[i] is not None:
800 table[i].append(set())
801 for s in open(derivedprops):
802 s = s.split('#', 1)[0].strip()
803 if not s:
804 continue
805
806 r, p = s.split(";")
807 r = r.strip()
808 p = p.strip()
809 if ".." in r:
810 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000811 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000812 else:
813 chars = [int(r, 16)]
814 for char in chars:
815 if table[char]:
816 # Some properties (e.g. Default_Ignorable_Code_Point)
817 # apply to unassigned code points; ignore them
818 table[char][-1].add(p)
819
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000820 def uselatin1(self):
821 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000822 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000823
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000824# hash table tools
825
826# this is a straight-forward reimplementation of Python's built-in
827# dictionary type, using a static data structure, and a custom string
828# hash algorithm.
829
830def myhash(s, magic):
831 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000832 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000833 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000834 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000835 if ix:
836 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
837 return h
838
839SIZES = [
840 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
841 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
842 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
843 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
844]
845
846class Hash:
847 def __init__(self, name, data, magic):
848 # turn a (key, value) list into a static hash table structure
849
850 # determine table size
851 for size, poly in SIZES:
852 if size > len(data):
853 poly = size + poly
854 break
855 else:
Collin Wintera817e582007-08-22 23:05:06 +0000856 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000857
Collin Winter6afaeb72007-08-03 17:06:41 +0000858 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000859
860 table = [None] * size
861
862 mask = size-1
863
864 n = 0
865
866 hash = myhash
867
868 # initialize hash table
869 for key, value in data:
870 h = hash(key, magic)
871 i = (~h) & mask
872 v = table[i]
873 if v is None:
874 table[i] = value
875 continue
876 incr = (h ^ (h >> 3)) & mask;
877 if not incr:
878 incr = mask
879 while 1:
880 n = n + 1
881 i = (i + incr) & mask
882 v = table[i]
883 if v is None:
884 table[i] = value
885 break
886 incr = incr << 1
887 if incr > mask:
888 incr = incr ^ poly
889
Collin Winter6afaeb72007-08-03 17:06:41 +0000890 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000891 self.collisions = n
892
893 for i in range(len(table)):
894 if table[i] is None:
895 table[i] = 0
896
897 self.data = Array(name + "_hash", table)
898 self.magic = magic
899 self.name = name
900 self.size = size
901 self.poly = poly
902
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000903 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000904 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000905 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000906 file.write("#define %s_magic %d\n" % (self.name, self.magic))
907 file.write("#define %s_size %d\n" % (self.name, self.size))
908 file.write("#define %s_poly %d\n" % (self.name, self.poly))
909
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000910# stuff to deal with arrays of unsigned integers
911
912class Array:
913
914 def __init__(self, name, data):
915 self.name = name
916 self.data = data
917
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000918 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000919 # write data to file, as a C array
920 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000921 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000922 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000923 file.write("static ")
924 if size == 1:
925 file.write("unsigned char")
926 elif size == 2:
927 file.write("unsigned short")
928 else:
929 file.write("unsigned int")
930 file.write(" " + self.name + "[] = {\n")
931 if self.data:
932 s = " "
933 for item in self.data:
934 i = str(item) + ", "
935 if len(s) + len(i) > 78:
936 file.write(s + "\n")
937 s = " " + i
938 else:
939 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000940 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000941 file.write(s + "\n")
942 file.write("};\n\n")
943
944def getsize(data):
945 # return smallest possible integer size for the given array
946 maxdata = max(data)
947 if maxdata < 256:
948 return 1
949 elif maxdata < 65536:
950 return 2
951 else:
952 return 4
953
Tim Peters21013482000-09-25 07:13:41 +0000954def splitbins(t, trace=0):
955 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
956
957 t is a sequence of ints. This function can be useful to save space if
958 many of the ints are the same. t1 and t2 are lists of ints, and shift
959 is an int, chosen to minimize the combined size of t1 and t2 (in C
960 code), and where for each i in range(len(t)),
961 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
962 where mask is a bitmask isolating the last "shift" bits.
963
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000964 If optional arg trace is non-zero (default zero), progress info
965 is printed to sys.stderr. The higher the value, the more info
966 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000967 """
968
969 import sys
970 if trace:
971 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +0000972 print("%d+%d bins at shift %d; %d bytes" % (
973 len(t1), len(t2), shift, bytes), file=sys.stderr)
974 print("Size of original table:", len(t)*getsize(t), \
975 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000976 n = len(t)-1 # last valid index
977 maxshift = 0 # the most we can shift n and still have something left
978 if n > 0:
979 while n >> 1:
980 n >>= 1
981 maxshift += 1
982 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +0000983 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +0000984 t = tuple(t) # so slices can be dict keys
985 for shift in range(maxshift + 1):
986 t1 = []
987 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000988 size = 2**shift
989 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000990 for i in range(0, len(t), size):
991 bin = t[i:i+size]
992 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000993 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000994 index = len(t2)
995 bincache[bin] = index
996 t2.extend(bin)
997 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000998 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000999 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001000 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001001 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001002 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001003 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001004 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001005 t1, t2, shift = best
1006 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001007 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001008 dump(t1, t2, shift, bytes)
1009 if __debug__:
1010 # exhaustively verify that the decomposition is correct
1011 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001012 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001013 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1014 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001015
1016if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001017 maketables(1)