blob: 92268ad3eb065b8fc527d086832c5aec5731ad0b [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Fredrik Lundhcfcea492000-09-25 08:07:06 +000023#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000024# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000025#
26
27import sys
28
29SCRIPT = sys.argv[0]
Martin v. Löwis24329ba2008-09-10 13:38:12 +000030VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000031
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000032# The Unicode Database
Martin v. Löwis24329ba2008-09-10 13:38:12 +000033UNIDATA_VERSION = "5.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000037UNIHAN = "Unihan%s.txt"
Antoine Pitroue988e282009-04-27 21:53:26 +000038DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000039
40old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000041
42CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
43 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
44 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
45 "So" ]
46
47BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
48 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
49 "ON" ]
50
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000051EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
52
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000053# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000054ALPHA_MASK = 0x01
55DECIMAL_MASK = 0x02
56DIGIT_MASK = 0x04
57LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059SPACE_MASK = 0x20
60TITLE_MASK = 0x40
61UPPER_MASK = 0x80
Martin v. Löwis24329ba2008-09-10 13:38:12 +000062NODELTA_MASK = 0x100
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000063NUMERIC_MASK = 0x200
Fredrik Lundhe9133f72000-09-25 17:59:57 +000064
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000065def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000066
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000067 print "--- Reading", UNICODE_DATA % "", "..."
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000068
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069 version = ""
70 unicode = UnicodeData(UNICODE_DATA % version,
71 COMPOSITION_EXCLUSIONS % version,
Antoine Pitroue988e282009-04-27 21:53:26 +000072 EASTASIAN_WIDTH % version,
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000073 UNIHAN % version,
Antoine Pitroue988e282009-04-27 21:53:26 +000074 DERIVEDNORMALIZATION_PROPS % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000075
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000076 print len(filter(None, unicode.table)), "characters"
77
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078 for version in old_versions:
79 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
80 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
81 COMPOSITION_EXCLUSIONS % ("-"+version),
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000082 EASTASIAN_WIDTH % ("-"+version),
83 UNIHAN % ("-"+version))
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000084 print len(filter(None, old_unicode.table)), "characters"
85 merge_old_version(version, unicode, old_unicode)
86
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000087 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000088 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000089 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000090
91# --------------------------------------------------------------------
92# unicode character properties
93
94def makeunicodedata(unicode, trace):
95
Antoine Pitroue988e282009-04-27 21:53:26 +000096 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000097 table = [dummy]
98 cache = {0: dummy}
99 index = [0] * len(unicode.chars)
100
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000101 FILE = "Modules/unicodedata_db.h"
102
103 print "--- Preparing", FILE, "..."
104
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000105 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000106
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000107 for char in unicode.chars:
108 record = unicode.table[char]
109 if record:
110 # extract database properties
111 category = CATEGORY_NAMES.index(record[2])
112 combining = int(record[3])
113 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
114 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000115 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitroue988e282009-04-27 21:53:26 +0000116 normalizationquickcheck = record[16]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000117 item = (
Antoine Pitroue988e282009-04-27 21:53:26 +0000118 category, combining, bidirectional, mirrored, eastasianwidth,
119 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000120 )
121 # add entry to index and item tables
122 i = cache.get(item)
123 if i is None:
124 cache[item] = i = len(table)
125 table.append(item)
126 index[char] = i
127
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000128 # 2) decomposition data
129
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000130 decomp_data = [0]
131 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000132 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000133 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000134
Martin v. Löwis677bde22002-11-23 22:08:15 +0000135 comp_pairs = []
136 comp_first = [None] * len(unicode.chars)
137 comp_last = [None] * len(unicode.chars)
138
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000139 for char in unicode.chars:
140 record = unicode.table[char]
141 if record:
142 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000143 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000144 if len(decomp) > 19:
145 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000146 # prefix
147 if decomp[0][0] == "<":
148 prefix = decomp.pop(0)
149 else:
150 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000151 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000152 i = decomp_prefix.index(prefix)
153 except ValueError:
154 i = len(decomp_prefix)
155 decomp_prefix.append(prefix)
156 prefix = i
157 assert prefix < 256
158 # content
159 decomp = [prefix + (len(decomp)<<8)] +\
160 map(lambda s: int(s, 16), decomp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000161 # Collect NFC pairs
162 if not prefix and len(decomp) == 3 and \
163 char not in unicode.exclusions and \
164 unicode.table[decomp[1]][3] == "0":
165 p, l, r = decomp
166 comp_first[l] = 1
167 comp_last[r] = 1
168 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169 try:
170 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000171 except ValueError:
172 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000173 decomp_data.extend(decomp)
174 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000175 else:
176 i = 0
177 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000178
Martin v. Löwis677bde22002-11-23 22:08:15 +0000179 f = l = 0
180 comp_first_ranges = []
181 comp_last_ranges = []
182 prev_f = prev_l = None
183 for i in unicode.chars:
184 if comp_first[i] is not None:
185 comp_first[i] = f
186 f += 1
187 if prev_f is None:
188 prev_f = (i,i)
189 elif prev_f[1]+1 == i:
190 prev_f = prev_f[0],i
191 else:
192 comp_first_ranges.append(prev_f)
193 prev_f = (i,i)
194 if comp_last[i] is not None:
195 comp_last[i] = l
196 l += 1
197 if prev_l is None:
198 prev_l = (i,i)
199 elif prev_l[1]+1 == i:
200 prev_l = prev_l[0],i
201 else:
202 comp_last_ranges.append(prev_l)
203 prev_l = (i,i)
204 comp_first_ranges.append(prev_f)
205 comp_last_ranges.append(prev_l)
206 total_first = f
207 total_last = l
208
209 comp_data = [0]*(total_first*total_last)
210 for f,l,char in comp_pairs:
211 f = comp_first[f]
212 l = comp_last[l]
213 comp_data[f*total_last+l] = char
214
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000215 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000216 print len(decomp_prefix), "unique decomposition prefixes"
217 print len(decomp_data), "unique decomposition entries:",
218 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000219 print total_first, "first characters in NFC"
220 print total_last, "last characters in NFC"
221 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000222
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000223 print "--- Writing", FILE, "..."
224
Fred Drake9c685052000-10-26 03:56:46 +0000225 fp = open(FILE, "w")
226 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
227 print >>fp
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000228 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake9c685052000-10-26 03:56:46 +0000229 print >>fp, "/* a list of unique database records */"
230 print >>fp, \
231 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000232 for item in table:
Antoine Pitroue988e282009-04-27 21:53:26 +0000233 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
Fred Drake9c685052000-10-26 03:56:46 +0000234 print >>fp, "};"
235 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000236
Martin v. Löwis677bde22002-11-23 22:08:15 +0000237 print >>fp, "/* Reindexing of NFC first characters. */"
238 print >>fp, "#define TOTAL_FIRST",total_first
239 print >>fp, "#define TOTAL_LAST",total_last
240 print >>fp, "struct reindex{int start;short count,index;};"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000241 print >>fp, "static struct reindex nfc_first[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000242 for start,end in comp_first_ranges:
243 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
244 print >>fp," {0,0,0}"
245 print >>fp,"};\n"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000246 print >>fp, "static struct reindex nfc_last[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000247 for start,end in comp_last_ranges:
248 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
249 print >>fp," {0,0,0}"
250 print >>fp,"};\n"
251
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000252 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000253 # the support code moved into unicodedatabase.c
254
Fred Drake9c685052000-10-26 03:56:46 +0000255 print >>fp, "/* string literals */"
256 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000257 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000258 print >>fp, " \"%s\"," % name
259 print >>fp, " NULL"
260 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000261
Fred Drake9c685052000-10-26 03:56:46 +0000262 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000263 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000264 print >>fp, " \"%s\"," % name
265 print >>fp, " NULL"
266 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000267
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000268 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
269 for name in EASTASIANWIDTH_NAMES:
270 print >>fp, " \"%s\"," % name
271 print >>fp, " NULL"
272 print >>fp, "};"
273
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000274 print >>fp, "static const char *decomp_prefix[] = {"
275 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000276 print >>fp, " \"%s\"," % name
277 print >>fp, " NULL"
278 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000279
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000280 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000281 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000282
Fred Drake9c685052000-10-26 03:56:46 +0000283 print >>fp, "/* index tables for the database records */"
284 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000285 Array("index1", index1).dump(fp, trace)
286 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000287
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000288 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000289 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000290
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000291 print >>fp, "/* decomposition data */"
292 Array("decomp_data", decomp_data).dump(fp, trace)
293
Fred Drake9c685052000-10-26 03:56:46 +0000294 print >>fp, "/* index tables for the decomposition data */"
295 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000296 Array("decomp_index1", index1).dump(fp, trace)
297 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000298
Martin v. Löwis677bde22002-11-23 22:08:15 +0000299 index, index2, shift = splitbins(comp_data, trace)
300 print >>fp, "/* NFC pairs */"
301 print >>fp, "#define COMP_SHIFT", shift
302 Array("comp_index", index).dump(fp, trace)
303 Array("comp_data", index2).dump(fp, trace)
304
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000305 # Generate delta tables for old versions
306 for version, table, normalization in unicode.changed:
307 cversion = version.replace(".","_")
308 records = [table[0]]
309 cache = {table[0]:0}
310 index = [0] * len(table)
311 for i, record in enumerate(table):
312 try:
313 index[i] = cache[record]
314 except KeyError:
315 index[i] = cache[record] = len(records)
316 records.append(record)
317 index1, index2, shift = splitbins(index, trace)
318 print >>fp, "static const change_record change_records_%s[] = {" % cversion
319 for record in records:
320 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
321 print >>fp, "};"
322 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
323 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
324 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
325 print >>fp, "{"
326 print >>fp, "\tint index;"
327 print >>fp, "\tif (n >= 0x110000) index = 0;"
328 print >>fp, "\telse {"
329 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
330 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
331 (cversion, shift, ((1<<shift)-1))
332 print >>fp, "\t}"
333 print >>fp, "\treturn change_records_%s+index;" % cversion
334 print >>fp, "}\n"
335 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
336 print >>fp, "{"
337 print >>fp, "\tswitch(n) {"
338 for k, v in normalization:
339 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
340 print >>fp, "\tdefault: return 0;"
341 print >>fp, "\t}\n}\n"
342
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000343 fp.close()
344
345# --------------------------------------------------------------------
346# unicode character type tables
347
348def makeunicodetype(unicode, trace):
349
350 FILE = "Objects/unicodetype_db.h"
351
352 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000353
354 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000355 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000356 table = [dummy]
357 cache = {0: dummy}
358 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000359 numeric = {}
360 spaces = []
361 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000362
363 for char in unicode.chars:
364 record = unicode.table[char]
365 if record:
366 # extract database properties
367 category = record[2]
368 bidirectional = record[4]
369 flags = 0
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000370 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000371 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
372 flags |= ALPHA_MASK
373 if category == "Ll":
374 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000375 if category == "Zl" or bidirectional == "B":
376 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000377 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000378 if category == "Zs" or bidirectional in ("WS", "B", "S"):
379 flags |= SPACE_MASK
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000380 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000381 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000382 flags |= TITLE_MASK
383 if category == "Lu":
384 flags |= UPPER_MASK
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000385 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000386 if record[12]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000387 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000388 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000389 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000390 if record[13]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000391 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000393 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000394 if record[14]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000395 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000396 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000397 # UCD.html says that a missing title char means that
398 # it defaults to the uppercase character, not to the
399 # character itself. Apparently, in the current UCD (5.x)
400 # this feature is never used
401 title = upper
402 upper_d = upper - char
403 lower_d = lower - char
404 title_d = title - char
405 if -32768 <= upper_d <= 32767 and \
406 -32768 <= lower_d <= 32767 and \
407 -32768 <= title_d <= 32767:
408 # use deltas
409 upper = upper_d & 0xffff
410 lower = lower_d & 0xffff
411 title = title_d & 0xffff
412 else:
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000413 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000414 # decimal digit, integer digit
415 decimal = 0
416 if record[6]:
417 flags |= DECIMAL_MASK
418 decimal = int(record[6])
419 digit = 0
420 if record[7]:
421 flags |= DIGIT_MASK
422 digit = int(record[7])
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000423 if record[8]:
424 flags |= NUMERIC_MASK
425 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000426 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000427 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000428 )
429 # add entry to index and item tables
430 i = cache.get(item)
431 if i is None:
432 cache[item] = i = len(table)
433 table.append(item)
434 index[char] = i
435
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000436 print len(table), "unique character type entries"
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000437 print sum(map(len, numeric.values())), "numeric code points"
438 print len(spaces), "whitespace code points"
439 print len(linebreaks), "linebreak code points"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000440
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000441 print "--- Writing", FILE, "..."
442
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000443 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000444 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
445 print >>fp
446 print >>fp, "/* a list of unique character type descriptors */"
447 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000448 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000449 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
450 print >>fp, "};"
451 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000452
453 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000454 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000455
Fred Drake9c685052000-10-26 03:56:46 +0000456 print >>fp, "/* type indexes */"
457 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000458 Array("index1", index1).dump(fp, trace)
459 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000460
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000461 # Generate code for _PyUnicode_ToNumeric()
462 numeric_items = numeric.items()
463 numeric_items.sort()
464 print >>fp, '/* Returns the numeric value as double for Unicode characters'
465 print >>fp, ' * having this property, -1.0 otherwise.'
466 print >>fp, ' */'
467 print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
468 print >>fp, '{'
469 print >>fp, ' switch (ch) {'
470 for value, codepoints in numeric_items:
471 haswide = False
472 hasnonewide = False
473 codepoints.sort()
474 for codepoint in codepoints:
475 if codepoint < 0x10000:
476 hasnonewide = True
477 if codepoint >= 0x10000 and not haswide:
478 print >>fp, '#ifdef Py_UNICODE_WIDE'
479 haswide = True
480 print >>fp, ' case 0x%04X:' % (codepoint,)
481 if haswide and hasnonewide:
482 print >>fp, '#endif'
483 print >>fp, ' return (double) %s;' % (value,)
484 if haswide and not hasnonewide:
485 print >>fp, '#endif'
486 print >>fp,' }'
487 print >>fp,' return -1.0;'
488 print >>fp,'}'
489 print >>fp
490
491 # Generate code for _PyUnicode_IsWhitespace()
492 print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
493 print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
494 print >>fp, " */"
495 print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
496 print >>fp, '{'
497 print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
498 print >>fp, ' return iswspace(ch);'
499 print >>fp, '#else'
500 print >>fp, ' switch (ch) {'
501
502 haswide = False
503 hasnonewide = False
504 spaces.sort()
505 for codepoint in spaces:
506 if codepoint < 0x10000:
507 hasnonewide = True
508 if codepoint >= 0x10000 and not haswide:
509 print >>fp, '#ifdef Py_UNICODE_WIDE'
510 haswide = True
511 print >>fp, ' case 0x%04X:' % (codepoint,)
512 if haswide and hasnonewide:
513 print >>fp, '#endif'
514 print >>fp, ' return 1;'
515 if haswide and not hasnonewide:
516 print >>fp, '#endif'
517
518 print >>fp,' }'
519 print >>fp,' return 0;'
520 print >>fp, '#endif'
521 print >>fp,'}'
522 print >>fp
523
524 # Generate code for _PyUnicode_IsLinebreak()
525 print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl',"
526 print >>fp, " * 'Zp' or type 'B', 0 otherwise."
527 print >>fp, " */"
528 print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
529 print >>fp, '{'
530 print >>fp, ' switch (ch) {'
531 haswide = False
532 hasnonewide = False
533 linebreaks.sort()
534 for codepoint in linebreaks:
535 if codepoint < 0x10000:
536 hasnonewide = True
537 if codepoint >= 0x10000 and not haswide:
538 print >>fp, '#ifdef Py_UNICODE_WIDE'
539 haswide = True
540 print >>fp, ' case 0x%04X:' % (codepoint,)
541 if haswide and hasnonewide:
542 print >>fp, '#endif'
543 print >>fp, ' return 1;'
544 if haswide and not hasnonewide:
545 print >>fp, '#endif'
546
547 print >>fp,' }'
548 print >>fp,' return 0;'
549 print >>fp,'}'
550 print >>fp
551
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000552 fp.close()
553
554# --------------------------------------------------------------------
555# unicode name database
556
557def makeunicodename(unicode, trace):
558
559 FILE = "Modules/unicodename_db.h"
560
561 print "--- Preparing", FILE, "..."
562
563 # collect names
564 names = [None] * len(unicode.chars)
565
566 for char in unicode.chars:
567 record = unicode.table[char]
568 if record:
569 name = record[1].strip()
570 if name and name[0] != "<":
571 names[char] = name + chr(0)
572
573 print len(filter(lambda n: n is not None, names)), "distinct names"
574
575 # collect unique words from names (note that we differ between
576 # words inside a sentence, and words ending a sentence. the
577 # latter includes the trailing null byte.
578
579 words = {}
580 n = b = 0
581 for char in unicode.chars:
582 name = names[char]
583 if name:
584 w = name.split()
585 b = b + len(name)
586 n = n + len(w)
587 for w in w:
588 l = words.get(w)
589 if l:
590 l.append(None)
591 else:
592 words[w] = [len(words)]
593
594 print n, "words in text;", b, "bytes"
595
596 wordlist = words.items()
597
Martin v. Löwis97225da2002-11-24 23:05:09 +0000598 # sort on falling frequency, then by name
599 def cmpwords((aword, alist),(bword, blist)):
600 r = -cmp(len(alist),len(blist))
601 if r:
602 return r
603 return cmp(aword, bword)
604 wordlist.sort(cmpwords)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000605
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000606 # figure out how many phrasebook escapes we need
607 escapes = 0
608 while escapes * 256 < len(wordlist):
609 escapes = escapes + 1
610 print escapes, "escapes"
611
612 short = 256 - escapes
613
614 assert short > 0
615
616 print short, "short indexes in lexicon"
617
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000618 # statistics
619 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000620 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000621 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000622 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000623
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000624 # pick the most commonly used words, and sort the rest on falling
625 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000626
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000627 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000628 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
629 wordlist.extend(wordtail)
630
631 # generate lexicon from words
632
633 lexicon_offset = [0]
634 lexicon = ""
635 words = {}
636
637 # build a lexicon string
638 offset = 0
639 for w, x in wordlist:
640 # encoding: bit 7 indicates last character in word (chr(128)
641 # indicates the last character in an entire string)
642 ww = w[:-1] + chr(ord(w[-1])+128)
643 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000644 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000645 if o < 0:
646 o = offset
647 lexicon = lexicon + ww
648 offset = offset + len(w)
649 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000650 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000651
652 lexicon = map(ord, lexicon)
653
654 # generate phrasebook from names and lexicon
655 phrasebook = [0]
656 phrasebook_offset = [0] * len(unicode.chars)
657 for char in unicode.chars:
658 name = names[char]
659 if name:
660 w = name.split()
661 phrasebook_offset[char] = len(phrasebook)
662 for w in w:
663 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000664 if i < short:
665 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000666 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000667 # store as two bytes
668 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000669 phrasebook.append(i&255)
670
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000671 assert getsize(phrasebook) == 1
672
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000673 #
674 # unicode name hash table
675
676 # extract names
677 data = []
678 for char in unicode.chars:
679 record = unicode.table[char]
680 if record:
681 name = record[1].strip()
682 if name and name[0] != "<":
683 data.append((name, char))
684
685 # the magic number 47 was chosen to minimize the number of
686 # collisions on the current data set. if you like, change it
687 # and see what happens...
688
689 codehash = Hash("code", data, 47)
690
691 print "--- Writing", FILE, "..."
692
693 fp = open(FILE, "w")
694 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
695 print >>fp
696 print >>fp, "#define NAME_MAXLEN", 256
697 print >>fp
698 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000699 Array("lexicon", lexicon).dump(fp, trace)
700 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000701
702 # split decomposition index table
703 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
704
705 print >>fp, "/* code->name phrasebook */"
706 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000707 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000708
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000709 Array("phrasebook", phrasebook).dump(fp, trace)
710 Array("phrasebook_offset1", offset1).dump(fp, trace)
711 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000712
713 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000714 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000715
716 fp.close()
717
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000718
719def merge_old_version(version, new, old):
720 # Changes to exclusion file not implemented yet
721 if old.exclusions != new.exclusions:
722 raise NotImplementedError, "exclusions differ"
723
724 # In these change records, 0xFF means "no change"
725 bidir_changes = [0xFF]*0x110000
726 category_changes = [0xFF]*0x110000
727 decimal_changes = [0xFF]*0x110000
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000728 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000729 # In numeric data, 0 means "no change",
730 # -1 means "did not have a numeric value
731 numeric_changes = [0] * 0x110000
732 # normalization_changes is a list of key-value pairs
733 normalization_changes = []
734 for i in range(0x110000):
735 if new.table[i] is None:
736 # Characters unassigned in the new version ought to
737 # be unassigned in the old one
738 assert old.table[i] is None
739 continue
740 # check characters unassigned in the old version
741 if old.table[i] is None:
742 # category 0 is "unassigned"
743 category_changes[i] = 0
744 continue
745 # check characters that differ
746 if old.table[i] != new.table[i]:
747 for k in range(len(old.table[i])):
748 if old.table[i][k] != new.table[i][k]:
749 value = old.table[i][k]
750 if k == 2:
751 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
752 category_changes[i] = CATEGORY_NAMES.index(value)
753 elif k == 4:
754 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
755 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
756 elif k == 5:
757 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
758 # We assume that all normalization changes are in 1:1 mappings
759 assert " " not in value
760 normalization_changes.append((i, value))
761 elif k == 6:
762 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
763 # we only support changes where the old value is a single digit
764 assert value in "0123456789"
765 decimal_changes[i] = int(value)
766 elif k == 8:
767 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
768 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000769 if not value:
770 numeric_changes[i] = -1
771 else:
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000772 numeric_changes[i] = float(value)
773 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000774 elif k == 9:
775 if value == 'Y':
776 mirrored_changes[i] = '1'
777 else:
778 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000779 elif k == 11:
780 # change to ISO comment, ignore
781 pass
782 elif k == 12:
783 # change to simple uppercase mapping; ignore
784 pass
785 elif k == 13:
786 # change to simple lowercase mapping; ignore
787 pass
788 elif k == 14:
789 # change to simple titlecase mapping; ignore
790 pass
791 else:
792 class Difference(Exception):pass
793 raise Difference, (hex(i), k, old.table[i], new.table[i])
794 new.changed.append((version, zip(bidir_changes, category_changes,
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000795 decimal_changes, mirrored_changes,
796 numeric_changes),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000797 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000798
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000799
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000800# --------------------------------------------------------------------
801# the following support code is taken from the unidb utilities
802# Copyright (c) 1999-2000 by Secret Labs AB
803
804# load a unicode-data file from disk
805
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000806class UnicodeData:
807
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000808 def __init__(self, filename, exclusions, eastasianwidth, unihan,
Antoine Pitroue988e282009-04-27 21:53:26 +0000809 derivednormalizationprops=None, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000810 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000811 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000812 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000813 while 1:
814 s = file.readline()
815 if not s:
816 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000817 s = s.strip().split(";")
818 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000819 table[char] = s
820
Martin v. Löwis97225da2002-11-24 23:05:09 +0000821 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000822 if expand:
823 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000824 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000825 s = table[i]
826 if s:
827 if s[1][-6:] == "First>":
828 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000829 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000830 elif s[1][-5:] == "Last>":
831 s[1] = ""
832 field = None
833 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000834 f2 = field[:]
835 f2[0] = "%X" % i
836 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000837
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000838 # public attributes
839 self.filename = filename
840 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000841 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000842
Martin v. Löwis677bde22002-11-23 22:08:15 +0000843 file = open(exclusions)
844 self.exclusions = {}
845 for s in file:
846 s = s.strip()
847 if not s:
848 continue
849 if s[0] == '#':
850 continue
851 char = int(s.split()[0],16)
852 self.exclusions[char] = 1
853
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000854 widths = [None] * 0x110000
855 for s in open(eastasianwidth):
856 s = s.strip()
857 if not s:
858 continue
859 if s[0] == '#':
860 continue
861 s = s.split()[0].split(';')
862 if '..' in s[0]:
863 first, last = [int(c, 16) for c in s[0].split('..')]
864 chars = range(first, last+1)
865 else:
866 chars = [int(s[0], 16)]
867 for char in chars:
868 widths[char] = s[1]
869 for i in range(0, 0x110000):
870 if table[i] is not None:
871 table[i].append(widths[i])
Antoine Pitroue988e282009-04-27 21:53:26 +0000872 if derivednormalizationprops:
873 quickchecks = [0] * 0x110000 # default is Yes
874 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
875 for s in open(derivednormalizationprops):
876 if '#' in s:
877 s = s[:s.index('#')]
878 s = [i.strip() for i in s.split(';')]
879 if len(s) < 2 or s[1] not in qc_order:
880 continue
881 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
882 quickcheck_shift = qc_order.index(s[1])*2
883 quickcheck <<= quickcheck_shift
884 if '..' not in s[0]:
885 first = last = int(s[0], 16)
886 else:
887 first, last = [int(c, 16) for c in s[0].split('..')]
888 for char in range(first, last+1):
889 assert not (quickchecks[char]>>quickcheck_shift)&3
890 quickchecks[char] |= quickcheck
891 for i in range(0, 0x110000):
892 if table[i] is not None:
893 table[i].append(quickchecks[i])
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000894
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000895 for line in open(unihan):
896 if not line.startswith('U+'):
897 continue
898 code, tag, value = line.split(None, 3)[:3]
899 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
900 'kOtherNumeric'):
901 continue
902 value = value.strip().replace(',', '')
903 i = int(code[2:], 16)
904 # Patch the numeric field
905 if table[i] is not None:
906 table[i][8] = value
907
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000908 def uselatin1(self):
909 # restrict character range to ISO Latin 1
910 self.chars = range(256)
911
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000912# hash table tools
913
914# this is a straight-forward reimplementation of Python's built-in
915# dictionary type, using a static data structure, and a custom string
916# hash algorithm.
917
918def myhash(s, magic):
919 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000920 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000921 h = (h * magic) + c
Martin v. Löwis97225da2002-11-24 23:05:09 +0000922 ix = h & 0xff000000L
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000923 if ix:
924 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
925 return h
926
927SIZES = [
928 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
929 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
930 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
931 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
932]
933
934class Hash:
935 def __init__(self, name, data, magic):
936 # turn a (key, value) list into a static hash table structure
937
938 # determine table size
939 for size, poly in SIZES:
940 if size > len(data):
941 poly = size + poly
942 break
943 else:
944 raise AssertionError, "ran out of polynominals"
945
946 print size, "slots in hash table"
947
948 table = [None] * size
949
950 mask = size-1
951
952 n = 0
953
954 hash = myhash
955
956 # initialize hash table
957 for key, value in data:
958 h = hash(key, magic)
959 i = (~h) & mask
960 v = table[i]
961 if v is None:
962 table[i] = value
963 continue
964 incr = (h ^ (h >> 3)) & mask;
965 if not incr:
966 incr = mask
967 while 1:
968 n = n + 1
969 i = (i + incr) & mask
970 v = table[i]
971 if v is None:
972 table[i] = value
973 break
974 incr = incr << 1
975 if incr > mask:
976 incr = incr ^ poly
977
978 print n, "collisions"
979 self.collisions = n
980
981 for i in range(len(table)):
982 if table[i] is None:
983 table[i] = 0
984
985 self.data = Array(name + "_hash", table)
986 self.magic = magic
987 self.name = name
988 self.size = size
989 self.poly = poly
990
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000991 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000992 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000993 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000994 file.write("#define %s_magic %d\n" % (self.name, self.magic))
995 file.write("#define %s_size %d\n" % (self.name, self.size))
996 file.write("#define %s_poly %d\n" % (self.name, self.poly))
997
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000998# stuff to deal with arrays of unsigned integers
999
1000class Array:
1001
1002 def __init__(self, name, data):
1003 self.name = name
1004 self.data = data
1005
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001006 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001007 # write data to file, as a C array
1008 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001009 if trace:
1010 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001011 file.write("static ")
1012 if size == 1:
1013 file.write("unsigned char")
1014 elif size == 2:
1015 file.write("unsigned short")
1016 else:
1017 file.write("unsigned int")
1018 file.write(" " + self.name + "[] = {\n")
1019 if self.data:
1020 s = " "
1021 for item in self.data:
1022 i = str(item) + ", "
1023 if len(s) + len(i) > 78:
1024 file.write(s + "\n")
1025 s = " " + i
1026 else:
1027 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001028 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001029 file.write(s + "\n")
1030 file.write("};\n\n")
1031
1032def getsize(data):
1033 # return smallest possible integer size for the given array
1034 maxdata = max(data)
1035 if maxdata < 256:
1036 return 1
1037 elif maxdata < 65536:
1038 return 2
1039 else:
1040 return 4
1041
Tim Peters21013482000-09-25 07:13:41 +00001042def splitbins(t, trace=0):
1043 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1044
1045 t is a sequence of ints. This function can be useful to save space if
1046 many of the ints are the same. t1 and t2 are lists of ints, and shift
1047 is an int, chosen to minimize the combined size of t1 and t2 (in C
1048 code), and where for each i in range(len(t)),
1049 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1050 where mask is a bitmask isolating the last "shift" bits.
1051
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001052 If optional arg trace is non-zero (default zero), progress info
1053 is printed to sys.stderr. The higher the value, the more info
1054 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001055 """
1056
Tim Peters21013482000-09-25 07:13:41 +00001057 if trace:
1058 def dump(t1, t2, shift, bytes):
1059 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
1060 len(t1), len(t2), shift, bytes)
1061 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
1062 "bytes"
1063 n = len(t)-1 # last valid index
1064 maxshift = 0 # the most we can shift n and still have something left
1065 if n > 0:
1066 while n >> 1:
1067 n >>= 1
1068 maxshift += 1
1069 del n
1070 bytes = sys.maxint # smallest total size so far
1071 t = tuple(t) # so slices can be dict keys
1072 for shift in range(maxshift + 1):
1073 t1 = []
1074 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001075 size = 2**shift
1076 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001077 for i in range(0, len(t), size):
1078 bin = t[i:i+size]
1079 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001080 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001081 index = len(t2)
1082 bincache[bin] = index
1083 t2.extend(bin)
1084 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001085 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001086 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001087 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001088 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001089 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001090 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001091 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001092 t1, t2, shift = best
1093 if trace:
1094 print >>sys.stderr, "Best:",
1095 dump(t1, t2, shift, bytes)
1096 if __debug__:
1097 # exhaustively verify that the decomposition is correct
1098 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1099 for i in xrange(len(t)):
1100 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1101 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001102
1103if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001104 maketables(1)