blob: 15841d7c519e9d4ef1441d23724c422787b2d8b4 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.0 database file to
5# Modules/unicodedata_db.h and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00006#
7# history:
8# 2000-09-24 fl created (based on bits and pieces from unidb)
9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000010# 2000-09-25 fl added character type table
Fredrik Lundh375732c2000-09-25 23:03:34 +000011# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000012# 2000-11-03 fl expand first/last ranges
Fredrik Lundhcfcea492000-09-25 08:07:06 +000013#
14# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
Fredrik Lundhf367cac2000-09-24 23:18:31 +000015#
16
17import sys
18
19SCRIPT = sys.argv[0]
Fredrik Lundhcfcea492000-09-25 08:07:06 +000020VERSION = "1.1"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000021
Fredrik Lundhe9133f72000-09-25 17:59:57 +000022UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000023
24CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
25 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
26 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
27 "So" ]
28
29BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
30 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
31 "ON" ]
32
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000033# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000034ALPHA_MASK = 0x01
35DECIMAL_MASK = 0x02
36DIGIT_MASK = 0x04
37LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000038LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000039SPACE_MASK = 0x20
40TITLE_MASK = 0x40
41UPPER_MASK = 0x80
42
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000043def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000044
45 unicode = UnicodeData(UNICODE_DATA)
46
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000047 print "--- Processing", UNICODE_DATA, "..."
48 print len(filter(None, unicode.table)), "characters"
49
Fredrik Lundhf367cac2000-09-24 23:18:31 +000050 # extract unicode properties
Fredrik Lundhcfcea492000-09-25 08:07:06 +000051 dummy = (0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000052 table = [dummy]
53 cache = {0: dummy}
54 index = [0] * len(unicode.chars)
55
Fredrik Lundhcfcea492000-09-25 08:07:06 +000056 # 1) database properties
Fredrik Lundhf367cac2000-09-24 23:18:31 +000057 for char in unicode.chars:
58 record = unicode.table[char]
59 if record:
60 # extract database properties
61 category = CATEGORY_NAMES.index(record[2])
62 combining = int(record[3])
63 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
64 mirrored = record[9] == "Y"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000065 item = (
Fredrik Lundhcfcea492000-09-25 08:07:06 +000066 category, combining, bidirectional, mirrored
Fredrik Lundhf367cac2000-09-24 23:18:31 +000067 )
68 # add entry to index and item tables
69 i = cache.get(item)
70 if i is None:
71 cache[item] = i = len(table)
72 table.append(item)
73 index[char] = i
74
Fredrik Lundhcfcea492000-09-25 08:07:06 +000075 # 2) decomposition data
76
77 # FIXME: <fl> using the encoding stuff from unidb would save
78 # another 50k or so, but I'll leave that for 2.1...
79
80 decomp_data = [""]
81 decomp_index = [0] * len(unicode.chars)
82
83 for char in unicode.chars:
84 record = unicode.table[char]
85 if record:
86 if record[5]:
87 try:
88 i = decomp_data.index(record[5])
89 except ValueError:
90 i = len(decomp_data)
91 decomp_data.append(record[5])
92 else:
93 i = 0
94 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +000095
Fredrik Lundhe9133f72000-09-25 17:59:57 +000096 FILE = "Modules/unicodedata_db.h"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000097
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000098 print "--- Writing", FILE, "..."
99
100 print len(table), "unique properties"
101 print len(decomp_data), "unique decomposition entries"
102
Fred Drake9c685052000-10-26 03:56:46 +0000103 fp = open(FILE, "w")
104 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
105 print >>fp
106 print >>fp, "/* a list of unique database records */"
107 print >>fp, \
108 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000109 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000110 print >>fp, " {%d, %d, %d, %d}," % item
111 print >>fp, "};"
112 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000113
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000114 # FIXME: the following tables should be made static, and
115 # the support code moved into unicodedatabase.c
116
Fred Drake9c685052000-10-26 03:56:46 +0000117 print >>fp, "/* string literals */"
118 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000119 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000120 print >>fp, " \"%s\"," % name
121 print >>fp, " NULL"
122 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000123
Fred Drake9c685052000-10-26 03:56:46 +0000124 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000125 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000126 print >>fp, " \"%s\"," % name
127 print >>fp, " NULL"
128 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000129
Fred Drake9c685052000-10-26 03:56:46 +0000130 print >>fp, "static const char *decomp_data[] = {"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000131 for name in decomp_data:
Fred Drake9c685052000-10-26 03:56:46 +0000132 print >>fp, " \"%s\"," % name
133 print >>fp, " NULL"
134 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000135
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000136 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000137 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000138
Fred Drake9c685052000-10-26 03:56:46 +0000139 print >>fp, "/* index tables for the database records */"
140 print >>fp, "#define SHIFT", shift
141 Array("index1", index1).dump(fp)
142 Array("index2", index2).dump(fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000143
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000144 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000145 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000146
Fred Drake9c685052000-10-26 03:56:46 +0000147 print >>fp, "/* index tables for the decomposition data */"
148 print >>fp, "#define DECOMP_SHIFT", shift
149 Array("decomp_index1", index1).dump(fp)
150 Array("decomp_index2", index2).dump(fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000151
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000152 #
153 # 3) unicode type data
154
155 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000156 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000157 table = [dummy]
158 cache = {0: dummy}
159 index = [0] * len(unicode.chars)
160
161 for char in unicode.chars:
162 record = unicode.table[char]
163 if record:
164 # extract database properties
165 category = record[2]
166 bidirectional = record[4]
167 flags = 0
168 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
169 flags |= ALPHA_MASK
170 if category == "Ll":
171 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000172 if category == "Zl" or bidirectional == "B":
173 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000174 if category == "Zs" or bidirectional in ("WS", "B", "S"):
175 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000176 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000177 flags |= TITLE_MASK
178 if category == "Lu":
179 flags |= UPPER_MASK
180 # use delta predictor for upper/lower/title
181 if record[12]:
182 upper = (int(record[12], 16) - char) & 0xffff
183 else:
184 upper = 0
185 if record[13]:
186 lower = (int(record[13], 16) - char) & 0xffff
187 else:
188 lower = 0
189 if record[14]:
190 title = (int(record[14], 16) - char) & 0xffff
191 else:
192 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000193 # decimal digit, integer digit
194 decimal = 0
195 if record[6]:
196 flags |= DECIMAL_MASK
197 decimal = int(record[6])
198 digit = 0
199 if record[7]:
200 flags |= DIGIT_MASK
201 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000202 item = (
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000203 flags, upper, lower, title, decimal, digit
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000204 )
205 # add entry to index and item tables
206 i = cache.get(item)
207 if i is None:
208 cache[item] = i = len(table)
209 table.append(item)
210 index[char] = i
211
212 FILE = "Objects/unicodetype_db.h"
213
Fred Drake9c685052000-10-26 03:56:46 +0000214 fp = open(FILE, "w")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000215
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000216 print "--- Writing", FILE, "..."
217
218 print len(table), "unique character type entries"
219
Fred Drake9c685052000-10-26 03:56:46 +0000220 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
221 print >>fp
222 print >>fp, "/* a list of unique character type descriptors */"
223 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000224 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000225 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
226 print >>fp, "};"
227 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000228
229 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000230 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000231
Fred Drake9c685052000-10-26 03:56:46 +0000232 print >>fp, "/* type indexes */"
233 print >>fp, "#define SHIFT", shift
234 Array("index1", index1).dump(fp)
235 Array("index2", index2).dump(fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000236
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000237# --------------------------------------------------------------------
238# the following support code is taken from the unidb utilities
239# Copyright (c) 1999-2000 by Secret Labs AB
240
241# load a unicode-data file from disk
242
243import string, sys
244
245class UnicodeData:
246
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000247 def __init__(self, filename, expand=1):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000248 file = open(filename)
249 table = [None] * 65536
250 while 1:
251 s = file.readline()
252 if not s:
253 break
254 s = string.split(string.strip(s), ";")
255 char = string.atoi(s[0], 16)
256 table[char] = s
257
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000258 # expand first-last ranges (ignore surrogates and private use)
259 if expand:
260 field = None
261 for i in range(0, 0xD800):
262 s = table[i]
263 if s:
264 if s[1][-6:] == "First>":
265 s[1] = ""
266 field = s[:]
267 elif s[1][-5:] == "Last>":
268 s[1] = ""
269 field = None
270 elif field:
271 field[0] = hex(i)
272 table[i] = field
273
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000274 # public attributes
275 self.filename = filename
276 self.table = table
277 self.chars = range(65536) # unicode
278
279 def uselatin1(self):
280 # restrict character range to ISO Latin 1
281 self.chars = range(256)
282
283# stuff to deal with arrays of unsigned integers
284
285class Array:
286
287 def __init__(self, name, data):
288 self.name = name
289 self.data = data
290
291 def dump(self, file):
292 # write data to file, as a C array
293 size = getsize(self.data)
294 # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
295 file.write("static ")
296 if size == 1:
297 file.write("unsigned char")
298 elif size == 2:
299 file.write("unsigned short")
300 else:
301 file.write("unsigned int")
302 file.write(" " + self.name + "[] = {\n")
303 if self.data:
304 s = " "
305 for item in self.data:
306 i = str(item) + ", "
307 if len(s) + len(i) > 78:
308 file.write(s + "\n")
309 s = " " + i
310 else:
311 s = s + i
312 if string.strip(s):
313 file.write(s + "\n")
314 file.write("};\n\n")
315
316def getsize(data):
317 # return smallest possible integer size for the given array
318 maxdata = max(data)
319 if maxdata < 256:
320 return 1
321 elif maxdata < 65536:
322 return 2
323 else:
324 return 4
325
Tim Peters21013482000-09-25 07:13:41 +0000326def splitbins(t, trace=0):
327 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
328
329 t is a sequence of ints. This function can be useful to save space if
330 many of the ints are the same. t1 and t2 are lists of ints, and shift
331 is an int, chosen to minimize the combined size of t1 and t2 (in C
332 code), and where for each i in range(len(t)),
333 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
334 where mask is a bitmask isolating the last "shift" bits.
335
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000336 If optional arg trace is non-zero (default zero), progress info
337 is printed to sys.stderr. The higher the value, the more info
338 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000339 """
340
341 import sys
342 if trace:
343 def dump(t1, t2, shift, bytes):
344 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
345 len(t1), len(t2), shift, bytes)
346 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
347 "bytes"
348 n = len(t)-1 # last valid index
349 maxshift = 0 # the most we can shift n and still have something left
350 if n > 0:
351 while n >> 1:
352 n >>= 1
353 maxshift += 1
354 del n
355 bytes = sys.maxint # smallest total size so far
356 t = tuple(t) # so slices can be dict keys
357 for shift in range(maxshift + 1):
358 t1 = []
359 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000360 size = 2**shift
361 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000362 for i in range(0, len(t), size):
363 bin = t[i:i+size]
364 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000365 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000366 index = len(t2)
367 bincache[bin] = index
368 t2.extend(bin)
369 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000370 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000371 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000372 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000373 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000374 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000375 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000376 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000377 t1, t2, shift = best
378 if trace:
379 print >>sys.stderr, "Best:",
380 dump(t1, t2, shift, bytes)
381 if __debug__:
382 # exhaustively verify that the decomposition is correct
383 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
384 for i in xrange(len(t)):
385 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
386 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000387
388if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000389 maketables(1)