blob: 8c0c075815a1a1c0399da75fc16887a146774281 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.0 database file to
5# Modules/unicodedata_db.h and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00006#
7# history:
8# 2000-09-24 fl created (based on bits and pieces from unidb)
9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000010# 2000-09-25 fl added character type table
Fredrik Lundh375732c2000-09-25 23:03:34 +000011# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012#
13# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
Fredrik Lundhf367cac2000-09-24 23:18:31 +000014#
15
16import sys
17
18SCRIPT = sys.argv[0]
Fredrik Lundhcfcea492000-09-25 08:07:06 +000019VERSION = "1.1"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000020
Fredrik Lundhe9133f72000-09-25 17:59:57 +000021UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000022
23CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
24 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
25 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
26 "So" ]
27
28BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
29 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
30 "ON" ]
31
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000032# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000033ALPHA_MASK = 0x01
34DECIMAL_MASK = 0x02
35DIGIT_MASK = 0x04
36LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000037LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000038SPACE_MASK = 0x20
39TITLE_MASK = 0x40
40UPPER_MASK = 0x80
41
42def maketables():
Fredrik Lundhf367cac2000-09-24 23:18:31 +000043
44 unicode = UnicodeData(UNICODE_DATA)
45
46 # extract unicode properties
Fredrik Lundhcfcea492000-09-25 08:07:06 +000047 dummy = (0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000048 table = [dummy]
49 cache = {0: dummy}
50 index = [0] * len(unicode.chars)
51
Fredrik Lundhcfcea492000-09-25 08:07:06 +000052 # 1) database properties
Fredrik Lundhf367cac2000-09-24 23:18:31 +000053 for char in unicode.chars:
54 record = unicode.table[char]
55 if record:
56 # extract database properties
57 category = CATEGORY_NAMES.index(record[2])
58 combining = int(record[3])
59 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
60 mirrored = record[9] == "Y"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000061 item = (
Fredrik Lundhcfcea492000-09-25 08:07:06 +000062 category, combining, bidirectional, mirrored
Fredrik Lundhf367cac2000-09-24 23:18:31 +000063 )
64 # add entry to index and item tables
65 i = cache.get(item)
66 if i is None:
67 cache[item] = i = len(table)
68 table.append(item)
69 index[char] = i
70
Fredrik Lundhcfcea492000-09-25 08:07:06 +000071 # 2) decomposition data
72
73 # FIXME: <fl> using the encoding stuff from unidb would save
74 # another 50k or so, but I'll leave that for 2.1...
75
76 decomp_data = [""]
77 decomp_index = [0] * len(unicode.chars)
78
79 for char in unicode.chars:
80 record = unicode.table[char]
81 if record:
82 if record[5]:
83 try:
84 i = decomp_data.index(record[5])
85 except ValueError:
86 i = len(decomp_data)
87 decomp_data.append(record[5])
88 else:
89 i = 0
90 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +000091
Fredrik Lundhe9133f72000-09-25 17:59:57 +000092 FILE = "Modules/unicodedata_db.h"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000093
Fred Drake9c685052000-10-26 03:56:46 +000094 fp = open(FILE, "w")
95 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
96 print >>fp
97 print >>fp, "/* a list of unique database records */"
98 print >>fp, \
99 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000100 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000101 print >>fp, " {%d, %d, %d, %d}," % item
102 print >>fp, "};"
103 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000104
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000105 # FIXME: the following tables should be made static, and
106 # the support code moved into unicodedatabase.c
107
Fred Drake9c685052000-10-26 03:56:46 +0000108 print >>fp, "/* string literals */"
109 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000110 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000111 print >>fp, " \"%s\"," % name
112 print >>fp, " NULL"
113 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000114
Fred Drake9c685052000-10-26 03:56:46 +0000115 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000116 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000117 print >>fp, " \"%s\"," % name
118 print >>fp, " NULL"
119 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000120
Fred Drake9c685052000-10-26 03:56:46 +0000121 print >>fp, "static const char *decomp_data[] = {"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000122 for name in decomp_data:
Fred Drake9c685052000-10-26 03:56:46 +0000123 print >>fp, " \"%s\"," % name
124 print >>fp, " NULL"
125 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000126
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000127 # split record index table
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000128 index1, index2, shift = splitbins(index)
129
Fred Drake9c685052000-10-26 03:56:46 +0000130 print >>fp, "/* index tables for the database records */"
131 print >>fp, "#define SHIFT", shift
132 Array("index1", index1).dump(fp)
133 Array("index2", index2).dump(fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000134
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000135 # split decomposition index table
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000136 index1, index2, shift = splitbins(decomp_index)
137
Fred Drake9c685052000-10-26 03:56:46 +0000138 print >>fp, "/* index tables for the decomposition data */"
139 print >>fp, "#define DECOMP_SHIFT", shift
140 Array("decomp_index1", index1).dump(fp)
141 Array("decomp_index2", index2).dump(fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000142
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000143 #
144 # 3) unicode type data
145
146 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000147 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000148 table = [dummy]
149 cache = {0: dummy}
150 index = [0] * len(unicode.chars)
151
152 for char in unicode.chars:
153 record = unicode.table[char]
154 if record:
155 # extract database properties
156 category = record[2]
157 bidirectional = record[4]
158 flags = 0
159 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
160 flags |= ALPHA_MASK
161 if category == "Ll":
162 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000163 if category == "Zl" or bidirectional == "B":
164 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000165 if category == "Zs" or bidirectional in ("WS", "B", "S"):
166 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000167 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000168 flags |= TITLE_MASK
169 if category == "Lu":
170 flags |= UPPER_MASK
171 # use delta predictor for upper/lower/title
172 if record[12]:
173 upper = (int(record[12], 16) - char) & 0xffff
174 else:
175 upper = 0
176 if record[13]:
177 lower = (int(record[13], 16) - char) & 0xffff
178 else:
179 lower = 0
180 if record[14]:
181 title = (int(record[14], 16) - char) & 0xffff
182 else:
183 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000184 # decimal digit, integer digit
185 decimal = 0
186 if record[6]:
187 flags |= DECIMAL_MASK
188 decimal = int(record[6])
189 digit = 0
190 if record[7]:
191 flags |= DIGIT_MASK
192 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000193 item = (
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000194 flags, upper, lower, title, decimal, digit
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000195 )
196 # add entry to index and item tables
197 i = cache.get(item)
198 if i is None:
199 cache[item] = i = len(table)
200 table.append(item)
201 index[char] = i
202
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000203 print len(table), "ctype entries"
204
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000205 FILE = "Objects/unicodetype_db.h"
206
Fred Drake9c685052000-10-26 03:56:46 +0000207 fp = open(FILE, "w")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000208
Fred Drake9c685052000-10-26 03:56:46 +0000209 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
210 print >>fp
211 print >>fp, "/* a list of unique character type descriptors */"
212 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000213 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000214 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
215 print >>fp, "};"
216 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000217
218 # split decomposition index table
219 index1, index2, shift = splitbins(index)
220
Fred Drake9c685052000-10-26 03:56:46 +0000221 print >>fp, "/* type indexes */"
222 print >>fp, "#define SHIFT", shift
223 Array("index1", index1).dump(fp)
224 Array("index2", index2).dump(fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000225
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000226# --------------------------------------------------------------------
227# the following support code is taken from the unidb utilities
228# Copyright (c) 1999-2000 by Secret Labs AB
229
230# load a unicode-data file from disk
231
232import string, sys
233
234class UnicodeData:
235
236 def __init__(self, filename):
237 file = open(filename)
238 table = [None] * 65536
239 while 1:
240 s = file.readline()
241 if not s:
242 break
243 s = string.split(string.strip(s), ";")
244 char = string.atoi(s[0], 16)
245 table[char] = s
246
247 # public attributes
248 self.filename = filename
249 self.table = table
250 self.chars = range(65536) # unicode
251
252 def uselatin1(self):
253 # restrict character range to ISO Latin 1
254 self.chars = range(256)
255
256# stuff to deal with arrays of unsigned integers
257
258class Array:
259
260 def __init__(self, name, data):
261 self.name = name
262 self.data = data
263
264 def dump(self, file):
265 # write data to file, as a C array
266 size = getsize(self.data)
267 # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
268 file.write("static ")
269 if size == 1:
270 file.write("unsigned char")
271 elif size == 2:
272 file.write("unsigned short")
273 else:
274 file.write("unsigned int")
275 file.write(" " + self.name + "[] = {\n")
276 if self.data:
277 s = " "
278 for item in self.data:
279 i = str(item) + ", "
280 if len(s) + len(i) > 78:
281 file.write(s + "\n")
282 s = " " + i
283 else:
284 s = s + i
285 if string.strip(s):
286 file.write(s + "\n")
287 file.write("};\n\n")
288
289def getsize(data):
290 # return smallest possible integer size for the given array
291 maxdata = max(data)
292 if maxdata < 256:
293 return 1
294 elif maxdata < 65536:
295 return 2
296 else:
297 return 4
298
Tim Peters21013482000-09-25 07:13:41 +0000299def splitbins(t, trace=0):
300 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
301
302 t is a sequence of ints. This function can be useful to save space if
303 many of the ints are the same. t1 and t2 are lists of ints, and shift
304 is an int, chosen to minimize the combined size of t1 and t2 (in C
305 code), and where for each i in range(len(t)),
306 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
307 where mask is a bitmask isolating the last "shift" bits.
308
309 If optional arg trace is true (default false), progress info is
310 printed to sys.stderr.
311 """
312
313 import sys
314 if trace:
315 def dump(t1, t2, shift, bytes):
316 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
317 len(t1), len(t2), shift, bytes)
318 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
319 "bytes"
320 n = len(t)-1 # last valid index
321 maxshift = 0 # the most we can shift n and still have something left
322 if n > 0:
323 while n >> 1:
324 n >>= 1
325 maxshift += 1
326 del n
327 bytes = sys.maxint # smallest total size so far
328 t = tuple(t) # so slices can be dict keys
329 for shift in range(maxshift + 1):
330 t1 = []
331 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000332 size = 2**shift
333 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000334 for i in range(0, len(t), size):
335 bin = t[i:i+size]
336 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000337 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000338 index = len(t2)
339 bincache[bin] = index
340 t2.extend(bin)
341 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000342 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000343 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
344 if trace:
345 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000346 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000347 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000348 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000349 t1, t2, shift = best
350 if trace:
351 print >>sys.stderr, "Best:",
352 dump(t1, t2, shift, bytes)
353 if __debug__:
354 # exhaustively verify that the decomposition is correct
355 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
356 for i in xrange(len(t)):
357 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
358 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000359
360if __name__ == "__main__":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000361 maketables()