blob: 4781ec4f5bd592cab93f1212d0aa525a4702a41c [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhcfcea492000-09-25 08:07:06 +00002# generate a compact version of the unicode property database
3#
4# history:
5# 2000-09-24 fl created (based on bits and pieces from unidb)
6# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
7#
8# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
Fredrik Lundhf367cac2000-09-24 23:18:31 +00009#
10
11import sys
12
13SCRIPT = sys.argv[0]
Fredrik Lundhcfcea492000-09-25 08:07:06 +000014VERSION = "1.1"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000015
Fredrik Lundhcfcea492000-09-25 08:07:06 +000016UNICODE_DATA = "../UnicodeData-Latest.txt"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000017
18CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
19 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
20 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
21 "So" ]
22
23BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
24 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
25 "ON" ]
26
27def maketable():
28
29 unicode = UnicodeData(UNICODE_DATA)
30
31 # extract unicode properties
Fredrik Lundhcfcea492000-09-25 08:07:06 +000032 dummy = (0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000033 table = [dummy]
34 cache = {0: dummy}
35 index = [0] * len(unicode.chars)
36
Fredrik Lundhcfcea492000-09-25 08:07:06 +000037 # 1) database properties
Fredrik Lundhf367cac2000-09-24 23:18:31 +000038 for char in unicode.chars:
39 record = unicode.table[char]
40 if record:
41 # extract database properties
42 category = CATEGORY_NAMES.index(record[2])
43 combining = int(record[3])
44 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
45 mirrored = record[9] == "Y"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000046 item = (
Fredrik Lundhcfcea492000-09-25 08:07:06 +000047 category, combining, bidirectional, mirrored
Fredrik Lundhf367cac2000-09-24 23:18:31 +000048 )
49 # add entry to index and item tables
50 i = cache.get(item)
51 if i is None:
52 cache[item] = i = len(table)
53 table.append(item)
54 index[char] = i
55
Fredrik Lundhcfcea492000-09-25 08:07:06 +000056 # 2) decomposition data
57
58 # FIXME: <fl> using the encoding stuff from unidb would save
59 # another 50k or so, but I'll leave that for 2.1...
60
61 decomp_data = [""]
62 decomp_index = [0] * len(unicode.chars)
63
64 for char in unicode.chars:
65 record = unicode.table[char]
66 if record:
67 if record[5]:
68 try:
69 i = decomp_data.index(record[5])
70 except ValueError:
71 i = len(decomp_data)
72 decomp_data.append(record[5])
73 else:
74 i = 0
75 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +000076
77 FILE = "unicodedata_db.h"
78
79 sys.stdout = open(FILE, "w")
80
81 print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
82 print
83 print "/* a list of unique database records */"
84 print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
85 for item in table:
Fredrik Lundhcfcea492000-09-25 08:07:06 +000086 print " {%d, %d, %d, %d}," % item
Fredrik Lundhf367cac2000-09-24 23:18:31 +000087 print "};"
88 print
89
90 print "/* string literals */"
91 print "const char *_PyUnicode_CategoryNames[] = {"
92 for name in CATEGORY_NAMES:
93 print " \"%s\"," % name
94 print " NULL"
95 print "};"
96
97 print "const char *_PyUnicode_BidirectionalNames[] = {"
98 for name in BIDIRECTIONAL_NAMES:
99 print " \"%s\"," % name
100 print " NULL"
101 print "};"
102
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000103 print "static const char *decomp_data[] = {"
104 for name in decomp_data:
105 print " \"%s\"," % name
106 print " NULL"
107 print "};"
108
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000109 # split index table
110 index1, index2, shift = splitbins(index)
111
112 print "/* index tables used to find the right database record */"
113 print "#define SHIFT", shift
114 Array("index1", index1).dump(sys.stdout)
115 Array("index2", index2).dump(sys.stdout)
116
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000117 # split index table
118 index1, index2, shift = splitbins(decomp_index)
119
120 print "/* same, for the decomposition data */"
121 print "#define DECOMP_SHIFT", shift
122 Array("decomp_index1", index1).dump(sys.stdout)
123 Array("decomp_index2", index2).dump(sys.stdout)
124
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000125 sys.stdout = sys.__stdout__
126
127# --------------------------------------------------------------------
128# the following support code is taken from the unidb utilities
129# Copyright (c) 1999-2000 by Secret Labs AB
130
131# load a unicode-data file from disk
132
133import string, sys
134
135class UnicodeData:
136
137 def __init__(self, filename):
138 file = open(filename)
139 table = [None] * 65536
140 while 1:
141 s = file.readline()
142 if not s:
143 break
144 s = string.split(string.strip(s), ";")
145 char = string.atoi(s[0], 16)
146 table[char] = s
147
148 # public attributes
149 self.filename = filename
150 self.table = table
151 self.chars = range(65536) # unicode
152
153 def uselatin1(self):
154 # restrict character range to ISO Latin 1
155 self.chars = range(256)
156
157# stuff to deal with arrays of unsigned integers
158
159class Array:
160
161 def __init__(self, name, data):
162 self.name = name
163 self.data = data
164
165 def dump(self, file):
166 # write data to file, as a C array
167 size = getsize(self.data)
168 # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
169 file.write("static ")
170 if size == 1:
171 file.write("unsigned char")
172 elif size == 2:
173 file.write("unsigned short")
174 else:
175 file.write("unsigned int")
176 file.write(" " + self.name + "[] = {\n")
177 if self.data:
178 s = " "
179 for item in self.data:
180 i = str(item) + ", "
181 if len(s) + len(i) > 78:
182 file.write(s + "\n")
183 s = " " + i
184 else:
185 s = s + i
186 if string.strip(s):
187 file.write(s + "\n")
188 file.write("};\n\n")
189
190def getsize(data):
191 # return smallest possible integer size for the given array
192 maxdata = max(data)
193 if maxdata < 256:
194 return 1
195 elif maxdata < 65536:
196 return 2
197 else:
198 return 4
199
Tim Peters21013482000-09-25 07:13:41 +0000200def splitbins(t, trace=0):
201 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
202
203 t is a sequence of ints. This function can be useful to save space if
204 many of the ints are the same. t1 and t2 are lists of ints, and shift
205 is an int, chosen to minimize the combined size of t1 and t2 (in C
206 code), and where for each i in range(len(t)),
207 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
208 where mask is a bitmask isolating the last "shift" bits.
209
210 If optional arg trace is true (default false), progress info is
211 printed to sys.stderr.
212 """
213
214 import sys
215 if trace:
216 def dump(t1, t2, shift, bytes):
217 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
218 len(t1), len(t2), shift, bytes)
219 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
220 "bytes"
221 n = len(t)-1 # last valid index
222 maxshift = 0 # the most we can shift n and still have something left
223 if n > 0:
224 while n >> 1:
225 n >>= 1
226 maxshift += 1
227 del n
228 bytes = sys.maxint # smallest total size so far
229 t = tuple(t) # so slices can be dict keys
230 for shift in range(maxshift + 1):
231 t1 = []
232 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000233 size = 2**shift
234 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000235 for i in range(0, len(t), size):
236 bin = t[i:i+size]
237 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000238 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000239 index = len(t2)
240 bincache[bin] = index
241 t2.extend(bin)
242 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000243 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000244 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
245 if trace:
246 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000247 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000248 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000249 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000250 t1, t2, shift = best
251 if trace:
252 print >>sys.stderr, "Best:",
253 dump(t1, t2, shift, bytes)
254 if __debug__:
255 # exhaustively verify that the decomposition is correct
256 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
257 for i in xrange(len(t)):
258 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
259 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000260
261if __name__ == "__main__":
262 maketable()