blob: f2e6dc86b34f5c32375aff8f59fbf18e3780d860 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
2# makeunidb.py -- generate a compact version of the unicode property
3# database (unicodedatabase.h)
4#
5
6import sys
7
8SCRIPT = sys.argv[0]
9VERSION = "1.0"
10
11UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"
12
13CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
14 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
15 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
16 "So" ]
17
18BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
19 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
20 "ON" ]
21
22def maketable():
23
24 unicode = UnicodeData(UNICODE_DATA)
25
26 # extract unicode properties
27 dummy = (0, 0, 0, 0, "NULL")
28 table = [dummy]
29 cache = {0: dummy}
30 index = [0] * len(unicode.chars)
31
32 DECOMPOSITION = [""]
33
34 for char in unicode.chars:
35 record = unicode.table[char]
36 if record:
37 # extract database properties
38 category = CATEGORY_NAMES.index(record[2])
39 combining = int(record[3])
40 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
41 mirrored = record[9] == "Y"
42 if record[5]:
43 decomposition = '"%s"' % record[5]
44 else:
45 decomposition = "NULL"
46 item = (
47 category, combining, bidirectional, mirrored, decomposition
48 )
49 # add entry to index and item tables
50 i = cache.get(item)
51 if i is None:
52 cache[item] = i = len(table)
53 table.append(item)
54 index[char] = i
55
56 # FIXME: we really should compress the decomposition stuff
57 # (see the unidb utilities for one way to do this)
58
59 FILE = "unicodedata_db.h"
60
61 sys.stdout = open(FILE, "w")
62
63 print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
64 print
65 print "/* a list of unique database records */"
66 print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
67 for item in table:
68 print " {%d, %d, %d, %d, %s}," % item
69 print "};"
70 print
71
72 print "/* string literals */"
73 print "const char *_PyUnicode_CategoryNames[] = {"
74 for name in CATEGORY_NAMES:
75 print " \"%s\"," % name
76 print " NULL"
77 print "};"
78
79 print "const char *_PyUnicode_BidirectionalNames[] = {"
80 for name in BIDIRECTIONAL_NAMES:
81 print " \"%s\"," % name
82 print " NULL"
83 print "};"
84
85 # split index table
86 index1, index2, shift = splitbins(index)
87
88 print "/* index tables used to find the right database record */"
89 print "#define SHIFT", shift
90 Array("index1", index1).dump(sys.stdout)
91 Array("index2", index2).dump(sys.stdout)
92
93 sys.stdout = sys.__stdout__
94
95# --------------------------------------------------------------------
96# the following support code is taken from the unidb utilities
97# Copyright (c) 1999-2000 by Secret Labs AB
98
99# load a unicode-data file from disk
100
101import string, sys
102
103class UnicodeData:
104
105 def __init__(self, filename):
106 file = open(filename)
107 table = [None] * 65536
108 while 1:
109 s = file.readline()
110 if not s:
111 break
112 s = string.split(string.strip(s), ";")
113 char = string.atoi(s[0], 16)
114 table[char] = s
115
116 # public attributes
117 self.filename = filename
118 self.table = table
119 self.chars = range(65536) # unicode
120
121 def uselatin1(self):
122 # restrict character range to ISO Latin 1
123 self.chars = range(256)
124
125# stuff to deal with arrays of unsigned integers
126
127class Array:
128
129 def __init__(self, name, data):
130 self.name = name
131 self.data = data
132
133 def dump(self, file):
134 # write data to file, as a C array
135 size = getsize(self.data)
136 # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
137 file.write("static ")
138 if size == 1:
139 file.write("unsigned char")
140 elif size == 2:
141 file.write("unsigned short")
142 else:
143 file.write("unsigned int")
144 file.write(" " + self.name + "[] = {\n")
145 if self.data:
146 s = " "
147 for item in self.data:
148 i = str(item) + ", "
149 if len(s) + len(i) > 78:
150 file.write(s + "\n")
151 s = " " + i
152 else:
153 s = s + i
154 if string.strip(s):
155 file.write(s + "\n")
156 file.write("};\n\n")
157
158def getsize(data):
159 # return smallest possible integer size for the given array
160 maxdata = max(data)
161 if maxdata < 256:
162 return 1
163 elif maxdata < 65536:
164 return 2
165 else:
166 return 4
167
Tim Peters21013482000-09-25 07:13:41 +0000168def splitbins(t, trace=0):
169 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
170
171 t is a sequence of ints. This function can be useful to save space if
172 many of the ints are the same. t1 and t2 are lists of ints, and shift
173 is an int, chosen to minimize the combined size of t1 and t2 (in C
174 code), and where for each i in range(len(t)),
175 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
176 where mask is a bitmask isolating the last "shift" bits.
177
178 If optional arg trace is true (default false), progress info is
179 printed to sys.stderr.
180 """
181
182 import sys
183 if trace:
184 def dump(t1, t2, shift, bytes):
185 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
186 len(t1), len(t2), shift, bytes)
187 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
188 "bytes"
189 n = len(t)-1 # last valid index
190 maxshift = 0 # the most we can shift n and still have something left
191 if n > 0:
192 while n >> 1:
193 n >>= 1
194 maxshift += 1
195 del n
196 bytes = sys.maxint # smallest total size so far
197 t = tuple(t) # so slices can be dict keys
198 for shift in range(maxshift + 1):
199 t1 = []
200 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000201 size = 2**shift
202 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000203 for i in range(0, len(t), size):
204 bin = t[i:i+size]
205 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000206 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000207 index = len(t2)
208 bincache[bin] = index
209 t2.extend(bin)
210 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000211 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000212 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
213 if trace:
214 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000215 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000216 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000217 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000218 t1, t2, shift = best
219 if trace:
220 print >>sys.stderr, "Best:",
221 dump(t1, t2, shift, bytes)
222 if __debug__:
223 # exhaustively verify that the decomposition is correct
224 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
225 for i in xrange(len(t)):
226 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
227 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000228
229if __name__ == "__main__":
230 maketable()