blob: c36fadfedf31876dcb0ea996bca891c5657f8a9c [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
2# makeunidb.py -- generate a compact version of the unicode property
3# database (unicodedatabase.h)
4#
5
6import sys
7
8SCRIPT = sys.argv[0]
9VERSION = "1.0"
10
11UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"
12
13CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
14 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
15 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
16 "So" ]
17
18BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
19 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
20 "ON" ]
21
22def maketable():
23
24 unicode = UnicodeData(UNICODE_DATA)
25
26 # extract unicode properties
27 dummy = (0, 0, 0, 0, "NULL")
28 table = [dummy]
29 cache = {0: dummy}
30 index = [0] * len(unicode.chars)
31
32 DECOMPOSITION = [""]
33
34 for char in unicode.chars:
35 record = unicode.table[char]
36 if record:
37 # extract database properties
38 category = CATEGORY_NAMES.index(record[2])
39 combining = int(record[3])
40 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
41 mirrored = record[9] == "Y"
42 if record[5]:
43 decomposition = '"%s"' % record[5]
44 else:
45 decomposition = "NULL"
46 item = (
47 category, combining, bidirectional, mirrored, decomposition
48 )
49 # add entry to index and item tables
50 i = cache.get(item)
51 if i is None:
52 cache[item] = i = len(table)
53 table.append(item)
54 index[char] = i
55
56 # FIXME: we really should compress the decomposition stuff
57 # (see the unidb utilities for one way to do this)
58
59 FILE = "unicodedata_db.h"
60
61 sys.stdout = open(FILE, "w")
62
63 print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
64 print
65 print "/* a list of unique database records */"
66 print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
67 for item in table:
68 print " {%d, %d, %d, %d, %s}," % item
69 print "};"
70 print
71
72 print "/* string literals */"
73 print "const char *_PyUnicode_CategoryNames[] = {"
74 for name in CATEGORY_NAMES:
75 print " \"%s\"," % name
76 print " NULL"
77 print "};"
78
79 print "const char *_PyUnicode_BidirectionalNames[] = {"
80 for name in BIDIRECTIONAL_NAMES:
81 print " \"%s\"," % name
82 print " NULL"
83 print "};"
84
85 # split index table
86 index1, index2, shift = splitbins(index)
87
88 print "/* index tables used to find the right database record */"
89 print "#define SHIFT", shift
90 Array("index1", index1).dump(sys.stdout)
91 Array("index2", index2).dump(sys.stdout)
92
93 sys.stdout = sys.__stdout__
94
95# --------------------------------------------------------------------
96# the following support code is taken from the unidb utilities
97# Copyright (c) 1999-2000 by Secret Labs AB
98
99# load a unicode-data file from disk
100
101import string, sys
102
103class UnicodeData:
104
105 def __init__(self, filename):
106 file = open(filename)
107 table = [None] * 65536
108 while 1:
109 s = file.readline()
110 if not s:
111 break
112 s = string.split(string.strip(s), ";")
113 char = string.atoi(s[0], 16)
114 table[char] = s
115
116 # public attributes
117 self.filename = filename
118 self.table = table
119 self.chars = range(65536) # unicode
120
121 def uselatin1(self):
122 # restrict character range to ISO Latin 1
123 self.chars = range(256)
124
125# stuff to deal with arrays of unsigned integers
126
127class Array:
128
129 def __init__(self, name, data):
130 self.name = name
131 self.data = data
132
133 def dump(self, file):
134 # write data to file, as a C array
135 size = getsize(self.data)
136 # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
137 file.write("static ")
138 if size == 1:
139 file.write("unsigned char")
140 elif size == 2:
141 file.write("unsigned short")
142 else:
143 file.write("unsigned int")
144 file.write(" " + self.name + "[] = {\n")
145 if self.data:
146 s = " "
147 for item in self.data:
148 i = str(item) + ", "
149 if len(s) + len(i) > 78:
150 file.write(s + "\n")
151 s = " " + i
152 else:
153 s = s + i
154 if string.strip(s):
155 file.write(s + "\n")
156 file.write("};\n\n")
157
158def getsize(data):
159 # return smallest possible integer size for the given array
160 maxdata = max(data)
161 if maxdata < 256:
162 return 1
163 elif maxdata < 65536:
164 return 2
165 else:
166 return 4
167
168def splitbins(bins):
169 # split a sparse integer table into two tables, such as:
170 # value = t2[(t1[char>>shift]<<shift)+(char&mask)]
171 # and value == 0 means no data
172 bytes = sys.maxint
173 for shift in range(16):
174 bin1 = []
175 bin2 = []
176 size = 2**shift
177 bincache = {}
178 for i in range(0, len(bins), size):
179 bin = bins[i:i+size]
180 index = bincache.get(tuple(bin))
181 if index is None:
182 index = len(bin2)
183 bincache[tuple(bin)] = index
184 for v in bin:
185 if v is None:
186 bin2.append(0)
187 else:
188 bin2.append(v)
189 bin1.append(index>>shift)
190 # determine memory size
191 b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2)
192 if b < bytes:
193 best = shift, bin1, bin2
194 bytes = b
195 shift, bin1, bin2 = best
196## print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
197## len(bin1), len(bin2), shift, bytes
198## )
199 return bin1, bin2, shift
200
201if __name__ == "__main__":
202 maketable()