| # |
| # makeunidb.py -- generate a compact version of the unicode property |
| # database (unicodedatabase.h) |
| # |
| |
| import sys |
| |
| SCRIPT = sys.argv[0] |
| VERSION = "1.0" |
| |
| UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt" |
| |
| CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", |
| "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", |
| "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", |
| "So" ] |
| |
| BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", |
| "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", |
| "ON" ] |
| |
| def maketable(): |
| |
| unicode = UnicodeData(UNICODE_DATA) |
| |
| # extract unicode properties |
| dummy = (0, 0, 0, 0, "NULL") |
| table = [dummy] |
| cache = {0: dummy} |
| index = [0] * len(unicode.chars) |
| |
| DECOMPOSITION = [""] |
| |
| for char in unicode.chars: |
| record = unicode.table[char] |
| if record: |
| # extract database properties |
| category = CATEGORY_NAMES.index(record[2]) |
| combining = int(record[3]) |
| bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) |
| mirrored = record[9] == "Y" |
| if record[5]: |
| decomposition = '"%s"' % record[5] |
| else: |
| decomposition = "NULL" |
| item = ( |
| category, combining, bidirectional, mirrored, decomposition |
| ) |
| # add entry to index and item tables |
| i = cache.get(item) |
| if i is None: |
| cache[item] = i = len(table) |
| table.append(item) |
| index[char] = i |
| |
| # FIXME: we really should compress the decomposition stuff |
| # (see the unidb utilities for one way to do this) |
| |
| FILE = "unicodedata_db.h" |
| |
| sys.stdout = open(FILE, "w") |
| |
| print "/* this file was generated by %s %s */" % (SCRIPT, VERSION) |
| print |
| print "/* a list of unique database records */" |
| print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" |
| for item in table: |
| print " {%d, %d, %d, %d, %s}," % item |
| print "};" |
| print |
| |
| print "/* string literals */" |
| print "const char *_PyUnicode_CategoryNames[] = {" |
| for name in CATEGORY_NAMES: |
| print " \"%s\"," % name |
| print " NULL" |
| print "};" |
| |
| print "const char *_PyUnicode_BidirectionalNames[] = {" |
| for name in BIDIRECTIONAL_NAMES: |
| print " \"%s\"," % name |
| print " NULL" |
| print "};" |
| |
| # split index table |
| index1, index2, shift = splitbins(index) |
| |
| print "/* index tables used to find the right database record */" |
| print "#define SHIFT", shift |
| Array("index1", index1).dump(sys.stdout) |
| Array("index2", index2).dump(sys.stdout) |
| |
| sys.stdout = sys.__stdout__ |
| |
| # -------------------------------------------------------------------- |
| # the following support code is taken from the unidb utilities |
| # Copyright (c) 1999-2000 by Secret Labs AB |
| |
| # load a unicode-data file from disk |
| |
| import string, sys |
| |
| class UnicodeData: |
| |
| def __init__(self, filename): |
| file = open(filename) |
| table = [None] * 65536 |
| while 1: |
| s = file.readline() |
| if not s: |
| break |
| s = string.split(string.strip(s), ";") |
| char = string.atoi(s[0], 16) |
| table[char] = s |
| |
| # public attributes |
| self.filename = filename |
| self.table = table |
| self.chars = range(65536) # unicode |
| |
| def uselatin1(self): |
| # restrict character range to ISO Latin 1 |
| self.chars = range(256) |
| |
| # stuff to deal with arrays of unsigned integers |
| |
| class Array: |
| |
| def __init__(self, name, data): |
| self.name = name |
| self.data = data |
| |
| def dump(self, file): |
| # write data to file, as a C array |
| size = getsize(self.data) |
| # print >>sys.stderr, self.name+":", size*len(self.data), "bytes" |
| file.write("static ") |
| if size == 1: |
| file.write("unsigned char") |
| elif size == 2: |
| file.write("unsigned short") |
| else: |
| file.write("unsigned int") |
| file.write(" " + self.name + "[] = {\n") |
| if self.data: |
| s = " " |
| for item in self.data: |
| i = str(item) + ", " |
| if len(s) + len(i) > 78: |
| file.write(s + "\n") |
| s = " " + i |
| else: |
| s = s + i |
| if string.strip(s): |
| file.write(s + "\n") |
| file.write("};\n\n") |
| |
| def getsize(data): |
| # return smallest possible integer size for the given array |
| maxdata = max(data) |
| if maxdata < 256: |
| return 1 |
| elif maxdata < 65536: |
| return 2 |
| else: |
| return 4 |
| |
| def splitbins(bins): |
| # split a sparse integer table into two tables, such as: |
| # value = t2[(t1[char>>shift]<<shift)+(char&mask)] |
| # and value == 0 means no data |
| bytes = sys.maxint |
| for shift in range(16): |
| bin1 = [] |
| bin2 = [] |
| size = 2**shift |
| bincache = {} |
| for i in range(0, len(bins), size): |
| bin = bins[i:i+size] |
| index = bincache.get(tuple(bin)) |
| if index is None: |
| index = len(bin2) |
| bincache[tuple(bin)] = index |
| for v in bin: |
| if v is None: |
| bin2.append(0) |
| else: |
| bin2.append(v) |
| bin1.append(index>>shift) |
| # determine memory size |
| b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2) |
| if b < bytes: |
| best = shift, bin1, bin2 |
| bytes = b |
| shift, bin1, bin2 = best |
| ## print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( |
| ## len(bin1), len(bin2), shift, bytes |
| ## ) |
| return bin1, bin2, shift |
| |
| if __name__ == "__main__": |
| maketable() |