| #! /usr/bin/env python |
| import sys |
| import string |
| import perfect_hash |
| |
| # This is a user of perfect_hash.py |
| # that takes as input the UnicodeData.txt file available from: |
| # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt |
| |
| # It generates a hash table from Unicode Character Name -> |
| # unicode code space value. |
| |
| # These variables determine which hash function is tried first. |
| # Yields a multiple of 1.7875 for UnicodeData.txt on 2000/06/24/ |
| f1Seed = 1694245428 |
| f2Seed = -1917331657 |
| |
| # Maximum allowed multipler, if this isn't None then instead of continually |
| # increasing C, it resets it back to initC to keep searching for |
| # a solution. |
| minC = 1.7875 |
| # Initial multiplier for trying to find a perfect hash function. |
| initC = 1.7875 |
| |
| moduleName = "ucnhash" |
| dataArrayName = "aucn" |
| dataArrayType = "_Py_UnicodeCharacterName" |
| headerFileName = "ucnhash.h" |
| cFileName = "ucnhash.c" |
| structName = "_Py_UCNHashAPI" |
| |
| keys = [] |
| hashData = {} |
| |
| def generateOutputFiles(perfHash, hashData): |
| header = perfHash.generate_header(structName) |
| header = header + """ |
| typedef struct |
| { |
| const char *pszUCN; |
| unsigned int uiValue; |
| } _Py_UnicodeCharacterName; |
| |
| """ |
| |
| code = perfHash.generate_code(moduleName, |
| dataArrayName, |
| dataArrayType, |
| structName) |
| out = open(headerFileName, "w") |
| out.write(header) |
| out = open(cFileName, "w") |
| out.write("#include <%s>\n" % headerFileName) |
| out.write(code) |
| perfHash.generate_graph(out) |
| out.write(""" |
| |
| static const _Py_UnicodeCharacterName aucn[] = |
| { |
| """) |
| for i in xrange(len(keys)): |
| v = hashData[keys[i][0]] |
| out.write(' { "' + keys[i][0] + '", ' + hex(v) + " }," + "\n") |
| out.write("};\n\n") |
| sys.stderr.write('\nGenerated output files: \n') |
| sys.stderr.write('%s\n%s\n' % (headerFileName, cFileName)) |
| |
| def main(): |
| # Suck in UnicodeData.txt and spit out the generated files. |
| input = open(sys.argv[1], 'r') |
| i = 0 |
| while 1: |
| line = input.readline() |
| if line == "": break |
| fields = string.split(line, ';') |
| if len(fields) < 2: |
| sys.stderr.write('Ill-formated line!\n') |
| sys.stderr.write('line #: %d\n' % (i + 1)) |
| sys.exit() |
| data, key = fields[:2] |
| key = string.strip( key ) |
| # Any name starting with '<' is a control, or start/end character, |
| # so skip it... |
| if key[0] == "<": |
| continue |
| hashcode = i |
| i = i + 1 |
| # force the name to uppercase |
| keys.append( (string.upper(key),hashcode) ) |
| data = string.atoi(data, 16) |
| hashData[key] = data |
| |
| input.close() |
| sys.stderr.write('%i key/hash pairs read\n' % len(keys) ) |
| perfHash = perfect_hash.generate_hash(keys, 1, |
| minC, initC, |
| f1Seed, f2Seed, |
| # increment, tries |
| 0.0025, 50) |
| generateOutputFiles(perfHash, hashData) |
| |
| if __name__ == '__main__': |
| if len(sys.argv) == 1: |
| sys.stdout = sys.stderr |
| print 'Usage: %s <input filename>' % sys.argv[0] |
| print ' The input file needs to be UnicodeData.txt' |
| sys.exit() |
| main() |
| |