blob: 31a0590c50d9a12ebb7edd51232078dac9b57d44 [file] [log] [blame]
Marc-André Lemburgc5bb9c22000-06-28 16:49:29 +00001#! /usr/bin/env python
2import sys
3import string
4import perfect_hash
5
6# This is a user of perfect_hash.py
7# that takes as input the UnicodeData.txt file available from:
8# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
9
10# It generates a hash table from Unicode Character Name ->
11# unicode code space value.
12
13# These variables determine which hash function is tried first.
14# Yields a multiple of 1.7875 for UnicodeData.txt on 2000/06/24/
15f1Seed = 1694245428
16f2Seed = -1917331657
17
18# Maximum allowed multipler, if this isn't None then instead of continually
19# increasing C, it resets it back to initC to keep searching for
20# a solution.
21minC = 1.7875
22# Initial multiplier for trying to find a perfect hash function.
23initC = 1.7875
24
25moduleName = "ucnhash"
26dataArrayName = "aucn"
27dataArrayType = "_Py_UnicodeCharacterName"
28headerFileName = "ucnhash.h"
29cFileName = "ucnhash.c"
30structName = "_Py_UCNHashAPI"
31
32keys = []
33hashData = {}
34
35def generateOutputFiles(perfHash, hashData):
36 header = perfHash.generate_header(structName)
37 header = header + """
38typedef struct
39{
40 const char *pszUCN;
41 unsigned int uiValue;
42} _Py_UnicodeCharacterName;
43
44"""
45
46 code = perfHash.generate_code(moduleName,
47 dataArrayName,
48 dataArrayType,
49 structName)
50 out = open(headerFileName, "w")
51 out.write(header)
52 out = open(cFileName, "w")
Marc-André Lemburga4657f72000-06-30 10:30:35 +000053 out.write("#include \"%s\"\n" % headerFileName)
Marc-André Lemburgc5bb9c22000-06-28 16:49:29 +000054 out.write(code)
55 perfHash.generate_graph(out)
56 out.write("""
57
58static const _Py_UnicodeCharacterName aucn[] =
59{
60""")
61 for i in xrange(len(keys)):
62 v = hashData[keys[i][0]]
63 out.write(' { "' + keys[i][0] + '", ' + hex(v) + " }," + "\n")
64 out.write("};\n\n")
65 sys.stderr.write('\nGenerated output files: \n')
66 sys.stderr.write('%s\n%s\n' % (headerFileName, cFileName))
67
68def main():
69 # Suck in UnicodeData.txt and spit out the generated files.
70 input = open(sys.argv[1], 'r')
71 i = 0
72 while 1:
73 line = input.readline()
74 if line == "": break
75 fields = string.split(line, ';')
76 if len(fields) < 2:
77 sys.stderr.write('Ill-formated line!\n')
78 sys.stderr.write('line #: %d\n' % (i + 1))
79 sys.exit()
80 data, key = fields[:2]
81 key = string.strip( key )
82 # Any name starting with '<' is a control, or start/end character,
83 # so skip it...
84 if key[0] == "<":
85 continue
86 hashcode = i
87 i = i + 1
88 # force the name to uppercase
89 keys.append( (string.upper(key),hashcode) )
90 data = string.atoi(data, 16)
91 hashData[key] = data
92
93 input.close()
94 sys.stderr.write('%i key/hash pairs read\n' % len(keys) )
95 perfHash = perfect_hash.generate_hash(keys, 1,
96 minC, initC,
97 f1Seed, f2Seed,
98 # increment, tries
99 0.0025, 50)
100 generateOutputFiles(perfHash, hashData)
101
102if __name__ == '__main__':
103 if len(sys.argv) == 1:
104 sys.stdout = sys.stderr
105 print 'Usage: %s <input filename>' % sys.argv[0]
106 print ' The input file needs to be UnicodeData.txt'
107 sys.exit()
108 main()
109