blob: bf857398b4dadc0b36781d52803c9714db7526bd [file] [log] [blame]
Marc-André Lemburg4e5302a2000-06-28 16:53:16 +00001#!/usr/bin/env/python
2
3# perfect_hash.py
4#
5# Outputs C code for a minimal perfect hash.
6# The hash is produced using the algorithm described in
7# "Optimal algorithms for minimal perfect hashing",
8# G. Havas, B.S. Majewski. Available as a technical report
9# from the CS department, University of Queensland
10# (ftp://ftp.cs.uq.oz.au/).
11#
12# This is a modified version of Andrew Kuchling's code
13# (http://starship.python.net/crew/amk/python/code/perfect-hash.html)
14# and generates C fragments suitable for compilation as a Python
15# extension module.
16#
17
18# Difference between this algorithm and gperf:
19# Gperf will complete in finite time with a successful function,
20# or by giving up.
21# This algorithm may never complete, although it is extremely likely
22# when c >= 2.
23
24# The algorithm works like this:
25# 0) You have K keys, that you want to perfectly hash to a bunch
26# of hash values.
27#
28# 1) Choose a number N larger than K. This is the number of
29# vertices in a graph G, and also the size of the resulting table.
30#
31# 2) Pick two random hash functions f1, f2, that output values from
32# 0...N-1.
33#
34# 3) for key in keys:
35# h1 = f1(key) ; h2 = f2(key)
36# Draw an edge between vertices h1 and h2 of the graph.
37# Associate the desired hash value with that edge.
38#
39# 4) Check if G is acyclic; if not, go back to step 1 and pick a bigger N.
40#
41# 5) Assign values to each vertex such that, for each edge, you can
42# add the values for the two vertices and get the desired value
43# for that edge -- which is the desired hash key. This task is
44# dead easy, because the graph is acyclic. This is done by
45# picking a vertex V, and assigning it a value of 0. You then do a
46# depth-first search, assigning values to new vertices so that
47# they sum up properly.
48#
49# 6) f1, f2, and G now make up your perfect hash function.
50
51
52import sys, whrandom, string
53import pprint
54import perfhash
55import time
56
57class Hash:
58 """Random hash function
59 For simplicity and speed, this doesn't implement any byte-level hashing
60 scheme. Instead, a random string is generated and prefixing to
61 str(key), and then Python's hashing function is used."""
62
63 def __init__(self, N, caseInsensitive=0):
64 self.N = N
65 junk = ""
66 for i in range(10):
67 junk = junk + whrandom.choice(string.letters + string.digits)
68 self.junk = junk
69 self.caseInsensitive = caseInsensitive
70 self.seed = perfhash.calcSeed(junk)
71
72 def __call__(self, key):
73 key = str(key)
74 if self.caseInsensitive:
75 key = string.upper(key)
76 x = perfhash.hash(self.seed, len(self.junk), key) % self.N
77 #h = hash(self.junk + key) % self.N
78 #assert x == h
79 return x
80
81 def generate_code(self):
82 s = """{
83 register int len;
84 register unsigned char *p;
85 register long x;
86
87 len = cch;
88 p = (unsigned char *) key;
89 x = %(junkSeed)d;
90 while (--len >= 0)
91 x = (1000003*x) ^ """ % \
92 {
93 "lenJunk" : len(self.junk),
94 "junkSeed" : self.seed,
95 }
96
97 if self.caseInsensitive:
98 s = s + "toupper(*(p++));"
99 else:
100 s = s + "*(p++);"
101 s = s + """
102 x ^= cch + %(lenJunk)d;
103 if (x == -1)
104 x = -2;
105 x %%= k_cHashElements;
106 /* ensure the returned value is positive so we mimic Python's %% operator */
107 if (x < 0)
108 x += k_cHashElements;
109 return x;
110}
111""" % { "lenJunk" : len(self.junk),
112 "junkSeed" : self.seed, }
113 return s
114
115
116WHITE, GREY, BLACK = 0,1,2
117class Graph:
118 """Graph class. This class isn't particularly efficient or general,
119 and only has the features I needed to implement this algorithm.
120
121 num_vertices -- number of vertices
122 edges -- maps 2-tuples of vertex numbers to the value for this
123 edge. If there's an edge between v1 and v2 (v1<v2),
124 (v1,v2) is a key and the value is the edge's value.
125 reachable_list -- maps a vertex V to the list of vertices
126 to which V is connected by edges. Used
127 for traversing the graph.
128 values -- numeric value for each vertex
129 """
130
131 def __init__(self, num_vertices):
132 self.num_vertices = num_vertices
133 self.edges = {}
134 self.reachable_list = {}
135 self.values = [-1] * num_vertices
136
137 def connect(self, vertex1, vertex2, value):
138 """Connect 'vertex1' and 'vertex2' with an edge, with associated
139 value 'value'"""
140
141 if vertex1 > vertex2: vertex1, vertex2 = vertex2, vertex1
142# if self.edges.has_key( (vertex1, vertex2) ):
143# raise ValueError, 'Collision: vertices already connected'
144 self.edges[ (vertex1, vertex2) ] = value
145
146 # Add vertices to each other's reachable list
147 if not self.reachable_list.has_key( vertex1 ):
148 self.reachable_list[ vertex1 ] = [vertex2]
149 else:
150 self.reachable_list[vertex1].append(vertex2)
151
152 if not self.reachable_list.has_key( vertex2 ):
153 self.reachable_list[ vertex2 ] = [vertex1]
154 else:
155 self.reachable_list[vertex2].append(vertex1)
156
157 def get_edge_value(self, vertex1, vertex2):
158 """Retrieve the value corresponding to the edge between
159 'vertex1' and 'vertex2'. Raises KeyError if no such edge"""
160 if vertex1 > vertex2:
161 vertex1, vertex2 = vertex2, vertex1
162 return self.edges[ (vertex1, vertex2) ]
163
164 def is_acyclic(self):
165 "Returns true if the graph is acyclic, otherwise false"
166
167 # This is done by doing a depth-first search of the graph;
168 # painting each vertex grey and then black. If the DFS
169 # ever finds a vertex that isn't white, there's a cycle.
170 colour = {}
171 for i in range(self.num_vertices): colour[i] = WHITE
172
173 # Loop over all vertices, taking white ones as starting
174 # points for a traversal.
175 for i in range(self.num_vertices):
176 if colour[i] == WHITE:
177
178 # List of vertices to visit
179 visit_list = [ (None,i) ]
180
181 # Do a DFS
182 while visit_list:
183 # Colour this vertex grey.
184 parent, vertex = visit_list[0] ; del visit_list[0]
185 colour[vertex] = GREY
186
187 # Make copy of list of neighbours, removing the vertex
188 # we arrived here from.
189 neighbours = self.reachable_list.get(vertex, []) [:]
190 if parent in neighbours: neighbours.remove( parent )
191
192 for neighbour in neighbours:
193 if colour[neighbour] == WHITE:
194 visit_list.insert(0, (vertex, neighbour) )
195 elif colour[neighbour] != WHITE:
196 # Aha! Already visited this node,
197 # so the graph isn't acyclic.
198 return 0
199
200 colour[vertex] = BLACK
201
202 # We got through, so the graph is acyclic.
203 return 1
204
205 def assign_values(self):
206 """Compute values for each vertex, so that they sum up
207 properly to the associated value for each edge."""
208
209 # Also done with a DFS; I simply copied the DFS code
210 # from is_acyclic(). (Should generalize the logic so
211 # one function could be used from both methods,
212 # but I couldn't be bothered.)
213
214 colour = {}
215 for i in range(self.num_vertices): colour[i] = WHITE
216
217 # Loop over all vertices, taking white ones as starting
218 # points for a traversal.
219 for i in range(self.num_vertices):
220 if colour[i] == WHITE:
221 # Set this vertex's value, arbitrarily, to zero.
222 self.set_vertex_value( i, 0 )
223
224 # List of vertices to visit
225 visit_list = [ (None,i) ]
226
227 # Do a DFS
228 while visit_list:
229 # Colour this vertex grey.
230 parent, vertex = visit_list[0] ; del visit_list[0]
231 colour[vertex] = GREY
232
233 # Make copy of list of neighbours, removing the vertex
234 # we arrived here from.
235 neighbours = self.reachable_list.get(vertex, []) [:]
236 if parent in neighbours: neighbours.remove( parent )
237
238 for neighbour in self.reachable_list.get(vertex, []):
239 edge_value = self.get_edge_value( vertex, neighbour )
240 if colour[neighbour] == WHITE:
241 visit_list.insert(0, (vertex, neighbour) )
242
243 # Set new vertex's value to the desired
244 # edge value, minus the value of the
245 # vertex we came here from.
246 new_val = (edge_value -
247 self.get_vertex_value( vertex ) )
248 self.set_vertex_value( neighbour,
249 new_val % self.num_vertices)
250
251 colour[vertex] = BLACK
252
253 # Returns nothing
254 return
255
256 def __getitem__(self, index):
257 if index < self.num_vertices: return index
258 raise IndexError
259
260 def get_vertex_value(self, vertex):
261 "Get value for a vertex"
262 return self.values[ vertex ]
263
264 def set_vertex_value(self, vertex, value):
265 "Set value for a vertex"
266 self.values[ vertex ] = value
267
268 def generate_code(self, out, width = 70):
269 "Return nicely formatted table"
270 out.write("{ ")
271 pos = 0
272 for v in self.values:
273 v=str(v)+', '
274 out.write(v)
275 pos = pos + len(v) + 1
276 if pos > width: out.write('\n '); pos = 0
277 out.write('};\n')
278
279
280class PerfectHash:
281 def __init__(self, cchMax, f1, f2, G, cHashElements, cKeys, maxHashValue):
282 self.cchMax = cchMax
283 self.f1 = f1
284 self.f2 = f2
285 self.G = G
286 self.cHashElements = cHashElements
287 self.cKeys = cKeys
288 # determine the necessary type for storing our hash function
289 # helper table:
290 self.type = self.determineType(maxHashValue)
291
292 def generate_header(self, structName):
293 header = """
Marc-André Lemburge3f257e2000-06-30 09:56:00 +0000294#include "Python.h"
Marc-André Lemburg4e5302a2000-06-28 16:53:16 +0000295#include <stdlib.h>
296
297/* --- C API ----------------------------------------------------*/
298/* C API for usage by other Python modules */
299typedef struct %(structName)s
300{
301 unsigned long cKeys;
302 unsigned long cchMax;
303 unsigned long (*hash)(const char *key, unsigned int cch);
304 const void *(*getValue)(unsigned long iKey);
305} %(structName)s;
306""" % { "structName" : structName }
307 return header
308
309 def determineType(self, maxHashValue):
310 if maxHashValue <= 255:
311 return "unsigned char"
312 elif maxHashValue <= 65535:
313 return "unsigned short"
314 else:
315 # Take the cheesy way out...
316 return "unsigned long"
317
318 def generate_code(self, moduleName, dataArrayName, dataArrayType, structName):
319 # Output C code for the hash functions and tables
320 code = """
321/*
322 * The hash is produced using the algorithm described in
323 * "Optimal algorithms for minimal perfect hashing",
324 * G. Havas, B.S. Majewski. Available as a technical report
325 * from the CS department, University of Queensland
326 * (ftp://ftp.cs.uq.oz.au/).
327 *
328 * Generated using a heavily tweaked version of Andrew Kuchling's
329 * perfect_hash.py:
330 * http://starship.python.net/crew/amk/python/code/perfect-hash.html
331 *
332 * Generated on: %s
333 */
334""" % time.ctime(time.time())
335 # MSVC SP3 was complaining when I actually used a global constant
336 code = code + """
337#define k_cHashElements %i
338#define k_cchMaxKey %d
339#define k_cKeys %i
340
341""" % (self.cHashElements, self.cchMax, self.cKeys)
342
343 code = code + """
344static const %s G[k_cHashElements];
345static const %s %s[k_cKeys];
346""" % (self.type, dataArrayType, dataArrayName)
347
348 code = code + """
349static long f1(const char *key, unsigned int cch)
350"""
351 code = code + self.f1.generate_code()
352 code = code + """
353
354static long f2(const char *key, unsigned int cch)
355"""
356 code = code + self.f2.generate_code()
357 code = code + """
358
359static unsigned long hash(const char *key, unsigned int cch)
360{
361 return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) %% k_cHashElements;
362}
363
364const void *getValue(unsigned long iKey)
365{
366 return &%(dataArrayName)s[iKey];
367}
368
369/* Helper for adding objects to dictionaries. Check for errors with
370 PyErr_Occurred() */
371static
372void insobj(PyObject *dict,
373 char *name,
374 PyObject *v)
375{
376 PyDict_SetItemString(dict, name, v);
377 Py_XDECREF(v);
378}
379
380static const %(structName)s hashAPI =
381{
382 k_cKeys,
383 k_cchMaxKey,
384 &hash,
385 &getValue,
386};
387
388static
389PyMethodDef Module_methods[] =
390{
391 {NULL, NULL},
392};
393
394static char *Module_docstring = "%(moduleName)s hash function module";
395
396/* Error reporting for module init functions */
397
398#define Py_ReportModuleInitError(modname) { \\
399 PyObject *exc_type, *exc_value, *exc_tb; \\
400 PyObject *str_type, *str_value; \\
401 \\
402 /* Fetch error objects and convert them to strings */ \\
403 PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \\
404 if (exc_type && exc_value) { \\
405 str_type = PyObject_Str(exc_type); \\
406 str_value = PyObject_Str(exc_value); \\
407 } \\
408 else { \\
409 str_type = NULL; \\
410 str_value = NULL; \\
411 } \\
412 /* Try to format a more informative error message using the \\
413 original error */ \\
414 if (str_type && str_value && \\
415 PyString_Check(str_type) && PyString_Check(str_value)) \\
416 PyErr_Format( \\
417 PyExc_ImportError, \\
418 "initialization of module "modname" failed " \\
419 "(%%s:%%s)", \\
420 PyString_AS_STRING(str_type), \\
421 PyString_AS_STRING(str_value)); \\
422 else \\
423 PyErr_SetString( \\
424 PyExc_ImportError, \\
425 "initialization of module "modname" failed"); \\
426 Py_XDECREF(str_type); \\
427 Py_XDECREF(str_value); \\
428 Py_XDECREF(exc_type); \\
429 Py_XDECREF(exc_value); \\
430 Py_XDECREF(exc_tb); \\
431}
432
433
434/* Create PyMethodObjects and register them in the module\'s dict */
435DL_EXPORT(void)
436init%(moduleName)s(void)
437{
438 PyObject *module, *moddict;
439 /* Create module */
440 module = Py_InitModule4("%(moduleName)s", /* Module name */
441 Module_methods, /* Method list */
442 Module_docstring, /* Module doc-string */
443 (PyObject *)NULL, /* always pass this as *self */
444 PYTHON_API_VERSION); /* API Version */
445 if (module == NULL)
446 goto onError;
447 /* Add some constants to the module\'s dict */
448 moddict = PyModule_GetDict(module);
449 if (moddict == NULL)
450 goto onError;
451
452 /* Export C API */
453 insobj(
454 moddict,
455 "%(moduleName)sAPI",
456 PyCObject_FromVoidPtr((void *)&hashAPI, NULL));
457
458onError:
459 /* Check for errors and report them */
460 if (PyErr_Occurred())
461 Py_ReportModuleInitError("%(moduleName)s");
462 return;
463}
464""" % { "moduleName" : moduleName,
465 "dataArrayName" : dataArrayName,
466 "structName" : structName, }
467
468 return code
469
470 def generate_graph(self, out):
471 out.write("""
472static const unsigned short G[] =
473""")
474 self.G.generate_code(out)
475
476
477def generate_hash(keys, caseInsensitive=0,
478 minC=None, initC=None,
479 f1Seed=None, f2Seed=None,
480 cIncrement=None, cTries=None):
481 """Print out code for a perfect minimal hash. Input is a list of
482 (key, desired hash value) tuples. """
483
484 # K is the number of keys.
485 K = len(keys)
486
487 # We will be generating graphs of size N, where N = c * K.
488 # The larger C is, the fewer trial graphs will need to be made, but
489 # the resulting table is also larger. Increase this starting value
490 # if you're impatient. After 50 failures, c will be increased by 0.025.
491 if initC is None:
492 initC = 1.5
493
494 c = initC
495 if cIncrement is None:
496 cIncrement = 0.0025
497
498 if cTries is None:
499 cTries = 50
500
501 # Number of trial graphs so far
502 num_graphs = 0
503 sys.stderr.write('Generating graphs... ')
504
505 while 1:
506 # N is the number of vertices in the graph G
507 N = int(c*K)
508 num_graphs = num_graphs + 1
509 if (num_graphs % cTries) == 0:
510 # Enough failures at this multiplier,
511 # increase the multiplier and keep trying....
512 c = c + cIncrement
513
514 # Whats good with searching for a better
515 # hash function if we exceed the size
516 # of a function we've generated in the past....
517 if minC is not None and \
518 c > minC:
519 c = initC
520 sys.stderr.write(' -- c > minC, resetting c to %0.4f\n' % c)
521 else:
522 sys.stderr.write(' -- increasing c to %0.4f\n' % c)
523 sys.stderr.write('Generating graphs... ')
524
525 # Output a progress message
526 sys.stderr.write( str(num_graphs) + ' ')
527 sys.stderr.flush()
528
529 # Create graph w/ N vertices
530 G = Graph(N)
531 # Save the seeds used to generate
532 # the following two hash functions.
533 _seeds = whrandom._inst._seed
534
535 # Create 2 random hash functions
536 f1 = Hash(N, caseInsensitive)
537 f2 = Hash(N, caseInsensitive)
538
539 # Set the initial hash function seed values if passed in.
540 # Doing this protects our hash functions from
541 # changes to whrandom's behavior.
542 if f1Seed is not None:
543 f1.seed = f1Seed
544 f1Seed = None
545 fSpecifiedSeeds = 1
546 if f2Seed is not None:
547 f2.seed = f2Seed
548 f2Seed = None
549 fSpecifiedSeeds = 1
550
551 # Connect vertices given by the values of the two hash functions
552 # for each key. Associate the desired hash value with each
553 # edge.
554 for k, v in keys:
555 h1 = f1(k) ; h2 = f2(k)
556 G.connect( h1,h2, v)
557
558 # Check if the resulting graph is acyclic; if it is,
559 # we're done with step 1.
560 if G.is_acyclic():
561 break
562 elif fSpecifiedSeeds:
563 sys.stderr.write('\nThe initial f1/f2 seeds you specified didn\'t generate a perfect hash function: \n')
564 sys.stderr.write('f1 seed: %s\n' % f1.seed)
565 sys.stderr.write('f2 seed: %s\n' % f2.seed)
566 sys.stderr.write('multipler: %s\n' % c)
567 sys.stderr.write('Your data has likely changed, or you forgot what your initial multiplier should be.\n')
568 sys.stderr.write('continuing the search for a perfect hash function......\n')
569 fSpecifiedSeeds = 0
570
571 # Now we have an acyclic graph, so we assign values to each vertex
572 # such that, for each edge, you can add the values for the two vertices
573 # involved and get the desired value for that edge -- which is the
574 # desired hash key. This task is dead easy, because the graph is acyclic.
575 sys.stderr.write('\nAcyclic graph found; computing vertex values...\n')
576 G.assign_values()
577
578 sys.stderr.write('Checking uniqueness of hash values...\n')
579
580 # Sanity check the result by actually verifying that all the keys
581 # hash to the right value.
582 cchMaxKey = 0
583 maxHashValue = 0
584
585 for k, v in keys:
586 hash1 = G.values[ f1(k) ]
587 hash2 = G.values[ f2(k) ]
588 if hash1 > maxHashValue:
589 maxHashValue = hash1
590 if hash2 > maxHashValue:
591 maxHashValue = hash2
592 perfecthash = (hash1 + hash2) % N
593 assert perfecthash == v
594 cch = len(k)
595 if cch > cchMaxKey:
596 cchMaxKey = cch
597
598 sys.stderr.write('Found perfect hash function!\n')
599 sys.stderr.write('\nIn order to regenerate this hash function, \n')
600 sys.stderr.write('you need to pass these following values back in:\n')
601 sys.stderr.write('f1 seed: %s\n' % repr(f1.seed))
602 sys.stderr.write('f2 seed: %s\n' % repr(f2.seed))
603 sys.stderr.write('initial multipler: %s\n' % c)
604
605 return PerfectHash(cchMaxKey, f1, f2, G, N, len(keys), maxHashValue)