Marc-André Lemburg | 4e5302a | 2000-06-28 16:53:16 +0000 | [diff] [blame] | 1 | #!/usr/bin/env/python |
| 2 | |
| 3 | # perfect_hash.py |
| 4 | # |
| 5 | # Outputs C code for a minimal perfect hash. |
| 6 | # The hash is produced using the algorithm described in |
| 7 | # "Optimal algorithms for minimal perfect hashing", |
| 8 | # G. Havas, B.S. Majewski. Available as a technical report |
| 9 | # from the CS department, University of Queensland |
| 10 | # (ftp://ftp.cs.uq.oz.au/). |
| 11 | # |
| 12 | # This is a modified version of Andrew Kuchling's code |
| 13 | # (http://starship.python.net/crew/amk/python/code/perfect-hash.html) |
| 14 | # and generates C fragments suitable for compilation as a Python |
| 15 | # extension module. |
| 16 | # |
| 17 | |
| 18 | # Difference between this algorithm and gperf: |
| 19 | # Gperf will complete in finite time with a successful function, |
| 20 | # or by giving up. |
| 21 | # This algorithm may never complete, although it is extremely likely |
| 22 | # when c >= 2. |
| 23 | |
| 24 | # The algorithm works like this: |
| 25 | # 0) You have K keys, that you want to perfectly hash to a bunch |
| 26 | # of hash values. |
| 27 | # |
| 28 | # 1) Choose a number N larger than K. This is the number of |
| 29 | # vertices in a graph G, and also the size of the resulting table. |
| 30 | # |
| 31 | # 2) Pick two random hash functions f1, f2, that output values from |
| 32 | # 0...N-1. |
| 33 | # |
| 34 | # 3) for key in keys: |
| 35 | # h1 = f1(key) ; h2 = f2(key) |
| 36 | # Draw an edge between vertices h1 and h2 of the graph. |
| 37 | # Associate the desired hash value with that edge. |
| 38 | # |
| 39 | # 4) Check if G is acyclic; if not, go back to step 1 and pick a bigger N. |
| 40 | # |
| 41 | # 5) Assign values to each vertex such that, for each edge, you can |
| 42 | # add the values for the two vertices and get the desired value |
| 43 | # for that edge -- which is the desired hash key. This task is |
| 44 | # dead easy, because the graph is acyclic. This is done by |
| 45 | # picking a vertex V, and assigning it a value of 0. You then do a |
| 46 | # depth-first search, assigning values to new vertices so that |
| 47 | # they sum up properly. |
| 48 | # |
| 49 | # 6) f1, f2, and G now make up your perfect hash function. |
| 50 | |
| 51 | |
| 52 | import sys, whrandom, string |
| 53 | import pprint |
| 54 | import perfhash |
| 55 | import time |
| 56 | |
| 57 | class Hash: |
| 58 | """Random hash function |
| 59 | For simplicity and speed, this doesn't implement any byte-level hashing |
| 60 | scheme. Instead, a random string is generated and prefixing to |
| 61 | str(key), and then Python's hashing function is used.""" |
| 62 | |
| 63 | def __init__(self, N, caseInsensitive=0): |
| 64 | self.N = N |
| 65 | junk = "" |
| 66 | for i in range(10): |
| 67 | junk = junk + whrandom.choice(string.letters + string.digits) |
| 68 | self.junk = junk |
| 69 | self.caseInsensitive = caseInsensitive |
| 70 | self.seed = perfhash.calcSeed(junk) |
| 71 | |
| 72 | def __call__(self, key): |
| 73 | key = str(key) |
| 74 | if self.caseInsensitive: |
| 75 | key = string.upper(key) |
| 76 | x = perfhash.hash(self.seed, len(self.junk), key) % self.N |
| 77 | #h = hash(self.junk + key) % self.N |
| 78 | #assert x == h |
| 79 | return x |
| 80 | |
| 81 | def generate_code(self): |
| 82 | s = """{ |
| 83 | register int len; |
| 84 | register unsigned char *p; |
| 85 | register long x; |
| 86 | |
| 87 | len = cch; |
| 88 | p = (unsigned char *) key; |
| 89 | x = %(junkSeed)d; |
| 90 | while (--len >= 0) |
| 91 | x = (1000003*x) ^ """ % \ |
| 92 | { |
| 93 | "lenJunk" : len(self.junk), |
| 94 | "junkSeed" : self.seed, |
| 95 | } |
| 96 | |
| 97 | if self.caseInsensitive: |
| 98 | s = s + "toupper(*(p++));" |
| 99 | else: |
| 100 | s = s + "*(p++);" |
| 101 | s = s + """ |
| 102 | x ^= cch + %(lenJunk)d; |
| 103 | if (x == -1) |
| 104 | x = -2; |
| 105 | x %%= k_cHashElements; |
| 106 | /* ensure the returned value is positive so we mimic Python's %% operator */ |
| 107 | if (x < 0) |
| 108 | x += k_cHashElements; |
| 109 | return x; |
| 110 | } |
| 111 | """ % { "lenJunk" : len(self.junk), |
| 112 | "junkSeed" : self.seed, } |
| 113 | return s |
| 114 | |
| 115 | |
| 116 | WHITE, GREY, BLACK = 0,1,2 |
| 117 | class Graph: |
| 118 | """Graph class. This class isn't particularly efficient or general, |
| 119 | and only has the features I needed to implement this algorithm. |
| 120 | |
| 121 | num_vertices -- number of vertices |
| 122 | edges -- maps 2-tuples of vertex numbers to the value for this |
| 123 | edge. If there's an edge between v1 and v2 (v1<v2), |
| 124 | (v1,v2) is a key and the value is the edge's value. |
| 125 | reachable_list -- maps a vertex V to the list of vertices |
| 126 | to which V is connected by edges. Used |
| 127 | for traversing the graph. |
| 128 | values -- numeric value for each vertex |
| 129 | """ |
| 130 | |
| 131 | def __init__(self, num_vertices): |
| 132 | self.num_vertices = num_vertices |
| 133 | self.edges = {} |
| 134 | self.reachable_list = {} |
| 135 | self.values = [-1] * num_vertices |
| 136 | |
| 137 | def connect(self, vertex1, vertex2, value): |
| 138 | """Connect 'vertex1' and 'vertex2' with an edge, with associated |
| 139 | value 'value'""" |
| 140 | |
| 141 | if vertex1 > vertex2: vertex1, vertex2 = vertex2, vertex1 |
| 142 | # if self.edges.has_key( (vertex1, vertex2) ): |
| 143 | # raise ValueError, 'Collision: vertices already connected' |
| 144 | self.edges[ (vertex1, vertex2) ] = value |
| 145 | |
| 146 | # Add vertices to each other's reachable list |
| 147 | if not self.reachable_list.has_key( vertex1 ): |
| 148 | self.reachable_list[ vertex1 ] = [vertex2] |
| 149 | else: |
| 150 | self.reachable_list[vertex1].append(vertex2) |
| 151 | |
| 152 | if not self.reachable_list.has_key( vertex2 ): |
| 153 | self.reachable_list[ vertex2 ] = [vertex1] |
| 154 | else: |
| 155 | self.reachable_list[vertex2].append(vertex1) |
| 156 | |
| 157 | def get_edge_value(self, vertex1, vertex2): |
| 158 | """Retrieve the value corresponding to the edge between |
| 159 | 'vertex1' and 'vertex2'. Raises KeyError if no such edge""" |
| 160 | if vertex1 > vertex2: |
| 161 | vertex1, vertex2 = vertex2, vertex1 |
| 162 | return self.edges[ (vertex1, vertex2) ] |
| 163 | |
| 164 | def is_acyclic(self): |
| 165 | "Returns true if the graph is acyclic, otherwise false" |
| 166 | |
| 167 | # This is done by doing a depth-first search of the graph; |
| 168 | # painting each vertex grey and then black. If the DFS |
| 169 | # ever finds a vertex that isn't white, there's a cycle. |
| 170 | colour = {} |
| 171 | for i in range(self.num_vertices): colour[i] = WHITE |
| 172 | |
| 173 | # Loop over all vertices, taking white ones as starting |
| 174 | # points for a traversal. |
| 175 | for i in range(self.num_vertices): |
| 176 | if colour[i] == WHITE: |
| 177 | |
| 178 | # List of vertices to visit |
| 179 | visit_list = [ (None,i) ] |
| 180 | |
| 181 | # Do a DFS |
| 182 | while visit_list: |
| 183 | # Colour this vertex grey. |
| 184 | parent, vertex = visit_list[0] ; del visit_list[0] |
| 185 | colour[vertex] = GREY |
| 186 | |
| 187 | # Make copy of list of neighbours, removing the vertex |
| 188 | # we arrived here from. |
| 189 | neighbours = self.reachable_list.get(vertex, []) [:] |
| 190 | if parent in neighbours: neighbours.remove( parent ) |
| 191 | |
| 192 | for neighbour in neighbours: |
| 193 | if colour[neighbour] == WHITE: |
| 194 | visit_list.insert(0, (vertex, neighbour) ) |
| 195 | elif colour[neighbour] != WHITE: |
| 196 | # Aha! Already visited this node, |
| 197 | # so the graph isn't acyclic. |
| 198 | return 0 |
| 199 | |
| 200 | colour[vertex] = BLACK |
| 201 | |
| 202 | # We got through, so the graph is acyclic. |
| 203 | return 1 |
| 204 | |
| 205 | def assign_values(self): |
| 206 | """Compute values for each vertex, so that they sum up |
| 207 | properly to the associated value for each edge.""" |
| 208 | |
| 209 | # Also done with a DFS; I simply copied the DFS code |
| 210 | # from is_acyclic(). (Should generalize the logic so |
| 211 | # one function could be used from both methods, |
| 212 | # but I couldn't be bothered.) |
| 213 | |
| 214 | colour = {} |
| 215 | for i in range(self.num_vertices): colour[i] = WHITE |
| 216 | |
| 217 | # Loop over all vertices, taking white ones as starting |
| 218 | # points for a traversal. |
| 219 | for i in range(self.num_vertices): |
| 220 | if colour[i] == WHITE: |
| 221 | # Set this vertex's value, arbitrarily, to zero. |
| 222 | self.set_vertex_value( i, 0 ) |
| 223 | |
| 224 | # List of vertices to visit |
| 225 | visit_list = [ (None,i) ] |
| 226 | |
| 227 | # Do a DFS |
| 228 | while visit_list: |
| 229 | # Colour this vertex grey. |
| 230 | parent, vertex = visit_list[0] ; del visit_list[0] |
| 231 | colour[vertex] = GREY |
| 232 | |
| 233 | # Make copy of list of neighbours, removing the vertex |
| 234 | # we arrived here from. |
| 235 | neighbours = self.reachable_list.get(vertex, []) [:] |
| 236 | if parent in neighbours: neighbours.remove( parent ) |
| 237 | |
| 238 | for neighbour in self.reachable_list.get(vertex, []): |
| 239 | edge_value = self.get_edge_value( vertex, neighbour ) |
| 240 | if colour[neighbour] == WHITE: |
| 241 | visit_list.insert(0, (vertex, neighbour) ) |
| 242 | |
| 243 | # Set new vertex's value to the desired |
| 244 | # edge value, minus the value of the |
| 245 | # vertex we came here from. |
| 246 | new_val = (edge_value - |
| 247 | self.get_vertex_value( vertex ) ) |
| 248 | self.set_vertex_value( neighbour, |
| 249 | new_val % self.num_vertices) |
| 250 | |
| 251 | colour[vertex] = BLACK |
| 252 | |
| 253 | # Returns nothing |
| 254 | return |
| 255 | |
| 256 | def __getitem__(self, index): |
| 257 | if index < self.num_vertices: return index |
| 258 | raise IndexError |
| 259 | |
| 260 | def get_vertex_value(self, vertex): |
| 261 | "Get value for a vertex" |
| 262 | return self.values[ vertex ] |
| 263 | |
| 264 | def set_vertex_value(self, vertex, value): |
| 265 | "Set value for a vertex" |
| 266 | self.values[ vertex ] = value |
| 267 | |
| 268 | def generate_code(self, out, width = 70): |
| 269 | "Return nicely formatted table" |
| 270 | out.write("{ ") |
| 271 | pos = 0 |
| 272 | for v in self.values: |
| 273 | v=str(v)+', ' |
| 274 | out.write(v) |
| 275 | pos = pos + len(v) + 1 |
| 276 | if pos > width: out.write('\n '); pos = 0 |
| 277 | out.write('};\n') |
| 278 | |
| 279 | |
| 280 | class PerfectHash: |
| 281 | def __init__(self, cchMax, f1, f2, G, cHashElements, cKeys, maxHashValue): |
| 282 | self.cchMax = cchMax |
| 283 | self.f1 = f1 |
| 284 | self.f2 = f2 |
| 285 | self.G = G |
| 286 | self.cHashElements = cHashElements |
| 287 | self.cKeys = cKeys |
| 288 | # determine the necessary type for storing our hash function |
| 289 | # helper table: |
| 290 | self.type = self.determineType(maxHashValue) |
| 291 | |
| 292 | def generate_header(self, structName): |
| 293 | header = """ |
Marc-André Lemburg | e3f257e | 2000-06-30 09:56:00 +0000 | [diff] [blame] | 294 | #include "Python.h" |
Marc-André Lemburg | 4e5302a | 2000-06-28 16:53:16 +0000 | [diff] [blame] | 295 | #include <stdlib.h> |
| 296 | |
| 297 | /* --- C API ----------------------------------------------------*/ |
| 298 | /* C API for usage by other Python modules */ |
| 299 | typedef struct %(structName)s |
| 300 | { |
| 301 | unsigned long cKeys; |
| 302 | unsigned long cchMax; |
| 303 | unsigned long (*hash)(const char *key, unsigned int cch); |
| 304 | const void *(*getValue)(unsigned long iKey); |
| 305 | } %(structName)s; |
| 306 | """ % { "structName" : structName } |
| 307 | return header |
| 308 | |
| 309 | def determineType(self, maxHashValue): |
| 310 | if maxHashValue <= 255: |
| 311 | return "unsigned char" |
| 312 | elif maxHashValue <= 65535: |
| 313 | return "unsigned short" |
| 314 | else: |
| 315 | # Take the cheesy way out... |
| 316 | return "unsigned long" |
| 317 | |
| 318 | def generate_code(self, moduleName, dataArrayName, dataArrayType, structName): |
| 319 | # Output C code for the hash functions and tables |
| 320 | code = """ |
| 321 | /* |
| 322 | * The hash is produced using the algorithm described in |
| 323 | * "Optimal algorithms for minimal perfect hashing", |
| 324 | * G. Havas, B.S. Majewski. Available as a technical report |
| 325 | * from the CS department, University of Queensland |
| 326 | * (ftp://ftp.cs.uq.oz.au/). |
| 327 | * |
| 328 | * Generated using a heavily tweaked version of Andrew Kuchling's |
| 329 | * perfect_hash.py: |
| 330 | * http://starship.python.net/crew/amk/python/code/perfect-hash.html |
| 331 | * |
| 332 | * Generated on: %s |
| 333 | */ |
| 334 | """ % time.ctime(time.time()) |
| 335 | # MSVC SP3 was complaining when I actually used a global constant |
| 336 | code = code + """ |
| 337 | #define k_cHashElements %i |
| 338 | #define k_cchMaxKey %d |
| 339 | #define k_cKeys %i |
| 340 | |
| 341 | """ % (self.cHashElements, self.cchMax, self.cKeys) |
| 342 | |
| 343 | code = code + """ |
| 344 | static const %s G[k_cHashElements]; |
| 345 | static const %s %s[k_cKeys]; |
| 346 | """ % (self.type, dataArrayType, dataArrayName) |
| 347 | |
| 348 | code = code + """ |
| 349 | static long f1(const char *key, unsigned int cch) |
| 350 | """ |
| 351 | code = code + self.f1.generate_code() |
| 352 | code = code + """ |
| 353 | |
| 354 | static long f2(const char *key, unsigned int cch) |
| 355 | """ |
| 356 | code = code + self.f2.generate_code() |
| 357 | code = code + """ |
| 358 | |
| 359 | static unsigned long hash(const char *key, unsigned int cch) |
| 360 | { |
| 361 | return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) %% k_cHashElements; |
| 362 | } |
| 363 | |
| 364 | const void *getValue(unsigned long iKey) |
| 365 | { |
| 366 | return &%(dataArrayName)s[iKey]; |
| 367 | } |
| 368 | |
| 369 | /* Helper for adding objects to dictionaries. Check for errors with |
| 370 | PyErr_Occurred() */ |
| 371 | static |
| 372 | void insobj(PyObject *dict, |
| 373 | char *name, |
| 374 | PyObject *v) |
| 375 | { |
| 376 | PyDict_SetItemString(dict, name, v); |
| 377 | Py_XDECREF(v); |
| 378 | } |
| 379 | |
| 380 | static const %(structName)s hashAPI = |
| 381 | { |
| 382 | k_cKeys, |
| 383 | k_cchMaxKey, |
| 384 | &hash, |
| 385 | &getValue, |
| 386 | }; |
| 387 | |
| 388 | static |
| 389 | PyMethodDef Module_methods[] = |
| 390 | { |
| 391 | {NULL, NULL}, |
| 392 | }; |
| 393 | |
| 394 | static char *Module_docstring = "%(moduleName)s hash function module"; |
| 395 | |
| 396 | /* Error reporting for module init functions */ |
| 397 | |
| 398 | #define Py_ReportModuleInitError(modname) { \\ |
| 399 | PyObject *exc_type, *exc_value, *exc_tb; \\ |
| 400 | PyObject *str_type, *str_value; \\ |
| 401 | \\ |
| 402 | /* Fetch error objects and convert them to strings */ \\ |
| 403 | PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \\ |
| 404 | if (exc_type && exc_value) { \\ |
| 405 | str_type = PyObject_Str(exc_type); \\ |
| 406 | str_value = PyObject_Str(exc_value); \\ |
| 407 | } \\ |
| 408 | else { \\ |
| 409 | str_type = NULL; \\ |
| 410 | str_value = NULL; \\ |
| 411 | } \\ |
| 412 | /* Try to format a more informative error message using the \\ |
| 413 | original error */ \\ |
| 414 | if (str_type && str_value && \\ |
| 415 | PyString_Check(str_type) && PyString_Check(str_value)) \\ |
| 416 | PyErr_Format( \\ |
| 417 | PyExc_ImportError, \\ |
| 418 | "initialization of module "modname" failed " \\ |
| 419 | "(%%s:%%s)", \\ |
| 420 | PyString_AS_STRING(str_type), \\ |
| 421 | PyString_AS_STRING(str_value)); \\ |
| 422 | else \\ |
| 423 | PyErr_SetString( \\ |
| 424 | PyExc_ImportError, \\ |
| 425 | "initialization of module "modname" failed"); \\ |
| 426 | Py_XDECREF(str_type); \\ |
| 427 | Py_XDECREF(str_value); \\ |
| 428 | Py_XDECREF(exc_type); \\ |
| 429 | Py_XDECREF(exc_value); \\ |
| 430 | Py_XDECREF(exc_tb); \\ |
| 431 | } |
| 432 | |
| 433 | |
| 434 | /* Create PyMethodObjects and register them in the module\'s dict */ |
| 435 | DL_EXPORT(void) |
| 436 | init%(moduleName)s(void) |
| 437 | { |
| 438 | PyObject *module, *moddict; |
| 439 | /* Create module */ |
| 440 | module = Py_InitModule4("%(moduleName)s", /* Module name */ |
| 441 | Module_methods, /* Method list */ |
| 442 | Module_docstring, /* Module doc-string */ |
| 443 | (PyObject *)NULL, /* always pass this as *self */ |
| 444 | PYTHON_API_VERSION); /* API Version */ |
| 445 | if (module == NULL) |
| 446 | goto onError; |
| 447 | /* Add some constants to the module\'s dict */ |
| 448 | moddict = PyModule_GetDict(module); |
| 449 | if (moddict == NULL) |
| 450 | goto onError; |
| 451 | |
| 452 | /* Export C API */ |
| 453 | insobj( |
| 454 | moddict, |
| 455 | "%(moduleName)sAPI", |
| 456 | PyCObject_FromVoidPtr((void *)&hashAPI, NULL)); |
| 457 | |
| 458 | onError: |
| 459 | /* Check for errors and report them */ |
| 460 | if (PyErr_Occurred()) |
| 461 | Py_ReportModuleInitError("%(moduleName)s"); |
| 462 | return; |
| 463 | } |
| 464 | """ % { "moduleName" : moduleName, |
| 465 | "dataArrayName" : dataArrayName, |
| 466 | "structName" : structName, } |
| 467 | |
| 468 | return code |
| 469 | |
| 470 | def generate_graph(self, out): |
| 471 | out.write(""" |
| 472 | static const unsigned short G[] = |
| 473 | """) |
| 474 | self.G.generate_code(out) |
| 475 | |
| 476 | |
| 477 | def generate_hash(keys, caseInsensitive=0, |
| 478 | minC=None, initC=None, |
| 479 | f1Seed=None, f2Seed=None, |
| 480 | cIncrement=None, cTries=None): |
| 481 | """Print out code for a perfect minimal hash. Input is a list of |
| 482 | (key, desired hash value) tuples. """ |
| 483 | |
| 484 | # K is the number of keys. |
| 485 | K = len(keys) |
| 486 | |
| 487 | # We will be generating graphs of size N, where N = c * K. |
| 488 | # The larger C is, the fewer trial graphs will need to be made, but |
| 489 | # the resulting table is also larger. Increase this starting value |
| 490 | # if you're impatient. After 50 failures, c will be increased by 0.025. |
| 491 | if initC is None: |
| 492 | initC = 1.5 |
| 493 | |
| 494 | c = initC |
| 495 | if cIncrement is None: |
| 496 | cIncrement = 0.0025 |
| 497 | |
| 498 | if cTries is None: |
| 499 | cTries = 50 |
| 500 | |
| 501 | # Number of trial graphs so far |
| 502 | num_graphs = 0 |
| 503 | sys.stderr.write('Generating graphs... ') |
| 504 | |
| 505 | while 1: |
| 506 | # N is the number of vertices in the graph G |
| 507 | N = int(c*K) |
| 508 | num_graphs = num_graphs + 1 |
| 509 | if (num_graphs % cTries) == 0: |
| 510 | # Enough failures at this multiplier, |
| 511 | # increase the multiplier and keep trying.... |
| 512 | c = c + cIncrement |
| 513 | |
| 514 | # Whats good with searching for a better |
| 515 | # hash function if we exceed the size |
| 516 | # of a function we've generated in the past.... |
| 517 | if minC is not None and \ |
| 518 | c > minC: |
| 519 | c = initC |
| 520 | sys.stderr.write(' -- c > minC, resetting c to %0.4f\n' % c) |
| 521 | else: |
| 522 | sys.stderr.write(' -- increasing c to %0.4f\n' % c) |
| 523 | sys.stderr.write('Generating graphs... ') |
| 524 | |
| 525 | # Output a progress message |
| 526 | sys.stderr.write( str(num_graphs) + ' ') |
| 527 | sys.stderr.flush() |
| 528 | |
| 529 | # Create graph w/ N vertices |
| 530 | G = Graph(N) |
| 531 | # Save the seeds used to generate |
| 532 | # the following two hash functions. |
| 533 | _seeds = whrandom._inst._seed |
| 534 | |
| 535 | # Create 2 random hash functions |
| 536 | f1 = Hash(N, caseInsensitive) |
| 537 | f2 = Hash(N, caseInsensitive) |
| 538 | |
| 539 | # Set the initial hash function seed values if passed in. |
| 540 | # Doing this protects our hash functions from |
| 541 | # changes to whrandom's behavior. |
| 542 | if f1Seed is not None: |
| 543 | f1.seed = f1Seed |
| 544 | f1Seed = None |
| 545 | fSpecifiedSeeds = 1 |
| 546 | if f2Seed is not None: |
| 547 | f2.seed = f2Seed |
| 548 | f2Seed = None |
| 549 | fSpecifiedSeeds = 1 |
| 550 | |
| 551 | # Connect vertices given by the values of the two hash functions |
| 552 | # for each key. Associate the desired hash value with each |
| 553 | # edge. |
| 554 | for k, v in keys: |
| 555 | h1 = f1(k) ; h2 = f2(k) |
| 556 | G.connect( h1,h2, v) |
| 557 | |
| 558 | # Check if the resulting graph is acyclic; if it is, |
| 559 | # we're done with step 1. |
| 560 | if G.is_acyclic(): |
| 561 | break |
| 562 | elif fSpecifiedSeeds: |
| 563 | sys.stderr.write('\nThe initial f1/f2 seeds you specified didn\'t generate a perfect hash function: \n') |
| 564 | sys.stderr.write('f1 seed: %s\n' % f1.seed) |
| 565 | sys.stderr.write('f2 seed: %s\n' % f2.seed) |
| 566 | sys.stderr.write('multipler: %s\n' % c) |
| 567 | sys.stderr.write('Your data has likely changed, or you forgot what your initial multiplier should be.\n') |
| 568 | sys.stderr.write('continuing the search for a perfect hash function......\n') |
| 569 | fSpecifiedSeeds = 0 |
| 570 | |
| 571 | # Now we have an acyclic graph, so we assign values to each vertex |
| 572 | # such that, for each edge, you can add the values for the two vertices |
| 573 | # involved and get the desired value for that edge -- which is the |
| 574 | # desired hash key. This task is dead easy, because the graph is acyclic. |
| 575 | sys.stderr.write('\nAcyclic graph found; computing vertex values...\n') |
| 576 | G.assign_values() |
| 577 | |
| 578 | sys.stderr.write('Checking uniqueness of hash values...\n') |
| 579 | |
| 580 | # Sanity check the result by actually verifying that all the keys |
| 581 | # hash to the right value. |
| 582 | cchMaxKey = 0 |
| 583 | maxHashValue = 0 |
| 584 | |
| 585 | for k, v in keys: |
| 586 | hash1 = G.values[ f1(k) ] |
| 587 | hash2 = G.values[ f2(k) ] |
| 588 | if hash1 > maxHashValue: |
| 589 | maxHashValue = hash1 |
| 590 | if hash2 > maxHashValue: |
| 591 | maxHashValue = hash2 |
| 592 | perfecthash = (hash1 + hash2) % N |
| 593 | assert perfecthash == v |
| 594 | cch = len(k) |
| 595 | if cch > cchMaxKey: |
| 596 | cchMaxKey = cch |
| 597 | |
| 598 | sys.stderr.write('Found perfect hash function!\n') |
| 599 | sys.stderr.write('\nIn order to regenerate this hash function, \n') |
| 600 | sys.stderr.write('you need to pass these following values back in:\n') |
| 601 | sys.stderr.write('f1 seed: %s\n' % repr(f1.seed)) |
| 602 | sys.stderr.write('f2 seed: %s\n' % repr(f2.seed)) |
| 603 | sys.stderr.write('initial multipler: %s\n' % c) |
| 604 | |
| 605 | return PerfectHash(cchMaxKey, f1, f2, G, N, len(keys), maxHashValue) |