| #!/usr/bin/env/python |
| |
| # perfect_hash.py |
| # |
| # Outputs C code for a minimal perfect hash. |
| # The hash is produced using the algorithm described in |
| # "Optimal algorithms for minimal perfect hashing", |
| # G. Havas, B.S. Majewski. Available as a technical report |
| # from the CS department, University of Queensland |
| # (ftp://ftp.cs.uq.oz.au/). |
| # |
| # This is a modified version of Andrew Kuchling's code |
| # (http://starship.python.net/crew/amk/python/code/perfect-hash.html) |
| # and generates C fragments suitable for compilation as a Python |
| # extension module. |
| # |
| |
| # Difference between this algorithm and gperf: |
| # Gperf will complete in finite time with a successful function, |
| # or by giving up. |
| # This algorithm may never complete, although it is extremely likely |
| # when c >= 2. |
| |
| # The algorithm works like this: |
| # 0) You have K keys, that you want to perfectly hash to a bunch |
| # of hash values. |
| # |
| # 1) Choose a number N larger than K. This is the number of |
| # vertices in a graph G, and also the size of the resulting table. |
| # |
| # 2) Pick two random hash functions f1, f2, that output values from |
| # 0...N-1. |
| # |
| # 3) for key in keys: |
| # h1 = f1(key) ; h2 = f2(key) |
| # Draw an edge between vertices h1 and h2 of the graph. |
| # Associate the desired hash value with that edge. |
| # |
| # 4) Check if G is acyclic; if not, go back to step 1 and pick a bigger N. |
| # |
| # 5) Assign values to each vertex such that, for each edge, you can |
| # add the values for the two vertices and get the desired value |
| # for that edge -- which is the desired hash key. This task is |
| # dead easy, because the graph is acyclic. This is done by |
| # picking a vertex V, and assigning it a value of 0. You then do a |
| # depth-first search, assigning values to new vertices so that |
| # they sum up properly. |
| # |
| # 6) f1, f2, and G now make up your perfect hash function. |
| |
| |
| import sys, whrandom, string |
| import pprint |
| import perfhash |
| import time |
| |
| class Hash: |
| """Random hash function |
| For simplicity and speed, this doesn't implement any byte-level hashing |
| scheme. Instead, a random string is generated and prefixing to |
| str(key), and then Python's hashing function is used.""" |
| |
| def __init__(self, N, caseInsensitive=0): |
| self.N = N |
| junk = "" |
| for i in range(10): |
| junk = junk + whrandom.choice(string.letters + string.digits) |
| self.junk = junk |
| self.caseInsensitive = caseInsensitive |
| self.seed = perfhash.calcSeed(junk) |
| |
| def __call__(self, key): |
| key = str(key) |
| if self.caseInsensitive: |
| key = string.upper(key) |
| x = perfhash.hash(self.seed, len(self.junk), key) % self.N |
| #h = hash(self.junk + key) % self.N |
| #assert x == h |
| return x |
| |
| def generate_code(self): |
| s = """{ |
| register int len; |
| register unsigned char *p; |
| register long x; |
| |
| len = cch; |
| p = (unsigned char *) key; |
| x = %(junkSeed)d; |
| while (--len >= 0) |
| x = (1000003*x) ^ """ % \ |
| { |
| "lenJunk" : len(self.junk), |
| "junkSeed" : self.seed, |
| } |
| |
| if self.caseInsensitive: |
| s = s + "toupper(*(p++));" |
| else: |
| s = s + "*(p++);" |
| s = s + """ |
| x ^= cch + %(lenJunk)d; |
| if (x == -1) |
| x = -2; |
| x %%= k_cHashElements; |
| /* ensure the returned value is positive so we mimic Python's %% operator */ |
| if (x < 0) |
| x += k_cHashElements; |
| return x; |
| } |
| """ % { "lenJunk" : len(self.junk), |
| "junkSeed" : self.seed, } |
| return s |
| |
| |
| WHITE, GREY, BLACK = 0,1,2 |
| class Graph: |
| """Graph class. This class isn't particularly efficient or general, |
| and only has the features I needed to implement this algorithm. |
| |
| num_vertices -- number of vertices |
| edges -- maps 2-tuples of vertex numbers to the value for this |
| edge. If there's an edge between v1 and v2 (v1<v2), |
| (v1,v2) is a key and the value is the edge's value. |
| reachable_list -- maps a vertex V to the list of vertices |
| to which V is connected by edges. Used |
| for traversing the graph. |
| values -- numeric value for each vertex |
| """ |
| |
| def __init__(self, num_vertices): |
| self.num_vertices = num_vertices |
| self.edges = {} |
| self.reachable_list = {} |
| self.values = [-1] * num_vertices |
| |
| def connect(self, vertex1, vertex2, value): |
| """Connect 'vertex1' and 'vertex2' with an edge, with associated |
| value 'value'""" |
| |
| if vertex1 > vertex2: vertex1, vertex2 = vertex2, vertex1 |
| # if self.edges.has_key( (vertex1, vertex2) ): |
| # raise ValueError, 'Collision: vertices already connected' |
| self.edges[ (vertex1, vertex2) ] = value |
| |
| # Add vertices to each other's reachable list |
| if not self.reachable_list.has_key( vertex1 ): |
| self.reachable_list[ vertex1 ] = [vertex2] |
| else: |
| self.reachable_list[vertex1].append(vertex2) |
| |
| if not self.reachable_list.has_key( vertex2 ): |
| self.reachable_list[ vertex2 ] = [vertex1] |
| else: |
| self.reachable_list[vertex2].append(vertex1) |
| |
| def get_edge_value(self, vertex1, vertex2): |
| """Retrieve the value corresponding to the edge between |
| 'vertex1' and 'vertex2'. Raises KeyError if no such edge""" |
| if vertex1 > vertex2: |
| vertex1, vertex2 = vertex2, vertex1 |
| return self.edges[ (vertex1, vertex2) ] |
| |
| def is_acyclic(self): |
| "Returns true if the graph is acyclic, otherwise false" |
| |
| # This is done by doing a depth-first search of the graph; |
| # painting each vertex grey and then black. If the DFS |
| # ever finds a vertex that isn't white, there's a cycle. |
| colour = {} |
| for i in range(self.num_vertices): colour[i] = WHITE |
| |
| # Loop over all vertices, taking white ones as starting |
| # points for a traversal. |
| for i in range(self.num_vertices): |
| if colour[i] == WHITE: |
| |
| # List of vertices to visit |
| visit_list = [ (None,i) ] |
| |
| # Do a DFS |
| while visit_list: |
| # Colour this vertex grey. |
| parent, vertex = visit_list[0] ; del visit_list[0] |
| colour[vertex] = GREY |
| |
| # Make copy of list of neighbours, removing the vertex |
| # we arrived here from. |
| neighbours = self.reachable_list.get(vertex, []) [:] |
| if parent in neighbours: neighbours.remove( parent ) |
| |
| for neighbour in neighbours: |
| if colour[neighbour] == WHITE: |
| visit_list.insert(0, (vertex, neighbour) ) |
| elif colour[neighbour] != WHITE: |
| # Aha! Already visited this node, |
| # so the graph isn't acyclic. |
| return 0 |
| |
| colour[vertex] = BLACK |
| |
| # We got through, so the graph is acyclic. |
| return 1 |
| |
| def assign_values(self): |
| """Compute values for each vertex, so that they sum up |
| properly to the associated value for each edge.""" |
| |
| # Also done with a DFS; I simply copied the DFS code |
| # from is_acyclic(). (Should generalize the logic so |
| # one function could be used from both methods, |
| # but I couldn't be bothered.) |
| |
| colour = {} |
| for i in range(self.num_vertices): colour[i] = WHITE |
| |
| # Loop over all vertices, taking white ones as starting |
| # points for a traversal. |
| for i in range(self.num_vertices): |
| if colour[i] == WHITE: |
| # Set this vertex's value, arbitrarily, to zero. |
| self.set_vertex_value( i, 0 ) |
| |
| # List of vertices to visit |
| visit_list = [ (None,i) ] |
| |
| # Do a DFS |
| while visit_list: |
| # Colour this vertex grey. |
| parent, vertex = visit_list[0] ; del visit_list[0] |
| colour[vertex] = GREY |
| |
| # Make copy of list of neighbours, removing the vertex |
| # we arrived here from. |
| neighbours = self.reachable_list.get(vertex, []) [:] |
| if parent in neighbours: neighbours.remove( parent ) |
| |
| for neighbour in self.reachable_list.get(vertex, []): |
| edge_value = self.get_edge_value( vertex, neighbour ) |
| if colour[neighbour] == WHITE: |
| visit_list.insert(0, (vertex, neighbour) ) |
| |
| # Set new vertex's value to the desired |
| # edge value, minus the value of the |
| # vertex we came here from. |
| new_val = (edge_value - |
| self.get_vertex_value( vertex ) ) |
| self.set_vertex_value( neighbour, |
| new_val % self.num_vertices) |
| |
| colour[vertex] = BLACK |
| |
| # Returns nothing |
| return |
| |
| def __getitem__(self, index): |
| if index < self.num_vertices: return index |
| raise IndexError |
| |
| def get_vertex_value(self, vertex): |
| "Get value for a vertex" |
| return self.values[ vertex ] |
| |
| def set_vertex_value(self, vertex, value): |
| "Set value for a vertex" |
| self.values[ vertex ] = value |
| |
| def generate_code(self, out, width = 70): |
| "Return nicely formatted table" |
| out.write("{ ") |
| pos = 0 |
| for v in self.values: |
| v=str(v)+', ' |
| out.write(v) |
| pos = pos + len(v) + 1 |
| if pos > width: out.write('\n '); pos = 0 |
| out.write('};\n') |
| |
| |
| class PerfectHash: |
| def __init__(self, cchMax, f1, f2, G, cHashElements, cKeys, maxHashValue): |
| self.cchMax = cchMax |
| self.f1 = f1 |
| self.f2 = f2 |
| self.G = G |
| self.cHashElements = cHashElements |
| self.cKeys = cKeys |
| # determine the necessary type for storing our hash function |
| # helper table: |
| self.type = self.determineType(maxHashValue) |
| |
| def generate_header(self, structName): |
| header = """ |
| #include "Python.h" |
| #include <stdlib.h> |
| |
| /* --- C API ----------------------------------------------------*/ |
| /* C API for usage by other Python modules */ |
| typedef struct %(structName)s |
| { |
| unsigned long cKeys; |
| unsigned long cchMax; |
| unsigned long (*hash)(const char *key, unsigned int cch); |
| const void *(*getValue)(unsigned long iKey); |
| } %(structName)s; |
| """ % { "structName" : structName } |
| return header |
| |
| def determineType(self, maxHashValue): |
| if maxHashValue <= 255: |
| return "unsigned char" |
| elif maxHashValue <= 65535: |
| return "unsigned short" |
| else: |
| # Take the cheesy way out... |
| return "unsigned long" |
| |
| def generate_code(self, moduleName, dataArrayName, dataArrayType, structName): |
| # Output C code for the hash functions and tables |
| code = """ |
| /* |
| * The hash is produced using the algorithm described in |
| * "Optimal algorithms for minimal perfect hashing", |
| * G. Havas, B.S. Majewski. Available as a technical report |
| * from the CS department, University of Queensland |
| * (ftp://ftp.cs.uq.oz.au/). |
| * |
| * Generated using a heavily tweaked version of Andrew Kuchling's |
| * perfect_hash.py: |
| * http://starship.python.net/crew/amk/python/code/perfect-hash.html |
| * |
| * Generated on: %s |
| */ |
| """ % time.ctime(time.time()) |
| # MSVC SP3 was complaining when I actually used a global constant |
| code = code + """ |
| #define k_cHashElements %i |
| #define k_cchMaxKey %d |
| #define k_cKeys %i |
| |
| """ % (self.cHashElements, self.cchMax, self.cKeys) |
| |
| code = code + """ |
| static const %s G[k_cHashElements]; |
| static const %s %s[k_cKeys]; |
| """ % (self.type, dataArrayType, dataArrayName) |
| |
| code = code + """ |
| static long f1(const char *key, unsigned int cch) |
| """ |
| code = code + self.f1.generate_code() |
| code = code + """ |
| |
| static long f2(const char *key, unsigned int cch) |
| """ |
| code = code + self.f2.generate_code() |
| code = code + """ |
| |
| static unsigned long hash(const char *key, unsigned int cch) |
| { |
| return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) %% k_cHashElements; |
| } |
| |
| const void *getValue(unsigned long iKey) |
| { |
| return &%(dataArrayName)s[iKey]; |
| } |
| |
| /* Helper for adding objects to dictionaries. Check for errors with |
| PyErr_Occurred() */ |
| static |
| void insobj(PyObject *dict, |
| char *name, |
| PyObject *v) |
| { |
| PyDict_SetItemString(dict, name, v); |
| Py_XDECREF(v); |
| } |
| |
| static const %(structName)s hashAPI = |
| { |
| k_cKeys, |
| k_cchMaxKey, |
| &hash, |
| &getValue, |
| }; |
| |
| static |
| PyMethodDef Module_methods[] = |
| { |
| {NULL, NULL}, |
| }; |
| |
| static char *Module_docstring = "%(moduleName)s hash function module"; |
| |
| /* Error reporting for module init functions */ |
| |
| #define Py_ReportModuleInitError(modname) { \\ |
| PyObject *exc_type, *exc_value, *exc_tb; \\ |
| PyObject *str_type, *str_value; \\ |
| \\ |
| /* Fetch error objects and convert them to strings */ \\ |
| PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \\ |
| if (exc_type && exc_value) { \\ |
| str_type = PyObject_Str(exc_type); \\ |
| str_value = PyObject_Str(exc_value); \\ |
| } \\ |
| else { \\ |
| str_type = NULL; \\ |
| str_value = NULL; \\ |
| } \\ |
| /* Try to format a more informative error message using the \\ |
| original error */ \\ |
| if (str_type && str_value && \\ |
| PyString_Check(str_type) && PyString_Check(str_value)) \\ |
| PyErr_Format( \\ |
| PyExc_ImportError, \\ |
| "initialization of module "modname" failed " \\ |
| "(%%s:%%s)", \\ |
| PyString_AS_STRING(str_type), \\ |
| PyString_AS_STRING(str_value)); \\ |
| else \\ |
| PyErr_SetString( \\ |
| PyExc_ImportError, \\ |
| "initialization of module "modname" failed"); \\ |
| Py_XDECREF(str_type); \\ |
| Py_XDECREF(str_value); \\ |
| Py_XDECREF(exc_type); \\ |
| Py_XDECREF(exc_value); \\ |
| Py_XDECREF(exc_tb); \\ |
| } |
| |
| |
| /* Create PyMethodObjects and register them in the module\'s dict */ |
| DL_EXPORT(void) |
| init%(moduleName)s(void) |
| { |
| PyObject *module, *moddict; |
| /* Create module */ |
| module = Py_InitModule4("%(moduleName)s", /* Module name */ |
| Module_methods, /* Method list */ |
| Module_docstring, /* Module doc-string */ |
| (PyObject *)NULL, /* always pass this as *self */ |
| PYTHON_API_VERSION); /* API Version */ |
| if (module == NULL) |
| goto onError; |
| /* Add some constants to the module\'s dict */ |
| moddict = PyModule_GetDict(module); |
| if (moddict == NULL) |
| goto onError; |
| |
| /* Export C API */ |
| insobj( |
| moddict, |
| "%(moduleName)sAPI", |
| PyCObject_FromVoidPtr((void *)&hashAPI, NULL)); |
| |
| onError: |
| /* Check for errors and report them */ |
| if (PyErr_Occurred()) |
| Py_ReportModuleInitError("%(moduleName)s"); |
| return; |
| } |
| """ % { "moduleName" : moduleName, |
| "dataArrayName" : dataArrayName, |
| "structName" : structName, } |
| |
| return code |
| |
| def generate_graph(self, out): |
| out.write(""" |
| static const unsigned short G[] = |
| """) |
| self.G.generate_code(out) |
| |
| |
| def generate_hash(keys, caseInsensitive=0, |
| minC=None, initC=None, |
| f1Seed=None, f2Seed=None, |
| cIncrement=None, cTries=None): |
| """Print out code for a perfect minimal hash. Input is a list of |
| (key, desired hash value) tuples. """ |
| |
| # K is the number of keys. |
| K = len(keys) |
| |
| # We will be generating graphs of size N, where N = c * K. |
| # The larger C is, the fewer trial graphs will need to be made, but |
| # the resulting table is also larger. Increase this starting value |
| # if you're impatient. After 50 failures, c will be increased by 0.025. |
| if initC is None: |
| initC = 1.5 |
| |
| c = initC |
| if cIncrement is None: |
| cIncrement = 0.0025 |
| |
| if cTries is None: |
| cTries = 50 |
| |
| # Number of trial graphs so far |
| num_graphs = 0 |
| sys.stderr.write('Generating graphs... ') |
| |
| while 1: |
| # N is the number of vertices in the graph G |
| N = int(c*K) |
| num_graphs = num_graphs + 1 |
| if (num_graphs % cTries) == 0: |
| # Enough failures at this multiplier, |
| # increase the multiplier and keep trying.... |
| c = c + cIncrement |
| |
| # Whats good with searching for a better |
| # hash function if we exceed the size |
| # of a function we've generated in the past.... |
| if minC is not None and \ |
| c > minC: |
| c = initC |
| sys.stderr.write(' -- c > minC, resetting c to %0.4f\n' % c) |
| else: |
| sys.stderr.write(' -- increasing c to %0.4f\n' % c) |
| sys.stderr.write('Generating graphs... ') |
| |
| # Output a progress message |
| sys.stderr.write( str(num_graphs) + ' ') |
| sys.stderr.flush() |
| |
| # Create graph w/ N vertices |
| G = Graph(N) |
| # Save the seeds used to generate |
| # the following two hash functions. |
| _seeds = whrandom._inst._seed |
| |
| # Create 2 random hash functions |
| f1 = Hash(N, caseInsensitive) |
| f2 = Hash(N, caseInsensitive) |
| |
| # Set the initial hash function seed values if passed in. |
| # Doing this protects our hash functions from |
| # changes to whrandom's behavior. |
| if f1Seed is not None: |
| f1.seed = f1Seed |
| f1Seed = None |
| fSpecifiedSeeds = 1 |
| if f2Seed is not None: |
| f2.seed = f2Seed |
| f2Seed = None |
| fSpecifiedSeeds = 1 |
| |
| # Connect vertices given by the values of the two hash functions |
| # for each key. Associate the desired hash value with each |
| # edge. |
| for k, v in keys: |
| h1 = f1(k) ; h2 = f2(k) |
| G.connect( h1,h2, v) |
| |
| # Check if the resulting graph is acyclic; if it is, |
| # we're done with step 1. |
| if G.is_acyclic(): |
| break |
| elif fSpecifiedSeeds: |
| sys.stderr.write('\nThe initial f1/f2 seeds you specified didn\'t generate a perfect hash function: \n') |
| sys.stderr.write('f1 seed: %s\n' % f1.seed) |
| sys.stderr.write('f2 seed: %s\n' % f2.seed) |
| sys.stderr.write('multipler: %s\n' % c) |
| sys.stderr.write('Your data has likely changed, or you forgot what your initial multiplier should be.\n') |
| sys.stderr.write('continuing the search for a perfect hash function......\n') |
| fSpecifiedSeeds = 0 |
| |
| # Now we have an acyclic graph, so we assign values to each vertex |
| # such that, for each edge, you can add the values for the two vertices |
| # involved and get the desired value for that edge -- which is the |
| # desired hash key. This task is dead easy, because the graph is acyclic. |
| sys.stderr.write('\nAcyclic graph found; computing vertex values...\n') |
| G.assign_values() |
| |
| sys.stderr.write('Checking uniqueness of hash values...\n') |
| |
| # Sanity check the result by actually verifying that all the keys |
| # hash to the right value. |
| cchMaxKey = 0 |
| maxHashValue = 0 |
| |
| for k, v in keys: |
| hash1 = G.values[ f1(k) ] |
| hash2 = G.values[ f2(k) ] |
| if hash1 > maxHashValue: |
| maxHashValue = hash1 |
| if hash2 > maxHashValue: |
| maxHashValue = hash2 |
| perfecthash = (hash1 + hash2) % N |
| assert perfecthash == v |
| cch = len(k) |
| if cch > cchMaxKey: |
| cchMaxKey = cch |
| |
| sys.stderr.write('Found perfect hash function!\n') |
| sys.stderr.write('\nIn order to regenerate this hash function, \n') |
| sys.stderr.write('you need to pass these following values back in:\n') |
| sys.stderr.write('f1 seed: %s\n' % repr(f1.seed)) |
| sys.stderr.write('f2 seed: %s\n' % repr(f2.seed)) |
| sys.stderr.write('initial multipler: %s\n' % c) |
| |
| return PerfectHash(cchMaxKey, f1, f2, G, N, len(keys), maxHashValue) |