Blame - Tools/perfecthash/perfect_hash.py - platform/external/python/cpython3

blob: bf857398b4dadc0b36781d52803c9714db7526bd [file] [log] [blame]

Marc-André Lemburg	4e5302a	2000-06-28 16:53:16 +0000	[diff] [blame]	1	#!/usr/bin/env/python
				2
				3	# perfect_hash.py
				4	#
				5	# Outputs C code for a minimal perfect hash.
				6	# The hash is produced using the algorithm described in
				7	# "Optimal algorithms for minimal perfect hashing",
				8	# G. Havas, B.S. Majewski. Available as a technical report
				9	# from the CS department, University of Queensland
				10	# (ftp://ftp.cs.uq.oz.au/).
				11	#
				12	# This is a modified version of Andrew Kuchling's code
				13	# (http://starship.python.net/crew/amk/python/code/perfect-hash.html)
				14	# and generates C fragments suitable for compilation as a Python
				15	# extension module.
				16	#
				17
				18	# Difference between this algorithm and gperf:
				19	# Gperf will complete in finite time with a successful function,
				20	# or by giving up.
				21	# This algorithm may never complete, although it is extremely likely
				22	# when c >= 2.
				23
				24	# The algorithm works like this:
				25	# 0) You have K keys, that you want to perfectly hash to a bunch
				26	# of hash values.
				27	#
				28	# 1) Choose a number N larger than K. This is the number of
				29	# vertices in a graph G, and also the size of the resulting table.
				30	#
				31	# 2) Pick two random hash functions f1, f2, that output values from
				32	# 0...N-1.
				33	#
				34	# 3) for key in keys:
				35	# h1 = f1(key) ; h2 = f2(key)
				36	# Draw an edge between vertices h1 and h2 of the graph.
				37	# Associate the desired hash value with that edge.
				38	#
				39	# 4) Check if G is acyclic; if not, go back to step 1 and pick a bigger N.
				40	#
				41	# 5) Assign values to each vertex such that, for each edge, you can
				42	# add the values for the two vertices and get the desired value
				43	# for that edge -- which is the desired hash key. This task is
				44	# dead easy, because the graph is acyclic. This is done by
				45	# picking a vertex V, and assigning it a value of 0. You then do a
				46	# depth-first search, assigning values to new vertices so that
				47	# they sum up properly.
				48	#
				49	# 6) f1, f2, and G now make up your perfect hash function.
				50
				51
				52	import sys, whrandom, string
				53	import pprint
				54	import perfhash
				55	import time
				56
				57	class Hash:
				58	"""Random hash function
				59	For simplicity and speed, this doesn't implement any byte-level hashing
				60	scheme. Instead, a random string is generated and prefixing to
				61	str(key), and then Python's hashing function is used."""
				62
				63	def __init__(self, N, caseInsensitive=0):
				64	self.N = N
				65	junk = ""
				66	for i in range(10):
				67	junk = junk + whrandom.choice(string.letters + string.digits)
				68	self.junk = junk
				69	self.caseInsensitive = caseInsensitive
				70	self.seed = perfhash.calcSeed(junk)
				71
				72	def __call__(self, key):
				73	key = str(key)
				74	if self.caseInsensitive:
				75	key = string.upper(key)
				76	x = perfhash.hash(self.seed, len(self.junk), key) % self.N
				77	#h = hash(self.junk + key) % self.N
				78	#assert x == h
				79	return x
				80
				81	def generate_code(self):
				82	s = """{
				83	register int len;
				84	register unsigned char *p;
				85	register long x;
				86
				87	len = cch;
				88	p = (unsigned char *) key;
				89	x = %(junkSeed)d;
				90	while (--len >= 0)
				91	x = (1000003*x) ^ """ % \
				92	{
				93	"lenJunk" : len(self.junk),
				94	"junkSeed" : self.seed,
				95	}
				96
				97	if self.caseInsensitive:
				98	s = s + "toupper(*(p++));"
				99	else:
				100	s = s + "*(p++);"
				101	s = s + """
				102	x ^= cch + %(lenJunk)d;
				103	if (x == -1)
				104	x = -2;
				105	x %%= k_cHashElements;
				106	/* ensure the returned value is positive so we mimic Python's %% operator */
				107	if (x < 0)
				108	x += k_cHashElements;
				109	return x;
				110	}
				111	""" % { "lenJunk" : len(self.junk),
				112	"junkSeed" : self.seed, }
				113	return s
				114
				115
				116	WHITE, GREY, BLACK = 0,1,2
				117	class Graph:
				118	"""Graph class. This class isn't particularly efficient or general,
				119	and only has the features I needed to implement this algorithm.
				120
				121	num_vertices -- number of vertices
				122	edges -- maps 2-tuples of vertex numbers to the value for this
				123	edge. If there's an edge between v1 and v2 (v1<v2),
				124	(v1,v2) is a key and the value is the edge's value.
				125	reachable_list -- maps a vertex V to the list of vertices
				126	to which V is connected by edges. Used
				127	for traversing the graph.
				128	values -- numeric value for each vertex
				129	"""
				130
				131	def __init__(self, num_vertices):
				132	self.num_vertices = num_vertices
				133	self.edges = {}
				134	self.reachable_list = {}
				135	self.values = [-1] * num_vertices
				136
				137	def connect(self, vertex1, vertex2, value):
				138	"""Connect 'vertex1' and 'vertex2' with an edge, with associated
				139	value 'value'"""
				140
				141	if vertex1 > vertex2: vertex1, vertex2 = vertex2, vertex1
				142	# if self.edges.has_key( (vertex1, vertex2) ):
				143	# raise ValueError, 'Collision: vertices already connected'
				144	self.edges[ (vertex1, vertex2) ] = value
				145
				146	# Add vertices to each other's reachable list
				147	if not self.reachable_list.has_key( vertex1 ):
				148	self.reachable_list[ vertex1 ] = [vertex2]
				149	else:
				150	self.reachable_list[vertex1].append(vertex2)
				151
				152	if not self.reachable_list.has_key( vertex2 ):
				153	self.reachable_list[ vertex2 ] = [vertex1]
				154	else:
				155	self.reachable_list[vertex2].append(vertex1)
				156
				157	def get_edge_value(self, vertex1, vertex2):
				158	"""Retrieve the value corresponding to the edge between
				159	'vertex1' and 'vertex2'. Raises KeyError if no such edge"""
				160	if vertex1 > vertex2:
				161	vertex1, vertex2 = vertex2, vertex1
				162	return self.edges[ (vertex1, vertex2) ]
				163
				164	def is_acyclic(self):
				165	"Returns true if the graph is acyclic, otherwise false"
				166
				167	# This is done by doing a depth-first search of the graph;
				168	# painting each vertex grey and then black. If the DFS
				169	# ever finds a vertex that isn't white, there's a cycle.
				170	colour = {}
				171	for i in range(self.num_vertices): colour[i] = WHITE
				172
				173	# Loop over all vertices, taking white ones as starting
				174	# points for a traversal.
				175	for i in range(self.num_vertices):
				176	if colour[i] == WHITE:
				177
				178	# List of vertices to visit
				179	visit_list = [ (None,i) ]
				180
				181	# Do a DFS
				182	while visit_list:
				183	# Colour this vertex grey.
				184	parent, vertex = visit_list[0] ; del visit_list[0]
				185	colour[vertex] = GREY
				186
				187	# Make copy of list of neighbours, removing the vertex
				188	# we arrived here from.
				189	neighbours = self.reachable_list.get(vertex, []) [:]
				190	if parent in neighbours: neighbours.remove( parent )
				191
				192	for neighbour in neighbours:
				193	if colour[neighbour] == WHITE:
				194	visit_list.insert(0, (vertex, neighbour) )
				195	elif colour[neighbour] != WHITE:
				196	# Aha! Already visited this node,
				197	# so the graph isn't acyclic.
				198	return 0
				199
				200	colour[vertex] = BLACK
				201
				202	# We got through, so the graph is acyclic.
				203	return 1
				204
				205	def assign_values(self):
				206	"""Compute values for each vertex, so that they sum up
				207	properly to the associated value for each edge."""
				208
				209	# Also done with a DFS; I simply copied the DFS code
				210	# from is_acyclic(). (Should generalize the logic so
				211	# one function could be used from both methods,
				212	# but I couldn't be bothered.)
				213
				214	colour = {}
				215	for i in range(self.num_vertices): colour[i] = WHITE
				216
				217	# Loop over all vertices, taking white ones as starting
				218	# points for a traversal.
				219	for i in range(self.num_vertices):
				220	if colour[i] == WHITE:
				221	# Set this vertex's value, arbitrarily, to zero.
				222	self.set_vertex_value( i, 0 )
				223
				224	# List of vertices to visit
				225	visit_list = [ (None,i) ]
				226
				227	# Do a DFS
				228	while visit_list:
				229	# Colour this vertex grey.
				230	parent, vertex = visit_list[0] ; del visit_list[0]
				231	colour[vertex] = GREY
				232
				233	# Make copy of list of neighbours, removing the vertex
				234	# we arrived here from.
				235	neighbours = self.reachable_list.get(vertex, []) [:]
				236	if parent in neighbours: neighbours.remove( parent )
				237
				238	for neighbour in self.reachable_list.get(vertex, []):
				239	edge_value = self.get_edge_value( vertex, neighbour )
				240	if colour[neighbour] == WHITE:
				241	visit_list.insert(0, (vertex, neighbour) )
				242
				243	# Set new vertex's value to the desired
				244	# edge value, minus the value of the
				245	# vertex we came here from.
				246	new_val = (edge_value -
				247	self.get_vertex_value( vertex ) )
				248	self.set_vertex_value( neighbour,
				249	new_val % self.num_vertices)
				250
				251	colour[vertex] = BLACK
				252
				253	# Returns nothing
				254	return
				255
				256	def __getitem__(self, index):
				257	if index < self.num_vertices: return index
				258	raise IndexError
				259
				260	def get_vertex_value(self, vertex):
				261	"Get value for a vertex"
				262	return self.values[ vertex ]
				263
				264	def set_vertex_value(self, vertex, value):
				265	"Set value for a vertex"
				266	self.values[ vertex ] = value
				267
				268	def generate_code(self, out, width = 70):
				269	"Return nicely formatted table"
				270	out.write("{ ")
				271	pos = 0
				272	for v in self.values:
				273	v=str(v)+', '
				274	out.write(v)
				275	pos = pos + len(v) + 1
				276	if pos > width: out.write('\n '); pos = 0
				277	out.write('};\n')
				278
				279
				280	class PerfectHash:
				281	def __init__(self, cchMax, f1, f2, G, cHashElements, cKeys, maxHashValue):
				282	self.cchMax = cchMax
				283	self.f1 = f1
				284	self.f2 = f2
				285	self.G = G
				286	self.cHashElements = cHashElements
				287	self.cKeys = cKeys
				288	# determine the necessary type for storing our hash function
				289	# helper table:
				290	self.type = self.determineType(maxHashValue)
				291
				292	def generate_header(self, structName):
				293	header = """
Marc-André Lemburg	e3f257e	2000-06-30 09:56:00 +0000	[diff] [blame]	294	#include "Python.h"
Marc-André Lemburg	4e5302a	2000-06-28 16:53:16 +0000	[diff] [blame]	295	#include <stdlib.h>
				296
				297	/* --- C API ----------------------------------------------------*/
				298	/* C API for usage by other Python modules */
				299	typedef struct %(structName)s
				300	{
				301	unsigned long cKeys;
				302	unsigned long cchMax;
				303	unsigned long (hash)(const char key, unsigned int cch);
				304	const void (getValue)(unsigned long iKey);
				305	} %(structName)s;
				306	""" % { "structName" : structName }
				307	return header
				308
				309	def determineType(self, maxHashValue):
				310	if maxHashValue <= 255:
				311	return "unsigned char"
				312	elif maxHashValue <= 65535:
				313	return "unsigned short"
				314	else:
				315	# Take the cheesy way out...
				316	return "unsigned long"
				317
				318	def generate_code(self, moduleName, dataArrayName, dataArrayType, structName):
				319	# Output C code for the hash functions and tables
				320	code = """
				321	/*
				322	* The hash is produced using the algorithm described in
				323	* "Optimal algorithms for minimal perfect hashing",
				324	* G. Havas, B.S. Majewski. Available as a technical report
				325	* from the CS department, University of Queensland
				326	* (ftp://ftp.cs.uq.oz.au/).
				327	*
				328	* Generated using a heavily tweaked version of Andrew Kuchling's
				329	* perfect_hash.py:
				330	* http://starship.python.net/crew/amk/python/code/perfect-hash.html
				331	*
				332	* Generated on: %s
				333	*/
				334	""" % time.ctime(time.time())
				335	# MSVC SP3 was complaining when I actually used a global constant
				336	code = code + """
				337	#define k_cHashElements %i
				338	#define k_cchMaxKey %d
				339	#define k_cKeys %i
				340
				341	""" % (self.cHashElements, self.cchMax, self.cKeys)
				342
				343	code = code + """
				344	static const %s G[k_cHashElements];
				345	static const %s %s[k_cKeys];
				346	""" % (self.type, dataArrayType, dataArrayName)
				347
				348	code = code + """
				349	static long f1(const char *key, unsigned int cch)
				350	"""
				351	code = code + self.f1.generate_code()
				352	code = code + """
				353
				354	static long f2(const char *key, unsigned int cch)
				355	"""
				356	code = code + self.f2.generate_code()
				357	code = code + """
				358
				359	static unsigned long hash(const char *key, unsigned int cch)
				360	{
				361	return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) %% k_cHashElements;
				362	}
				363
				364	const void *getValue(unsigned long iKey)
				365	{
				366	return &%(dataArrayName)s[iKey];
				367	}
				368
				369	/* Helper for adding objects to dictionaries. Check for errors with
				370	PyErr_Occurred() */
				371	static
				372	void insobj(PyObject *dict,
				373	char *name,
				374	PyObject *v)
				375	{
				376	PyDict_SetItemString(dict, name, v);
				377	Py_XDECREF(v);
				378	}
				379
				380	static const %(structName)s hashAPI =
				381	{
				382	k_cKeys,
				383	k_cchMaxKey,
				384	&hash,
				385	&getValue,
				386	};
				387
				388	static
				389	PyMethodDef Module_methods[] =
				390	{
				391	{NULL, NULL},
				392	};
				393
				394	static char *Module_docstring = "%(moduleName)s hash function module";
				395
				396	/* Error reporting for module init functions */
				397
				398	#define Py_ReportModuleInitError(modname) { \\
				399	PyObject exc_type, exc_value, *exc_tb; \\
				400	PyObject str_type, str_value; \\
				401	\\
				402	/* Fetch error objects and convert them to strings */ \\
				403	PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \\
				404	if (exc_type && exc_value) { \\
				405	str_type = PyObject_Str(exc_type); \\
				406	str_value = PyObject_Str(exc_value); \\
				407	} \\
				408	else { \\
				409	str_type = NULL; \\
				410	str_value = NULL; \\
				411	} \\
				412	/* Try to format a more informative error message using the \\
				413	original error */ \\
				414	if (str_type && str_value && \\
				415	PyString_Check(str_type) && PyString_Check(str_value)) \\
				416	PyErr_Format( \\
				417	PyExc_ImportError, \\
				418	"initialization of module "modname" failed " \\
				419	"(%%s:%%s)", \\
				420	PyString_AS_STRING(str_type), \\
				421	PyString_AS_STRING(str_value)); \\
				422	else \\
				423	PyErr_SetString( \\
				424	PyExc_ImportError, \\
				425	"initialization of module "modname" failed"); \\
				426	Py_XDECREF(str_type); \\
				427	Py_XDECREF(str_value); \\
				428	Py_XDECREF(exc_type); \\
				429	Py_XDECREF(exc_value); \\
				430	Py_XDECREF(exc_tb); \\
				431	}
				432
				433
				434	/* Create PyMethodObjects and register them in the module\'s dict */
				435	DL_EXPORT(void)
				436	init%(moduleName)s(void)
				437	{
				438	PyObject module, moddict;
				439	/* Create module */
				440	module = Py_InitModule4("%(moduleName)s", /* Module name */
				441	Module_methods, /* Method list */
				442	Module_docstring, /* Module doc-string */
				443	(PyObject )NULL, / always pass this as self /
				444	PYTHON_API_VERSION); /* API Version */
				445	if (module == NULL)
				446	goto onError;
				447	/* Add some constants to the module\'s dict */
				448	moddict = PyModule_GetDict(module);
				449	if (moddict == NULL)
				450	goto onError;
				451
				452	/* Export C API */
				453	insobj(
				454	moddict,
				455	"%(moduleName)sAPI",
				456	PyCObject_FromVoidPtr((void *)&hashAPI, NULL));
				457
				458	onError:
				459	/* Check for errors and report them */
				460	if (PyErr_Occurred())
				461	Py_ReportModuleInitError("%(moduleName)s");
				462	return;
				463	}
				464	""" % { "moduleName" : moduleName,
				465	"dataArrayName" : dataArrayName,
				466	"structName" : structName, }
				467
				468	return code
				469
				470	def generate_graph(self, out):
				471	out.write("""
				472	static const unsigned short G[] =
				473	""")
				474	self.G.generate_code(out)
				475
				476
				477	def generate_hash(keys, caseInsensitive=0,
				478	minC=None, initC=None,
				479	f1Seed=None, f2Seed=None,
				480	cIncrement=None, cTries=None):
				481	"""Print out code for a perfect minimal hash. Input is a list of
				482	(key, desired hash value) tuples. """
				483
				484	# K is the number of keys.
				485	K = len(keys)
				486
				487	# We will be generating graphs of size N, where N = c * K.
				488	# The larger C is, the fewer trial graphs will need to be made, but
				489	# the resulting table is also larger. Increase this starting value
				490	# if you're impatient. After 50 failures, c will be increased by 0.025.
				491	if initC is None:
				492	initC = 1.5
				493
				494	c = initC
				495	if cIncrement is None:
				496	cIncrement = 0.0025
				497
				498	if cTries is None:
				499	cTries = 50
				500
				501	# Number of trial graphs so far
				502	num_graphs = 0
				503	sys.stderr.write('Generating graphs... ')
				504
				505	while 1:
				506	# N is the number of vertices in the graph G
				507	N = int(c*K)
				508	num_graphs = num_graphs + 1
				509	if (num_graphs % cTries) == 0:
				510	# Enough failures at this multiplier,
				511	# increase the multiplier and keep trying....
				512	c = c + cIncrement
				513
				514	# Whats good with searching for a better
				515	# hash function if we exceed the size
				516	# of a function we've generated in the past....
				517	if minC is not None and \
				518	c > minC:
				519	c = initC
				520	sys.stderr.write(' -- c > minC, resetting c to %0.4f\n' % c)
				521	else:
				522	sys.stderr.write(' -- increasing c to %0.4f\n' % c)
				523	sys.stderr.write('Generating graphs... ')
				524
				525	# Output a progress message
				526	sys.stderr.write( str(num_graphs) + ' ')
				527	sys.stderr.flush()
				528
				529	# Create graph w/ N vertices
				530	G = Graph(N)
				531	# Save the seeds used to generate
				532	# the following two hash functions.
				533	_seeds = whrandom._inst._seed
				534
				535	# Create 2 random hash functions
				536	f1 = Hash(N, caseInsensitive)
				537	f2 = Hash(N, caseInsensitive)
				538
				539	# Set the initial hash function seed values if passed in.
				540	# Doing this protects our hash functions from
				541	# changes to whrandom's behavior.
				542	if f1Seed is not None:
				543	f1.seed = f1Seed
				544	f1Seed = None
				545	fSpecifiedSeeds = 1
				546	if f2Seed is not None:
				547	f2.seed = f2Seed
				548	f2Seed = None
				549	fSpecifiedSeeds = 1
				550
				551	# Connect vertices given by the values of the two hash functions
				552	# for each key. Associate the desired hash value with each
				553	# edge.
				554	for k, v in keys:
				555	h1 = f1(k) ; h2 = f2(k)
				556	G.connect( h1,h2, v)
				557
				558	# Check if the resulting graph is acyclic; if it is,
				559	# we're done with step 1.
				560	if G.is_acyclic():
				561	break
				562	elif fSpecifiedSeeds:
				563	sys.stderr.write('\nThe initial f1/f2 seeds you specified didn\'t generate a perfect hash function: \n')
				564	sys.stderr.write('f1 seed: %s\n' % f1.seed)
				565	sys.stderr.write('f2 seed: %s\n' % f2.seed)
				566	sys.stderr.write('multipler: %s\n' % c)
				567	sys.stderr.write('Your data has likely changed, or you forgot what your initial multiplier should be.\n')
				568	sys.stderr.write('continuing the search for a perfect hash function......\n')
				569	fSpecifiedSeeds = 0
				570
				571	# Now we have an acyclic graph, so we assign values to each vertex
				572	# such that, for each edge, you can add the values for the two vertices
				573	# involved and get the desired value for that edge -- which is the
				574	# desired hash key. This task is dead easy, because the graph is acyclic.
				575	sys.stderr.write('\nAcyclic graph found; computing vertex values...\n')
				576	G.assign_values()
				577
				578	sys.stderr.write('Checking uniqueness of hash values...\n')
				579
				580	# Sanity check the result by actually verifying that all the keys
				581	# hash to the right value.
				582	cchMaxKey = 0
				583	maxHashValue = 0
				584
				585	for k, v in keys:
				586	hash1 = G.values[ f1(k) ]
				587	hash2 = G.values[ f2(k) ]
				588	if hash1 > maxHashValue:
				589	maxHashValue = hash1
				590	if hash2 > maxHashValue:
				591	maxHashValue = hash2
				592	perfecthash = (hash1 + hash2) % N
				593	assert perfecthash == v
				594	cch = len(k)
				595	if cch > cchMaxKey:
				596	cchMaxKey = cch
				597
				598	sys.stderr.write('Found perfect hash function!\n')
				599	sys.stderr.write('\nIn order to regenerate this hash function, \n')
				600	sys.stderr.write('you need to pass these following values back in:\n')
				601	sys.stderr.write('f1 seed: %s\n' % repr(f1.seed))
				602	sys.stderr.write('f2 seed: %s\n' % repr(f2.seed))
				603	sys.stderr.write('initial multipler: %s\n' % c)
				604
				605	return PerfectHash(cchMaxKey, f1, f2, G, N, len(keys), maxHashValue)