Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

blob: 3a362ec7f02174314e10434d5bf0df034a141f4f [file] [log] [blame]

Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	1	#
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	2	# (re)generate unicode property and type databases
				3	#
				4	# this script converts a unicode 3.0 database file to
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	5	# Modules/unicodedata_db.h, Modules/unicodename_db.h,
				6	# and Objects/unicodetype_db.h
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	7	#
				8	# history:
				9	# 2000-09-24 fl created (based on bits and pieces from unidb)
				10	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	11	# 2000-09-25 fl added character type table
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	12	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	13	# 2000-11-03 fl expand first/last ranges
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	14	# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	15	#
				16	# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	17	#
				18
				19	import sys
				20
				21	SCRIPT = sys.argv[0]
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	22	VERSION = "2.1"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	23
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	24	UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	25
				26	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
				27	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
				28	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
				29	"So" ]
				30
				31	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
				32	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
				33	"ON" ]
				34
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	35	# note: should match definitions in Objects/unicodectype.c
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	36	ALPHA_MASK = 0x01
				37	DECIMAL_MASK = 0x02
				38	DIGIT_MASK = 0x04
				39	LOWER_MASK = 0x08
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	40	LINEBREAK_MASK = 0x10
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	41	SPACE_MASK = 0x20
				42	TITLE_MASK = 0x40
				43	UPPER_MASK = 0x80
				44
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	45	def maketables(trace=0):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	46
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	47	print "--- Reading", UNICODE_DATA, "..."
				48
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	49	unicode = UnicodeData(UNICODE_DATA)
				50
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	51	print len(filter(None, unicode.table)), "characters"
				52
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	53	makeunicodedata(unicode, trace)
				54	makeunicodetype(unicode, trace)
				55	makeunicodename(unicode, trace)
				56
				57	# --------------------------------------------------------------------
				58	# unicode character properties
				59
				60	def makeunicodedata(unicode, trace):
				61
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	62	dummy = (0, 0, 0, 0)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	63	table = [dummy]
				64	cache = {0: dummy}
				65	index = [0] * len(unicode.chars)
				66
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	67	FILE = "Modules/unicodedata_db.h"
				68
				69	print "--- Preparing", FILE, "..."
				70
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	71	# 1) database properties
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	72
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	73	for char in unicode.chars:
				74	record = unicode.table[char]
				75	if record:
				76	# extract database properties
				77	category = CATEGORY_NAMES.index(record[2])
				78	combining = int(record[3])
				79	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
				80	mirrored = record[9] == "Y"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	81	item = (
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	82	category, combining, bidirectional, mirrored
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	83	)
				84	# add entry to index and item tables
				85	i = cache.get(item)
				86	if i is None:
				87	cache[item] = i = len(table)
				88	table.append(item)
				89	index[char] = i
				90
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	91	# 2) decomposition data
				92
				93	# FIXME: <fl> using the encoding stuff from unidb would save
				94	# another 50k or so, but I'll leave that for 2.1...
				95
				96	decomp_data = [""]
				97	decomp_index = [0] * len(unicode.chars)
				98
				99	for char in unicode.chars:
				100	record = unicode.table[char]
				101	if record:
				102	if record[5]:
				103	try:
				104	i = decomp_data.index(record[5])
				105	except ValueError:
				106	i = len(decomp_data)
				107	decomp_data.append(record[5])
				108	else:
				109	i = 0
				110	decomp_index[char] = i
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	111
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	112	print len(table), "unique properties"
				113	print len(decomp_data), "unique decomposition entries"
				114
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	115	print "--- Writing", FILE, "..."
				116
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	117	fp = open(FILE, "w")
				118	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				119	print >>fp
				120	print >>fp, "/* a list of unique database records */"
				121	print >>fp, \
				122	"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	123	for item in table:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	124	print >>fp, " {%d, %d, %d, %d}," % item
				125	print >>fp, "};"
				126	print >>fp
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	127
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	128	# FIXME: <fl> the following tables could be made static, and
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	129	# the support code moved into unicodedatabase.c
				130
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	131	print >>fp, "/* string literals */"
				132	print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	133	for name in CATEGORY_NAMES:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	134	print >>fp, " \"%s\"," % name
				135	print >>fp, " NULL"
				136	print >>fp, "};"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	137
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	138	print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	139	for name in BIDIRECTIONAL_NAMES:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	140	print >>fp, " \"%s\"," % name
				141	print >>fp, " NULL"
				142	print >>fp, "};"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	143
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	144	print >>fp, "static const char *decomp_data[] = {"
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	145	for name in decomp_data:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	146	print >>fp, " \"%s\"," % name
				147	print >>fp, " NULL"
				148	print >>fp, "};"
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	149
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	150	# split record index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	151	index1, index2, shift = splitbins(index, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	152
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	153	print >>fp, "/* index tables for the database records */"
				154	print >>fp, "#define SHIFT", shift
				155	Array("index1", index1).dump(fp)
				156	Array("index2", index2).dump(fp)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	157
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	158	# split decomposition index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	159	index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	160
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	161	print >>fp, "/* index tables for the decomposition data */"
				162	print >>fp, "#define DECOMP_SHIFT", shift
				163	Array("decomp_index1", index1).dump(fp)
				164	Array("decomp_index2", index2).dump(fp)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	165
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	166	fp.close()
				167
				168	# --------------------------------------------------------------------
				169	# unicode character type tables
				170
				171	def makeunicodetype(unicode, trace):
				172
				173	FILE = "Objects/unicodetype_db.h"
				174
				175	print "--- Preparing", FILE, "..."
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	176
				177	# extract unicode types
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	178	dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	179	table = [dummy]
				180	cache = {0: dummy}
				181	index = [0] * len(unicode.chars)
				182
				183	for char in unicode.chars:
				184	record = unicode.table[char]
				185	if record:
				186	# extract database properties
				187	category = record[2]
				188	bidirectional = record[4]
				189	flags = 0
				190	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
				191	flags \|= ALPHA_MASK
				192	if category == "Ll":
				193	flags \|= LOWER_MASK
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	194	if category == "Zl" or bidirectional == "B":
				195	flags \|= LINEBREAK_MASK
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	196	if category == "Zs" or bidirectional in ("WS", "B", "S"):
				197	flags \|= SPACE_MASK
Fredrik Lundh	375732c	2000-09-25 23:03:34 +0000	[diff] [blame]	198	if category == "Lt":
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	199	flags \|= TITLE_MASK
				200	if category == "Lu":
				201	flags \|= UPPER_MASK
				202	# use delta predictor for upper/lower/title
				203	if record[12]:
				204	upper = (int(record[12], 16) - char) & 0xffff
				205	else:
				206	upper = 0
				207	if record[13]:
				208	lower = (int(record[13], 16) - char) & 0xffff
				209	else:
				210	lower = 0
				211	if record[14]:
				212	title = (int(record[14], 16) - char) & 0xffff
				213	else:
				214	title = 0
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	215	# decimal digit, integer digit
				216	decimal = 0
				217	if record[6]:
				218	flags \|= DECIMAL_MASK
				219	decimal = int(record[6])
				220	digit = 0
				221	if record[7]:
				222	flags \|= DIGIT_MASK
				223	digit = int(record[7])
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	224	item = (
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	225	flags, upper, lower, title, decimal, digit
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	226	)
				227	# add entry to index and item tables
				228	i = cache.get(item)
				229	if i is None:
				230	cache[item] = i = len(table)
				231	table.append(item)
				232	index[char] = i
				233
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	234	print len(table), "unique character type entries"
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	235
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	236	print "--- Writing", FILE, "..."
				237
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	238	fp = open(FILE, "w")
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	239	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				240	print >>fp
				241	print >>fp, "/* a list of unique character type descriptors */"
				242	print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	243	for item in table:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	244	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
				245	print >>fp, "};"
				246	print >>fp
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	247
				248	# split decomposition index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	249	index1, index2, shift = splitbins(index, trace)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	250
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	251	print >>fp, "/* type indexes */"
				252	print >>fp, "#define SHIFT", shift
				253	Array("index1", index1).dump(fp)
				254	Array("index2", index2).dump(fp)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	255
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	256	fp.close()
				257
				258	# --------------------------------------------------------------------
				259	# unicode name database
				260
				261	def makeunicodename(unicode, trace):
				262
				263	FILE = "Modules/unicodename_db.h"
				264
				265	print "--- Preparing", FILE, "..."
				266
				267	# collect names
				268	names = [None] * len(unicode.chars)
				269
				270	for char in unicode.chars:
				271	record = unicode.table[char]
				272	if record:
				273	name = record[1].strip()
				274	if name and name[0] != "<":
				275	names[char] = name + chr(0)
				276
				277	print len(filter(lambda n: n is not None, names)), "distinct names"
				278
				279	# collect unique words from names (note that we differ between
				280	# words inside a sentence, and words ending a sentence. the
				281	# latter includes the trailing null byte.
				282
				283	words = {}
				284	n = b = 0
				285	for char in unicode.chars:
				286	name = names[char]
				287	if name:
				288	w = name.split()
				289	b = b + len(name)
				290	n = n + len(w)
				291	for w in w:
				292	l = words.get(w)
				293	if l:
				294	l.append(None)
				295	else:
				296	words[w] = [len(words)]
				297
				298	print n, "words in text;", b, "bytes"
				299
				300	wordlist = words.items()
				301
				302	# sort on falling frequency
				303	wordlist.sort(lambda a, b: len(b[1])-len(a[1]))
				304
				305	# statistics
				306	n = 0
				307	for i in range(128):
				308	n = n + len(wordlist[i][1])
				309	print n, "short words (7-bit indices)"
				310
				311	# pick the 128 most commonly used words, and sort the rest on
				312	# falling length (to maximize overlap)
				313
				314	wordlist, wordtail = wordlist[:128], wordlist[128:]
				315	wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
				316	wordlist.extend(wordtail)
				317
				318	# generate lexicon from words
				319
				320	lexicon_offset = [0]
				321	lexicon = ""
				322	words = {}
				323
				324	# build a lexicon string
				325	offset = 0
				326	for w, x in wordlist:
				327	# encoding: bit 7 indicates last character in word (chr(128)
				328	# indicates the last character in an entire string)
				329	ww = w[:-1] + chr(ord(w[-1])+128)
				330	# reuse string tails, when possible
				331	o = string.find(lexicon, ww)
				332	if o < 0:
				333	o = offset
				334	lexicon = lexicon + ww
				335	offset = offset + len(w)
				336	words[w] = len(lexicon_offset)
				337	lexicon_offset.append(offset)
				338
				339	print len(words), "words in lexicon;", len(lexicon), "bytes"
				340
				341	assert len(words) < 32768 # 15-bit word indices
				342
				343	lexicon = map(ord, lexicon)
				344
				345	# generate phrasebook from names and lexicon
				346	phrasebook = [0]
				347	phrasebook_offset = [0] * len(unicode.chars)
				348	for char in unicode.chars:
				349	name = names[char]
				350	if name:
				351	w = name.split()
				352	phrasebook_offset[char] = len(phrasebook)
				353	for w in w:
				354	i = words[w]
				355	if i < 128:
				356	phrasebook.append(128+i)
				357	else:
				358	phrasebook.append(i>>8)
				359	phrasebook.append(i&255)
				360
				361	#
				362	# unicode name hash table
				363
				364	# extract names
				365	data = []
				366	for char in unicode.chars:
				367	record = unicode.table[char]
				368	if record:
				369	name = record[1].strip()
				370	if name and name[0] != "<":
				371	data.append((name, char))
				372
				373	# the magic number 47 was chosen to minimize the number of
				374	# collisions on the current data set. if you like, change it
				375	# and see what happens...
				376
				377	codehash = Hash("code", data, 47)
				378
				379	print "--- Writing", FILE, "..."
				380
				381	fp = open(FILE, "w")
				382	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				383	print >>fp
				384	print >>fp, "#define NAME_MAXLEN", 256
				385	print >>fp
				386	print >>fp, "/* lexicon */"
				387	Array("lexicon", lexicon).dump(fp)
				388	Array("lexicon_offset", lexicon_offset).dump(fp)
				389
				390	# split decomposition index table
				391	offset1, offset2, shift = splitbins(phrasebook_offset, trace)
				392
				393	print >>fp, "/* code->name phrasebook */"
				394	print >>fp, "#define phrasebook_shift", shift
				395
				396	Array("phrasebook", phrasebook).dump(fp)
				397	Array("phrasebook_offset1", offset1).dump(fp)
				398	Array("phrasebook_offset2", offset2).dump(fp)
				399
				400	print >>fp, "/* name->code dictionary */"
				401	codehash.dump(fp)
				402
				403	fp.close()
				404
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	405	# --------------------------------------------------------------------
				406	# the following support code is taken from the unidb utilities
				407	# Copyright (c) 1999-2000 by Secret Labs AB
				408
				409	# load a unicode-data file from disk
				410
				411	import string, sys
				412
				413	class UnicodeData:
				414
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	415	def __init__(self, filename, expand=1):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	416	file = open(filename)
				417	table = [None] * 65536
				418	while 1:
				419	s = file.readline()
				420	if not s:
				421	break
				422	s = string.split(string.strip(s), ";")
				423	char = string.atoi(s[0], 16)
				424	table[char] = s
				425
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	426	# expand first-last ranges (ignore surrogates and private use)
				427	if expand:
				428	field = None
				429	for i in range(0, 0xD800):
				430	s = table[i]
				431	if s:
				432	if s[1][-6:] == "First>":
				433	s[1] = ""
				434	field = s[:]
				435	elif s[1][-5:] == "Last>":
				436	s[1] = ""
				437	field = None
				438	elif field:
				439	field[0] = hex(i)
				440	table[i] = field
				441
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	442	# public attributes
				443	self.filename = filename
				444	self.table = table
				445	self.chars = range(65536) # unicode
				446
				447	def uselatin1(self):
				448	# restrict character range to ISO Latin 1
				449	self.chars = range(256)
				450
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	451	# hash table tools
				452
				453	# this is a straight-forward reimplementation of Python's built-in
				454	# dictionary type, using a static data structure, and a custom string
				455	# hash algorithm.
				456
				457	def myhash(s, magic):
				458	h = 0
				459	for c in map(ord, string.upper(s)):
				460	h = (h * magic) + c
				461	ix = h & 0xff000000
				462	if ix:
				463	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
				464	return h
				465
				466	SIZES = [
				467	(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
				468	(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
				469	(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
				470	(2097152,5), (4194304,3), (8388608,33), (16777216,27)
				471	]
				472
				473	class Hash:
				474	def __init__(self, name, data, magic):
				475	# turn a (key, value) list into a static hash table structure
				476
				477	# determine table size
				478	for size, poly in SIZES:
				479	if size > len(data):
				480	poly = size + poly
				481	break
				482	else:
				483	raise AssertionError, "ran out of polynominals"
				484
				485	print size, "slots in hash table"
				486
				487	table = [None] * size
				488
				489	mask = size-1
				490
				491	n = 0
				492
				493	hash = myhash
				494
				495	# initialize hash table
				496	for key, value in data:
				497	h = hash(key, magic)
				498	i = (~h) & mask
				499	v = table[i]
				500	if v is None:
				501	table[i] = value
				502	continue
				503	incr = (h ^ (h >> 3)) & mask;
				504	if not incr:
				505	incr = mask
				506	while 1:
				507	n = n + 1
				508	i = (i + incr) & mask
				509	v = table[i]
				510	if v is None:
				511	table[i] = value
				512	break
				513	incr = incr << 1
				514	if incr > mask:
				515	incr = incr ^ poly
				516
				517	print n, "collisions"
				518	self.collisions = n
				519
				520	for i in range(len(table)):
				521	if table[i] is None:
				522	table[i] = 0
				523
				524	self.data = Array(name + "_hash", table)
				525	self.magic = magic
				526	self.name = name
				527	self.size = size
				528	self.poly = poly
				529
				530	def dump(self, file):
				531	# write data to file, as a C array
				532	self.data.dump(file)
				533	file.write("#define %s_magic %d\n" % (self.name, self.magic))
				534	file.write("#define %s_size %d\n" % (self.name, self.size))
				535	file.write("#define %s_poly %d\n" % (self.name, self.poly))
				536
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	537	# stuff to deal with arrays of unsigned integers
				538
				539	class Array:
				540
				541	def __init__(self, name, data):
				542	self.name = name
				543	self.data = data
				544
				545	def dump(self, file):
				546	# write data to file, as a C array
				547	size = getsize(self.data)
				548	# print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
				549	file.write("static ")
				550	if size == 1:
				551	file.write("unsigned char")
				552	elif size == 2:
				553	file.write("unsigned short")
				554	else:
				555	file.write("unsigned int")
				556	file.write(" " + self.name + "[] = {\n")
				557	if self.data:
				558	s = " "
				559	for item in self.data:
				560	i = str(item) + ", "
				561	if len(s) + len(i) > 78:
				562	file.write(s + "\n")
				563	s = " " + i
				564	else:
				565	s = s + i
				566	if string.strip(s):
				567	file.write(s + "\n")
				568	file.write("};\n\n")
				569
				570	def getsize(data):
				571	# return smallest possible integer size for the given array
				572	maxdata = max(data)
				573	if maxdata < 256:
				574	return 1
				575	elif maxdata < 65536:
				576	return 2
				577	else:
				578	return 4
				579
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	580	def splitbins(t, trace=0):
				581	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
				582
				583	t is a sequence of ints. This function can be useful to save space if
				584	many of the ints are the same. t1 and t2 are lists of ints, and shift
				585	is an int, chosen to minimize the combined size of t1 and t2 (in C
				586	code), and where for each i in range(len(t)),
				587	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				588	where mask is a bitmask isolating the last "shift" bits.
				589
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	590	If optional arg trace is non-zero (default zero), progress info
				591	is printed to sys.stderr. The higher the value, the more info
				592	you'll get.
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	593	"""
				594
				595	import sys
				596	if trace:
				597	def dump(t1, t2, shift, bytes):
				598	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
				599	len(t1), len(t2), shift, bytes)
				600	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
				601	"bytes"
				602	n = len(t)-1 # last valid index
				603	maxshift = 0 # the most we can shift n and still have something left
				604	if n > 0:
				605	while n >> 1:
				606	n >>= 1
				607	maxshift += 1
				608	del n
				609	bytes = sys.maxint # smallest total size so far
				610	t = tuple(t) # so slices can be dict keys
				611	for shift in range(maxshift + 1):
				612	t1 = []
				613	t2 = []
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	614	size = 2**shift
				615	bincache = {}
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	616	for i in range(0, len(t), size):
				617	bin = t[i:i+size]
				618	index = bincache.get(bin)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	619	if index is None:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	620	index = len(t2)
				621	bincache[bin] = index
				622	t2.extend(bin)
				623	t1.append(index >> shift)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	624	# determine memory size
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	625	b = len(t1)getsize(t1) + len(t2)getsize(t2)
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	626	if trace > 1:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	627	dump(t1, t2, shift, b)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	628	if b < bytes:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	629	best = t1, t2, shift
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	630	bytes = b
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	631	t1, t2, shift = best
				632	if trace:
				633	print >>sys.stderr, "Best:",
				634	dump(t1, t2, shift, bytes)
				635	if __debug__:
				636	# exhaustively verify that the decomposition is correct
				637	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
				638	for i in xrange(len(t)):
				639	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				640	return best
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	641
				642	if __name__ == "__main__":
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	643	maketables(1)