Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

blob: 7186780f81e157ab62951e0ae007bc4e92cd0d05 [file] [log] [blame]

Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	1	#
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	2	# (re)generate unicode property and type databases
				3	#
Martin v. Löwis	b5c980b	2002-11-25 09:13:37 +0000	[diff] [blame]	4	# this script converts a unicode 3.2 database file to
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	5	# Modules/unicodedata_db.h, Modules/unicodename_db.h,
				6	# and Objects/unicodetype_db.h
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	7	#
				8	# history:
				9	# 2000-09-24 fl created (based on bits and pieces from unidb)
				10	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	11	# 2000-09-25 fl added character type table
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	12	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	13	# 2000-11-03 fl expand first/last ranges
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	14	# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	15	# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	16	# 2002-09-11 wd use string methods
				17	# 2002-10-18 mvl update to Unicode 3.2
				18	# 2002-10-22 mvl generate NFC tables
Martin v. Löwis	97225da	2002-11-24 23:05:09 +0000	[diff] [blame]	19	# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwis	b5c980b	2002-11-25 09:13:37 +0000	[diff] [blame]	20	# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang	974ed7c	2004-06-02 16:49:17 +0000	[diff] [blame]	21	# 2004-05-29 perky add east asian width information
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	22	#
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	23	# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	24	#
				25
				26	import sys
				27
				28	SCRIPT = sys.argv[0]
Hye-Shik Chang	974ed7c	2004-06-02 16:49:17 +0000	[diff] [blame]	29	VERSION = "2.3"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	30
Martin v. Löwis	b5c980b	2002-11-25 09:13:37 +0000	[diff] [blame]	31	# The Unicode Database
				32	UNIDATA_VERSION = "3.2.0"
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	33	UNICODE_DATA = "UnicodeData.txt"
				34	COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
Hye-Shik Chang	974ed7c	2004-06-02 16:49:17 +0000	[diff] [blame]	35	EASTASIAN_WIDTH = "EastAsianWidth.txt"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	36
				37	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
				38	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
				39	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
				40	"So" ]
				41
				42	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
				43	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
				44	"ON" ]
				45
Hye-Shik Chang	e9ddfbb	2004-08-04 07:38:35 +0000	[diff] [blame]	46	EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
				47
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	48	# note: should match definitions in Objects/unicodectype.c
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	49	ALPHA_MASK = 0x01
				50	DECIMAL_MASK = 0x02
				51	DIGIT_MASK = 0x04
				52	LOWER_MASK = 0x08
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	53	LINEBREAK_MASK = 0x10
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	54	SPACE_MASK = 0x20
				55	TITLE_MASK = 0x40
				56	UPPER_MASK = 0x80
				57
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	58	def maketables(trace=0):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	59
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	60	print "--- Reading", UNICODE_DATA, "..."
				61
Hye-Shik Chang	974ed7c	2004-06-02 16:49:17 +0000	[diff] [blame]	62	unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS,
				63	EASTASIAN_WIDTH)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	64
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	65	print len(filter(None, unicode.table)), "characters"
				66
Fredrik Lundh	b2dfd73	2001-01-21 23:31:52 +0000	[diff] [blame]	67	makeunicodename(unicode, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	68	makeunicodedata(unicode, trace)
Fredrik Lundh	b2dfd73	2001-01-21 23:31:52 +0000	[diff] [blame]	69	makeunicodetype(unicode, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	70
				71	# --------------------------------------------------------------------
				72	# unicode character properties
				73
				74	def makeunicodedata(unicode, trace):
				75
Hye-Shik Chang	e9ddfbb	2004-08-04 07:38:35 +0000	[diff] [blame]	76	dummy = (0, 0, 0, 0, 0)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	77	table = [dummy]
				78	cache = {0: dummy}
				79	index = [0] * len(unicode.chars)
				80
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	81	FILE = "Modules/unicodedata_db.h"
				82
				83	print "--- Preparing", FILE, "..."
				84
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	85	# 1) database properties
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	86
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	87	for char in unicode.chars:
				88	record = unicode.table[char]
				89	if record:
				90	# extract database properties
				91	category = CATEGORY_NAMES.index(record[2])
				92	combining = int(record[3])
				93	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
				94	mirrored = record[9] == "Y"
Hye-Shik Chang	e9ddfbb	2004-08-04 07:38:35 +0000	[diff] [blame]	95	eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	96	item = (
Hye-Shik Chang	e9ddfbb	2004-08-04 07:38:35 +0000	[diff] [blame]	97	category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	98	)
				99	# add entry to index and item tables
				100	i = cache.get(item)
				101	if i is None:
				102	cache[item] = i = len(table)
				103	table.append(item)
				104	index[char] = i
				105
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	106	# 2) decomposition data
				107
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	108	decomp_data = [0]
				109	decomp_prefix = [""]
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	110	decomp_index = [0] * len(unicode.chars)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	111	decomp_size = 0
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	112
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	113	comp_pairs = []
				114	comp_first = [None] * len(unicode.chars)
				115	comp_last = [None] * len(unicode.chars)
				116
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	117	for char in unicode.chars:
				118	record = unicode.table[char]
				119	if record:
				120	if record[5]:
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	121	decomp = record[5].split()
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	122	# prefix
				123	if decomp[0][0] == "<":
				124	prefix = decomp.pop(0)
				125	else:
				126	prefix = ""
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	127	try:
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	128	i = decomp_prefix.index(prefix)
				129	except ValueError:
				130	i = len(decomp_prefix)
				131	decomp_prefix.append(prefix)
				132	prefix = i
				133	assert prefix < 256
				134	# content
				135	decomp = [prefix + (len(decomp)<<8)] +\
				136	map(lambda s: int(s, 16), decomp)
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	137	# Collect NFC pairs
				138	if not prefix and len(decomp) == 3 and \
				139	char not in unicode.exclusions and \
				140	unicode.table[decomp[1]][3] == "0":
				141	p, l, r = decomp
				142	comp_first[l] = 1
				143	comp_last[r] = 1
				144	comp_pairs.append((l,r,char))
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	145	try:
				146	i = decomp_data.index(decomp)
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	147	except ValueError:
				148	i = len(decomp_data)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	149	decomp_data.extend(decomp)
				150	decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	151	else:
				152	i = 0
				153	decomp_index[char] = i
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	154
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	155	f = l = 0
				156	comp_first_ranges = []
				157	comp_last_ranges = []
				158	prev_f = prev_l = None
				159	for i in unicode.chars:
				160	if comp_first[i] is not None:
				161	comp_first[i] = f
				162	f += 1
				163	if prev_f is None:
				164	prev_f = (i,i)
				165	elif prev_f[1]+1 == i:
				166	prev_f = prev_f[0],i
				167	else:
				168	comp_first_ranges.append(prev_f)
				169	prev_f = (i,i)
				170	if comp_last[i] is not None:
				171	comp_last[i] = l
				172	l += 1
				173	if prev_l is None:
				174	prev_l = (i,i)
				175	elif prev_l[1]+1 == i:
				176	prev_l = prev_l[0],i
				177	else:
				178	comp_last_ranges.append(prev_l)
				179	prev_l = (i,i)
				180	comp_first_ranges.append(prev_f)
				181	comp_last_ranges.append(prev_l)
				182	total_first = f
				183	total_last = l
				184
				185	comp_data = [0](total_firsttotal_last)
				186	for f,l,char in comp_pairs:
				187	f = comp_first[f]
				188	l = comp_last[l]
				189	comp_data[f*total_last+l] = char
				190
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	191	print len(table), "unique properties"
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	192	print len(decomp_prefix), "unique decomposition prefixes"
				193	print len(decomp_data), "unique decomposition entries:",
				194	print decomp_size, "bytes"
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	195	print total_first, "first characters in NFC"
				196	print total_last, "last characters in NFC"
				197	print len(comp_pairs), "NFC pairs"
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	198
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	199	print "--- Writing", FILE, "..."
				200
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	201	fp = open(FILE, "w")
				202	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				203	print >>fp
Martin v. Löwis	b5c980b	2002-11-25 09:13:37 +0000	[diff] [blame]	204	print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	205	print >>fp, "/* a list of unique database records */"
				206	print >>fp, \
				207	"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	208	for item in table:
Hye-Shik Chang	e9ddfbb	2004-08-04 07:38:35 +0000	[diff] [blame]	209	print >>fp, " {%d, %d, %d, %d, %d}," % item
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	210	print >>fp, "};"
				211	print >>fp
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	212
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	213	print >>fp, "/* Reindexing of NFC first characters. */"
				214	print >>fp, "#define TOTAL_FIRST",total_first
				215	print >>fp, "#define TOTAL_LAST",total_last
				216	print >>fp, "struct reindex{int start;short count,index;};"
				217	print >>fp, "struct reindex nfc_first[] = {"
				218	for start,end in comp_first_ranges:
				219	print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
				220	print >>fp," {0,0,0}"
				221	print >>fp,"};\n"
				222	print >>fp, "struct reindex nfc_last[] = {"
				223	for start,end in comp_last_ranges:
				224	print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
				225	print >>fp," {0,0,0}"
				226	print >>fp,"};\n"
				227
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	228	# FIXME: <fl> the following tables could be made static, and
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	229	# the support code moved into unicodedatabase.c
				230
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	231	print >>fp, "/* string literals */"
				232	print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	233	for name in CATEGORY_NAMES:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	234	print >>fp, " \"%s\"," % name
				235	print >>fp, " NULL"
				236	print >>fp, "};"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	237
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	238	print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	239	for name in BIDIRECTIONAL_NAMES:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	240	print >>fp, " \"%s\"," % name
				241	print >>fp, " NULL"
				242	print >>fp, "};"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	243
Hye-Shik Chang	e9ddfbb	2004-08-04 07:38:35 +0000	[diff] [blame]	244	print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
				245	for name in EASTASIANWIDTH_NAMES:
				246	print >>fp, " \"%s\"," % name
				247	print >>fp, " NULL"
				248	print >>fp, "};"
				249
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	250	print >>fp, "static const char *decomp_prefix[] = {"
				251	for name in decomp_prefix:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	252	print >>fp, " \"%s\"," % name
				253	print >>fp, " NULL"
				254	print >>fp, "};"
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	255
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	256	# split record index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	257	index1, index2, shift = splitbins(index, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	258
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	259	print >>fp, "/* index tables for the database records */"
				260	print >>fp, "#define SHIFT", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	261	Array("index1", index1).dump(fp, trace)
				262	Array("index2", index2).dump(fp, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	263
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	264	# split decomposition index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	265	index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	266
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	267	print >>fp, "/* decomposition data */"
				268	Array("decomp_data", decomp_data).dump(fp, trace)
				269
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	270	print >>fp, "/* index tables for the decomposition data */"
				271	print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	272	Array("decomp_index1", index1).dump(fp, trace)
				273	Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	274
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	275	index, index2, shift = splitbins(comp_data, trace)
				276	print >>fp, "/* NFC pairs */"
				277	print >>fp, "#define COMP_SHIFT", shift
				278	Array("comp_index", index).dump(fp, trace)
				279	Array("comp_data", index2).dump(fp, trace)
				280
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	281	fp.close()
				282
				283	# --------------------------------------------------------------------
				284	# unicode character type tables
				285
				286	def makeunicodetype(unicode, trace):
				287
				288	FILE = "Objects/unicodetype_db.h"
				289
				290	print "--- Preparing", FILE, "..."
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	291
				292	# extract unicode types
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	293	dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	294	table = [dummy]
				295	cache = {0: dummy}
				296	index = [0] * len(unicode.chars)
				297
				298	for char in unicode.chars:
				299	record = unicode.table[char]
				300	if record:
				301	# extract database properties
				302	category = record[2]
				303	bidirectional = record[4]
				304	flags = 0
				305	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
				306	flags \|= ALPHA_MASK
				307	if category == "Ll":
				308	flags \|= LOWER_MASK
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	309	if category == "Zl" or bidirectional == "B":
				310	flags \|= LINEBREAK_MASK
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	311	if category == "Zs" or bidirectional in ("WS", "B", "S"):
				312	flags \|= SPACE_MASK
Fredrik Lundh	375732c	2000-09-25 23:03:34 +0000	[diff] [blame]	313	if category == "Lt":
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	314	flags \|= TITLE_MASK
				315	if category == "Lu":
				316	flags \|= UPPER_MASK
				317	# use delta predictor for upper/lower/title
				318	if record[12]:
Martin v. Löwis	99ac328	2002-10-18 17:34:18 +0000	[diff] [blame]	319	upper = int(record[12], 16) - char
				320	assert -32768 <= upper <= 32767
				321	upper = upper & 0xffff
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	322	else:
				323	upper = 0
				324	if record[13]:
Martin v. Löwis	99ac328	2002-10-18 17:34:18 +0000	[diff] [blame]	325	lower = int(record[13], 16) - char
				326	assert -32768 <= lower <= 32767
				327	lower = lower & 0xffff
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	328	else:
				329	lower = 0
				330	if record[14]:
Martin v. Löwis	99ac328	2002-10-18 17:34:18 +0000	[diff] [blame]	331	title = int(record[14], 16) - char
				332	assert -32768 <= lower <= 32767
				333	title = title & 0xffff
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	334	else:
				335	title = 0
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	336	# decimal digit, integer digit
				337	decimal = 0
				338	if record[6]:
				339	flags \|= DECIMAL_MASK
				340	decimal = int(record[6])
				341	digit = 0
				342	if record[7]:
				343	flags \|= DIGIT_MASK
				344	digit = int(record[7])
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	345	item = (
Hye-Shik Chang	974ed7c	2004-06-02 16:49:17 +0000	[diff] [blame]	346	upper, lower, title, decimal, digit, flags
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	347	)
				348	# add entry to index and item tables
				349	i = cache.get(item)
				350	if i is None:
				351	cache[item] = i = len(table)
				352	table.append(item)
				353	index[char] = i
				354
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	355	print len(table), "unique character type entries"
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	356
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	357	print "--- Writing", FILE, "..."
				358
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	359	fp = open(FILE, "w")
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	360	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				361	print >>fp
				362	print >>fp, "/* a list of unique character type descriptors */"
				363	print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	364	for item in table:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	365	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
				366	print >>fp, "};"
				367	print >>fp
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	368
				369	# split decomposition index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	370	index1, index2, shift = splitbins(index, trace)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	371
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	372	print >>fp, "/* type indexes */"
				373	print >>fp, "#define SHIFT", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	374	Array("index1", index1).dump(fp, trace)
				375	Array("index2", index2).dump(fp, trace)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	376
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	377	fp.close()
				378
				379	# --------------------------------------------------------------------
				380	# unicode name database
				381
				382	def makeunicodename(unicode, trace):
				383
				384	FILE = "Modules/unicodename_db.h"
				385
				386	print "--- Preparing", FILE, "..."
				387
				388	# collect names
				389	names = [None] * len(unicode.chars)
				390
				391	for char in unicode.chars:
				392	record = unicode.table[char]
				393	if record:
				394	name = record[1].strip()
				395	if name and name[0] != "<":
				396	names[char] = name + chr(0)
				397
				398	print len(filter(lambda n: n is not None, names)), "distinct names"
				399
				400	# collect unique words from names (note that we differ between
				401	# words inside a sentence, and words ending a sentence. the
				402	# latter includes the trailing null byte.
				403
				404	words = {}
				405	n = b = 0
				406	for char in unicode.chars:
				407	name = names[char]
				408	if name:
				409	w = name.split()
				410	b = b + len(name)
				411	n = n + len(w)
				412	for w in w:
				413	l = words.get(w)
				414	if l:
				415	l.append(None)
				416	else:
				417	words[w] = [len(words)]
				418
				419	print n, "words in text;", b, "bytes"
				420
				421	wordlist = words.items()
				422
Martin v. Löwis	97225da	2002-11-24 23:05:09 +0000	[diff] [blame]	423	# sort on falling frequency, then by name
				424	def cmpwords((aword, alist),(bword, blist)):
				425	r = -cmp(len(alist),len(blist))
				426	if r:
				427	return r
				428	return cmp(aword, bword)
				429	wordlist.sort(cmpwords)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	430
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	431	# figure out how many phrasebook escapes we need
				432	escapes = 0
				433	while escapes * 256 < len(wordlist):
				434	escapes = escapes + 1
				435	print escapes, "escapes"
				436
				437	short = 256 - escapes
				438
				439	assert short > 0
				440
				441	print short, "short indexes in lexicon"
				442
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	443	# statistics
				444	n = 0
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	445	for i in range(short):
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	446	n = n + len(wordlist[i][1])
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	447	print n, "short indexes in phrasebook"
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	448
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	449	# pick the most commonly used words, and sort the rest on falling
				450	# length (to maximize overlap)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	451
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	452	wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	453	wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
				454	wordlist.extend(wordtail)
				455
				456	# generate lexicon from words
				457
				458	lexicon_offset = [0]
				459	lexicon = ""
				460	words = {}
				461
				462	# build a lexicon string
				463	offset = 0
				464	for w, x in wordlist:
				465	# encoding: bit 7 indicates last character in word (chr(128)
				466	# indicates the last character in an entire string)
				467	ww = w[:-1] + chr(ord(w[-1])+128)
				468	# reuse string tails, when possible
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	469	o = lexicon.find(ww)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	470	if o < 0:
				471	o = offset
				472	lexicon = lexicon + ww
				473	offset = offset + len(w)
				474	words[w] = len(lexicon_offset)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	475	lexicon_offset.append(o)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	476
				477	lexicon = map(ord, lexicon)
				478
				479	# generate phrasebook from names and lexicon
				480	phrasebook = [0]
				481	phrasebook_offset = [0] * len(unicode.chars)
				482	for char in unicode.chars:
				483	name = names[char]
				484	if name:
				485	w = name.split()
				486	phrasebook_offset[char] = len(phrasebook)
				487	for w in w:
				488	i = words[w]
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	489	if i < short:
				490	phrasebook.append(i)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	491	else:
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	492	# store as two bytes
				493	phrasebook.append((i>>8) + short)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	494	phrasebook.append(i&255)
				495
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	496	assert getsize(phrasebook) == 1
				497
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	498	#
				499	# unicode name hash table
				500
				501	# extract names
				502	data = []
				503	for char in unicode.chars:
				504	record = unicode.table[char]
				505	if record:
				506	name = record[1].strip()
				507	if name and name[0] != "<":
				508	data.append((name, char))
				509
				510	# the magic number 47 was chosen to minimize the number of
				511	# collisions on the current data set. if you like, change it
				512	# and see what happens...
				513
				514	codehash = Hash("code", data, 47)
				515
				516	print "--- Writing", FILE, "..."
				517
				518	fp = open(FILE, "w")
				519	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				520	print >>fp
				521	print >>fp, "#define NAME_MAXLEN", 256
				522	print >>fp
				523	print >>fp, "/* lexicon */"
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	524	Array("lexicon", lexicon).dump(fp, trace)
				525	Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	526
				527	# split decomposition index table
				528	offset1, offset2, shift = splitbins(phrasebook_offset, trace)
				529
				530	print >>fp, "/* code->name phrasebook */"
				531	print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	532	print >>fp, "#define phrasebook_short", short
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	533
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	534	Array("phrasebook", phrasebook).dump(fp, trace)
				535	Array("phrasebook_offset1", offset1).dump(fp, trace)
				536	Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	537
				538	print >>fp, "/* name->code dictionary */"
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	539	codehash.dump(fp, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	540
				541	fp.close()
				542
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	543	# --------------------------------------------------------------------
				544	# the following support code is taken from the unidb utilities
				545	# Copyright (c) 1999-2000 by Secret Labs AB
				546
				547	# load a unicode-data file from disk
				548
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	549	import sys
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	550
				551	class UnicodeData:
				552
Hye-Shik Chang	974ed7c	2004-06-02 16:49:17 +0000	[diff] [blame]	553	def __init__(self, filename, exclusions, eastasianwidth, expand=1):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	554	file = open(filename)
Martin v. Löwis	9def6a3	2002-10-18 16:11:54 +0000	[diff] [blame]	555	table = [None] * 0x110000
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	556	while 1:
				557	s = file.readline()
				558	if not s:
				559	break
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	560	s = s.strip().split(";")
				561	char = int(s[0], 16)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	562	table[char] = s
				563
Martin v. Löwis	97225da	2002-11-24 23:05:09 +0000	[diff] [blame]	564	# expand first-last ranges
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	565	if expand:
				566	field = None
Martin v. Löwis	97225da	2002-11-24 23:05:09 +0000	[diff] [blame]	567	for i in range(0, 0x110000):
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	568	s = table[i]
				569	if s:
				570	if s[1][-6:] == "First>":
				571	s[1] = ""
				572	field = s[:]
				573	elif s[1][-5:] == "Last>":
				574	s[1] = ""
				575	field = None
				576	elif field:
				577	field[0] = hex(i)
				578	table[i] = field
				579
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	580	# public attributes
				581	self.filename = filename
				582	self.table = table
Martin v. Löwis	9def6a3	2002-10-18 16:11:54 +0000	[diff] [blame]	583	self.chars = range(0x110000) # unicode 3.2
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	584
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	585	file = open(exclusions)
				586	self.exclusions = {}
				587	for s in file:
				588	s = s.strip()
				589	if not s:
				590	continue
				591	if s[0] == '#':
				592	continue
				593	char = int(s.split()[0],16)
				594	self.exclusions[char] = 1
				595
Hye-Shik Chang	974ed7c	2004-06-02 16:49:17 +0000	[diff] [blame]	596	widths = [None] * 0x110000
				597	for s in open(eastasianwidth):
				598	s = s.strip()
				599	if not s:
				600	continue
				601	if s[0] == '#':
				602	continue
				603	s = s.split()[0].split(';')
				604	if '..' in s[0]:
				605	first, last = [int(c, 16) for c in s[0].split('..')]
				606	chars = range(first, last+1)
				607	else:
				608	chars = [int(s[0], 16)]
				609	for char in chars:
				610	widths[char] = s[1]
				611	for i in range(0, 0x110000):
				612	if table[i] is not None:
				613	table[i].append(widths[i])
				614
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	615	def uselatin1(self):
				616	# restrict character range to ISO Latin 1
				617	self.chars = range(256)
				618
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	619	# hash table tools
				620
				621	# this is a straight-forward reimplementation of Python's built-in
				622	# dictionary type, using a static data structure, and a custom string
				623	# hash algorithm.
				624
				625	def myhash(s, magic):
				626	h = 0
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	627	for c in map(ord, s.upper()):
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	628	h = (h * magic) + c
Martin v. Löwis	97225da	2002-11-24 23:05:09 +0000	[diff] [blame]	629	ix = h & 0xff000000L
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	630	if ix:
				631	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
				632	return h
				633
				634	SIZES = [
				635	(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
				636	(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
				637	(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
				638	(2097152,5), (4194304,3), (8388608,33), (16777216,27)
				639	]
				640
				641	class Hash:
				642	def __init__(self, name, data, magic):
				643	# turn a (key, value) list into a static hash table structure
				644
				645	# determine table size
				646	for size, poly in SIZES:
				647	if size > len(data):
				648	poly = size + poly
				649	break
				650	else:
				651	raise AssertionError, "ran out of polynominals"
				652
				653	print size, "slots in hash table"
				654
				655	table = [None] * size
				656
				657	mask = size-1
				658
				659	n = 0
				660
				661	hash = myhash
				662
				663	# initialize hash table
				664	for key, value in data:
				665	h = hash(key, magic)
				666	i = (~h) & mask
				667	v = table[i]
				668	if v is None:
				669	table[i] = value
				670	continue
				671	incr = (h ^ (h >> 3)) & mask;
				672	if not incr:
				673	incr = mask
				674	while 1:
				675	n = n + 1
				676	i = (i + incr) & mask
				677	v = table[i]
				678	if v is None:
				679	table[i] = value
				680	break
				681	incr = incr << 1
				682	if incr > mask:
				683	incr = incr ^ poly
				684
				685	print n, "collisions"
				686	self.collisions = n
				687
				688	for i in range(len(table)):
				689	if table[i] is None:
				690	table[i] = 0
				691
				692	self.data = Array(name + "_hash", table)
				693	self.magic = magic
				694	self.name = name
				695	self.size = size
				696	self.poly = poly
				697
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	698	def dump(self, file, trace):
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	699	# write data to file, as a C array
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	700	self.data.dump(file, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	701	file.write("#define %s_magic %d\n" % (self.name, self.magic))
				702	file.write("#define %s_size %d\n" % (self.name, self.size))
				703	file.write("#define %s_poly %d\n" % (self.name, self.poly))
				704
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	705	# stuff to deal with arrays of unsigned integers
				706
				707	class Array:
				708
				709	def __init__(self, name, data):
				710	self.name = name
				711	self.data = data
				712
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	713	def dump(self, file, trace=0):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	714	# write data to file, as a C array
				715	size = getsize(self.data)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	716	if trace:
				717	print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	718	file.write("static ")
				719	if size == 1:
				720	file.write("unsigned char")
				721	elif size == 2:
				722	file.write("unsigned short")
				723	else:
				724	file.write("unsigned int")
				725	file.write(" " + self.name + "[] = {\n")
				726	if self.data:
				727	s = " "
				728	for item in self.data:
				729	i = str(item) + ", "
				730	if len(s) + len(i) > 78:
				731	file.write(s + "\n")
				732	s = " " + i
				733	else:
				734	s = s + i
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	735	if s.strip():
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	736	file.write(s + "\n")
				737	file.write("};\n\n")
				738
				739	def getsize(data):
				740	# return smallest possible integer size for the given array
				741	maxdata = max(data)
				742	if maxdata < 256:
				743	return 1
				744	elif maxdata < 65536:
				745	return 2
				746	else:
				747	return 4
				748
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	749	def splitbins(t, trace=0):
				750	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
				751
				752	t is a sequence of ints. This function can be useful to save space if
				753	many of the ints are the same. t1 and t2 are lists of ints, and shift
				754	is an int, chosen to minimize the combined size of t1 and t2 (in C
				755	code), and where for each i in range(len(t)),
				756	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				757	where mask is a bitmask isolating the last "shift" bits.
				758
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	759	If optional arg trace is non-zero (default zero), progress info
				760	is printed to sys.stderr. The higher the value, the more info
				761	you'll get.
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	762	"""
				763
				764	import sys
				765	if trace:
				766	def dump(t1, t2, shift, bytes):
				767	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
				768	len(t1), len(t2), shift, bytes)
				769	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
				770	"bytes"
				771	n = len(t)-1 # last valid index
				772	maxshift = 0 # the most we can shift n and still have something left
				773	if n > 0:
				774	while n >> 1:
				775	n >>= 1
				776	maxshift += 1
				777	del n
				778	bytes = sys.maxint # smallest total size so far
				779	t = tuple(t) # so slices can be dict keys
				780	for shift in range(maxshift + 1):
				781	t1 = []
				782	t2 = []
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	783	size = 2**shift
				784	bincache = {}
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	785	for i in range(0, len(t), size):
				786	bin = t[i:i+size]
				787	index = bincache.get(bin)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	788	if index is None:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	789	index = len(t2)
				790	bincache[bin] = index
				791	t2.extend(bin)
				792	t1.append(index >> shift)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	793	# determine memory size
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	794	b = len(t1)getsize(t1) + len(t2)getsize(t2)
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	795	if trace > 1:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	796	dump(t1, t2, shift, b)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	797	if b < bytes:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	798	best = t1, t2, shift
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	799	bytes = b
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	800	t1, t2, shift = best
				801	if trace:
				802	print >>sys.stderr, "Best:",
				803	dump(t1, t2, shift, bytes)
				804	if __debug__:
				805	# exhaustively verify that the decomposition is correct
				806	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
				807	for i in xrange(len(t)):
				808	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				809	return best
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	810
				811	if __name__ == "__main__":
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	812	maketables(1)