Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

blob: 0eeb335ede675fc14859c4207be6c044390f9a7a [file] [log] [blame]

Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	1	#
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	2	# (re)generate unicode property and type databases
				3	#
				4	# this script converts a unicode 3.0 database file to
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	5	# Modules/unicodedata_db.h, Modules/unicodename_db.h,
				6	# and Objects/unicodetype_db.h
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	7	#
				8	# history:
				9	# 2000-09-24 fl created (based on bits and pieces from unidb)
				10	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	11	# 2000-09-25 fl added character type table
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	12	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	13	# 2000-11-03 fl expand first/last ranges
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	14	# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	15	# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	16	#
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	17	# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	18	#
				19
				20	import sys
				21
				22	SCRIPT = sys.argv[0]
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	23	VERSION = "2.1"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	24
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	25	UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	26
				27	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
				28	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
				29	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
				30	"So" ]
				31
				32	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
				33	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
				34	"ON" ]
				35
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	36	# note: should match definitions in Objects/unicodectype.c
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	37	ALPHA_MASK = 0x01
				38	DECIMAL_MASK = 0x02
				39	DIGIT_MASK = 0x04
				40	LOWER_MASK = 0x08
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	41	LINEBREAK_MASK = 0x10
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	42	SPACE_MASK = 0x20
				43	TITLE_MASK = 0x40
				44	UPPER_MASK = 0x80
				45
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	46	def maketables(trace=0):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	47
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	48	print "--- Reading", UNICODE_DATA, "..."
				49
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	50	unicode = UnicodeData(UNICODE_DATA)
				51
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	52	print len(filter(None, unicode.table)), "characters"
				53
Fredrik Lundh	b2dfd73	2001-01-21 23:31:52 +0000	[diff] [blame]	54	makeunicodename(unicode, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	55	makeunicodedata(unicode, trace)
Fredrik Lundh	b2dfd73	2001-01-21 23:31:52 +0000	[diff] [blame]	56	makeunicodetype(unicode, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	57
				58	# --------------------------------------------------------------------
				59	# unicode character properties
				60
				61	def makeunicodedata(unicode, trace):
				62
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	63	dummy = (0, 0, 0, 0)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	64	table = [dummy]
				65	cache = {0: dummy}
				66	index = [0] * len(unicode.chars)
				67
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	68	FILE = "Modules/unicodedata_db.h"
				69
				70	print "--- Preparing", FILE, "..."
				71
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	72	# 1) database properties
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	73
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	74	for char in unicode.chars:
				75	record = unicode.table[char]
				76	if record:
				77	# extract database properties
				78	category = CATEGORY_NAMES.index(record[2])
				79	combining = int(record[3])
				80	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
				81	mirrored = record[9] == "Y"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	82	item = (
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	83	category, combining, bidirectional, mirrored
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	84	)
				85	# add entry to index and item tables
				86	i = cache.get(item)
				87	if i is None:
				88	cache[item] = i = len(table)
				89	table.append(item)
				90	index[char] = i
				91
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	92	# 2) decomposition data
				93
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	94	decomp_data = [0]
				95	decomp_prefix = [""]
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	96	decomp_index = [0] * len(unicode.chars)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	97	decomp_size = 0
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	98
				99	for char in unicode.chars:
				100	record = unicode.table[char]
				101	if record:
				102	if record[5]:
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	103	decomp = string.split(record[5])
				104	# prefix
				105	if decomp[0][0] == "<":
				106	prefix = decomp.pop(0)
				107	else:
				108	prefix = ""
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	109	try:
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	110	i = decomp_prefix.index(prefix)
				111	except ValueError:
				112	i = len(decomp_prefix)
				113	decomp_prefix.append(prefix)
				114	prefix = i
				115	assert prefix < 256
				116	# content
				117	decomp = [prefix + (len(decomp)<<8)] +\
				118	map(lambda s: int(s, 16), decomp)
				119	try:
				120	i = decomp_data.index(decomp)
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	121	except ValueError:
				122	i = len(decomp_data)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	123	decomp_data.extend(decomp)
				124	decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	125	else:
				126	i = 0
				127	decomp_index[char] = i
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	128
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	129	print len(table), "unique properties"
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	130	print len(decomp_prefix), "unique decomposition prefixes"
				131	print len(decomp_data), "unique decomposition entries:",
				132	print decomp_size, "bytes"
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	133
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	134	print "--- Writing", FILE, "..."
				135
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	136	fp = open(FILE, "w")
				137	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				138	print >>fp
				139	print >>fp, "/* a list of unique database records */"
				140	print >>fp, \
				141	"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	142	for item in table:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	143	print >>fp, " {%d, %d, %d, %d}," % item
				144	print >>fp, "};"
				145	print >>fp
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	146
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	147	# FIXME: <fl> the following tables could be made static, and
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	148	# the support code moved into unicodedatabase.c
				149
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	150	print >>fp, "/* string literals */"
				151	print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	152	for name in CATEGORY_NAMES:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	153	print >>fp, " \"%s\"," % name
				154	print >>fp, " NULL"
				155	print >>fp, "};"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	156
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	157	print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	158	for name in BIDIRECTIONAL_NAMES:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	159	print >>fp, " \"%s\"," % name
				160	print >>fp, " NULL"
				161	print >>fp, "};"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	162
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	163	print >>fp, "static const char *decomp_prefix[] = {"
				164	for name in decomp_prefix:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	165	print >>fp, " \"%s\"," % name
				166	print >>fp, " NULL"
				167	print >>fp, "};"
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	168
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	169	# split record index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	170	index1, index2, shift = splitbins(index, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	171
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	172	print >>fp, "/* index tables for the database records */"
				173	print >>fp, "#define SHIFT", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	174	Array("index1", index1).dump(fp, trace)
				175	Array("index2", index2).dump(fp, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	176
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	177	# split decomposition index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	178	index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	179
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	180	print >>fp, "/* decomposition data */"
				181	Array("decomp_data", decomp_data).dump(fp, trace)
				182
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	183	print >>fp, "/* index tables for the decomposition data */"
				184	print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	185	Array("decomp_index1", index1).dump(fp, trace)
				186	Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	187
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	188	fp.close()
				189
				190	# --------------------------------------------------------------------
				191	# unicode character type tables
				192
				193	def makeunicodetype(unicode, trace):
				194
				195	FILE = "Objects/unicodetype_db.h"
				196
				197	print "--- Preparing", FILE, "..."
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	198
				199	# extract unicode types
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	200	dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	201	table = [dummy]
				202	cache = {0: dummy}
				203	index = [0] * len(unicode.chars)
				204
				205	for char in unicode.chars:
				206	record = unicode.table[char]
				207	if record:
				208	# extract database properties
				209	category = record[2]
				210	bidirectional = record[4]
				211	flags = 0
				212	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
				213	flags \|= ALPHA_MASK
				214	if category == "Ll":
				215	flags \|= LOWER_MASK
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	216	if category == "Zl" or bidirectional == "B":
				217	flags \|= LINEBREAK_MASK
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	218	if category == "Zs" or bidirectional in ("WS", "B", "S"):
				219	flags \|= SPACE_MASK
Fredrik Lundh	375732c	2000-09-25 23:03:34 +0000	[diff] [blame]	220	if category == "Lt":
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	221	flags \|= TITLE_MASK
				222	if category == "Lu":
				223	flags \|= UPPER_MASK
				224	# use delta predictor for upper/lower/title
				225	if record[12]:
				226	upper = (int(record[12], 16) - char) & 0xffff
				227	else:
				228	upper = 0
				229	if record[13]:
				230	lower = (int(record[13], 16) - char) & 0xffff
				231	else:
				232	lower = 0
				233	if record[14]:
				234	title = (int(record[14], 16) - char) & 0xffff
				235	else:
				236	title = 0
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	237	# decimal digit, integer digit
				238	decimal = 0
				239	if record[6]:
				240	flags \|= DECIMAL_MASK
				241	decimal = int(record[6])
				242	digit = 0
				243	if record[7]:
				244	flags \|= DIGIT_MASK
				245	digit = int(record[7])
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	246	item = (
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	247	flags, upper, lower, title, decimal, digit
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	248	)
				249	# add entry to index and item tables
				250	i = cache.get(item)
				251	if i is None:
				252	cache[item] = i = len(table)
				253	table.append(item)
				254	index[char] = i
				255
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	256	print len(table), "unique character type entries"
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	257
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	258	print "--- Writing", FILE, "..."
				259
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	260	fp = open(FILE, "w")
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	261	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				262	print >>fp
				263	print >>fp, "/* a list of unique character type descriptors */"
				264	print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	265	for item in table:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	266	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
				267	print >>fp, "};"
				268	print >>fp
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	269
				270	# split decomposition index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	271	index1, index2, shift = splitbins(index, trace)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	272
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	273	print >>fp, "/* type indexes */"
				274	print >>fp, "#define SHIFT", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	275	Array("index1", index1).dump(fp, trace)
				276	Array("index2", index2).dump(fp, trace)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	277
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	278	fp.close()
				279
				280	# --------------------------------------------------------------------
				281	# unicode name database
				282
				283	def makeunicodename(unicode, trace):
				284
				285	FILE = "Modules/unicodename_db.h"
				286
				287	print "--- Preparing", FILE, "..."
				288
				289	# collect names
				290	names = [None] * len(unicode.chars)
				291
				292	for char in unicode.chars:
				293	record = unicode.table[char]
				294	if record:
				295	name = record[1].strip()
				296	if name and name[0] != "<":
				297	names[char] = name + chr(0)
				298
				299	print len(filter(lambda n: n is not None, names)), "distinct names"
				300
				301	# collect unique words from names (note that we differ between
				302	# words inside a sentence, and words ending a sentence. the
				303	# latter includes the trailing null byte.
				304
				305	words = {}
				306	n = b = 0
				307	for char in unicode.chars:
				308	name = names[char]
				309	if name:
				310	w = name.split()
				311	b = b + len(name)
				312	n = n + len(w)
				313	for w in w:
				314	l = words.get(w)
				315	if l:
				316	l.append(None)
				317	else:
				318	words[w] = [len(words)]
				319
				320	print n, "words in text;", b, "bytes"
				321
				322	wordlist = words.items()
				323
				324	# sort on falling frequency
				325	wordlist.sort(lambda a, b: len(b[1])-len(a[1]))
				326
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	327	# figure out how many phrasebook escapes we need
				328	escapes = 0
				329	while escapes * 256 < len(wordlist):
				330	escapes = escapes + 1
				331	print escapes, "escapes"
				332
				333	short = 256 - escapes
				334
				335	assert short > 0
				336
				337	print short, "short indexes in lexicon"
				338
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	339	# statistics
				340	n = 0
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	341	for i in range(short):
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	342	n = n + len(wordlist[i][1])
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	343	print n, "short indexes in phrasebook"
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	344
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	345	# pick the most commonly used words, and sort the rest on falling
				346	# length (to maximize overlap)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	347
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	348	wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	349	wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
				350	wordlist.extend(wordtail)
				351
				352	# generate lexicon from words
				353
				354	lexicon_offset = [0]
				355	lexicon = ""
				356	words = {}
				357
				358	# build a lexicon string
				359	offset = 0
				360	for w, x in wordlist:
				361	# encoding: bit 7 indicates last character in word (chr(128)
				362	# indicates the last character in an entire string)
				363	ww = w[:-1] + chr(ord(w[-1])+128)
				364	# reuse string tails, when possible
				365	o = string.find(lexicon, ww)
				366	if o < 0:
				367	o = offset
				368	lexicon = lexicon + ww
				369	offset = offset + len(w)
				370	words[w] = len(lexicon_offset)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	371	lexicon_offset.append(o)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	372
				373	lexicon = map(ord, lexicon)
				374
				375	# generate phrasebook from names and lexicon
				376	phrasebook = [0]
				377	phrasebook_offset = [0] * len(unicode.chars)
				378	for char in unicode.chars:
				379	name = names[char]
				380	if name:
				381	w = name.split()
				382	phrasebook_offset[char] = len(phrasebook)
				383	for w in w:
				384	i = words[w]
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	385	if i < short:
				386	phrasebook.append(i)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	387	else:
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	388	# store as two bytes
				389	phrasebook.append((i>>8) + short)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	390	phrasebook.append(i&255)
				391
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	392	assert getsize(phrasebook) == 1
				393
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	394	#
				395	# unicode name hash table
				396
				397	# extract names
				398	data = []
				399	for char in unicode.chars:
				400	record = unicode.table[char]
				401	if record:
				402	name = record[1].strip()
				403	if name and name[0] != "<":
				404	data.append((name, char))
				405
				406	# the magic number 47 was chosen to minimize the number of
				407	# collisions on the current data set. if you like, change it
				408	# and see what happens...
				409
				410	codehash = Hash("code", data, 47)
				411
				412	print "--- Writing", FILE, "..."
				413
				414	fp = open(FILE, "w")
				415	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				416	print >>fp
				417	print >>fp, "#define NAME_MAXLEN", 256
				418	print >>fp
				419	print >>fp, "/* lexicon */"
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	420	Array("lexicon", lexicon).dump(fp, trace)
				421	Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	422
				423	# split decomposition index table
				424	offset1, offset2, shift = splitbins(phrasebook_offset, trace)
				425
				426	print >>fp, "/* code->name phrasebook */"
				427	print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	428	print >>fp, "#define phrasebook_short", short
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	429
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	430	Array("phrasebook", phrasebook).dump(fp, trace)
				431	Array("phrasebook_offset1", offset1).dump(fp, trace)
				432	Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	433
				434	print >>fp, "/* name->code dictionary */"
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	435	codehash.dump(fp, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	436
				437	fp.close()
				438
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	439	# --------------------------------------------------------------------
				440	# the following support code is taken from the unidb utilities
				441	# Copyright (c) 1999-2000 by Secret Labs AB
				442
				443	# load a unicode-data file from disk
				444
				445	import string, sys
				446
				447	class UnicodeData:
				448
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	449	def __init__(self, filename, expand=1):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	450	file = open(filename)
				451	table = [None] * 65536
				452	while 1:
				453	s = file.readline()
				454	if not s:
				455	break
				456	s = string.split(string.strip(s), ";")
				457	char = string.atoi(s[0], 16)
				458	table[char] = s
				459
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	460	# expand first-last ranges (ignore surrogates and private use)
				461	if expand:
				462	field = None
				463	for i in range(0, 0xD800):
				464	s = table[i]
				465	if s:
				466	if s[1][-6:] == "First>":
				467	s[1] = ""
				468	field = s[:]
				469	elif s[1][-5:] == "Last>":
				470	s[1] = ""
				471	field = None
				472	elif field:
				473	field[0] = hex(i)
				474	table[i] = field
				475
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	476	# public attributes
				477	self.filename = filename
				478	self.table = table
				479	self.chars = range(65536) # unicode
				480
				481	def uselatin1(self):
				482	# restrict character range to ISO Latin 1
				483	self.chars = range(256)
				484
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	485	# hash table tools
				486
				487	# this is a straight-forward reimplementation of Python's built-in
				488	# dictionary type, using a static data structure, and a custom string
				489	# hash algorithm.
				490
				491	def myhash(s, magic):
				492	h = 0
				493	for c in map(ord, string.upper(s)):
				494	h = (h * magic) + c
				495	ix = h & 0xff000000
				496	if ix:
				497	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
				498	return h
				499
				500	SIZES = [
				501	(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
				502	(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
				503	(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
				504	(2097152,5), (4194304,3), (8388608,33), (16777216,27)
				505	]
				506
				507	class Hash:
				508	def __init__(self, name, data, magic):
				509	# turn a (key, value) list into a static hash table structure
				510
				511	# determine table size
				512	for size, poly in SIZES:
				513	if size > len(data):
				514	poly = size + poly
				515	break
				516	else:
				517	raise AssertionError, "ran out of polynominals"
				518
				519	print size, "slots in hash table"
				520
				521	table = [None] * size
				522
				523	mask = size-1
				524
				525	n = 0
				526
				527	hash = myhash
				528
				529	# initialize hash table
				530	for key, value in data:
				531	h = hash(key, magic)
				532	i = (~h) & mask
				533	v = table[i]
				534	if v is None:
				535	table[i] = value
				536	continue
				537	incr = (h ^ (h >> 3)) & mask;
				538	if not incr:
				539	incr = mask
				540	while 1:
				541	n = n + 1
				542	i = (i + incr) & mask
				543	v = table[i]
				544	if v is None:
				545	table[i] = value
				546	break
				547	incr = incr << 1
				548	if incr > mask:
				549	incr = incr ^ poly
				550
				551	print n, "collisions"
				552	self.collisions = n
				553
				554	for i in range(len(table)):
				555	if table[i] is None:
				556	table[i] = 0
				557
				558	self.data = Array(name + "_hash", table)
				559	self.magic = magic
				560	self.name = name
				561	self.size = size
				562	self.poly = poly
				563
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	564	def dump(self, file, trace):
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	565	# write data to file, as a C array
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	566	self.data.dump(file, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	567	file.write("#define %s_magic %d\n" % (self.name, self.magic))
				568	file.write("#define %s_size %d\n" % (self.name, self.size))
				569	file.write("#define %s_poly %d\n" % (self.name, self.poly))
				570
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	571	# stuff to deal with arrays of unsigned integers
				572
				573	class Array:
				574
				575	def __init__(self, name, data):
				576	self.name = name
				577	self.data = data
				578
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	579	def dump(self, file, trace=0):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	580	# write data to file, as a C array
				581	size = getsize(self.data)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	582	if trace:
				583	print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	584	file.write("static ")
				585	if size == 1:
				586	file.write("unsigned char")
				587	elif size == 2:
				588	file.write("unsigned short")
				589	else:
				590	file.write("unsigned int")
				591	file.write(" " + self.name + "[] = {\n")
				592	if self.data:
				593	s = " "
				594	for item in self.data:
				595	i = str(item) + ", "
				596	if len(s) + len(i) > 78:
				597	file.write(s + "\n")
				598	s = " " + i
				599	else:
				600	s = s + i
				601	if string.strip(s):
				602	file.write(s + "\n")
				603	file.write("};\n\n")
				604
				605	def getsize(data):
				606	# return smallest possible integer size for the given array
				607	maxdata = max(data)
				608	if maxdata < 256:
				609	return 1
				610	elif maxdata < 65536:
				611	return 2
				612	else:
				613	return 4
				614
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	615	def splitbins(t, trace=0):
				616	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
				617
				618	t is a sequence of ints. This function can be useful to save space if
				619	many of the ints are the same. t1 and t2 are lists of ints, and shift
				620	is an int, chosen to minimize the combined size of t1 and t2 (in C
				621	code), and where for each i in range(len(t)),
				622	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				623	where mask is a bitmask isolating the last "shift" bits.
				624
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	625	If optional arg trace is non-zero (default zero), progress info
				626	is printed to sys.stderr. The higher the value, the more info
				627	you'll get.
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	628	"""
				629
				630	import sys
				631	if trace:
				632	def dump(t1, t2, shift, bytes):
				633	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
				634	len(t1), len(t2), shift, bytes)
				635	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
				636	"bytes"
				637	n = len(t)-1 # last valid index
				638	maxshift = 0 # the most we can shift n and still have something left
				639	if n > 0:
				640	while n >> 1:
				641	n >>= 1
				642	maxshift += 1
				643	del n
				644	bytes = sys.maxint # smallest total size so far
				645	t = tuple(t) # so slices can be dict keys
				646	for shift in range(maxshift + 1):
				647	t1 = []
				648	t2 = []
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	649	size = 2**shift
				650	bincache = {}
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	651	for i in range(0, len(t), size):
				652	bin = t[i:i+size]
				653	index = bincache.get(bin)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	654	if index is None:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	655	index = len(t2)
				656	bincache[bin] = index
				657	t2.extend(bin)
				658	t1.append(index >> shift)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	659	# determine memory size
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	660	b = len(t1)getsize(t1) + len(t2)getsize(t2)
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	661	if trace > 1:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	662	dump(t1, t2, shift, b)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	663	if b < bytes:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	664	best = t1, t2, shift
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	665	bytes = b
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	666	t1, t2, shift = best
				667	if trace:
				668	print >>sys.stderr, "Best:",
				669	dump(t1, t2, shift, bytes)
				670	if __debug__:
				671	# exhaustively verify that the decomposition is correct
				672	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
				673	for i in xrange(len(t)):
				674	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				675	return best
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	676
				677	if __name__ == "__main__":
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	678	maketables(1)