Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

blob: 3b2fd1152d70e661427b599afc93dc5af470e365 [file] [log] [blame]

Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	1	#
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	2	# (re)generate unicode property and type databases
				3	#
				4	# this script converts a unicode 3.0 database file to
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	5	# Modules/unicodedata_db.h, Modules/unicodename_db.h,
				6	# and Objects/unicodetype_db.h
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	7	#
				8	# history:
				9	# 2000-09-24 fl created (based on bits and pieces from unidb)
				10	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	11	# 2000-09-25 fl added character type table
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	12	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	13	# 2000-11-03 fl expand first/last ranges
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	14	# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	15	# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	16	#
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	17	# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	18	#
				19
				20	import sys
				21
				22	SCRIPT = sys.argv[0]
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	23	VERSION = "2.1"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	24
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	25	UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	26
				27	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
				28	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
				29	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
				30	"So" ]
				31
				32	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
				33	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
				34	"ON" ]
				35
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	36	# note: should match definitions in Objects/unicodectype.c
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	37	ALPHA_MASK = 0x01
				38	DECIMAL_MASK = 0x02
				39	DIGIT_MASK = 0x04
				40	LOWER_MASK = 0x08
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	41	LINEBREAK_MASK = 0x10
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	42	SPACE_MASK = 0x20
				43	TITLE_MASK = 0x40
				44	UPPER_MASK = 0x80
				45
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	46	def maketables(trace=0):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	47
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	48	print "--- Reading", UNICODE_DATA, "..."
				49
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	50	unicode = UnicodeData(UNICODE_DATA)
				51
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	52	print len(filter(None, unicode.table)), "characters"
				53
Fredrik Lundh	b2dfd73	2001-01-21 23:31:52 +0000	[diff] [blame]	54	makeunicodename(unicode, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	55	makeunicodedata(unicode, trace)
Fredrik Lundh	b2dfd73	2001-01-21 23:31:52 +0000	[diff] [blame]	56	makeunicodetype(unicode, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	57
				58	# --------------------------------------------------------------------
				59	# unicode character properties
				60
				61	def makeunicodedata(unicode, trace):
				62
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	63	dummy = (0, 0, 0, 0)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	64	table = [dummy]
				65	cache = {0: dummy}
				66	index = [0] * len(unicode.chars)
				67
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	68	FILE = "Modules/unicodedata_db.h"
				69
				70	print "--- Preparing", FILE, "..."
				71
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	72	# 1) database properties
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	73
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	74	for char in unicode.chars:
				75	record = unicode.table[char]
				76	if record:
				77	# extract database properties
				78	category = CATEGORY_NAMES.index(record[2])
				79	combining = int(record[3])
				80	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
				81	mirrored = record[9] == "Y"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	82	item = (
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	83	category, combining, bidirectional, mirrored
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	84	)
				85	# add entry to index and item tables
				86	i = cache.get(item)
				87	if i is None:
				88	cache[item] = i = len(table)
				89	table.append(item)
				90	index[char] = i
				91
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	92	# 2) decomposition data
				93
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	94	decomp_data = [0]
				95	decomp_prefix = [""]
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	96	decomp_index = [0] * len(unicode.chars)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	97	decomp_size = 0
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	98
				99	for char in unicode.chars:
				100	record = unicode.table[char]
				101	if record:
				102	if record[5]:
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	103	decomp = record[5].split()
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	104	# prefix
				105	if decomp[0][0] == "<":
				106	prefix = decomp.pop(0)
				107	else:
				108	prefix = ""
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	109	try:
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	110	i = decomp_prefix.index(prefix)
				111	except ValueError:
				112	i = len(decomp_prefix)
				113	decomp_prefix.append(prefix)
				114	prefix = i
				115	assert prefix < 256
				116	# content
				117	decomp = [prefix + (len(decomp)<<8)] +\
				118	map(lambda s: int(s, 16), decomp)
				119	try:
				120	i = decomp_data.index(decomp)
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	121	except ValueError:
				122	i = len(decomp_data)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	123	decomp_data.extend(decomp)
				124	decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	125	else:
				126	i = 0
				127	decomp_index[char] = i
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	128
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	129	print len(table), "unique properties"
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	130	print len(decomp_prefix), "unique decomposition prefixes"
				131	print len(decomp_data), "unique decomposition entries:",
				132	print decomp_size, "bytes"
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	133
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	134	print "--- Writing", FILE, "..."
				135
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	136	fp = open(FILE, "w")
				137	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				138	print >>fp
				139	print >>fp, "/* a list of unique database records */"
				140	print >>fp, \
				141	"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	142	for item in table:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	143	print >>fp, " {%d, %d, %d, %d}," % item
				144	print >>fp, "};"
				145	print >>fp
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	146
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	147	# FIXME: <fl> the following tables could be made static, and
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	148	# the support code moved into unicodedatabase.c
				149
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	150	print >>fp, "/* string literals */"
				151	print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	152	for name in CATEGORY_NAMES:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	153	print >>fp, " \"%s\"," % name
				154	print >>fp, " NULL"
				155	print >>fp, "};"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	156
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	157	print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	158	for name in BIDIRECTIONAL_NAMES:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	159	print >>fp, " \"%s\"," % name
				160	print >>fp, " NULL"
				161	print >>fp, "};"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	162
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	163	print >>fp, "static const char *decomp_prefix[] = {"
				164	for name in decomp_prefix:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	165	print >>fp, " \"%s\"," % name
				166	print >>fp, " NULL"
				167	print >>fp, "};"
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	168
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	169	# split record index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	170	index1, index2, shift = splitbins(index, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	171
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	172	print >>fp, "/* index tables for the database records */"
				173	print >>fp, "#define SHIFT", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	174	Array("index1", index1).dump(fp, trace)
				175	Array("index2", index2).dump(fp, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	176
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	177	# split decomposition index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	178	index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	179
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	180	print >>fp, "/* decomposition data */"
				181	Array("decomp_data", decomp_data).dump(fp, trace)
				182
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	183	print >>fp, "/* index tables for the decomposition data */"
				184	print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	185	Array("decomp_index1", index1).dump(fp, trace)
				186	Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	187
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	188	fp.close()
				189
				190	# --------------------------------------------------------------------
				191	# unicode character type tables
				192
				193	def makeunicodetype(unicode, trace):
				194
				195	FILE = "Objects/unicodetype_db.h"
				196
				197	print "--- Preparing", FILE, "..."
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	198
				199	# extract unicode types
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	200	dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	201	table = [dummy]
				202	cache = {0: dummy}
				203	index = [0] * len(unicode.chars)
				204
				205	for char in unicode.chars:
				206	record = unicode.table[char]
				207	if record:
				208	# extract database properties
				209	category = record[2]
				210	bidirectional = record[4]
				211	flags = 0
				212	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
				213	flags \|= ALPHA_MASK
				214	if category == "Ll":
				215	flags \|= LOWER_MASK
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	216	if category == "Zl" or bidirectional == "B":
				217	flags \|= LINEBREAK_MASK
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	218	if category == "Zs" or bidirectional in ("WS", "B", "S"):
				219	flags \|= SPACE_MASK
Fredrik Lundh	375732c	2000-09-25 23:03:34 +0000	[diff] [blame]	220	if category == "Lt":
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	221	flags \|= TITLE_MASK
				222	if category == "Lu":
				223	flags \|= UPPER_MASK
				224	# use delta predictor for upper/lower/title
				225	if record[12]:
Martin v. Löwis	99ac328	2002-10-18 17:34:18 +0000	[diff] [blame]	226	upper = int(record[12], 16) - char
				227	assert -32768 <= upper <= 32767
				228	upper = upper & 0xffff
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	229	else:
				230	upper = 0
				231	if record[13]:
Martin v. Löwis	99ac328	2002-10-18 17:34:18 +0000	[diff] [blame]	232	lower = int(record[13], 16) - char
				233	assert -32768 <= lower <= 32767
				234	lower = lower & 0xffff
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	235	else:
				236	lower = 0
				237	if record[14]:
Martin v. Löwis	99ac328	2002-10-18 17:34:18 +0000	[diff] [blame]	238	title = int(record[14], 16) - char
				239	assert -32768 <= lower <= 32767
				240	title = title & 0xffff
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	241	else:
				242	title = 0
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	243	# decimal digit, integer digit
				244	decimal = 0
				245	if record[6]:
				246	flags \|= DECIMAL_MASK
				247	decimal = int(record[6])
				248	digit = 0
				249	if record[7]:
				250	flags \|= DIGIT_MASK
				251	digit = int(record[7])
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	252	item = (
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	253	flags, upper, lower, title, decimal, digit
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	254	)
				255	# add entry to index and item tables
				256	i = cache.get(item)
				257	if i is None:
				258	cache[item] = i = len(table)
				259	table.append(item)
				260	index[char] = i
				261
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	262	print len(table), "unique character type entries"
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	263
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	264	print "--- Writing", FILE, "..."
				265
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	266	fp = open(FILE, "w")
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	267	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				268	print >>fp
				269	print >>fp, "/* a list of unique character type descriptors */"
				270	print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	271	for item in table:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	272	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
				273	print >>fp, "};"
				274	print >>fp
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	275
				276	# split decomposition index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	277	index1, index2, shift = splitbins(index, trace)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	278
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	279	print >>fp, "/* type indexes */"
				280	print >>fp, "#define SHIFT", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	281	Array("index1", index1).dump(fp, trace)
				282	Array("index2", index2).dump(fp, trace)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	283
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	284	fp.close()
				285
				286	# --------------------------------------------------------------------
				287	# unicode name database
				288
				289	def makeunicodename(unicode, trace):
				290
				291	FILE = "Modules/unicodename_db.h"
				292
				293	print "--- Preparing", FILE, "..."
				294
				295	# collect names
				296	names = [None] * len(unicode.chars)
				297
				298	for char in unicode.chars:
				299	record = unicode.table[char]
				300	if record:
				301	name = record[1].strip()
				302	if name and name[0] != "<":
				303	names[char] = name + chr(0)
				304
				305	print len(filter(lambda n: n is not None, names)), "distinct names"
				306
				307	# collect unique words from names (note that we differ between
				308	# words inside a sentence, and words ending a sentence. the
				309	# latter includes the trailing null byte.
				310
				311	words = {}
				312	n = b = 0
				313	for char in unicode.chars:
				314	name = names[char]
				315	if name:
				316	w = name.split()
				317	b = b + len(name)
				318	n = n + len(w)
				319	for w in w:
				320	l = words.get(w)
				321	if l:
				322	l.append(None)
				323	else:
				324	words[w] = [len(words)]
				325
				326	print n, "words in text;", b, "bytes"
				327
				328	wordlist = words.items()
				329
				330	# sort on falling frequency
Martin v. Löwis	99ac328	2002-10-18 17:34:18 +0000	[diff] [blame]	331	# XXX: different Python versions produce a different order
				332	# for words with equal frequency
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	333	wordlist.sort(lambda a, b: len(b[1])-len(a[1]))
				334
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	335	# figure out how many phrasebook escapes we need
				336	escapes = 0
				337	while escapes * 256 < len(wordlist):
				338	escapes = escapes + 1
				339	print escapes, "escapes"
				340
				341	short = 256 - escapes
				342
				343	assert short > 0
				344
				345	print short, "short indexes in lexicon"
				346
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	347	# statistics
				348	n = 0
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	349	for i in range(short):
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	350	n = n + len(wordlist[i][1])
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	351	print n, "short indexes in phrasebook"
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	352
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	353	# pick the most commonly used words, and sort the rest on falling
				354	# length (to maximize overlap)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	355
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	356	wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	357	wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
				358	wordlist.extend(wordtail)
				359
				360	# generate lexicon from words
				361
				362	lexicon_offset = [0]
				363	lexicon = ""
				364	words = {}
				365
				366	# build a lexicon string
				367	offset = 0
				368	for w, x in wordlist:
				369	# encoding: bit 7 indicates last character in word (chr(128)
				370	# indicates the last character in an entire string)
				371	ww = w[:-1] + chr(ord(w[-1])+128)
				372	# reuse string tails, when possible
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	373	o = lexicon.find(ww)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	374	if o < 0:
				375	o = offset
				376	lexicon = lexicon + ww
				377	offset = offset + len(w)
				378	words[w] = len(lexicon_offset)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	379	lexicon_offset.append(o)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	380
				381	lexicon = map(ord, lexicon)
				382
				383	# generate phrasebook from names and lexicon
				384	phrasebook = [0]
				385	phrasebook_offset = [0] * len(unicode.chars)
				386	for char in unicode.chars:
				387	name = names[char]
				388	if name:
				389	w = name.split()
				390	phrasebook_offset[char] = len(phrasebook)
				391	for w in w:
				392	i = words[w]
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	393	if i < short:
				394	phrasebook.append(i)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	395	else:
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	396	# store as two bytes
				397	phrasebook.append((i>>8) + short)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	398	phrasebook.append(i&255)
				399
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	400	assert getsize(phrasebook) == 1
				401
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	402	#
				403	# unicode name hash table
				404
				405	# extract names
				406	data = []
				407	for char in unicode.chars:
				408	record = unicode.table[char]
				409	if record:
				410	name = record[1].strip()
				411	if name and name[0] != "<":
				412	data.append((name, char))
				413
				414	# the magic number 47 was chosen to minimize the number of
				415	# collisions on the current data set. if you like, change it
				416	# and see what happens...
				417
				418	codehash = Hash("code", data, 47)
				419
				420	print "--- Writing", FILE, "..."
				421
				422	fp = open(FILE, "w")
				423	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				424	print >>fp
				425	print >>fp, "#define NAME_MAXLEN", 256
				426	print >>fp
				427	print >>fp, "/* lexicon */"
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	428	Array("lexicon", lexicon).dump(fp, trace)
				429	Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	430
				431	# split decomposition index table
				432	offset1, offset2, shift = splitbins(phrasebook_offset, trace)
				433
				434	print >>fp, "/* code->name phrasebook */"
				435	print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	436	print >>fp, "#define phrasebook_short", short
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	437
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	438	Array("phrasebook", phrasebook).dump(fp, trace)
				439	Array("phrasebook_offset1", offset1).dump(fp, trace)
				440	Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	441
				442	print >>fp, "/* name->code dictionary */"
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	443	codehash.dump(fp, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	444
				445	fp.close()
				446
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	447	# --------------------------------------------------------------------
				448	# the following support code is taken from the unidb utilities
				449	# Copyright (c) 1999-2000 by Secret Labs AB
				450
				451	# load a unicode-data file from disk
				452
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	453	import sys
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	454
				455	class UnicodeData:
				456
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	457	def __init__(self, filename, expand=1):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	458	file = open(filename)
Martin v. Löwis	9def6a3	2002-10-18 16:11:54 +0000	[diff] [blame]	459	table = [None] * 0x110000
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	460	while 1:
				461	s = file.readline()
				462	if not s:
				463	break
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	464	s = s.strip().split(";")
				465	char = int(s[0], 16)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	466	table[char] = s
				467
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	468	# expand first-last ranges (ignore surrogates and private use)
				469	if expand:
				470	field = None
				471	for i in range(0, 0xD800):
				472	s = table[i]
				473	if s:
				474	if s[1][-6:] == "First>":
				475	s[1] = ""
				476	field = s[:]
				477	elif s[1][-5:] == "Last>":
				478	s[1] = ""
				479	field = None
				480	elif field:
				481	field[0] = hex(i)
				482	table[i] = field
				483
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	484	# public attributes
				485	self.filename = filename
				486	self.table = table
Martin v. Löwis	9def6a3	2002-10-18 16:11:54 +0000	[diff] [blame]	487	self.chars = range(0x110000) # unicode 3.2
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	488
				489	def uselatin1(self):
				490	# restrict character range to ISO Latin 1
				491	self.chars = range(256)
				492
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	493	# hash table tools
				494
				495	# this is a straight-forward reimplementation of Python's built-in
				496	# dictionary type, using a static data structure, and a custom string
				497	# hash algorithm.
				498
				499	def myhash(s, magic):
				500	h = 0
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	501	for c in map(ord, s.upper()):
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	502	h = (h * magic) + c
				503	ix = h & 0xff000000
				504	if ix:
				505	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
				506	return h
				507
				508	SIZES = [
				509	(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
				510	(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
				511	(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
				512	(2097152,5), (4194304,3), (8388608,33), (16777216,27)
				513	]
				514
				515	class Hash:
				516	def __init__(self, name, data, magic):
				517	# turn a (key, value) list into a static hash table structure
				518
				519	# determine table size
				520	for size, poly in SIZES:
				521	if size > len(data):
				522	poly = size + poly
				523	break
				524	else:
				525	raise AssertionError, "ran out of polynominals"
				526
				527	print size, "slots in hash table"
				528
				529	table = [None] * size
				530
				531	mask = size-1
				532
				533	n = 0
				534
				535	hash = myhash
				536
				537	# initialize hash table
				538	for key, value in data:
				539	h = hash(key, magic)
				540	i = (~h) & mask
				541	v = table[i]
				542	if v is None:
				543	table[i] = value
				544	continue
				545	incr = (h ^ (h >> 3)) & mask;
				546	if not incr:
				547	incr = mask
				548	while 1:
				549	n = n + 1
				550	i = (i + incr) & mask
				551	v = table[i]
				552	if v is None:
				553	table[i] = value
				554	break
				555	incr = incr << 1
				556	if incr > mask:
				557	incr = incr ^ poly
				558
				559	print n, "collisions"
				560	self.collisions = n
				561
				562	for i in range(len(table)):
				563	if table[i] is None:
				564	table[i] = 0
				565
				566	self.data = Array(name + "_hash", table)
				567	self.magic = magic
				568	self.name = name
				569	self.size = size
				570	self.poly = poly
				571
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	572	def dump(self, file, trace):
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	573	# write data to file, as a C array
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	574	self.data.dump(file, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	575	file.write("#define %s_magic %d\n" % (self.name, self.magic))
				576	file.write("#define %s_size %d\n" % (self.name, self.size))
				577	file.write("#define %s_poly %d\n" % (self.name, self.poly))
				578
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	579	# stuff to deal with arrays of unsigned integers
				580
				581	class Array:
				582
				583	def __init__(self, name, data):
				584	self.name = name
				585	self.data = data
				586
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	587	def dump(self, file, trace=0):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	588	# write data to file, as a C array
				589	size = getsize(self.data)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	590	if trace:
				591	print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	592	file.write("static ")
				593	if size == 1:
				594	file.write("unsigned char")
				595	elif size == 2:
				596	file.write("unsigned short")
				597	else:
				598	file.write("unsigned int")
				599	file.write(" " + self.name + "[] = {\n")
				600	if self.data:
				601	s = " "
				602	for item in self.data:
				603	i = str(item) + ", "
				604	if len(s) + len(i) > 78:
				605	file.write(s + "\n")
				606	s = " " + i
				607	else:
				608	s = s + i
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	609	if s.strip():
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	610	file.write(s + "\n")
				611	file.write("};\n\n")
				612
				613	def getsize(data):
				614	# return smallest possible integer size for the given array
				615	maxdata = max(data)
				616	if maxdata < 256:
				617	return 1
				618	elif maxdata < 65536:
				619	return 2
				620	else:
				621	return 4
				622
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	623	def splitbins(t, trace=0):
				624	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
				625
				626	t is a sequence of ints. This function can be useful to save space if
				627	many of the ints are the same. t1 and t2 are lists of ints, and shift
				628	is an int, chosen to minimize the combined size of t1 and t2 (in C
				629	code), and where for each i in range(len(t)),
				630	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				631	where mask is a bitmask isolating the last "shift" bits.
				632
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	633	If optional arg trace is non-zero (default zero), progress info
				634	is printed to sys.stderr. The higher the value, the more info
				635	you'll get.
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	636	"""
				637
				638	import sys
				639	if trace:
				640	def dump(t1, t2, shift, bytes):
				641	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
				642	len(t1), len(t2), shift, bytes)
				643	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
				644	"bytes"
				645	n = len(t)-1 # last valid index
				646	maxshift = 0 # the most we can shift n and still have something left
				647	if n > 0:
				648	while n >> 1:
				649	n >>= 1
				650	maxshift += 1
				651	del n
				652	bytes = sys.maxint # smallest total size so far
				653	t = tuple(t) # so slices can be dict keys
				654	for shift in range(maxshift + 1):
				655	t1 = []
				656	t2 = []
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	657	size = 2**shift
				658	bincache = {}
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	659	for i in range(0, len(t), size):
				660	bin = t[i:i+size]
				661	index = bincache.get(bin)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	662	if index is None:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	663	index = len(t2)
				664	bincache[bin] = index
				665	t2.extend(bin)
				666	t1.append(index >> shift)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	667	# determine memory size
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	668	b = len(t1)getsize(t1) + len(t2)getsize(t2)
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	669	if trace > 1:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	670	dump(t1, t2, shift, b)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	671	if b < bytes:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	672	best = t1, t2, shift
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	673	bytes = b
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	674	t1, t2, shift = best
				675	if trace:
				676	print >>sys.stderr, "Best:",
				677	dump(t1, t2, shift, bytes)
				678	if __debug__:
				679	# exhaustively verify that the decomposition is correct
				680	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
				681	for i in xrange(len(t)):
				682	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				683	return best
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	684
				685	if __name__ == "__main__":
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	686	maketables(1)