Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

blob: 42cbcf149e570f5e9bb4704301cc4ceb0ed32e69 [file] [log] [blame]

Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	1	#
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	2	# (re)generate unicode property and type databases
				3	#
				4	# this script converts a unicode 3.0 database file to
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	5	# Modules/unicodedata_db.h, Modules/unicodename_db.h,
				6	# and Objects/unicodetype_db.h
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	7	#
				8	# history:
				9	# 2000-09-24 fl created (based on bits and pieces from unidb)
				10	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	11	# 2000-09-25 fl added character type table
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	12	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	13	# 2000-11-03 fl expand first/last ranges
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	14	# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	15	# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	16	# 2002-09-11 wd use string methods
				17	# 2002-10-18 mvl update to Unicode 3.2
				18	# 2002-10-22 mvl generate NFC tables
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	19	#
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	20	# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	21	#
				22
				23	import sys
				24
				25	SCRIPT = sys.argv[0]
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	26	VERSION = "2.1"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	27
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	28	UNICODE_DATA = "UnicodeData.txt"
				29	COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	30
				31	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
				32	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
				33	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
				34	"So" ]
				35
				36	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
				37	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
				38	"ON" ]
				39
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	40	# note: should match definitions in Objects/unicodectype.c
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	41	ALPHA_MASK = 0x01
				42	DECIMAL_MASK = 0x02
				43	DIGIT_MASK = 0x04
				44	LOWER_MASK = 0x08
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	45	LINEBREAK_MASK = 0x10
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	46	SPACE_MASK = 0x20
				47	TITLE_MASK = 0x40
				48	UPPER_MASK = 0x80
				49
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	50	def maketables(trace=0):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	51
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	52	print "--- Reading", UNICODE_DATA, "..."
				53
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	54	unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	55
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	56	print len(filter(None, unicode.table)), "characters"
				57
Fredrik Lundh	b2dfd73	2001-01-21 23:31:52 +0000	[diff] [blame]	58	makeunicodename(unicode, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	59	makeunicodedata(unicode, trace)
Fredrik Lundh	b2dfd73	2001-01-21 23:31:52 +0000	[diff] [blame]	60	makeunicodetype(unicode, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	61
				62	# --------------------------------------------------------------------
				63	# unicode character properties
				64
				65	def makeunicodedata(unicode, trace):
				66
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	67	dummy = (0, 0, 0, 0)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	68	table = [dummy]
				69	cache = {0: dummy}
				70	index = [0] * len(unicode.chars)
				71
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	72	FILE = "Modules/unicodedata_db.h"
				73
				74	print "--- Preparing", FILE, "..."
				75
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	76	# 1) database properties
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	77
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	78	for char in unicode.chars:
				79	record = unicode.table[char]
				80	if record:
				81	# extract database properties
				82	category = CATEGORY_NAMES.index(record[2])
				83	combining = int(record[3])
				84	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
				85	mirrored = record[9] == "Y"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	86	item = (
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	87	category, combining, bidirectional, mirrored
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	88	)
				89	# add entry to index and item tables
				90	i = cache.get(item)
				91	if i is None:
				92	cache[item] = i = len(table)
				93	table.append(item)
				94	index[char] = i
				95
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	96	# 2) decomposition data
				97
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	98	decomp_data = [0]
				99	decomp_prefix = [""]
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	100	decomp_index = [0] * len(unicode.chars)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	101	decomp_size = 0
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	102
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	103	comp_pairs = []
				104	comp_first = [None] * len(unicode.chars)
				105	comp_last = [None] * len(unicode.chars)
				106
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	107	for char in unicode.chars:
				108	record = unicode.table[char]
				109	if record:
				110	if record[5]:
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	111	decomp = record[5].split()
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	112	# prefix
				113	if decomp[0][0] == "<":
				114	prefix = decomp.pop(0)
				115	else:
				116	prefix = ""
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	117	try:
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	118	i = decomp_prefix.index(prefix)
				119	except ValueError:
				120	i = len(decomp_prefix)
				121	decomp_prefix.append(prefix)
				122	prefix = i
				123	assert prefix < 256
				124	# content
				125	decomp = [prefix + (len(decomp)<<8)] +\
				126	map(lambda s: int(s, 16), decomp)
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	127	# Collect NFC pairs
				128	if not prefix and len(decomp) == 3 and \
				129	char not in unicode.exclusions and \
				130	unicode.table[decomp[1]][3] == "0":
				131	p, l, r = decomp
				132	comp_first[l] = 1
				133	comp_last[r] = 1
				134	comp_pairs.append((l,r,char))
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	135	try:
				136	i = decomp_data.index(decomp)
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	137	except ValueError:
				138	i = len(decomp_data)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	139	decomp_data.extend(decomp)
				140	decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	141	else:
				142	i = 0
				143	decomp_index[char] = i
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	144
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	145	f = l = 0
				146	comp_first_ranges = []
				147	comp_last_ranges = []
				148	prev_f = prev_l = None
				149	for i in unicode.chars:
				150	if comp_first[i] is not None:
				151	comp_first[i] = f
				152	f += 1
				153	if prev_f is None:
				154	prev_f = (i,i)
				155	elif prev_f[1]+1 == i:
				156	prev_f = prev_f[0],i
				157	else:
				158	comp_first_ranges.append(prev_f)
				159	prev_f = (i,i)
				160	if comp_last[i] is not None:
				161	comp_last[i] = l
				162	l += 1
				163	if prev_l is None:
				164	prev_l = (i,i)
				165	elif prev_l[1]+1 == i:
				166	prev_l = prev_l[0],i
				167	else:
				168	comp_last_ranges.append(prev_l)
				169	prev_l = (i,i)
				170	comp_first_ranges.append(prev_f)
				171	comp_last_ranges.append(prev_l)
				172	total_first = f
				173	total_last = l
				174
				175	comp_data = [0](total_firsttotal_last)
				176	for f,l,char in comp_pairs:
				177	f = comp_first[f]
				178	l = comp_last[l]
				179	comp_data[f*total_last+l] = char
				180
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	181	print len(table), "unique properties"
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	182	print len(decomp_prefix), "unique decomposition prefixes"
				183	print len(decomp_data), "unique decomposition entries:",
				184	print decomp_size, "bytes"
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	185	print total_first, "first characters in NFC"
				186	print total_last, "last characters in NFC"
				187	print len(comp_pairs), "NFC pairs"
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	188
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	189	print "--- Writing", FILE, "..."
				190
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	191	fp = open(FILE, "w")
				192	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				193	print >>fp
				194	print >>fp, "/* a list of unique database records */"
				195	print >>fp, \
				196	"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	197	for item in table:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	198	print >>fp, " {%d, %d, %d, %d}," % item
				199	print >>fp, "};"
				200	print >>fp
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	201
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	202	print >>fp, "/* Reindexing of NFC first characters. */"
				203	print >>fp, "#define TOTAL_FIRST",total_first
				204	print >>fp, "#define TOTAL_LAST",total_last
				205	print >>fp, "struct reindex{int start;short count,index;};"
				206	print >>fp, "struct reindex nfc_first[] = {"
				207	for start,end in comp_first_ranges:
				208	print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
				209	print >>fp," {0,0,0}"
				210	print >>fp,"};\n"
				211	print >>fp, "struct reindex nfc_last[] = {"
				212	for start,end in comp_last_ranges:
				213	print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
				214	print >>fp," {0,0,0}"
				215	print >>fp,"};\n"
				216
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	217	# FIXME: <fl> the following tables could be made static, and
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	218	# the support code moved into unicodedatabase.c
				219
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	220	print >>fp, "/* string literals */"
				221	print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	222	for name in CATEGORY_NAMES:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	223	print >>fp, " \"%s\"," % name
				224	print >>fp, " NULL"
				225	print >>fp, "};"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	226
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	227	print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	228	for name in BIDIRECTIONAL_NAMES:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	229	print >>fp, " \"%s\"," % name
				230	print >>fp, " NULL"
				231	print >>fp, "};"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	232
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	233	print >>fp, "static const char *decomp_prefix[] = {"
				234	for name in decomp_prefix:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	235	print >>fp, " \"%s\"," % name
				236	print >>fp, " NULL"
				237	print >>fp, "};"
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	238
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	239	# split record index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	240	index1, index2, shift = splitbins(index, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	241
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	242	print >>fp, "/* index tables for the database records */"
				243	print >>fp, "#define SHIFT", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	244	Array("index1", index1).dump(fp, trace)
				245	Array("index2", index2).dump(fp, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	246
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	247	# split decomposition index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	248	index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	249
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	250	print >>fp, "/* decomposition data */"
				251	Array("decomp_data", decomp_data).dump(fp, trace)
				252
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	253	print >>fp, "/* index tables for the decomposition data */"
				254	print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	255	Array("decomp_index1", index1).dump(fp, trace)
				256	Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	257
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	258	index, index2, shift = splitbins(comp_data, trace)
				259	print >>fp, "/* NFC pairs */"
				260	print >>fp, "#define COMP_SHIFT", shift
				261	Array("comp_index", index).dump(fp, trace)
				262	Array("comp_data", index2).dump(fp, trace)
				263
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	264	fp.close()
				265
				266	# --------------------------------------------------------------------
				267	# unicode character type tables
				268
				269	def makeunicodetype(unicode, trace):
				270
				271	FILE = "Objects/unicodetype_db.h"
				272
				273	print "--- Preparing", FILE, "..."
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	274
				275	# extract unicode types
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	276	dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	277	table = [dummy]
				278	cache = {0: dummy}
				279	index = [0] * len(unicode.chars)
				280
				281	for char in unicode.chars:
				282	record = unicode.table[char]
				283	if record:
				284	# extract database properties
				285	category = record[2]
				286	bidirectional = record[4]
				287	flags = 0
				288	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
				289	flags \|= ALPHA_MASK
				290	if category == "Ll":
				291	flags \|= LOWER_MASK
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	292	if category == "Zl" or bidirectional == "B":
				293	flags \|= LINEBREAK_MASK
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	294	if category == "Zs" or bidirectional in ("WS", "B", "S"):
				295	flags \|= SPACE_MASK
Fredrik Lundh	375732c	2000-09-25 23:03:34 +0000	[diff] [blame]	296	if category == "Lt":
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	297	flags \|= TITLE_MASK
				298	if category == "Lu":
				299	flags \|= UPPER_MASK
				300	# use delta predictor for upper/lower/title
				301	if record[12]:
Martin v. Löwis	99ac328	2002-10-18 17:34:18 +0000	[diff] [blame]	302	upper = int(record[12], 16) - char
				303	assert -32768 <= upper <= 32767
				304	upper = upper & 0xffff
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	305	else:
				306	upper = 0
				307	if record[13]:
Martin v. Löwis	99ac328	2002-10-18 17:34:18 +0000	[diff] [blame]	308	lower = int(record[13], 16) - char
				309	assert -32768 <= lower <= 32767
				310	lower = lower & 0xffff
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	311	else:
				312	lower = 0
				313	if record[14]:
Martin v. Löwis	99ac328	2002-10-18 17:34:18 +0000	[diff] [blame]	314	title = int(record[14], 16) - char
				315	assert -32768 <= lower <= 32767
				316	title = title & 0xffff
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	317	else:
				318	title = 0
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	319	# decimal digit, integer digit
				320	decimal = 0
				321	if record[6]:
				322	flags \|= DECIMAL_MASK
				323	decimal = int(record[6])
				324	digit = 0
				325	if record[7]:
				326	flags \|= DIGIT_MASK
				327	digit = int(record[7])
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	328	item = (
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	329	flags, upper, lower, title, decimal, digit
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	330	)
				331	# add entry to index and item tables
				332	i = cache.get(item)
				333	if i is None:
				334	cache[item] = i = len(table)
				335	table.append(item)
				336	index[char] = i
				337
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	338	print len(table), "unique character type entries"
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	339
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	340	print "--- Writing", FILE, "..."
				341
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	342	fp = open(FILE, "w")
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	343	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				344	print >>fp
				345	print >>fp, "/* a list of unique character type descriptors */"
				346	print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	347	for item in table:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	348	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
				349	print >>fp, "};"
				350	print >>fp
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	351
				352	# split decomposition index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	353	index1, index2, shift = splitbins(index, trace)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	354
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	355	print >>fp, "/* type indexes */"
				356	print >>fp, "#define SHIFT", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	357	Array("index1", index1).dump(fp, trace)
				358	Array("index2", index2).dump(fp, trace)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	359
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	360	fp.close()
				361
				362	# --------------------------------------------------------------------
				363	# unicode name database
				364
				365	def makeunicodename(unicode, trace):
				366
				367	FILE = "Modules/unicodename_db.h"
				368
				369	print "--- Preparing", FILE, "..."
				370
				371	# collect names
				372	names = [None] * len(unicode.chars)
				373
				374	for char in unicode.chars:
				375	record = unicode.table[char]
				376	if record:
				377	name = record[1].strip()
				378	if name and name[0] != "<":
				379	names[char] = name + chr(0)
				380
				381	print len(filter(lambda n: n is not None, names)), "distinct names"
				382
				383	# collect unique words from names (note that we differ between
				384	# words inside a sentence, and words ending a sentence. the
				385	# latter includes the trailing null byte.
				386
				387	words = {}
				388	n = b = 0
				389	for char in unicode.chars:
				390	name = names[char]
				391	if name:
				392	w = name.split()
				393	b = b + len(name)
				394	n = n + len(w)
				395	for w in w:
				396	l = words.get(w)
				397	if l:
				398	l.append(None)
				399	else:
				400	words[w] = [len(words)]
				401
				402	print n, "words in text;", b, "bytes"
				403
				404	wordlist = words.items()
				405
				406	# sort on falling frequency
Martin v. Löwis	99ac328	2002-10-18 17:34:18 +0000	[diff] [blame]	407	# XXX: different Python versions produce a different order
				408	# for words with equal frequency
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	409	wordlist.sort(lambda a, b: len(b[1])-len(a[1]))
				410
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	411	# figure out how many phrasebook escapes we need
				412	escapes = 0
				413	while escapes * 256 < len(wordlist):
				414	escapes = escapes + 1
				415	print escapes, "escapes"
				416
				417	short = 256 - escapes
				418
				419	assert short > 0
				420
				421	print short, "short indexes in lexicon"
				422
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	423	# statistics
				424	n = 0
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	425	for i in range(short):
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	426	n = n + len(wordlist[i][1])
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	427	print n, "short indexes in phrasebook"
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	428
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	429	# pick the most commonly used words, and sort the rest on falling
				430	# length (to maximize overlap)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	431
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	432	wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	433	wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
				434	wordlist.extend(wordtail)
				435
				436	# generate lexicon from words
				437
				438	lexicon_offset = [0]
				439	lexicon = ""
				440	words = {}
				441
				442	# build a lexicon string
				443	offset = 0
				444	for w, x in wordlist:
				445	# encoding: bit 7 indicates last character in word (chr(128)
				446	# indicates the last character in an entire string)
				447	ww = w[:-1] + chr(ord(w[-1])+128)
				448	# reuse string tails, when possible
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	449	o = lexicon.find(ww)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	450	if o < 0:
				451	o = offset
				452	lexicon = lexicon + ww
				453	offset = offset + len(w)
				454	words[w] = len(lexicon_offset)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	455	lexicon_offset.append(o)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	456
				457	lexicon = map(ord, lexicon)
				458
				459	# generate phrasebook from names and lexicon
				460	phrasebook = [0]
				461	phrasebook_offset = [0] * len(unicode.chars)
				462	for char in unicode.chars:
				463	name = names[char]
				464	if name:
				465	w = name.split()
				466	phrasebook_offset[char] = len(phrasebook)
				467	for w in w:
				468	i = words[w]
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	469	if i < short:
				470	phrasebook.append(i)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	471	else:
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	472	# store as two bytes
				473	phrasebook.append((i>>8) + short)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	474	phrasebook.append(i&255)
				475
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	476	assert getsize(phrasebook) == 1
				477
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	478	#
				479	# unicode name hash table
				480
				481	# extract names
				482	data = []
				483	for char in unicode.chars:
				484	record = unicode.table[char]
				485	if record:
				486	name = record[1].strip()
				487	if name and name[0] != "<":
				488	data.append((name, char))
				489
				490	# the magic number 47 was chosen to minimize the number of
				491	# collisions on the current data set. if you like, change it
				492	# and see what happens...
				493
				494	codehash = Hash("code", data, 47)
				495
				496	print "--- Writing", FILE, "..."
				497
				498	fp = open(FILE, "w")
				499	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				500	print >>fp
				501	print >>fp, "#define NAME_MAXLEN", 256
				502	print >>fp
				503	print >>fp, "/* lexicon */"
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	504	Array("lexicon", lexicon).dump(fp, trace)
				505	Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	506
				507	# split decomposition index table
				508	offset1, offset2, shift = splitbins(phrasebook_offset, trace)
				509
				510	print >>fp, "/* code->name phrasebook */"
				511	print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	512	print >>fp, "#define phrasebook_short", short
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	513
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	514	Array("phrasebook", phrasebook).dump(fp, trace)
				515	Array("phrasebook_offset1", offset1).dump(fp, trace)
				516	Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	517
				518	print >>fp, "/* name->code dictionary */"
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	519	codehash.dump(fp, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	520
				521	fp.close()
				522
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	523	# --------------------------------------------------------------------
				524	# the following support code is taken from the unidb utilities
				525	# Copyright (c) 1999-2000 by Secret Labs AB
				526
				527	# load a unicode-data file from disk
				528
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	529	import sys
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	530
				531	class UnicodeData:
				532
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	533	def __init__(self, filename, exclusions, expand=1):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	534	file = open(filename)
Martin v. Löwis	9def6a3	2002-10-18 16:11:54 +0000	[diff] [blame]	535	table = [None] * 0x110000
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	536	while 1:
				537	s = file.readline()
				538	if not s:
				539	break
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	540	s = s.strip().split(";")
				541	char = int(s[0], 16)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	542	table[char] = s
				543
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	544	# expand first-last ranges (ignore surrogates and private use)
				545	if expand:
				546	field = None
				547	for i in range(0, 0xD800):
				548	s = table[i]
				549	if s:
				550	if s[1][-6:] == "First>":
				551	s[1] = ""
				552	field = s[:]
				553	elif s[1][-5:] == "Last>":
				554	s[1] = ""
				555	field = None
				556	elif field:
				557	field[0] = hex(i)
				558	table[i] = field
				559
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	560	# public attributes
				561	self.filename = filename
				562	self.table = table
Martin v. Löwis	9def6a3	2002-10-18 16:11:54 +0000	[diff] [blame]	563	self.chars = range(0x110000) # unicode 3.2
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	564
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	565	file = open(exclusions)
				566	self.exclusions = {}
				567	for s in file:
				568	s = s.strip()
				569	if not s:
				570	continue
				571	if s[0] == '#':
				572	continue
				573	char = int(s.split()[0],16)
				574	self.exclusions[char] = 1
				575
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	576	def uselatin1(self):
				577	# restrict character range to ISO Latin 1
				578	self.chars = range(256)
				579
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	580	# hash table tools
				581
				582	# this is a straight-forward reimplementation of Python's built-in
				583	# dictionary type, using a static data structure, and a custom string
				584	# hash algorithm.
				585
				586	def myhash(s, magic):
				587	h = 0
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	588	for c in map(ord, s.upper()):
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	589	h = (h * magic) + c
				590	ix = h & 0xff000000
				591	if ix:
				592	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
				593	return h
				594
				595	SIZES = [
				596	(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
				597	(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
				598	(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
				599	(2097152,5), (4194304,3), (8388608,33), (16777216,27)
				600	]
				601
				602	class Hash:
				603	def __init__(self, name, data, magic):
				604	# turn a (key, value) list into a static hash table structure
				605
				606	# determine table size
				607	for size, poly in SIZES:
				608	if size > len(data):
				609	poly = size + poly
				610	break
				611	else:
				612	raise AssertionError, "ran out of polynominals"
				613
				614	print size, "slots in hash table"
				615
				616	table = [None] * size
				617
				618	mask = size-1
				619
				620	n = 0
				621
				622	hash = myhash
				623
				624	# initialize hash table
				625	for key, value in data:
				626	h = hash(key, magic)
				627	i = (~h) & mask
				628	v = table[i]
				629	if v is None:
				630	table[i] = value
				631	continue
				632	incr = (h ^ (h >> 3)) & mask;
				633	if not incr:
				634	incr = mask
				635	while 1:
				636	n = n + 1
				637	i = (i + incr) & mask
				638	v = table[i]
				639	if v is None:
				640	table[i] = value
				641	break
				642	incr = incr << 1
				643	if incr > mask:
				644	incr = incr ^ poly
				645
				646	print n, "collisions"
				647	self.collisions = n
				648
				649	for i in range(len(table)):
				650	if table[i] is None:
				651	table[i] = 0
				652
				653	self.data = Array(name + "_hash", table)
				654	self.magic = magic
				655	self.name = name
				656	self.size = size
				657	self.poly = poly
				658
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	659	def dump(self, file, trace):
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	660	# write data to file, as a C array
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	661	self.data.dump(file, trace)
Fredrik Lundh	9e9bcda	2001-01-21 17:01:31 +0000	[diff] [blame]	662	file.write("#define %s_magic %d\n" % (self.name, self.magic))
				663	file.write("#define %s_size %d\n" % (self.name, self.size))
				664	file.write("#define %s_poly %d\n" % (self.name, self.poly))
				665
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	666	# stuff to deal with arrays of unsigned integers
				667
				668	class Array:
				669
				670	def __init__(self, name, data):
				671	self.name = name
				672	self.data = data
				673
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	674	def dump(self, file, trace=0):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	675	# write data to file, as a C array
				676	size = getsize(self.data)
Fredrik Lundh	7b7dd10	2001-01-21 22:41:08 +0000	[diff] [blame]	677	if trace:
				678	print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	679	file.write("static ")
				680	if size == 1:
				681	file.write("unsigned char")
				682	elif size == 2:
				683	file.write("unsigned short")
				684	else:
				685	file.write("unsigned int")
				686	file.write(" " + self.name + "[] = {\n")
				687	if self.data:
				688	s = " "
				689	for item in self.data:
				690	i = str(item) + ", "
				691	if len(s) + len(i) > 78:
				692	file.write(s + "\n")
				693	s = " " + i
				694	else:
				695	s = s + i
Walter Dörwald	aaab30e	2002-09-11 20:36:02 +0000	[diff] [blame]	696	if s.strip():
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	697	file.write(s + "\n")
				698	file.write("};\n\n")
				699
				700	def getsize(data):
				701	# return smallest possible integer size for the given array
				702	maxdata = max(data)
				703	if maxdata < 256:
				704	return 1
				705	elif maxdata < 65536:
				706	return 2
				707	else:
				708	return 4
				709
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	710	def splitbins(t, trace=0):
				711	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
				712
				713	t is a sequence of ints. This function can be useful to save space if
				714	many of the ints are the same. t1 and t2 are lists of ints, and shift
				715	is an int, chosen to minimize the combined size of t1 and t2 (in C
				716	code), and where for each i in range(len(t)),
				717	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				718	where mask is a bitmask isolating the last "shift" bits.
				719
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	720	If optional arg trace is non-zero (default zero), progress info
				721	is printed to sys.stderr. The higher the value, the more info
				722	you'll get.
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	723	"""
				724
				725	import sys
				726	if trace:
				727	def dump(t1, t2, shift, bytes):
				728	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
				729	len(t1), len(t2), shift, bytes)
				730	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
				731	"bytes"
				732	n = len(t)-1 # last valid index
				733	maxshift = 0 # the most we can shift n and still have something left
				734	if n > 0:
				735	while n >> 1:
				736	n >>= 1
				737	maxshift += 1
				738	del n
				739	bytes = sys.maxint # smallest total size so far
				740	t = tuple(t) # so slices can be dict keys
				741	for shift in range(maxshift + 1):
				742	t1 = []
				743	t2 = []
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	744	size = 2**shift
				745	bincache = {}
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	746	for i in range(0, len(t), size):
				747	bin = t[i:i+size]
				748	index = bincache.get(bin)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	749	if index is None:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	750	index = len(t2)
				751	bincache[bin] = index
				752	t2.extend(bin)
				753	t1.append(index >> shift)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	754	# determine memory size
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	755	b = len(t1)getsize(t1) + len(t2)getsize(t2)
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	756	if trace > 1:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	757	dump(t1, t2, shift, b)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	758	if b < bytes:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	759	best = t1, t2, shift
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	760	bytes = b
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	761	t1, t2, shift = best
				762	if trace:
				763	print >>sys.stderr, "Best:",
				764	dump(t1, t2, shift, bytes)
				765	if __debug__:
				766	# exhaustively verify that the decomposition is correct
				767	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
				768	for i in xrange(len(t)):
				769	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				770	return best
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	771
				772	if __name__ == "__main__":
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame]	773	maketables(1)