Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

blob: 4781ec4f5bd592cab93f1212d0aa525a4702a41c [file] [log] [blame]

Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	1	#
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	2	# generate a compact version of the unicode property database
				3	#
				4	# history:
				5	# 2000-09-24 fl created (based on bits and pieces from unidb)
				6	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
				7	#
				8	# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	9	#
				10
				11	import sys
				12
				13	SCRIPT = sys.argv[0]
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	14	VERSION = "1.1"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	15
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	16	UNICODE_DATA = "../UnicodeData-Latest.txt"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	17
				18	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
				19	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
				20	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
				21	"So" ]
				22
				23	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
				24	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
				25	"ON" ]
				26
				27	def maketable():
				28
				29	unicode = UnicodeData(UNICODE_DATA)
				30
				31	# extract unicode properties
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	32	dummy = (0, 0, 0, 0)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	33	table = [dummy]
				34	cache = {0: dummy}
				35	index = [0] * len(unicode.chars)
				36
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	37	# 1) database properties
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	38	for char in unicode.chars:
				39	record = unicode.table[char]
				40	if record:
				41	# extract database properties
				42	category = CATEGORY_NAMES.index(record[2])
				43	combining = int(record[3])
				44	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
				45	mirrored = record[9] == "Y"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	46	item = (
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	47	category, combining, bidirectional, mirrored
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	48	)
				49	# add entry to index and item tables
				50	i = cache.get(item)
				51	if i is None:
				52	cache[item] = i = len(table)
				53	table.append(item)
				54	index[char] = i
				55
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	56	# 2) decomposition data
				57
				58	# FIXME: <fl> using the encoding stuff from unidb would save
				59	# another 50k or so, but I'll leave that for 2.1...
				60
				61	decomp_data = [""]
				62	decomp_index = [0] * len(unicode.chars)
				63
				64	for char in unicode.chars:
				65	record = unicode.table[char]
				66	if record:
				67	if record[5]:
				68	try:
				69	i = decomp_data.index(record[5])
				70	except ValueError:
				71	i = len(decomp_data)
				72	decomp_data.append(record[5])
				73	else:
				74	i = 0
				75	decomp_index[char] = i
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	76
				77	FILE = "unicodedata_db.h"
				78
				79	sys.stdout = open(FILE, "w")
				80
				81	print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				82	print
				83	print "/* a list of unique database records */"
				84	print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
				85	for item in table:
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	86	print " {%d, %d, %d, %d}," % item
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	87	print "};"
				88	print
				89
				90	print "/* string literals */"
				91	print "const char *_PyUnicode_CategoryNames[] = {"
				92	for name in CATEGORY_NAMES:
				93	print " \"%s\"," % name
				94	print " NULL"
				95	print "};"
				96
				97	print "const char *_PyUnicode_BidirectionalNames[] = {"
				98	for name in BIDIRECTIONAL_NAMES:
				99	print " \"%s\"," % name
				100	print " NULL"
				101	print "};"
				102
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	103	print "static const char *decomp_data[] = {"
				104	for name in decomp_data:
				105	print " \"%s\"," % name
				106	print " NULL"
				107	print "};"
				108
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	109	# split index table
				110	index1, index2, shift = splitbins(index)
				111
				112	print "/* index tables used to find the right database record */"
				113	print "#define SHIFT", shift
				114	Array("index1", index1).dump(sys.stdout)
				115	Array("index2", index2).dump(sys.stdout)
				116
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	117	# split index table
				118	index1, index2, shift = splitbins(decomp_index)
				119
				120	print "/* same, for the decomposition data */"
				121	print "#define DECOMP_SHIFT", shift
				122	Array("decomp_index1", index1).dump(sys.stdout)
				123	Array("decomp_index2", index2).dump(sys.stdout)
				124
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	125	sys.stdout = sys.__stdout__
				126
				127	# --------------------------------------------------------------------
				128	# the following support code is taken from the unidb utilities
				129	# Copyright (c) 1999-2000 by Secret Labs AB
				130
				131	# load a unicode-data file from disk
				132
				133	import string, sys
				134
				135	class UnicodeData:
				136
				137	def __init__(self, filename):
				138	file = open(filename)
				139	table = [None] * 65536
				140	while 1:
				141	s = file.readline()
				142	if not s:
				143	break
				144	s = string.split(string.strip(s), ";")
				145	char = string.atoi(s[0], 16)
				146	table[char] = s
				147
				148	# public attributes
				149	self.filename = filename
				150	self.table = table
				151	self.chars = range(65536) # unicode
				152
				153	def uselatin1(self):
				154	# restrict character range to ISO Latin 1
				155	self.chars = range(256)
				156
				157	# stuff to deal with arrays of unsigned integers
				158
				159	class Array:
				160
				161	def __init__(self, name, data):
				162	self.name = name
				163	self.data = data
				164
				165	def dump(self, file):
				166	# write data to file, as a C array
				167	size = getsize(self.data)
				168	# print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
				169	file.write("static ")
				170	if size == 1:
				171	file.write("unsigned char")
				172	elif size == 2:
				173	file.write("unsigned short")
				174	else:
				175	file.write("unsigned int")
				176	file.write(" " + self.name + "[] = {\n")
				177	if self.data:
				178	s = " "
				179	for item in self.data:
				180	i = str(item) + ", "
				181	if len(s) + len(i) > 78:
				182	file.write(s + "\n")
				183	s = " " + i
				184	else:
				185	s = s + i
				186	if string.strip(s):
				187	file.write(s + "\n")
				188	file.write("};\n\n")
				189
				190	def getsize(data):
				191	# return smallest possible integer size for the given array
				192	maxdata = max(data)
				193	if maxdata < 256:
				194	return 1
				195	elif maxdata < 65536:
				196	return 2
				197	else:
				198	return 4
				199
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	200	def splitbins(t, trace=0):
				201	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
				202
				203	t is a sequence of ints. This function can be useful to save space if
				204	many of the ints are the same. t1 and t2 are lists of ints, and shift
				205	is an int, chosen to minimize the combined size of t1 and t2 (in C
				206	code), and where for each i in range(len(t)),
				207	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				208	where mask is a bitmask isolating the last "shift" bits.
				209
				210	If optional arg trace is true (default false), progress info is
				211	printed to sys.stderr.
				212	"""
				213
				214	import sys
				215	if trace:
				216	def dump(t1, t2, shift, bytes):
				217	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
				218	len(t1), len(t2), shift, bytes)
				219	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
				220	"bytes"
				221	n = len(t)-1 # last valid index
				222	maxshift = 0 # the most we can shift n and still have something left
				223	if n > 0:
				224	while n >> 1:
				225	n >>= 1
				226	maxshift += 1
				227	del n
				228	bytes = sys.maxint # smallest total size so far
				229	t = tuple(t) # so slices can be dict keys
				230	for shift in range(maxshift + 1):
				231	t1 = []
				232	t2 = []
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	233	size = 2**shift
				234	bincache = {}
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	235	for i in range(0, len(t), size):
				236	bin = t[i:i+size]
				237	index = bincache.get(bin)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	238	if index is None:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	239	index = len(t2)
				240	bincache[bin] = index
				241	t2.extend(bin)
				242	t1.append(index >> shift)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	243	# determine memory size
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	244	b = len(t1)getsize(t1) + len(t2)getsize(t2)
				245	if trace:
				246	dump(t1, t2, shift, b)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	247	if b < bytes:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	248	best = t1, t2, shift
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	249	bytes = b
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	250	t1, t2, shift = best
				251	if trace:
				252	print >>sys.stderr, "Best:",
				253	dump(t1, t2, shift, bytes)
				254	if __debug__:
				255	# exhaustively verify that the decomposition is correct
				256	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
				257	for i in xrange(len(t)):
				258	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				259	return best
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	260
				261	if __name__ == "__main__":
				262	maketable()