Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

blob: f2e6dc86b34f5c32375aff8f59fbf18e3780d860 [file] [log] [blame]

Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	1	#
				2	# makeunidb.py -- generate a compact version of the unicode property
				3	# database (unicodedatabase.h)
				4	#
				5
				6	import sys
				7
				8	SCRIPT = sys.argv[0]
				9	VERSION = "1.0"
				10
				11	UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"
				12
				13	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
				14	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
				15	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
				16	"So" ]
				17
				18	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
				19	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
				20	"ON" ]
				21
				22	def maketable():
				23
				24	unicode = UnicodeData(UNICODE_DATA)
				25
				26	# extract unicode properties
				27	dummy = (0, 0, 0, 0, "NULL")
				28	table = [dummy]
				29	cache = {0: dummy}
				30	index = [0] * len(unicode.chars)
				31
				32	DECOMPOSITION = [""]
				33
				34	for char in unicode.chars:
				35	record = unicode.table[char]
				36	if record:
				37	# extract database properties
				38	category = CATEGORY_NAMES.index(record[2])
				39	combining = int(record[3])
				40	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
				41	mirrored = record[9] == "Y"
				42	if record[5]:
				43	decomposition = '"%s"' % record[5]
				44	else:
				45	decomposition = "NULL"
				46	item = (
				47	category, combining, bidirectional, mirrored, decomposition
				48	)
				49	# add entry to index and item tables
				50	i = cache.get(item)
				51	if i is None:
				52	cache[item] = i = len(table)
				53	table.append(item)
				54	index[char] = i
				55
				56	# FIXME: we really should compress the decomposition stuff
				57	# (see the unidb utilities for one way to do this)
				58
				59	FILE = "unicodedata_db.h"
				60
				61	sys.stdout = open(FILE, "w")
				62
				63	print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				64	print
				65	print "/* a list of unique database records */"
				66	print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
				67	for item in table:
				68	print " {%d, %d, %d, %d, %s}," % item
				69	print "};"
				70	print
				71
				72	print "/* string literals */"
				73	print "const char *_PyUnicode_CategoryNames[] = {"
				74	for name in CATEGORY_NAMES:
				75	print " \"%s\"," % name
				76	print " NULL"
				77	print "};"
				78
				79	print "const char *_PyUnicode_BidirectionalNames[] = {"
				80	for name in BIDIRECTIONAL_NAMES:
				81	print " \"%s\"," % name
				82	print " NULL"
				83	print "};"
				84
				85	# split index table
				86	index1, index2, shift = splitbins(index)
				87
				88	print "/* index tables used to find the right database record */"
				89	print "#define SHIFT", shift
				90	Array("index1", index1).dump(sys.stdout)
				91	Array("index2", index2).dump(sys.stdout)
				92
				93	sys.stdout = sys.__stdout__
				94
				95	# --------------------------------------------------------------------
				96	# the following support code is taken from the unidb utilities
				97	# Copyright (c) 1999-2000 by Secret Labs AB
				98
				99	# load a unicode-data file from disk
				100
				101	import string, sys
				102
				103	class UnicodeData:
				104
				105	def __init__(self, filename):
				106	file = open(filename)
				107	table = [None] * 65536
				108	while 1:
				109	s = file.readline()
				110	if not s:
				111	break
				112	s = string.split(string.strip(s), ";")
				113	char = string.atoi(s[0], 16)
				114	table[char] = s
				115
				116	# public attributes
				117	self.filename = filename
				118	self.table = table
				119	self.chars = range(65536) # unicode
				120
				121	def uselatin1(self):
				122	# restrict character range to ISO Latin 1
				123	self.chars = range(256)
				124
				125	# stuff to deal with arrays of unsigned integers
				126
				127	class Array:
				128
				129	def __init__(self, name, data):
				130	self.name = name
				131	self.data = data
				132
				133	def dump(self, file):
				134	# write data to file, as a C array
				135	size = getsize(self.data)
				136	# print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
				137	file.write("static ")
				138	if size == 1:
				139	file.write("unsigned char")
				140	elif size == 2:
				141	file.write("unsigned short")
				142	else:
				143	file.write("unsigned int")
				144	file.write(" " + self.name + "[] = {\n")
				145	if self.data:
				146	s = " "
				147	for item in self.data:
				148	i = str(item) + ", "
				149	if len(s) + len(i) > 78:
				150	file.write(s + "\n")
				151	s = " " + i
				152	else:
				153	s = s + i
				154	if string.strip(s):
				155	file.write(s + "\n")
				156	file.write("};\n\n")
				157
				158	def getsize(data):
				159	# return smallest possible integer size for the given array
				160	maxdata = max(data)
				161	if maxdata < 256:
				162	return 1
				163	elif maxdata < 65536:
				164	return 2
				165	else:
				166	return 4
				167
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame^]	168	def splitbins(t, trace=0):
				169	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
				170
				171	t is a sequence of ints. This function can be useful to save space if
				172	many of the ints are the same. t1 and t2 are lists of ints, and shift
				173	is an int, chosen to minimize the combined size of t1 and t2 (in C
				174	code), and where for each i in range(len(t)),
				175	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				176	where mask is a bitmask isolating the last "shift" bits.
				177
				178	If optional arg trace is true (default false), progress info is
				179	printed to sys.stderr.
				180	"""
				181
				182	import sys
				183	if trace:
				184	def dump(t1, t2, shift, bytes):
				185	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
				186	len(t1), len(t2), shift, bytes)
				187	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
				188	"bytes"
				189	n = len(t)-1 # last valid index
				190	maxshift = 0 # the most we can shift n and still have something left
				191	if n > 0:
				192	while n >> 1:
				193	n >>= 1
				194	maxshift += 1
				195	del n
				196	bytes = sys.maxint # smallest total size so far
				197	t = tuple(t) # so slices can be dict keys
				198	for shift in range(maxshift + 1):
				199	t1 = []
				200	t2 = []
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	201	size = 2**shift
				202	bincache = {}
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame^]	203	for i in range(0, len(t), size):
				204	bin = t[i:i+size]
				205	index = bincache.get(bin)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	206	if index is None:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame^]	207	index = len(t2)
				208	bincache[bin] = index
				209	t2.extend(bin)
				210	t1.append(index >> shift)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	211	# determine memory size
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame^]	212	b = len(t1)getsize(t1) + len(t2)getsize(t2)
				213	if trace:
				214	dump(t1, t2, shift, b)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	215	if b < bytes:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame^]	216	best = t1, t2, shift
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	217	bytes = b
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame^]	218	t1, t2, shift = best
				219	if trace:
				220	print >>sys.stderr, "Best:",
				221	dump(t1, t2, shift, bytes)
				222	if __debug__:
				223	# exhaustively verify that the decomposition is correct
				224	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
				225	for i in xrange(len(t)):
				226	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				227	return best
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	228
				229	if __name__ == "__main__":
				230	maketable()