Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

blob: b8411ad48bed6ec10e65fca6a6ed0645a7ac1197 [file] [log] [blame]

Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	1	#
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	2	# (re)generate unicode property and type databases
				3	#
				4	# this script converts a unicode 3.0 database file to
				5	# Modules/unicodedata_db.h and Objects/unicodetype_db.h
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	6	#
				7	# history:
				8	# 2000-09-24 fl created (based on bits and pieces from unidb)
				9	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	10	# 2000-09-25 fl added character type table
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	11	#
				12	# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	13	#
				14
				15	import sys
				16
				17	SCRIPT = sys.argv[0]
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	18	VERSION = "1.1"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	19
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	20	UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	21
				22	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
				23	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
				24	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
				25	"So" ]
				26
				27	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
				28	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
				29	"ON" ]
				30
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	31	ALPHA_MASK = 0x01
				32	DECIMAL_MASK = 0x02
				33	DIGIT_MASK = 0x04
				34	LOWER_MASK = 0x08
				35	NUMERIC_MASK = 0x10
				36	SPACE_MASK = 0x20
				37	TITLE_MASK = 0x40
				38	UPPER_MASK = 0x80
				39
				40	def maketables():
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	41
				42	unicode = UnicodeData(UNICODE_DATA)
				43
				44	# extract unicode properties
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	45	dummy = (0, 0, 0, 0)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	46	table = [dummy]
				47	cache = {0: dummy}
				48	index = [0] * len(unicode.chars)
				49
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	50	# 1) database properties
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	51	for char in unicode.chars:
				52	record = unicode.table[char]
				53	if record:
				54	# extract database properties
				55	category = CATEGORY_NAMES.index(record[2])
				56	combining = int(record[3])
				57	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
				58	mirrored = record[9] == "Y"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	59	item = (
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	60	category, combining, bidirectional, mirrored
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	61	)
				62	# add entry to index and item tables
				63	i = cache.get(item)
				64	if i is None:
				65	cache[item] = i = len(table)
				66	table.append(item)
				67	index[char] = i
				68
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	69	# 2) decomposition data
				70
				71	# FIXME: <fl> using the encoding stuff from unidb would save
				72	# another 50k or so, but I'll leave that for 2.1...
				73
				74	decomp_data = [""]
				75	decomp_index = [0] * len(unicode.chars)
				76
				77	for char in unicode.chars:
				78	record = unicode.table[char]
				79	if record:
				80	if record[5]:
				81	try:
				82	i = decomp_data.index(record[5])
				83	except ValueError:
				84	i = len(decomp_data)
				85	decomp_data.append(record[5])
				86	else:
				87	i = 0
				88	decomp_index[char] = i
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	89
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	90	FILE = "Modules/unicodedata_db.h"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	91
				92	sys.stdout = open(FILE, "w")
				93
				94	print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				95	print
				96	print "/* a list of unique database records */"
				97	print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
				98	for item in table:
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	99	print " {%d, %d, %d, %d}," % item
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	100	print "};"
				101	print
				102
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	103	# FIXME: the following tables should be made static, and
				104	# the support code moved into unicodedatabase.c
				105
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	106	print "/* string literals */"
				107	print "const char *_PyUnicode_CategoryNames[] = {"
				108	for name in CATEGORY_NAMES:
				109	print " \"%s\"," % name
				110	print " NULL"
				111	print "};"
				112
				113	print "const char *_PyUnicode_BidirectionalNames[] = {"
				114	for name in BIDIRECTIONAL_NAMES:
				115	print " \"%s\"," % name
				116	print " NULL"
				117	print "};"
				118
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	119	print "static const char *decomp_data[] = {"
				120	for name in decomp_data:
				121	print " \"%s\"," % name
				122	print " NULL"
				123	print "};"
				124
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	125	# split record index table
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	126	index1, index2, shift = splitbins(index)
				127
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	128	print "/* index tables for the database records */"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	129	print "#define SHIFT", shift
				130	Array("index1", index1).dump(sys.stdout)
				131	Array("index2", index2).dump(sys.stdout)
				132
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	133	# split decomposition index table
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	134	index1, index2, shift = splitbins(decomp_index)
				135
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	136	print "/* index tables for the decomposition data */"
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	137	print "#define DECOMP_SHIFT", shift
				138	Array("decomp_index1", index1).dump(sys.stdout)
				139	Array("decomp_index2", index2).dump(sys.stdout)
				140
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	141	sys.stdout = sys.__stdout__
				142
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	143	#
				144	# 3) unicode type data
				145
				146	# extract unicode types
				147	dummy = (0, 0, 0, 0)
				148	table = [dummy]
				149	cache = {0: dummy}
				150	index = [0] * len(unicode.chars)
				151
				152	for char in unicode.chars:
				153	record = unicode.table[char]
				154	if record:
				155	# extract database properties
				156	category = record[2]
				157	bidirectional = record[4]
				158	flags = 0
				159	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
				160	flags \|= ALPHA_MASK
				161	if category == "Ll":
				162	flags \|= LOWER_MASK
				163	if category == "Zs" or bidirectional in ("WS", "B", "S"):
				164	flags \|= SPACE_MASK
				165	if category in ["Lt", "Lu"]:
				166	flags \|= TITLE_MASK
				167	if category == "Lu":
				168	flags \|= UPPER_MASK
				169	# use delta predictor for upper/lower/title
				170	if record[12]:
				171	upper = (int(record[12], 16) - char) & 0xffff
				172	else:
				173	upper = 0
				174	if record[13]:
				175	lower = (int(record[13], 16) - char) & 0xffff
				176	else:
				177	lower = 0
				178	if record[14]:
				179	title = (int(record[14], 16) - char) & 0xffff
				180	else:
				181	title = 0
				182	item = (
				183	flags, upper, lower, title
				184	)
				185	# add entry to index and item tables
				186	i = cache.get(item)
				187	if i is None:
				188	cache[item] = i = len(table)
				189	table.append(item)
				190	index[char] = i
				191
				192	FILE = "Objects/unicodetype_db.h"
				193
				194	sys.stdout = open(FILE, "w")
				195
				196	print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				197	print
				198	print "/* a list of unique character type descriptors */"
				199	print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
				200	for item in table:
				201	print " {%d, %d, %d, %d}," % item
				202	print "};"
				203	print
				204
				205	# split decomposition index table
				206	index1, index2, shift = splitbins(index)
				207
				208	print "/* type indexes */"
				209	print "#define SHIFT", shift
				210	Array("index1", index1).dump(sys.stdout)
				211	Array("index2", index2).dump(sys.stdout)
				212
				213	sys.stdout = sys.__stdout__
				214
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	215	# --------------------------------------------------------------------
				216	# the following support code is taken from the unidb utilities
				217	# Copyright (c) 1999-2000 by Secret Labs AB
				218
				219	# load a unicode-data file from disk
				220
				221	import string, sys
				222
				223	class UnicodeData:
				224
				225	def __init__(self, filename):
				226	file = open(filename)
				227	table = [None] * 65536
				228	while 1:
				229	s = file.readline()
				230	if not s:
				231	break
				232	s = string.split(string.strip(s), ";")
				233	char = string.atoi(s[0], 16)
				234	table[char] = s
				235
				236	# public attributes
				237	self.filename = filename
				238	self.table = table
				239	self.chars = range(65536) # unicode
				240
				241	def uselatin1(self):
				242	# restrict character range to ISO Latin 1
				243	self.chars = range(256)
				244
				245	# stuff to deal with arrays of unsigned integers
				246
				247	class Array:
				248
				249	def __init__(self, name, data):
				250	self.name = name
				251	self.data = data
				252
				253	def dump(self, file):
				254	# write data to file, as a C array
				255	size = getsize(self.data)
				256	# print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
				257	file.write("static ")
				258	if size == 1:
				259	file.write("unsigned char")
				260	elif size == 2:
				261	file.write("unsigned short")
				262	else:
				263	file.write("unsigned int")
				264	file.write(" " + self.name + "[] = {\n")
				265	if self.data:
				266	s = " "
				267	for item in self.data:
				268	i = str(item) + ", "
				269	if len(s) + len(i) > 78:
				270	file.write(s + "\n")
				271	s = " " + i
				272	else:
				273	s = s + i
				274	if string.strip(s):
				275	file.write(s + "\n")
				276	file.write("};\n\n")
				277
				278	def getsize(data):
				279	# return smallest possible integer size for the given array
				280	maxdata = max(data)
				281	if maxdata < 256:
				282	return 1
				283	elif maxdata < 65536:
				284	return 2
				285	else:
				286	return 4
				287
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	288	def splitbins(t, trace=0):
				289	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
				290
				291	t is a sequence of ints. This function can be useful to save space if
				292	many of the ints are the same. t1 and t2 are lists of ints, and shift
				293	is an int, chosen to minimize the combined size of t1 and t2 (in C
				294	code), and where for each i in range(len(t)),
				295	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				296	where mask is a bitmask isolating the last "shift" bits.
				297
				298	If optional arg trace is true (default false), progress info is
				299	printed to sys.stderr.
				300	"""
				301
				302	import sys
				303	if trace:
				304	def dump(t1, t2, shift, bytes):
				305	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
				306	len(t1), len(t2), shift, bytes)
				307	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
				308	"bytes"
				309	n = len(t)-1 # last valid index
				310	maxshift = 0 # the most we can shift n and still have something left
				311	if n > 0:
				312	while n >> 1:
				313	n >>= 1
				314	maxshift += 1
				315	del n
				316	bytes = sys.maxint # smallest total size so far
				317	t = tuple(t) # so slices can be dict keys
				318	for shift in range(maxshift + 1):
				319	t1 = []
				320	t2 = []
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	321	size = 2**shift
				322	bincache = {}
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	323	for i in range(0, len(t), size):
				324	bin = t[i:i+size]
				325	index = bincache.get(bin)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	326	if index is None:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	327	index = len(t2)
				328	bincache[bin] = index
				329	t2.extend(bin)
				330	t1.append(index >> shift)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	331	# determine memory size
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	332	b = len(t1)getsize(t1) + len(t2)getsize(t2)
				333	if trace:
				334	dump(t1, t2, shift, b)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	335	if b < bytes:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	336	best = t1, t2, shift
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	337	bytes = b
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	338	t1, t2, shift = best
				339	if trace:
				340	print >>sys.stderr, "Best:",
				341	dump(t1, t2, shift, bytes)
				342	if __debug__:
				343	# exhaustively verify that the decomposition is correct
				344	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
				345	for i in xrange(len(t)):
				346	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				347	return best
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	348
				349	if __name__ == "__main__":
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	350	maketables()