Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython2

blob: faca17f369309dc144868a867918cfe16a0f2f8b [file] [log] [blame]

Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	1	#
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	2	# (re)generate unicode property and type databases
				3	#
				4	# this script converts a unicode 3.0 database file to
				5	# Modules/unicodedata_db.h and Objects/unicodetype_db.h
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	6	#
				7	# history:
				8	# 2000-09-24 fl created (based on bits and pieces from unidb)
				9	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	10	# 2000-09-25 fl added character type table
Fredrik Lundh	375732c	2000-09-25 23:03:34 +0000	[diff] [blame^]	11	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	12	#
				13	# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	14	#
				15
				16	import sys
				17
				18	SCRIPT = sys.argv[0]
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	19	VERSION = "1.1"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	20
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	21	UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	22
				23	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
				24	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
				25	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
				26	"So" ]
				27
				28	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
				29	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
				30	"ON" ]
				31
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	32	# note: should match definitions in Objects/unicodectype.c
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	33	ALPHA_MASK = 0x01
				34	DECIMAL_MASK = 0x02
				35	DIGIT_MASK = 0x04
				36	LOWER_MASK = 0x08
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	37	LINEBREAK_MASK = 0x10
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	38	SPACE_MASK = 0x20
				39	TITLE_MASK = 0x40
				40	UPPER_MASK = 0x80
				41
				42	def maketables():
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	43
				44	unicode = UnicodeData(UNICODE_DATA)
				45
				46	# extract unicode properties
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	47	dummy = (0, 0, 0, 0)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	48	table = [dummy]
				49	cache = {0: dummy}
				50	index = [0] * len(unicode.chars)
				51
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	52	# 1) database properties
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	53	for char in unicode.chars:
				54	record = unicode.table[char]
				55	if record:
				56	# extract database properties
				57	category = CATEGORY_NAMES.index(record[2])
				58	combining = int(record[3])
				59	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
				60	mirrored = record[9] == "Y"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	61	item = (
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	62	category, combining, bidirectional, mirrored
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	63	)
				64	# add entry to index and item tables
				65	i = cache.get(item)
				66	if i is None:
				67	cache[item] = i = len(table)
				68	table.append(item)
				69	index[char] = i
				70
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	71	# 2) decomposition data
				72
				73	# FIXME: <fl> using the encoding stuff from unidb would save
				74	# another 50k or so, but I'll leave that for 2.1...
				75
				76	decomp_data = [""]
				77	decomp_index = [0] * len(unicode.chars)
				78
				79	for char in unicode.chars:
				80	record = unicode.table[char]
				81	if record:
				82	if record[5]:
				83	try:
				84	i = decomp_data.index(record[5])
				85	except ValueError:
				86	i = len(decomp_data)
				87	decomp_data.append(record[5])
				88	else:
				89	i = 0
				90	decomp_index[char] = i
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	91
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	92	FILE = "Modules/unicodedata_db.h"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	93
				94	sys.stdout = open(FILE, "w")
				95
				96	print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				97	print
				98	print "/* a list of unique database records */"
				99	print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
				100	for item in table:
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	101	print " {%d, %d, %d, %d}," % item
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	102	print "};"
				103	print
				104
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	105	# FIXME: the following tables should be made static, and
				106	# the support code moved into unicodedatabase.c
				107
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	108	print "/* string literals */"
				109	print "const char *_PyUnicode_CategoryNames[] = {"
				110	for name in CATEGORY_NAMES:
				111	print " \"%s\"," % name
				112	print " NULL"
				113	print "};"
				114
				115	print "const char *_PyUnicode_BidirectionalNames[] = {"
				116	for name in BIDIRECTIONAL_NAMES:
				117	print " \"%s\"," % name
				118	print " NULL"
				119	print "};"
				120
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	121	print "static const char *decomp_data[] = {"
				122	for name in decomp_data:
				123	print " \"%s\"," % name
				124	print " NULL"
				125	print "};"
				126
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	127	# split record index table
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	128	index1, index2, shift = splitbins(index)
				129
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	130	print "/* index tables for the database records */"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	131	print "#define SHIFT", shift
				132	Array("index1", index1).dump(sys.stdout)
				133	Array("index2", index2).dump(sys.stdout)
				134
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	135	# split decomposition index table
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	136	index1, index2, shift = splitbins(decomp_index)
				137
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	138	print "/* index tables for the decomposition data */"
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	139	print "#define DECOMP_SHIFT", shift
				140	Array("decomp_index1", index1).dump(sys.stdout)
				141	Array("decomp_index2", index2).dump(sys.stdout)
				142
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	143	sys.stdout = sys.__stdout__
				144
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	145	#
				146	# 3) unicode type data
				147
				148	# extract unicode types
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	149	dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	150	table = [dummy]
				151	cache = {0: dummy}
				152	index = [0] * len(unicode.chars)
				153
				154	for char in unicode.chars:
				155	record = unicode.table[char]
				156	if record:
				157	# extract database properties
				158	category = record[2]
				159	bidirectional = record[4]
				160	flags = 0
				161	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
				162	flags \|= ALPHA_MASK
				163	if category == "Ll":
				164	flags \|= LOWER_MASK
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	165	if category == "Zl" or bidirectional == "B":
				166	flags \|= LINEBREAK_MASK
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	167	if category == "Zs" or bidirectional in ("WS", "B", "S"):
				168	flags \|= SPACE_MASK
Fredrik Lundh	375732c	2000-09-25 23:03:34 +0000	[diff] [blame^]	169	if category == "Lt":
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	170	flags \|= TITLE_MASK
				171	if category == "Lu":
				172	flags \|= UPPER_MASK
				173	# use delta predictor for upper/lower/title
				174	if record[12]:
				175	upper = (int(record[12], 16) - char) & 0xffff
				176	else:
				177	upper = 0
				178	if record[13]:
				179	lower = (int(record[13], 16) - char) & 0xffff
				180	else:
				181	lower = 0
				182	if record[14]:
				183	title = (int(record[14], 16) - char) & 0xffff
				184	else:
				185	title = 0
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	186	# decimal digit, integer digit
				187	decimal = 0
				188	if record[6]:
				189	flags \|= DECIMAL_MASK
				190	decimal = int(record[6])
				191	digit = 0
				192	if record[7]:
				193	flags \|= DIGIT_MASK
				194	digit = int(record[7])
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	195	item = (
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	196	flags, upper, lower, title, decimal, digit
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	197	)
				198	# add entry to index and item tables
				199	i = cache.get(item)
				200	if i is None:
				201	cache[item] = i = len(table)
				202	table.append(item)
				203	index[char] = i
				204
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	205	print len(table), "ctype entries"
				206
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	207	FILE = "Objects/unicodetype_db.h"
				208
				209	sys.stdout = open(FILE, "w")
				210
				211	print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				212	print
				213	print "/* a list of unique character type descriptors */"
				214	print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
				215	for item in table:
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	216	print " {%d, %d, %d, %d, %d, %d}," % item
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	217	print "};"
				218	print
				219
				220	# split decomposition index table
				221	index1, index2, shift = splitbins(index)
				222
				223	print "/* type indexes */"
				224	print "#define SHIFT", shift
				225	Array("index1", index1).dump(sys.stdout)
				226	Array("index2", index2).dump(sys.stdout)
				227
				228	sys.stdout = sys.__stdout__
				229
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	230	# --------------------------------------------------------------------
				231	# the following support code is taken from the unidb utilities
				232	# Copyright (c) 1999-2000 by Secret Labs AB
				233
				234	# load a unicode-data file from disk
				235
				236	import string, sys
				237
				238	class UnicodeData:
				239
				240	def __init__(self, filename):
				241	file = open(filename)
				242	table = [None] * 65536
				243	while 1:
				244	s = file.readline()
				245	if not s:
				246	break
				247	s = string.split(string.strip(s), ";")
				248	char = string.atoi(s[0], 16)
				249	table[char] = s
				250
				251	# public attributes
				252	self.filename = filename
				253	self.table = table
				254	self.chars = range(65536) # unicode
				255
				256	def uselatin1(self):
				257	# restrict character range to ISO Latin 1
				258	self.chars = range(256)
				259
				260	# stuff to deal with arrays of unsigned integers
				261
				262	class Array:
				263
				264	def __init__(self, name, data):
				265	self.name = name
				266	self.data = data
				267
				268	def dump(self, file):
				269	# write data to file, as a C array
				270	size = getsize(self.data)
				271	# print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
				272	file.write("static ")
				273	if size == 1:
				274	file.write("unsigned char")
				275	elif size == 2:
				276	file.write("unsigned short")
				277	else:
				278	file.write("unsigned int")
				279	file.write(" " + self.name + "[] = {\n")
				280	if self.data:
				281	s = " "
				282	for item in self.data:
				283	i = str(item) + ", "
				284	if len(s) + len(i) > 78:
				285	file.write(s + "\n")
				286	s = " " + i
				287	else:
				288	s = s + i
				289	if string.strip(s):
				290	file.write(s + "\n")
				291	file.write("};\n\n")
				292
				293	def getsize(data):
				294	# return smallest possible integer size for the given array
				295	maxdata = max(data)
				296	if maxdata < 256:
				297	return 1
				298	elif maxdata < 65536:
				299	return 2
				300	else:
				301	return 4
				302
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	303	def splitbins(t, trace=0):
				304	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
				305
				306	t is a sequence of ints. This function can be useful to save space if
				307	many of the ints are the same. t1 and t2 are lists of ints, and shift
				308	is an int, chosen to minimize the combined size of t1 and t2 (in C
				309	code), and where for each i in range(len(t)),
				310	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				311	where mask is a bitmask isolating the last "shift" bits.
				312
				313	If optional arg trace is true (default false), progress info is
				314	printed to sys.stderr.
				315	"""
				316
				317	import sys
				318	if trace:
				319	def dump(t1, t2, shift, bytes):
				320	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
				321	len(t1), len(t2), shift, bytes)
				322	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
				323	"bytes"
				324	n = len(t)-1 # last valid index
				325	maxshift = 0 # the most we can shift n and still have something left
				326	if n > 0:
				327	while n >> 1:
				328	n >>= 1
				329	maxshift += 1
				330	del n
				331	bytes = sys.maxint # smallest total size so far
				332	t = tuple(t) # so slices can be dict keys
				333	for shift in range(maxshift + 1):
				334	t1 = []
				335	t2 = []
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	336	size = 2**shift
				337	bincache = {}
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	338	for i in range(0, len(t), size):
				339	bin = t[i:i+size]
				340	index = bincache.get(bin)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	341	if index is None:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	342	index = len(t2)
				343	bincache[bin] = index
				344	t2.extend(bin)
				345	t1.append(index >> shift)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	346	# determine memory size
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	347	b = len(t1)getsize(t1) + len(t2)getsize(t2)
				348	if trace:
				349	dump(t1, t2, shift, b)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	350	if b < bytes:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	351	best = t1, t2, shift
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	352	bytes = b
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	353	t1, t2, shift = best
				354	if trace:
				355	print >>sys.stderr, "Best:",
				356	dump(t1, t2, shift, bytes)
				357	if __debug__:
				358	# exhaustively verify that the decomposition is correct
				359	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
				360	for i in xrange(len(t)):
				361	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				362	return best
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	363
				364	if __name__ == "__main__":
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	365	maketables()