Blame - Tools/unicode/gencodec.py - platform/external/python/cpython2

blob: 7bce3d5f7f09d73029159b46fa98901b24f11747 [file] [log] [blame]

Marc-André Lemburg	c5694c8	2005-10-21 13:45:17 +0000	[diff] [blame^]	1	""" Unicode Mapping Parser and Codec Generator.
				2
				3	This script parses Unicode mapping files as available from the Unicode
				4	site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
				5	modules from them. The codecs use the standard character mapping codec
				6	to actually apply the mapping.
				7
				8	Synopsis: gencodec.py dir codec_prefix
				9
				10	All files in dir are scanned and those producing non-empty mappings
				11	will be written to <codec_prefix><mapname>.py with <mapname> being the
				12	first part of the map's filename ('a' in a.b.c.txt) converted to
				13	lowercase with hyphens replaced by underscores.
				14
				15	The tool also writes marshalled versions of the mapping tables to the
				16	same location (with .mapping extension).
				17
				18	Written by Marc-Andre Lemburg (mal@lemburg.com). Modified to generate
				19	Unicode table maps for decoding.
				20
				21	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				22	(c) Copyright Guido van Rossum, 2000.
				23	(c) Copyright Marc-Andre Lemburg, 2005.
				24
				25	"""#"
				26
				27	import re, os, time, marshal, codecs
				28
				29	# Maximum allowed size of charmap tables
				30	MAX_TABLE_SIZE = 8192
				31
				32	# Standard undefined Unicode code point
				33	UNI_UNDEFINED = unichr(0xFFFE)
				34
				35	mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
				36	'\s+'
				37	'((?:(?:0x[0-9a-fA-Z]+\|<[A-Za-z]+>)\+?)*)'
				38	'\s*'
				39	'(#.+)?')
				40
				41	def parsecodes(codes,
				42	len=len, filter=filter,range=range):
				43
				44	""" Converts code combinations to either a single code integer
				45	or a tuple of integers.
				46
				47	meta-codes (in angular brackets, e.g. <LR> and <RL>) are
				48	ignored.
				49
				50	Empty codes or illegal ones are returned as None.
				51
				52	"""
				53	if not codes:
				54	return None
				55	l = codes.split('+')
				56	if len(l) == 1:
				57	return int(l[0],16)
				58	for i in range(len(l)):
				59	try:
				60	l[i] = int(l[i],16)
				61	except ValueError:
				62	l[i] = None
				63	l = filter(lambda x: x is not None, l)
				64	if len(l) == 1:
				65	return l[0]
				66	else:
				67	return tuple(l)
				68
				69	def readmap(filename):
				70
				71	f = open(filename,'r')
				72	lines = f.readlines()
				73	f.close()
				74	enc2uni = {}
				75	identity = []
				76	unmapped = range(256)
				77
				78	# UTC mapping tables per convention don't include the identity
				79	# mappings for code points 0x00 - 0x1F and 0x7F, unless these are
				80	# explicitly mapped to different characters or undefined
				81	for i in range(32) + [127]:
				82	identity.append(i)
				83	unmapped.remove(i)
				84	enc2uni[i] = (i, 'CONTROL CHARACTER')
				85
				86	for line in lines:
				87	line = line.strip()
				88	if not line or line[0] == '#':
				89	continue
				90	m = mapRE.match(line)
				91	if not m:
				92	#print '* not matched: %s' % repr(line)
				93	continue
				94	enc,uni,comment = m.groups()
				95	enc = parsecodes(enc)
				96	uni = parsecodes(uni)
				97	if comment is None:
				98	comment = ''
				99	else:
				100	comment = comment[1:].strip()
				101	if enc < 256:
				102	if enc in unmapped:
				103	unmapped.remove(enc)
				104	if enc == uni:
				105	identity.append(enc)
				106	enc2uni[enc] = (uni,comment)
				107	else:
				108	enc2uni[enc] = (uni,comment)
				109
				110	# If there are more identity-mapped entries than unmapped entries,
				111	# it pays to generate an identity dictionary first, and add explicit
				112	# mappings to None for the rest
				113	if len(identity) >= len(unmapped):
				114	for enc in unmapped:
				115	enc2uni[enc] = (None, "")
				116	enc2uni['IDENTITY'] = 256
				117
				118	return enc2uni
				119
				120	def hexrepr(t):
				121
				122	if t is None:
				123	return 'None'
				124	try:
				125	len(t)
				126	except:
				127	return '0x%04x' % t
				128	try:
				129	return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
				130	except TypeError, why:
				131	print '* failed to convert %r: %s' % (t, why)
				132	raise
				133
				134	def python_mapdef_code(varname, map, comments=1):
				135
				136	l = []
				137	append = l.append
				138	if map.has_key("IDENTITY"):
				139	append("%s = codecs.make_identity_dict(range(%d))" %
				140	(varname, map["IDENTITY"]))
				141	append("%s.update({" % varname)
				142	splits = 1
				143	del map["IDENTITY"]
				144	identity = 1
				145	else:
				146	append("%s = {" % varname)
				147	splits = 0
				148	identity = 0
				149
				150	mappings = map.items()
				151	mappings.sort()
				152	i = 0
				153	for mapkey, mapvalue in mappings:
				154	mapcomment = ''
				155	if isinstance(mapkey, tuple):
				156	(mapkey, mapcomment) = mapkey
				157	if isinstance(mapvalue, tuple):
				158	(mapvalue, mapcomment) = mapvalue
				159	if mapkey is None:
				160	continue
				161	if (identity and
				162	mapkey == mapvalue and
				163	mapkey < 256):
				164	# No need to include identity mappings, since these
				165	# are already set for the first 256 code points.
				166	continue
				167	key = hexrepr(mapkey)
				168	value = hexrepr(mapvalue)
				169	if mapcomment and comments:
				170	append(' %s: %s,\t# %s' % (key, value, mapcomment))
				171	else:
				172	append(' %s: %s,' % (key, value))
				173	i += 1
				174	if i == 4096:
				175	# Split the definition into parts to that the Python
				176	# parser doesn't dump core
				177	if splits == 0:
				178	append('}')
				179	else:
				180	append('})')
				181	append('%s.update({' % varname)
				182	i = 0
				183	splits = splits + 1
				184	if splits == 0:
				185	append('}')
				186	else:
				187	append('})')
				188
				189	return l
				190
				191	def python_tabledef_code(varname, map, comments=1):
				192
				193	l = []
				194	append = l.append
				195	append('%s = (' % varname)
				196
				197	# Analyze map and create table dict
				198	mappings = map.items()
				199	mappings.sort()
				200	table = {}
				201	maxkey = 0
				202	if map.has_key('IDENTITY'):
				203	for key in range(256):
				204	table[key] = (key, '')
				205	maxkey = 255
				206	del map['IDENTITY']
				207	for mapkey, mapvalue in mappings:
				208	mapcomment = ''
				209	if isinstance(mapkey, tuple):
				210	(mapkey, mapcomment) = mapkey
				211	if isinstance(mapvalue, tuple):
				212	(mapvalue, mapcomment) = mapvalue
				213	if mapkey is None:
				214	continue
				215	table[mapkey] = (mapvalue, mapcomment)
				216	if mapkey > maxkey:
				217	maxkey = mapkey
				218	if maxkey > MAX_TABLE_SIZE:
				219	# Table too large
				220	return None
				221
				222	# Create table code
				223	for key in range(maxkey + 1):
				224	if key not in table:
				225	mapvalue = None
				226	mapcomment = 'UNDEFINED'
				227	else:
				228	mapvalue, mapcomment = table[key]
				229	if mapvalue is None:
				230	mapchar = UNI_UNDEFINED
				231	else:
				232	if isinstance(mapvalue, tuple):
				233	# 1-n mappings not supported
				234	return None
				235	else:
				236	mapchar = unichr(mapvalue)
				237	if mapcomment and comments:
				238	append(' %r\t# %s -> %s' % (mapchar,
				239	hexrepr(key),
				240	mapcomment))
				241	else:
				242	append(' %r' % mapchar)
				243
				244	append(')')
				245	return l
				246
				247	def codegen(name, map, comments=1):
				248
				249	""" Returns Python source for the given map.
				250
				251	Comments are included in the source, if comments is true (default).
				252
				253	"""
				254	# Generate code
				255	decoding_map_code = python_mapdef_code(
				256	'decoding_map',
				257	map,
				258	comments=comments)
				259	decoding_table_code = python_tabledef_code(
				260	'decoding_table',
				261	map,
				262	comments=comments)
				263	encoding_map_code = python_mapdef_code(
				264	'encoding_map',
				265	codecs.make_encoding_map(map),
				266	comments=comments)
				267
				268	l = [
				269	'''\
				270	""" Python Character Mapping Codec generated from '%s' with gencodec.py.
				271
				272	"""#"
				273
				274	import codecs
				275
				276	### Codec APIs
				277
				278	class Codec(codecs.Codec):
				279
				280	def encode(self,input,errors='strict'):
				281
				282	return codecs.charmap_encode(input,errors,encoding_map)
				283
				284	def decode(self,input,errors='strict'):
				285	''' % name
				286	]
				287	if decoding_table_code:
				288	l.append('''\
				289	return codecs.charmap_decode(input,errors,decoding_table)''')
				290	else:
				291	l.append('''\
				292	return codecs.charmap_decode(input,errors,decoding_map)''')
				293
				294	l.append('''
				295	class StreamWriter(Codec,codecs.StreamWriter):
				296	pass
				297
				298	class StreamReader(Codec,codecs.StreamReader):
				299	pass
				300
				301	### encodings module API
				302
				303	def getregentry():
				304
				305	return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
				306
				307	### Decoding Map
				308	''')
				309	l.extend(decoding_map_code)
				310
				311	# Add optional decoding table
				312	if decoding_table_code:
				313	l.append('''
				314	### Decoding Table
				315	''')
				316	l.extend(decoding_table_code)
				317
				318	l.append('''
				319	### Encoding Map
				320	''')
				321	l.extend(encoding_map_code)
				322
				323	return '\n'.join(l)
				324
				325	def pymap(name,map,pyfile,comments=1):
				326
				327	code = codegen(name,map,comments)
				328	f = open(pyfile,'w')
				329	f.write(code)
				330	f.close()
				331
				332	def marshalmap(name,map,marshalfile):
				333
				334	d = {}
				335	for e,(u,c) in map.items():
				336	d[e] = (u,c)
				337	f = open(marshalfile,'wb')
				338	marshal.dump(d,f)
				339	f.close()
				340
				341	def convertdir(dir,prefix='',comments=1):
				342
				343	mapnames = os.listdir(dir)
				344	for mapname in mapnames:
				345	mappathname = os.path.join(dir, mapname)
				346	name = os.path.split(mapname)[1]
				347	name = name.replace('-','_')
				348	name = name.split('.')[0]
				349	name = name.lower()
				350	codefile = name + '.py'
				351	marshalfile = name + '.mapping'
				352	print 'converting %s to %s and %s' % (mapname,
				353	prefix + codefile,
				354	prefix + marshalfile)
				355	try:
				356	map = readmap(os.path.join(dir,mapname))
				357	if not map:
				358	print '* map is empty; skipping'
				359	else:
				360	pymap(mappathname, map, prefix + codefile,comments)
				361	marshalmap(mappathname, map, prefix + marshalfile)
				362	except ValueError, why:
				363	print '* conversion failed: %s' % why
				364	raise
				365
				366	def rewritepythondir(dir,prefix='',comments=1):
				367
				368	mapnames = os.listdir(dir)
				369	for mapname in mapnames:
				370	if not mapname.endswith('.mapping'):
				371	continue
				372	codefile = mapname[:-len('.mapping')] + '.py'
				373	print 'converting %s to %s' % (mapname,
				374	prefix + codefile)
				375	try:
				376	map = marshal.load(open(os.path.join(dir,mapname),
				377	'rb'))
				378	if not map:
				379	print '* map is empty; skipping'
				380	else:
				381	pymap(mapname, map, prefix + codefile,comments)
				382	except ValueError, why:
				383	print '* conversion failed: %s' % why
				384
				385	if __name__ == '__main__':
				386
				387	import sys
				388	if 1:
				389	apply(convertdir,tuple(sys.argv[1:]))
				390	else:
				391	apply(rewritepythondir,tuple(sys.argv[1:]))