Blame - python/lib/Lib/unicodedata.py - platform/tools/idea

blob: 6e6f5eb477fcc7640b6ca54b2ea4d4afd5824978 [file] [log] [blame]

Tor Norbye	3a2425a	2013-11-04 10:16:08 -0800	[diff] [blame]	1	from bisect import bisect_left
				2	import operator
				3	import java.lang.Character
				4
				5	# XXX - this is intended as a stopgap measure until 2.5.1, which will have a Java implementation
				6	# requires java 6 for `normalize` function
				7	# only has one version of the database
				8	# does not normalized ideographs
				9
				10	_codepoints = {}
				11	_eaw = {}
				12	_names = {}
				13	_segments = []
				14	_eaw_segments = []
				15	Nonesuch = object()
				16
				17	def get_int(col):
				18	try:
				19	return int(col)
				20	except ValueError:
				21	return None
				22
				23	def get_yn(col):
				24	if col == 'Y': return 1
				25	else: return 0
				26
				27	def get_numeric(col):
				28	try:
				29	return float(col)
				30	except ValueError:
				31	try:
				32	a, b = col.split('/')
				33	return float(a)/float(b)
				34	except:
				35	return None
				36
				37	def init_unicodedata(data):
				38	for row in data:
				39	cols = row.split(';')
				40	codepoint = int(cols[0], 16)
				41	name = cols[1]
				42	if name == '<CJK Ideograph, Last>':
				43	lookup_name = 'CJK UNIFIED IDEOGRAPH'
				44	else:
				45	lookup_name = name
				46	data = (
				47	cols[2],
				48	get_int(cols[3]),
				49	cols[4],
				50	cols[5],
				51	get_int(cols[6]),
				52	get_int(cols[7]),
				53	get_numeric(cols[8]),
				54	get_yn(cols[9]),
				55	lookup_name,
				56	)
				57
				58	if name.find('First') >= 0:
				59	start = codepoint
				60	elif name.find('Last') >= 0:
				61	_segments.append((start, (start, codepoint), data))
				62	else:
				63	_names[name] = unichr(codepoint)
				64	_codepoints[codepoint] = data
				65
				66	def init_east_asian_width(data):
				67	for row in data:
				68	if row.startswith('#'):
				69	continue
				70	row = row.partition('#')[0]
				71	cols = row.split(';')
				72	if len(cols) < 2:
				73	continue
				74	cr = cols[0].split('..')
				75	width = cols[1].rstrip()
				76	if len(cr) == 1:
				77	codepoint = int(cr[0], 16)
				78	_eaw[codepoint] = width
				79	else:
				80	start = int(cr[0], 16)
				81	end = int(cr[1], 16)
				82	_eaw_segments.append((start, (start, end), width))
				83
				84	# xxx - need to normalize the segments, so
				85	# <CJK Ideograph, Last> ==> CJK UNIFIED IDEOGRAPH;
				86	# may need to do some sort of analysis against CPython for the normalization!
				87
				88	def name(unichr, default=None):
				89	codepoint = get_codepoint(unichr, "name")
				90	v = _codepoints.get(codepoint, None)
				91	if v is None:
				92	v = check_segments(codepoint, _segments)
				93	if v is not None:
				94	return "%s-%X" % (v[8], codepoint)
				95
				96	if v is None:
				97	if default is not Nonesuch:
				98	return default
				99	raise ValueError()
				100	return v[8]
				101
				102	# xxx - also need to add logic here so that if it's CJK UNIFIED
				103	# IDEOGRAPH-8000, we go against the segment to verify the prefix
				104
				105	def lookup(name):
				106	return _names[name]
				107
				108	def check_segments(codepoint, segments):
				109	i = bisect_left(segments, (codepoint,))
				110	if i < len(segments):
				111	segment = segments[i - 1]
				112	if codepoint <= segment[1][1]:
				113	return segment[2]
				114	return None
				115
				116
				117	def get_codepoint(unichr, fn=None):
				118	if not(isinstance(unichr, unicode)):
				119	raise TypeError(fn, "() argument 1 must be unicode, not " + type(unichr))
				120	if len(unichr) > 1 or len(unichr) == 0:
				121	raise TypeError("need a single Unicode character as parameter")
				122	return ord(unichr)
				123
				124	def get_eaw(unichr, default, fn):
				125	codepoint = get_codepoint(unichr, fn)
				126	v = _eaw.get(codepoint, None)
				127	if v is None:
				128	v = check_segments(codepoint, _eaw_segments)
				129
				130	if v is None:
				131	if default is not Nonesuch:
				132	return default
				133	raise ValueError()
				134	return v
				135
				136	def get(unichr, default, fn, getter):
				137	codepoint = get_codepoint(unichr, fn)
				138	data = _codepoints.get(codepoint, None)
				139	if data is None:
				140	data = check_segments(codepoint, _segments)
				141	if data is None:
				142	if default is not Nonesuch:
				143	return default
				144	raise ValueError()
				145	v = getter(data)
				146	if v is None:
				147	if default is not Nonesuch:
				148	return default
				149	raise ValueError()
				150	else:
				151	return v
				152
				153	category_getter = operator.itemgetter(0)
				154	combining_getter = operator.itemgetter(1)
				155	bidirectional_getter = operator.itemgetter(2)
				156	decomposition_getter = operator.itemgetter(3)
				157	decimal_getter = operator.itemgetter(4)
				158	digit_getter = operator.itemgetter(5)
				159	numeric_getter = operator.itemgetter(6)
				160	mirrored_getter = operator.itemgetter(7)
				161
				162	def decimal(unichr, default=Nonesuch):
				163	return get(unichr, default, 'decimal', decimal_getter)
				164
				165	def decomposition(unichr, default=''):
				166	return get(unichr, default, 'decomposition', decomposition_getter)
				167
				168	def digit(unichr, default=Nonesuch):
				169	return get(unichr, default, 'digit', digit_getter)
				170
				171	def numeric(unichr, default=Nonesuch):
				172	return get(unichr, default, 'numeric', numeric_getter)
				173
				174	def category(unichr):
				175	return get(unichr, 'Cn', 'catgegory', category_getter)
				176
				177	def bidirectional(unichr):
				178	return get(unichr, '', 'bidirectional', bidirectional_getter)
				179
				180	def combining(unichr):
				181	return get(unichr, 0, 'combining', combining_getter)
				182
				183	def mirrored(unichr):
				184	return get(unichr, 0, 'mirrored', mirrored_getter)
				185
				186	def east_asian_width(unichr):
				187	return get_eaw(unichr, 'N', 'east_asian_width')
				188
				189	def jymirrored(unichr):
				190	return java.lang.Character.isMirrored(get_codepoint(unichr, 'mirrored'))
				191
				192	try:
				193	from java.text import Normalizer
				194
				195	_forms = {
				196	'NFC': Normalizer.Form.NFC,
				197	'NFKC': Normalizer.Form.NFKC,
				198	'NFD': Normalizer.Form.NFD,
				199	'NFKD': Normalizer.Form.NFKD
				200	}
				201
				202	def normalize(form, unistr):
				203	"""
				204	Return the normal form 'form' for the Unicode string unistr. Valid
				205	values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
				206	"""
				207
				208	try:
				209	normalizer_form = _forms[form]
				210	except KeyError:
				211	raise ValueError('invalid normalization form')
				212	return Normalizer.normalize(unistr, normalizer_form)
				213
				214	except ImportError:
				215	pass
				216
				217
				218	def init():
				219	import pkgutil
				220	import os.path
				221	import StringIO
				222	import sys
				223
				224	my_path = os.path.dirname(__file__)
				225	loader = pkgutil.get_loader('unicodedata')
				226	init_unicodedata(StringIO.StringIO(loader.get_data(os.path.join(my_path, 'UnicodeData.txt'))))
				227	init_east_asian_width(StringIO.StringIO(loader.get_data(os.path.join(my_path, 'EastAsianWidth.txt'))))
				228
				229	init()