Blame - tools/localedata/extract_icu_data.py - platform/frameworks/base

blob: 9dceba2163ebaa23e3daecd5d264d48e9790bd71 [file] [log] [blame]

Roozbeh Pournader	b927c55	2016-01-15 11:23:42 -0800	[diff] [blame]	1	#!/usr/bin/env python
				2	#
				3	# Copyright 2016 The Android Open Source Project. All Rights Reserved.
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the "License");
				6	# you may not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# http://www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an "AS IS" BASIS,
				13	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	#
				17
				18	"""Generate a C++ data table containing locale data."""
				19
				20	import collections
				21	import glob
				22	import os.path
				23	import sys
				24
				25
				26	def get_locale_parts(locale):
				27	"""Split a locale into three parts, for langauge, script, and region."""
				28	parts = locale.split('_')
				29	if len(parts) == 1:
				30	return (parts[0], None, None)
				31	elif len(parts) == 2:
				32	if len(parts[1]) == 4: # parts[1] is a script
				33	return (parts[0], parts[1], None)
				34	else:
				35	return (parts[0], None, parts[1])
				36	else:
				37	assert len(parts) == 3
				38	return tuple(parts)
				39
				40
				41	def read_likely_subtags(input_file_name):
				42	"""Read and parse ICU's likelySubtags.txt."""
				43	with open(input_file_name) as input_file:
				44	likely_script_dict = {
				45	# Android's additions for pseudo-locales. These internal codes make
				46	# sure that the pseudo-locales would not match other English or
				47	# Arabic locales. (We can't use private-use ISO 15924 codes, since
				48	# they may be used by apps for other purposes.)
				49	"en_XA": "~~~A",
				50	"ar_XB": "~~~B",
Roozbeh Pournader	9bddb48	2017-01-13 17:37:04 -0800	[diff] [blame]	51	# Removed data from later versions of ICU
				52	"ji": "Hebr", # Old code for Yiddish, still used in Java and Android
Roozbeh Pournader	b927c55	2016-01-15 11:23:42 -0800	[diff] [blame]	53	}
				54	representative_locales = {
				55	# Android's additions
				56	"en_Latn_GB", # representative for en_Latn_001
				57	"es_Latn_MX", # representative for es_Latn_419
				58	"es_Latn_US", # representative for es_Latn_419 (not the best idea,
				59	# but Android has been shipping with it for quite a
				60	# while. Fortunately, MX < US, so if both exist, MX
				61	# would be chosen.)
				62	}
				63	for line in input_file:
				64	line = unicode(line, 'UTF-8').strip(u' \n\uFEFF').encode('UTF-8')
				65	if line.startswith('//'):
				66	continue
				67	if '{' in line and '}' in line:
				68	from_locale = line[:line.index('{')]
				69	to_locale = line[line.index('"')+1:line.rindex('"')]
				70	from_lang, from_scr, from_region = get_locale_parts(from_locale)
				71	_, to_scr, to_region = get_locale_parts(to_locale)
				72	if from_lang == 'und':
				73	continue # not very useful for our purposes
Roozbeh Pournader	9bddb48	2017-01-13 17:37:04 -0800	[diff] [blame]	74	if from_region is None and to_region not in ['001', 'ZZ']:
Roozbeh Pournader	b927c55	2016-01-15 11:23:42 -0800	[diff] [blame]	75	representative_locales.add(to_locale)
				76	if from_scr is None:
				77	likely_script_dict[from_locale] = to_scr
				78	return likely_script_dict, frozenset(representative_locales)
				79
				80
				81	# From packLanguageOrRegion() in ResourceTypes.cpp
				82	def pack_language_or_region(inp, base):
				83	"""Pack langauge or region in a two-byte tuple."""
				84	if inp is None:
				85	return (0, 0)
				86	elif len(inp) == 2:
				87	return ord(inp[0]), ord(inp[1])
				88	else:
				89	assert len(inp) == 3
				90	base = ord(base)
				91	first = ord(inp[0]) - base
				92	second = ord(inp[1]) - base
				93	third = ord(inp[2]) - base
				94
				95	return (0x80 \| (third << 2) \| (second >>3),
				96	((second << 5) \| first) & 0xFF)
				97
				98
				99	# From packLanguage() in ResourceTypes.cpp
				100	def pack_language(language):
				101	"""Pack language in a two-byte tuple."""
				102	return pack_language_or_region(language, 'a')
				103
				104
				105	# From packRegion() in ResourceTypes.cpp
				106	def pack_region(region):
				107	"""Pack region in a two-byte tuple."""
				108	return pack_language_or_region(region, '0')
				109
				110
				111	def pack_to_uint32(locale):
				112	"""Pack language+region of locale into a 32-bit unsigned integer."""
				113	lang, _, region = get_locale_parts(locale)
				114	plang = pack_language(lang)
				115	pregion = pack_region(region)
				116	return (plang[0] << 24) \| (plang[1] << 16) \| (pregion[0] << 8) \| pregion[1]
				117
				118
				119	def dump_script_codes(all_scripts):
				120	"""Dump the SCRIPT_CODES table."""
				121	print 'const char SCRIPT_CODES[][4] = {'
				122	for index, script in enumerate(all_scripts):
				123	print " /* %-2d */ {'%c', '%c', '%c', '%c'}," % (
				124	index, script[0], script[1], script[2], script[3])
				125	print '};'
				126	print
				127
				128
				129	def dump_script_data(likely_script_dict, all_scripts):
				130	"""Dump the script data."""
				131	print
				132	print 'const std::unordered_map<uint32_t, uint8_t> LIKELY_SCRIPTS({'
				133	for locale in sorted(likely_script_dict.keys()):
				134	script = likely_script_dict[locale]
				135	print ' {0x%08Xu, %2du}, // %s -> %s' % (
				136	pack_to_uint32(locale),
				137	all_scripts.index(script),
				138	locale.replace('_', '-'),
				139	script)
				140	print '});'
				141
				142
				143	def pack_to_uint64(locale):
				144	"""Pack a full locale into a 64-bit unsigned integer."""
				145	_, script, _ = get_locale_parts(locale)
				146	return ((pack_to_uint32(locale) << 32) \|
				147	(ord(script[0]) << 24) \|
				148	(ord(script[1]) << 16) \|
				149	(ord(script[2]) << 8) \|
				150	ord(script[3]))
				151
				152
				153	def dump_representative_locales(representative_locales):
				154	"""Dump the set of representative locales."""
				155	print
				156	print 'std::unordered_set<uint64_t> REPRESENTATIVE_LOCALES({'
				157	for locale in sorted(representative_locales):
				158	print ' 0x%08Xllu, // %s' % (
				159	pack_to_uint64(locale),
				160	locale)
				161	print '});'
				162
				163
				164	def read_and_dump_likely_data(icu_data_dir):
				165	"""Read and dump the likely-script data."""
				166	likely_subtags_txt = os.path.join(icu_data_dir, 'misc', 'likelySubtags.txt')
				167	likely_script_dict, representative_locales = read_likely_subtags(
				168	likely_subtags_txt)
				169
				170	all_scripts = list(set(likely_script_dict.values()))
				171	assert len(all_scripts) <= 256
				172	all_scripts.sort()
				173
				174	dump_script_codes(all_scripts)
				175	dump_script_data(likely_script_dict, all_scripts)
				176	dump_representative_locales(representative_locales)
				177	return likely_script_dict
				178
				179
				180	def read_parent_data(icu_data_dir):
				181	"""Read locale parent data from ICU data files."""
				182	all_icu_data_files = glob.glob(os.path.join(icu_data_dir, '', '.txt'))
				183	parent_dict = {}
				184	for data_file in all_icu_data_files:
				185	locale = os.path.splitext(os.path.basename(data_file))[0]
				186	with open(data_file) as input_file:
				187	for line in input_file:
				188	if '%%Parent' in line:
				189	parent = line[line.index('"')+1:line.rindex('"')]
				190	if locale in parent_dict:
				191	# Different files shouldn't have different parent info
				192	assert parent_dict[locale] == parent
				193	else:
				194	parent_dict[locale] = parent
				195	elif locale.startswith('ar_') and 'default{"latn"}' in line:
				196	# Arabic parent overrides for ASCII digits. Since
				197	# Unicode extensions are not supported in ResourceTypes,
				198	# we will use ar-015 (Arabic, Northern Africa) instead
				199	# of the more correct ar-u-nu-latn.
				200	parent_dict[locale] = 'ar_015'
				201	return parent_dict
				202
				203
				204	def get_likely_script(locale, likely_script_dict):
				205	"""Find the likely script for a locale, given the likely-script dictionary.
				206	"""
				207	if locale.count('_') == 2:
				208	# it already has a script
				209	return locale.split('_')[1]
				210	elif locale in likely_script_dict:
				211	return likely_script_dict[locale]
				212	else:
				213	language = locale.split('_')[0]
				214	return likely_script_dict[language]
				215
				216
				217	def dump_parent_data(script_organized_dict):
				218	"""Dump information for parents of locales."""
				219	sorted_scripts = sorted(script_organized_dict.keys())
				220	print
				221	for script in sorted_scripts:
				222	parent_dict = script_organized_dict[script]
				223	print ('const std::unordered_map<uint32_t, uint32_t> %s_PARENTS({'
				224	% script.upper())
				225	for locale in sorted(parent_dict.keys()):
				226	parent = parent_dict[locale]
				227	print ' {0x%08Xu, 0x%08Xu}, // %s -> %s' % (
				228	pack_to_uint32(locale),
				229	pack_to_uint32(parent),
				230	locale.replace('_', '-'),
				231	parent.replace('_', '-'))
				232	print '});'
				233	print
				234
				235	print 'const struct {'
				236	print ' const char script[4];'
				237	print ' const std::unordered_map<uint32_t, uint32_t>* map;'
				238	print '} SCRIPT_PARENTS[] = {'
				239	for script in sorted_scripts:
				240	print " {{'%c', '%c', '%c', '%c'}, &%s_PARENTS}," % (
				241	script[0], script[1], script[2], script[3],
				242	script.upper())
				243	print '};'
				244
				245
				246	def dump_parent_tree_depth(parent_dict):
				247	"""Find and dump the depth of the parent tree."""
				248	max_depth = 1
				249	for locale, _ in parent_dict.items():
				250	depth = 1
				251	while locale in parent_dict:
				252	locale = parent_dict[locale]
				253	depth += 1
				254	max_depth = max(max_depth, depth)
				255	assert max_depth < 5 # Our algorithms assume small max_depth
				256	print
				257	print 'const size_t MAX_PARENT_DEPTH = %d;' % max_depth
				258
				259
				260	def read_and_dump_parent_data(icu_data_dir, likely_script_dict):
				261	"""Read parent data from ICU and dump it."""
				262	parent_dict = read_parent_data(icu_data_dir)
				263	script_organized_dict = collections.defaultdict(dict)
				264	for locale in parent_dict:
				265	parent = parent_dict[locale]
				266	if parent == 'root':
				267	continue
				268	script = get_likely_script(locale, likely_script_dict)
				269	script_organized_dict[script][locale] = parent_dict[locale]
				270	dump_parent_data(script_organized_dict)
				271	dump_parent_tree_depth(parent_dict)
				272
				273
				274	def main():
				275	"""Read the data files from ICU and dump the output to a C++ file."""
				276	source_root = sys.argv[1]
				277	icu_data_dir = os.path.join(
				278	source_root,
				279	'external', 'icu', 'icu4c', 'source', 'data')
				280
				281	print '// Auto-generated by %s' % sys.argv[0]
				282	print
				283	likely_script_dict = read_and_dump_likely_data(icu_data_dir)
				284	read_and_dump_parent_data(icu_data_dir, likely_script_dict)
				285
				286
				287	if __name__ == '__main__':
				288	main()