blob: b071093a5615369f89051928201301be5eab64b1 [file] [log] [blame]
Roozbeh Pournaderb927c552016-01-15 11:23:42 -08001#!/usr/bin/env python
2#
3# Copyright 2016 The Android Open Source Project. All Rights Reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18"""Generate a C++ data table containing locale data."""
19
20import collections
21import glob
22import os.path
23import sys
24
25
26def get_locale_parts(locale):
27 """Split a locale into three parts, for langauge, script, and region."""
28 parts = locale.split('_')
29 if len(parts) == 1:
30 return (parts[0], None, None)
31 elif len(parts) == 2:
32 if len(parts[1]) == 4: # parts[1] is a script
33 return (parts[0], parts[1], None)
34 else:
35 return (parts[0], None, parts[1])
36 else:
37 assert len(parts) == 3
38 return tuple(parts)
39
40
41def read_likely_subtags(input_file_name):
42 """Read and parse ICU's likelySubtags.txt."""
43 with open(input_file_name) as input_file:
44 likely_script_dict = {
45 # Android's additions for pseudo-locales. These internal codes make
46 # sure that the pseudo-locales would not match other English or
47 # Arabic locales. (We can't use private-use ISO 15924 codes, since
48 # they may be used by apps for other purposes.)
49 "en_XA": "~~~A",
50 "ar_XB": "~~~B",
51 }
52 representative_locales = {
53 # Android's additions
54 "en_Latn_GB", # representative for en_Latn_001
55 "es_Latn_MX", # representative for es_Latn_419
56 "es_Latn_US", # representative for es_Latn_419 (not the best idea,
57 # but Android has been shipping with it for quite a
58 # while. Fortunately, MX < US, so if both exist, MX
59 # would be chosen.)
60 }
61 for line in input_file:
62 line = unicode(line, 'UTF-8').strip(u' \n\uFEFF').encode('UTF-8')
63 if line.startswith('//'):
64 continue
65 if '{' in line and '}' in line:
66 from_locale = line[:line.index('{')]
67 to_locale = line[line.index('"')+1:line.rindex('"')]
68 from_lang, from_scr, from_region = get_locale_parts(from_locale)
69 _, to_scr, to_region = get_locale_parts(to_locale)
70 if from_lang == 'und':
71 continue # not very useful for our purposes
72 if from_region is None and to_region != '001':
73 representative_locales.add(to_locale)
74 if from_scr is None:
75 likely_script_dict[from_locale] = to_scr
76 return likely_script_dict, frozenset(representative_locales)
77
78
79# From packLanguageOrRegion() in ResourceTypes.cpp
80def pack_language_or_region(inp, base):
81 """Pack langauge or region in a two-byte tuple."""
82 if inp is None:
83 return (0, 0)
84 elif len(inp) == 2:
85 return ord(inp[0]), ord(inp[1])
86 else:
87 assert len(inp) == 3
88 base = ord(base)
89 first = ord(inp[0]) - base
90 second = ord(inp[1]) - base
91 third = ord(inp[2]) - base
92
93 return (0x80 | (third << 2) | (second >>3),
94 ((second << 5) | first) & 0xFF)
95
96
97# From packLanguage() in ResourceTypes.cpp
98def pack_language(language):
99 """Pack language in a two-byte tuple."""
100 return pack_language_or_region(language, 'a')
101
102
103# From packRegion() in ResourceTypes.cpp
104def pack_region(region):
105 """Pack region in a two-byte tuple."""
106 return pack_language_or_region(region, '0')
107
108
109def pack_to_uint32(locale):
110 """Pack language+region of locale into a 32-bit unsigned integer."""
111 lang, _, region = get_locale_parts(locale)
112 plang = pack_language(lang)
113 pregion = pack_region(region)
114 return (plang[0] << 24) | (plang[1] << 16) | (pregion[0] << 8) | pregion[1]
115
116
117def dump_script_codes(all_scripts):
118 """Dump the SCRIPT_CODES table."""
119 print 'const char SCRIPT_CODES[][4] = {'
120 for index, script in enumerate(all_scripts):
121 print " /* %-2d */ {'%c', '%c', '%c', '%c'}," % (
122 index, script[0], script[1], script[2], script[3])
123 print '};'
124 print
125
126
127def dump_script_data(likely_script_dict, all_scripts):
128 """Dump the script data."""
129 print
130 print 'const std::unordered_map<uint32_t, uint8_t> LIKELY_SCRIPTS({'
131 for locale in sorted(likely_script_dict.keys()):
132 script = likely_script_dict[locale]
133 print ' {0x%08Xu, %2du}, // %s -> %s' % (
134 pack_to_uint32(locale),
135 all_scripts.index(script),
136 locale.replace('_', '-'),
137 script)
138 print '});'
139
140
141def pack_to_uint64(locale):
142 """Pack a full locale into a 64-bit unsigned integer."""
143 _, script, _ = get_locale_parts(locale)
144 return ((pack_to_uint32(locale) << 32) |
145 (ord(script[0]) << 24) |
146 (ord(script[1]) << 16) |
147 (ord(script[2]) << 8) |
148 ord(script[3]))
149
150
151def dump_representative_locales(representative_locales):
152 """Dump the set of representative locales."""
153 print
154 print 'std::unordered_set<uint64_t> REPRESENTATIVE_LOCALES({'
155 for locale in sorted(representative_locales):
156 print ' 0x%08Xllu, // %s' % (
157 pack_to_uint64(locale),
158 locale)
159 print '});'
160
161
162def read_and_dump_likely_data(icu_data_dir):
163 """Read and dump the likely-script data."""
164 likely_subtags_txt = os.path.join(icu_data_dir, 'misc', 'likelySubtags.txt')
165 likely_script_dict, representative_locales = read_likely_subtags(
166 likely_subtags_txt)
167
168 all_scripts = list(set(likely_script_dict.values()))
169 assert len(all_scripts) <= 256
170 all_scripts.sort()
171
172 dump_script_codes(all_scripts)
173 dump_script_data(likely_script_dict, all_scripts)
174 dump_representative_locales(representative_locales)
175 return likely_script_dict
176
177
178def read_parent_data(icu_data_dir):
179 """Read locale parent data from ICU data files."""
180 all_icu_data_files = glob.glob(os.path.join(icu_data_dir, '*', '*.txt'))
181 parent_dict = {}
182 for data_file in all_icu_data_files:
183 locale = os.path.splitext(os.path.basename(data_file))[0]
184 with open(data_file) as input_file:
185 for line in input_file:
186 if '%%Parent' in line:
187 parent = line[line.index('"')+1:line.rindex('"')]
188 if locale in parent_dict:
189 # Different files shouldn't have different parent info
190 assert parent_dict[locale] == parent
191 else:
192 parent_dict[locale] = parent
193 elif locale.startswith('ar_') and 'default{"latn"}' in line:
194 # Arabic parent overrides for ASCII digits. Since
195 # Unicode extensions are not supported in ResourceTypes,
196 # we will use ar-015 (Arabic, Northern Africa) instead
197 # of the more correct ar-u-nu-latn.
198 parent_dict[locale] = 'ar_015'
199 return parent_dict
200
201
202def get_likely_script(locale, likely_script_dict):
203 """Find the likely script for a locale, given the likely-script dictionary.
204 """
205 if locale.count('_') == 2:
206 # it already has a script
207 return locale.split('_')[1]
208 elif locale in likely_script_dict:
209 return likely_script_dict[locale]
210 else:
211 language = locale.split('_')[0]
212 return likely_script_dict[language]
213
214
215def dump_parent_data(script_organized_dict):
216 """Dump information for parents of locales."""
217 sorted_scripts = sorted(script_organized_dict.keys())
218 print
219 for script in sorted_scripts:
220 parent_dict = script_organized_dict[script]
221 print ('const std::unordered_map<uint32_t, uint32_t> %s_PARENTS({'
222 % script.upper())
223 for locale in sorted(parent_dict.keys()):
224 parent = parent_dict[locale]
225 print ' {0x%08Xu, 0x%08Xu}, // %s -> %s' % (
226 pack_to_uint32(locale),
227 pack_to_uint32(parent),
228 locale.replace('_', '-'),
229 parent.replace('_', '-'))
230 print '});'
231 print
232
233 print 'const struct {'
234 print ' const char script[4];'
235 print ' const std::unordered_map<uint32_t, uint32_t>* map;'
236 print '} SCRIPT_PARENTS[] = {'
237 for script in sorted_scripts:
238 print " {{'%c', '%c', '%c', '%c'}, &%s_PARENTS}," % (
239 script[0], script[1], script[2], script[3],
240 script.upper())
241 print '};'
242
243
244def dump_parent_tree_depth(parent_dict):
245 """Find and dump the depth of the parent tree."""
246 max_depth = 1
247 for locale, _ in parent_dict.items():
248 depth = 1
249 while locale in parent_dict:
250 locale = parent_dict[locale]
251 depth += 1
252 max_depth = max(max_depth, depth)
253 assert max_depth < 5 # Our algorithms assume small max_depth
254 print
255 print 'const size_t MAX_PARENT_DEPTH = %d;' % max_depth
256
257
258def read_and_dump_parent_data(icu_data_dir, likely_script_dict):
259 """Read parent data from ICU and dump it."""
260 parent_dict = read_parent_data(icu_data_dir)
261 script_organized_dict = collections.defaultdict(dict)
262 for locale in parent_dict:
263 parent = parent_dict[locale]
264 if parent == 'root':
265 continue
266 script = get_likely_script(locale, likely_script_dict)
267 script_organized_dict[script][locale] = parent_dict[locale]
268 dump_parent_data(script_organized_dict)
269 dump_parent_tree_depth(parent_dict)
270
271
272def main():
273 """Read the data files from ICU and dump the output to a C++ file."""
274 source_root = sys.argv[1]
275 icu_data_dir = os.path.join(
276 source_root,
277 'external', 'icu', 'icu4c', 'source', 'data')
278
279 print '// Auto-generated by %s' % sys.argv[0]
280 print
281 likely_script_dict = read_and_dump_likely_data(icu_data_dir)
282 read_and_dump_parent_data(icu_data_dir, likely_script_dict)
283
284
285if __name__ == '__main__':
286 main()