Roozbeh Pournader | b927c55 | 2016-01-15 11:23:42 -0800 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # |
| 3 | # Copyright 2016 The Android Open Source Project. All Rights Reserved. |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | # you may not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | |
| 18 | """Generate a C++ data table containing locale data.""" |
| 19 | |
| 20 | import collections |
| 21 | import glob |
| 22 | import os.path |
| 23 | import sys |
| 24 | |
| 25 | |
| 26 | def get_locale_parts(locale): |
| 27 | """Split a locale into three parts, for langauge, script, and region.""" |
| 28 | parts = locale.split('_') |
| 29 | if len(parts) == 1: |
| 30 | return (parts[0], None, None) |
| 31 | elif len(parts) == 2: |
| 32 | if len(parts[1]) == 4: # parts[1] is a script |
| 33 | return (parts[0], parts[1], None) |
| 34 | else: |
| 35 | return (parts[0], None, parts[1]) |
| 36 | else: |
| 37 | assert len(parts) == 3 |
| 38 | return tuple(parts) |
| 39 | |
| 40 | |
| 41 | def read_likely_subtags(input_file_name): |
| 42 | """Read and parse ICU's likelySubtags.txt.""" |
| 43 | with open(input_file_name) as input_file: |
| 44 | likely_script_dict = { |
| 45 | # Android's additions for pseudo-locales. These internal codes make |
| 46 | # sure that the pseudo-locales would not match other English or |
| 47 | # Arabic locales. (We can't use private-use ISO 15924 codes, since |
| 48 | # they may be used by apps for other purposes.) |
| 49 | "en_XA": "~~~A", |
| 50 | "ar_XB": "~~~B", |
Roozbeh Pournader | 9bddb48 | 2017-01-13 17:37:04 -0800 | [diff] [blame] | 51 | # Removed data from later versions of ICU |
| 52 | "ji": "Hebr", # Old code for Yiddish, still used in Java and Android |
Roozbeh Pournader | b927c55 | 2016-01-15 11:23:42 -0800 | [diff] [blame] | 53 | } |
| 54 | representative_locales = { |
| 55 | # Android's additions |
| 56 | "en_Latn_GB", # representative for en_Latn_001 |
| 57 | "es_Latn_MX", # representative for es_Latn_419 |
| 58 | "es_Latn_US", # representative for es_Latn_419 (not the best idea, |
| 59 | # but Android has been shipping with it for quite a |
| 60 | # while. Fortunately, MX < US, so if both exist, MX |
| 61 | # would be chosen.) |
| 62 | } |
| 63 | for line in input_file: |
| 64 | line = unicode(line, 'UTF-8').strip(u' \n\uFEFF').encode('UTF-8') |
| 65 | if line.startswith('//'): |
| 66 | continue |
| 67 | if '{' in line and '}' in line: |
| 68 | from_locale = line[:line.index('{')] |
| 69 | to_locale = line[line.index('"')+1:line.rindex('"')] |
| 70 | from_lang, from_scr, from_region = get_locale_parts(from_locale) |
| 71 | _, to_scr, to_region = get_locale_parts(to_locale) |
| 72 | if from_lang == 'und': |
| 73 | continue # not very useful for our purposes |
Roozbeh Pournader | 9bddb48 | 2017-01-13 17:37:04 -0800 | [diff] [blame] | 74 | if from_region is None and to_region not in ['001', 'ZZ']: |
Roozbeh Pournader | b927c55 | 2016-01-15 11:23:42 -0800 | [diff] [blame] | 75 | representative_locales.add(to_locale) |
| 76 | if from_scr is None: |
| 77 | likely_script_dict[from_locale] = to_scr |
| 78 | return likely_script_dict, frozenset(representative_locales) |
| 79 | |
| 80 | |
| 81 | # From packLanguageOrRegion() in ResourceTypes.cpp |
| 82 | def pack_language_or_region(inp, base): |
| 83 | """Pack langauge or region in a two-byte tuple.""" |
| 84 | if inp is None: |
| 85 | return (0, 0) |
| 86 | elif len(inp) == 2: |
| 87 | return ord(inp[0]), ord(inp[1]) |
| 88 | else: |
| 89 | assert len(inp) == 3 |
| 90 | base = ord(base) |
| 91 | first = ord(inp[0]) - base |
| 92 | second = ord(inp[1]) - base |
| 93 | third = ord(inp[2]) - base |
| 94 | |
| 95 | return (0x80 | (third << 2) | (second >>3), |
| 96 | ((second << 5) | first) & 0xFF) |
| 97 | |
| 98 | |
| 99 | # From packLanguage() in ResourceTypes.cpp |
| 100 | def pack_language(language): |
| 101 | """Pack language in a two-byte tuple.""" |
| 102 | return pack_language_or_region(language, 'a') |
| 103 | |
| 104 | |
| 105 | # From packRegion() in ResourceTypes.cpp |
| 106 | def pack_region(region): |
| 107 | """Pack region in a two-byte tuple.""" |
| 108 | return pack_language_or_region(region, '0') |
| 109 | |
| 110 | |
| 111 | def pack_to_uint32(locale): |
| 112 | """Pack language+region of locale into a 32-bit unsigned integer.""" |
| 113 | lang, _, region = get_locale_parts(locale) |
| 114 | plang = pack_language(lang) |
| 115 | pregion = pack_region(region) |
| 116 | return (plang[0] << 24) | (plang[1] << 16) | (pregion[0] << 8) | pregion[1] |
| 117 | |
| 118 | |
| 119 | def dump_script_codes(all_scripts): |
| 120 | """Dump the SCRIPT_CODES table.""" |
| 121 | print 'const char SCRIPT_CODES[][4] = {' |
| 122 | for index, script in enumerate(all_scripts): |
| 123 | print " /* %-2d */ {'%c', '%c', '%c', '%c'}," % ( |
| 124 | index, script[0], script[1], script[2], script[3]) |
| 125 | print '};' |
| 126 | print |
| 127 | |
| 128 | |
| 129 | def dump_script_data(likely_script_dict, all_scripts): |
| 130 | """Dump the script data.""" |
| 131 | print |
| 132 | print 'const std::unordered_map<uint32_t, uint8_t> LIKELY_SCRIPTS({' |
| 133 | for locale in sorted(likely_script_dict.keys()): |
| 134 | script = likely_script_dict[locale] |
| 135 | print ' {0x%08Xu, %2du}, // %s -> %s' % ( |
| 136 | pack_to_uint32(locale), |
| 137 | all_scripts.index(script), |
| 138 | locale.replace('_', '-'), |
| 139 | script) |
| 140 | print '});' |
| 141 | |
| 142 | |
| 143 | def pack_to_uint64(locale): |
| 144 | """Pack a full locale into a 64-bit unsigned integer.""" |
| 145 | _, script, _ = get_locale_parts(locale) |
| 146 | return ((pack_to_uint32(locale) << 32) | |
| 147 | (ord(script[0]) << 24) | |
| 148 | (ord(script[1]) << 16) | |
| 149 | (ord(script[2]) << 8) | |
| 150 | ord(script[3])) |
| 151 | |
| 152 | |
| 153 | def dump_representative_locales(representative_locales): |
| 154 | """Dump the set of representative locales.""" |
| 155 | print |
| 156 | print 'std::unordered_set<uint64_t> REPRESENTATIVE_LOCALES({' |
| 157 | for locale in sorted(representative_locales): |
| 158 | print ' 0x%08Xllu, // %s' % ( |
| 159 | pack_to_uint64(locale), |
| 160 | locale) |
| 161 | print '});' |
| 162 | |
| 163 | |
| 164 | def read_and_dump_likely_data(icu_data_dir): |
| 165 | """Read and dump the likely-script data.""" |
| 166 | likely_subtags_txt = os.path.join(icu_data_dir, 'misc', 'likelySubtags.txt') |
| 167 | likely_script_dict, representative_locales = read_likely_subtags( |
| 168 | likely_subtags_txt) |
| 169 | |
| 170 | all_scripts = list(set(likely_script_dict.values())) |
| 171 | assert len(all_scripts) <= 256 |
| 172 | all_scripts.sort() |
| 173 | |
| 174 | dump_script_codes(all_scripts) |
| 175 | dump_script_data(likely_script_dict, all_scripts) |
| 176 | dump_representative_locales(representative_locales) |
| 177 | return likely_script_dict |
| 178 | |
| 179 | |
| 180 | def read_parent_data(icu_data_dir): |
| 181 | """Read locale parent data from ICU data files.""" |
| 182 | all_icu_data_files = glob.glob(os.path.join(icu_data_dir, '*', '*.txt')) |
| 183 | parent_dict = {} |
| 184 | for data_file in all_icu_data_files: |
| 185 | locale = os.path.splitext(os.path.basename(data_file))[0] |
| 186 | with open(data_file) as input_file: |
| 187 | for line in input_file: |
| 188 | if '%%Parent' in line: |
| 189 | parent = line[line.index('"')+1:line.rindex('"')] |
| 190 | if locale in parent_dict: |
| 191 | # Different files shouldn't have different parent info |
| 192 | assert parent_dict[locale] == parent |
| 193 | else: |
| 194 | parent_dict[locale] = parent |
| 195 | elif locale.startswith('ar_') and 'default{"latn"}' in line: |
| 196 | # Arabic parent overrides for ASCII digits. Since |
| 197 | # Unicode extensions are not supported in ResourceTypes, |
| 198 | # we will use ar-015 (Arabic, Northern Africa) instead |
| 199 | # of the more correct ar-u-nu-latn. |
| 200 | parent_dict[locale] = 'ar_015' |
| 201 | return parent_dict |
| 202 | |
| 203 | |
| 204 | def get_likely_script(locale, likely_script_dict): |
| 205 | """Find the likely script for a locale, given the likely-script dictionary. |
| 206 | """ |
| 207 | if locale.count('_') == 2: |
| 208 | # it already has a script |
| 209 | return locale.split('_')[1] |
| 210 | elif locale in likely_script_dict: |
| 211 | return likely_script_dict[locale] |
| 212 | else: |
| 213 | language = locale.split('_')[0] |
| 214 | return likely_script_dict[language] |
| 215 | |
| 216 | |
| 217 | def dump_parent_data(script_organized_dict): |
| 218 | """Dump information for parents of locales.""" |
| 219 | sorted_scripts = sorted(script_organized_dict.keys()) |
| 220 | print |
| 221 | for script in sorted_scripts: |
| 222 | parent_dict = script_organized_dict[script] |
| 223 | print ('const std::unordered_map<uint32_t, uint32_t> %s_PARENTS({' |
| 224 | % script.upper()) |
| 225 | for locale in sorted(parent_dict.keys()): |
| 226 | parent = parent_dict[locale] |
| 227 | print ' {0x%08Xu, 0x%08Xu}, // %s -> %s' % ( |
| 228 | pack_to_uint32(locale), |
| 229 | pack_to_uint32(parent), |
| 230 | locale.replace('_', '-'), |
| 231 | parent.replace('_', '-')) |
| 232 | print '});' |
| 233 | print |
| 234 | |
| 235 | print 'const struct {' |
| 236 | print ' const char script[4];' |
| 237 | print ' const std::unordered_map<uint32_t, uint32_t>* map;' |
| 238 | print '} SCRIPT_PARENTS[] = {' |
| 239 | for script in sorted_scripts: |
| 240 | print " {{'%c', '%c', '%c', '%c'}, &%s_PARENTS}," % ( |
| 241 | script[0], script[1], script[2], script[3], |
| 242 | script.upper()) |
| 243 | print '};' |
| 244 | |
| 245 | |
| 246 | def dump_parent_tree_depth(parent_dict): |
| 247 | """Find and dump the depth of the parent tree.""" |
| 248 | max_depth = 1 |
| 249 | for locale, _ in parent_dict.items(): |
| 250 | depth = 1 |
| 251 | while locale in parent_dict: |
| 252 | locale = parent_dict[locale] |
| 253 | depth += 1 |
| 254 | max_depth = max(max_depth, depth) |
| 255 | assert max_depth < 5 # Our algorithms assume small max_depth |
| 256 | print |
| 257 | print 'const size_t MAX_PARENT_DEPTH = %d;' % max_depth |
| 258 | |
| 259 | |
| 260 | def read_and_dump_parent_data(icu_data_dir, likely_script_dict): |
| 261 | """Read parent data from ICU and dump it.""" |
| 262 | parent_dict = read_parent_data(icu_data_dir) |
| 263 | script_organized_dict = collections.defaultdict(dict) |
| 264 | for locale in parent_dict: |
| 265 | parent = parent_dict[locale] |
| 266 | if parent == 'root': |
| 267 | continue |
| 268 | script = get_likely_script(locale, likely_script_dict) |
| 269 | script_organized_dict[script][locale] = parent_dict[locale] |
| 270 | dump_parent_data(script_organized_dict) |
| 271 | dump_parent_tree_depth(parent_dict) |
| 272 | |
| 273 | |
| 274 | def main(): |
| 275 | """Read the data files from ICU and dump the output to a C++ file.""" |
| 276 | source_root = sys.argv[1] |
| 277 | icu_data_dir = os.path.join( |
| 278 | source_root, |
| 279 | 'external', 'icu', 'icu4c', 'source', 'data') |
| 280 | |
| 281 | print '// Auto-generated by %s' % sys.argv[0] |
| 282 | print |
| 283 | likely_script_dict = read_and_dump_likely_data(icu_data_dir) |
| 284 | read_and_dump_parent_data(icu_data_dir, likely_script_dict) |
| 285 | |
| 286 | |
| 287 | if __name__ == '__main__': |
| 288 | main() |