blob: c5598f0d87eda00035fce2f6863a692b6f2efa6c [file] [log] [blame]
Roozbeh Pournader0e969e22016-03-09 23:08:45 -08001#!/usr/bin/env python
2
3import collections
Roozbeh Pournader5dde0872016-03-31 13:54:56 -07004import copy
Roozbeh Pournader0e969e22016-03-09 23:08:45 -08005import glob
Roozbeh Pournader5dde0872016-03-31 13:54:56 -07006import itertools
Roozbeh Pournader0e969e22016-03-09 23:08:45 -08007from os import path
8import sys
9from xml.etree import ElementTree
10
11from fontTools import ttLib
12
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070013EMOJI_VS = 0xFE0F
14
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080015LANG_TO_SCRIPT = {
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070016 'as': 'Beng',
Roozbeh Pournadere626fb52017-02-22 18:53:39 -080017 'bg': 'Cyrl',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070018 'bn': 'Beng',
Roozbeh Pournadere626fb52017-02-22 18:53:39 -080019 'cu': 'Cyrl',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070020 'cy': 'Latn',
21 'da': 'Latn',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080022 'de': 'Latn',
23 'en': 'Latn',
24 'es': 'Latn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070025 'et': 'Latn',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080026 'eu': 'Latn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070027 'fr': 'Latn',
28 'ga': 'Latn',
29 'gu': 'Gujr',
30 'hi': 'Deva',
31 'hr': 'Latn',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080032 'hu': 'Latn',
33 'hy': 'Armn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070034 'ja': 'Jpan',
35 'kn': 'Knda',
36 'ko': 'Kore',
37 'ml': 'Mlym',
38 'mn': 'Cyrl',
39 'mr': 'Deva',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080040 'nb': 'Latn',
41 'nn': 'Latn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070042 'or': 'Orya',
43 'pa': 'Guru',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080044 'pt': 'Latn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070045 'sl': 'Latn',
46 'ta': 'Taml',
47 'te': 'Telu',
48 'tk': 'Latn',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080049}
50
51def lang_to_script(lang_code):
52 lang = lang_code.lower()
53 while lang not in LANG_TO_SCRIPT:
54 hyphen_idx = lang.rfind('-')
55 assert hyphen_idx != -1, (
56 'We do not know what script the "%s" language is written in.'
57 % lang_code)
58 assumed_script = lang[hyphen_idx+1:]
59 if len(assumed_script) == 4 and assumed_script.isalpha():
60 # This is actually the script
61 return assumed_script.title()
62 lang = lang[:hyphen_idx]
63 return LANG_TO_SCRIPT[lang]
64
65
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070066def printable(inp):
67 if type(inp) is set: # set of character sequences
68 return '{' + ', '.join([printable(seq) for seq in inp]) + '}'
69 if type(inp) is tuple: # character sequence
70 return '<' + (', '.join([printable(ch) for ch in inp])) + '>'
71 else: # single character
72 return 'U+%04X' % inp
73
74
75def open_font(font):
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080076 font_file, index = font
77 font_path = path.join(_fonts_dir, font_file)
78 if index is not None:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070079 return ttLib.TTFont(font_path, fontNumber=index)
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080080 else:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070081 return ttLib.TTFont(font_path)
82
83
84def get_best_cmap(font):
85 ttfont = open_font(font)
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080086 all_unicode_cmap = None
87 bmp_cmap = None
88 for cmap in ttfont['cmap'].tables:
89 specifier = (cmap.format, cmap.platformID, cmap.platEncID)
90 if specifier == (4, 3, 1):
91 assert bmp_cmap is None, 'More than one BMP cmap in %s' % (font, )
92 bmp_cmap = cmap
93 elif specifier == (12, 3, 10):
94 assert all_unicode_cmap is None, (
95 'More than one UCS-4 cmap in %s' % (font, ))
96 all_unicode_cmap = cmap
97
98 return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap
99
100
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700101def get_variation_sequences_cmap(font):
102 ttfont = open_font(font)
103 vs_cmap = None
104 for cmap in ttfont['cmap'].tables:
105 specifier = (cmap.format, cmap.platformID, cmap.platEncID)
106 if specifier == (14, 0, 5):
107 assert vs_cmap is None, 'More than one VS cmap in %s' % (font, )
108 vs_cmap = cmap
109 return vs_cmap
110
111
112def get_emoji_map(font):
113 # Add normal characters
114 emoji_map = copy.copy(get_best_cmap(font))
115 reverse_cmap = {glyph: code for code, glyph in emoji_map.items()}
116
117 # Add variation sequences
118 vs_dict = get_variation_sequences_cmap(font).uvsDict
119 for vs in vs_dict:
120 for base, glyph in vs_dict[vs]:
121 if glyph is None:
122 emoji_map[(base, vs)] = emoji_map[base]
123 else:
124 emoji_map[(base, vs)] = glyph
125
126 # Add GSUB rules
127 ttfont = open_font(font)
128 for lookup in ttfont['GSUB'].table.LookupList.Lookup:
129 assert lookup.LookupType == 4, 'We only understand type 4 lookups'
130 for subtable in lookup.SubTable:
131 ligatures = subtable.ligatures
132 for first_glyph in ligatures:
133 for ligature in ligatures[first_glyph]:
134 sequence = [first_glyph] + ligature.Component
135 sequence = [reverse_cmap[glyph] for glyph in sequence]
136 sequence = tuple(sequence)
137 # Make sure no starting subsequence of 'sequence' has been
138 # seen before.
139 for sub_len in range(2, len(sequence)+1):
140 subsequence = sequence[:sub_len]
141 assert subsequence not in emoji_map
142 emoji_map[sequence] = ligature.LigGlyph
143
144 return emoji_map
145
146
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800147def assert_font_supports_any_of_chars(font, chars):
148 best_cmap = get_best_cmap(font)
149 for char in chars:
150 if char in best_cmap:
151 return
152 sys.exit('None of characters in %s were found in %s' % (chars, font))
153
154
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700155def assert_font_supports_all_of_chars(font, chars):
156 best_cmap = get_best_cmap(font)
157 for char in chars:
158 assert char in best_cmap, (
159 'U+%04X was not found in %s' % (char, font))
160
161
162def assert_font_supports_none_of_chars(font, chars):
163 best_cmap = get_best_cmap(font)
164 for char in chars:
165 assert char not in best_cmap, (
166 'U+%04X was found in %s' % (char, font))
167
168
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700169def assert_font_supports_all_sequences(font, sequences):
170 vs_dict = get_variation_sequences_cmap(font).uvsDict
171 for base, vs in sorted(sequences):
172 assert vs in vs_dict and (base, None) in vs_dict[vs], (
173 '<U+%04X, U+%04X> was not found in %s' % (base, vs, font))
174
175
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800176def check_hyphens(hyphens_dir):
177 # Find all the scripts that need automatic hyphenation
178 scripts = set()
179 for hyb_file in glob.iglob(path.join(hyphens_dir, '*.hyb')):
180 hyb_file = path.basename(hyb_file)
181 assert hyb_file.startswith('hyph-'), (
182 'Unknown hyphenation file %s' % hyb_file)
183 lang_code = hyb_file[hyb_file.index('-')+1:hyb_file.index('.')]
184 scripts.add(lang_to_script(lang_code))
185
186 HYPHENS = {0x002D, 0x2010}
187 for script in scripts:
188 fonts = _script_to_font_map[script]
189 assert fonts, 'No fonts found for the "%s" script' % script
190 for font in fonts:
191 assert_font_supports_any_of_chars(font, HYPHENS)
192
193
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700194class FontRecord(object):
195 def __init__(self, name, scripts, variant, weight, style, font):
196 self.name = name
197 self.scripts = scripts
198 self.variant = variant
199 self.weight = weight
200 self.style = style
201 self.font = font
202
203
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800204def parse_fonts_xml(fonts_xml_path):
205 global _script_to_font_map, _fallback_chain
206 _script_to_font_map = collections.defaultdict(set)
207 _fallback_chain = []
208 tree = ElementTree.parse(fonts_xml_path)
209 for family in tree.findall('family'):
210 name = family.get('name')
211 variant = family.get('variant')
212 langs = family.get('lang')
213 if name:
214 assert variant is None, (
215 'No variant expected for LGC font %s.' % name)
216 assert langs is None, (
217 'No language expected for LGC fonts %s.' % name)
218 else:
219 assert variant in {None, 'elegant', 'compact'}, (
220 'Unexpected value for variant: %s' % variant)
221
222 if langs:
223 langs = langs.split()
224 scripts = {lang_to_script(lang) for lang in langs}
225 else:
226 scripts = set()
227
228 for child in family:
229 assert child.tag == 'font', (
230 'Unknown tag <%s>' % child.tag)
231 font_file = child.text
232 weight = int(child.get('weight'))
233 assert weight % 100 == 0, (
234 'Font weight "%d" is not a multiple of 100.' % weight)
235
236 style = child.get('style')
237 assert style in {'normal', 'italic'}, (
238 'Unknown style "%s"' % style)
239
240 index = child.get('index')
241 if index:
242 index = int(index)
243
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700244 _fallback_chain.append(FontRecord(
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800245 name,
246 frozenset(scripts),
247 variant,
248 weight,
249 style,
250 (font_file, index)))
251
252 if name: # non-empty names are used for default LGC fonts
253 map_scripts = {'Latn', 'Grek', 'Cyrl'}
254 else:
255 map_scripts = scripts
256 for script in map_scripts:
257 _script_to_font_map[script].add((font_file, index))
258
259
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700260def check_emoji_coverage(all_emoji, equivalent_emoji):
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700261 emoji_font = get_emoji_font()
262 check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji)
Doug Felt4970cda2016-07-08 17:42:15 -0700263
264
265def get_emoji_font():
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700266 emoji_fonts = [
267 record.font for record in _fallback_chain
268 if 'Zsye' in record.scripts]
Roozbeh Pournader27ec3ac2016-03-31 13:05:32 -0700269 assert len(emoji_fonts) == 1, 'There are %d emoji fonts.' % len(emoji_fonts)
Doug Felt4970cda2016-07-08 17:42:15 -0700270 return emoji_fonts[0]
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700271
Doug Felt4970cda2016-07-08 17:42:15 -0700272
273def check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji):
274 coverage = get_emoji_map(emoji_font)
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700275 for sequence in all_emoji:
276 assert sequence in coverage, (
277 '%s is not supported in the emoji font.' % printable(sequence))
278
279 for sequence in coverage:
280 if sequence in {0x0000, 0x000D, 0x0020}:
281 # The font needs to support a few extra characters, which is OK
282 continue
283 assert sequence in all_emoji, (
284 'Emoji font should not support %s.' % printable(sequence))
285
286 for first, second in sorted(equivalent_emoji.items()):
287 assert coverage[first] == coverage[second], (
288 '%s and %s should map to the same glyph.' % (
289 printable(first),
290 printable(second)))
291
292 for glyph in set(coverage.values()):
293 maps_to_glyph = [seq for seq in coverage if coverage[seq] == glyph]
294 if len(maps_to_glyph) > 1:
295 # There are more than one sequences mapping to the same glyph. We
296 # need to make sure they were expected to be equivalent.
297 equivalent_seqs = set()
298 for seq in maps_to_glyph:
299 equivalent_seq = seq
300 while equivalent_seq in equivalent_emoji:
301 equivalent_seq = equivalent_emoji[equivalent_seq]
302 equivalent_seqs.add(equivalent_seq)
303 assert len(equivalent_seqs) == 1, (
304 'The sequences %s should not result in the same glyph %s' % (
305 printable(equivalent_seqs),
306 glyph))
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700307
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700308
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700309def check_emoji_defaults(default_emoji):
310 missing_text_chars = _emoji_properties['Emoji'] - default_emoji
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700311 emoji_font_seen = False
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700312 for record in _fallback_chain:
313 if 'Zsye' in record.scripts:
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700314 emoji_font_seen = True
315 # No need to check the emoji font
316 continue
317 # For later fonts, we only check them if they have a script
318 # defined, since the defined script may get them to a higher
319 # score even if they appear after the emoji font.
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700320 if emoji_font_seen and not record.scripts:
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700321 continue
322
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700323 # Check default emoji-style characters
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700324 assert_font_supports_none_of_chars(record.font, sorted(default_emoji))
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700325
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700326 # Mark default text-style characters appearing in fonts above the emoji
327 # font as seen
328 if not emoji_font_seen:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700329 missing_text_chars -= set(get_best_cmap(record.font))
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700330
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700331 # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and
332 # webdings yet.
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700333 missing_text_chars -= _chars_by_age['7.0']
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700334 assert missing_text_chars == set(), (
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700335 'Text style version of some emoji characters are missing: ' +
336 repr(missing_text_chars))
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700337
338
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700339# Setting reverse to true returns a dictionary that maps the values to sets of
340# characters, useful for some binary properties. Otherwise, we get a
341# dictionary that maps characters to the property values, assuming there's only
342# one property in the file.
343def parse_unicode_datafile(file_path, reverse=False):
344 if reverse:
345 output_dict = collections.defaultdict(set)
346 else:
347 output_dict = {}
348 with open(file_path) as datafile:
349 for line in datafile:
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700350 if '#' in line:
351 line = line[:line.index('#')]
352 line = line.strip()
353 if not line:
354 continue
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700355
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700356 chars, prop = line.split(';')[:2]
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700357 chars = chars.strip()
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700358 prop = prop.strip()
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700359
360 if ' ' in chars: # character sequence
361 sequence = [int(ch, 16) for ch in chars.split(' ')]
362 additions = [tuple(sequence)]
363 elif '..' in chars: # character range
364 char_start, char_end = chars.split('..')
365 char_start = int(char_start, 16)
366 char_end = int(char_end, 16)
367 additions = xrange(char_start, char_end+1)
368 else: # singe character
369 additions = [int(chars, 16)]
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700370 if reverse:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700371 output_dict[prop].update(additions)
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700372 else:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700373 for addition in additions:
374 assert addition not in output_dict
375 output_dict[addition] = prop
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700376 return output_dict
377
378
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700379def parse_standardized_variants(file_path):
380 emoji_set = set()
381 text_set = set()
382 with open(file_path) as datafile:
383 for line in datafile:
384 if '#' in line:
385 line = line[:line.index('#')]
386 line = line.strip()
387 if not line:
388 continue
389 sequence, description, _ = line.split(';')
390 sequence = sequence.strip().split(' ')
391 base = int(sequence[0], 16)
392 vs = int(sequence[1], 16)
393 description = description.strip()
394 if description == 'text style':
395 text_set.add((base, vs))
396 elif description == 'emoji style':
397 emoji_set.add((base, vs))
398 return text_set, emoji_set
399
400
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700401def parse_ucd(ucd_path):
402 global _emoji_properties, _chars_by_age
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700403 global _text_variation_sequences, _emoji_variation_sequences
404 global _emoji_sequences, _emoji_zwj_sequences
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700405 _emoji_properties = parse_unicode_datafile(
406 path.join(ucd_path, 'emoji-data.txt'), reverse=True)
407 _chars_by_age = parse_unicode_datafile(
408 path.join(ucd_path, 'DerivedAge.txt'), reverse=True)
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700409 sequences = parse_standardized_variants(
410 path.join(ucd_path, 'StandardizedVariants.txt'))
411 _text_variation_sequences, _emoji_variation_sequences = sequences
412 _emoji_sequences = parse_unicode_datafile(
413 path.join(ucd_path, 'emoji-sequences.txt'))
414 _emoji_zwj_sequences = parse_unicode_datafile(
415 path.join(ucd_path, 'emoji-zwj-sequences.txt'))
416
Doug Felt4970cda2016-07-08 17:42:15 -0700417
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700418def flag_sequence(territory_code):
419 return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code)
420
421
422UNSUPPORTED_FLAGS = frozenset({
423 flag_sequence('BL'), flag_sequence('BQ'), flag_sequence('DG'),
424 flag_sequence('EA'), flag_sequence('EH'), flag_sequence('FK'),
425 flag_sequence('GF'), flag_sequence('GP'), flag_sequence('GS'),
426 flag_sequence('MF'), flag_sequence('MQ'), flag_sequence('NC'),
427 flag_sequence('PM'), flag_sequence('RE'), flag_sequence('TF'),
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700428 flag_sequence('UN'), flag_sequence('WF'), flag_sequence('XK'),
429 flag_sequence('YT'),
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700430})
431
432EQUIVALENT_FLAGS = {
433 flag_sequence('BV'): flag_sequence('NO'),
434 flag_sequence('CP'): flag_sequence('FR'),
435 flag_sequence('HM'): flag_sequence('AU'),
436 flag_sequence('SJ'): flag_sequence('NO'),
437 flag_sequence('UM'): flag_sequence('US'),
438}
439
440COMBINING_KEYCAP = 0x20E3
441
Roozbeh Pournader84b17462016-07-25 18:14:14 -0700442# Characters that Android defaults to emoji style, different from the recommendations in UTR #51
443ANDROID_DEFAULT_EMOJI = frozenset({
444 0x2600, # BLACK SUN WITH RAYS
445 0x2601, # CLOUD
446 0x260E, # BLACK TELEPHONE
447 0x261D, # WHITE UP POINTING INDEX
448 0x263A, # WHITE SMILING FACE
449 0x2660, # BLACK SPADE SUIT
450 0x2663, # BLACK CLUB SUIT
451 0x2665, # BLACK HEART SUIT
452 0x2666, # BLACK DIAMOND SUIT
453 0x270C, # VICTORY HAND
454 0x2744, # SNOWFLAKE
455 0x2764, # HEAVY BLACK HEART
456})
457
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700458LEGACY_ANDROID_EMOJI = {
459 0xFE4E5: flag_sequence('JP'),
460 0xFE4E6: flag_sequence('US'),
461 0xFE4E7: flag_sequence('FR'),
462 0xFE4E8: flag_sequence('DE'),
463 0xFE4E9: flag_sequence('IT'),
464 0xFE4EA: flag_sequence('GB'),
465 0xFE4EB: flag_sequence('ES'),
466 0xFE4EC: flag_sequence('RU'),
467 0xFE4ED: flag_sequence('CN'),
468 0xFE4EE: flag_sequence('KR'),
469 0xFE82C: (ord('#'), COMBINING_KEYCAP),
470 0xFE82E: (ord('1'), COMBINING_KEYCAP),
471 0xFE82F: (ord('2'), COMBINING_KEYCAP),
472 0xFE830: (ord('3'), COMBINING_KEYCAP),
473 0xFE831: (ord('4'), COMBINING_KEYCAP),
474 0xFE832: (ord('5'), COMBINING_KEYCAP),
475 0xFE833: (ord('6'), COMBINING_KEYCAP),
476 0xFE834: (ord('7'), COMBINING_KEYCAP),
477 0xFE835: (ord('8'), COMBINING_KEYCAP),
478 0xFE836: (ord('9'), COMBINING_KEYCAP),
479 0xFE837: (ord('0'), COMBINING_KEYCAP),
480}
481
482ZWJ_IDENTICALS = {
483 # KISS
484 (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F,
485 # COUPLE WITH HEART
486 (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F468): 0x1F491,
487 # FAMILY
488 (0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466): 0x1F46A,
489}
490
Doug Felt4970cda2016-07-08 17:42:15 -0700491
492def is_fitzpatrick_modifier(cp):
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700493 return 0x1F3FB <= cp <= 0x1F3FF
494
495
496def reverse_emoji(seq):
497 rev = list(reversed(seq))
498 # if there are fitzpatrick modifiers in the sequence, keep them after
499 # the emoji they modify
500 for i in xrange(1, len(rev)):
501 if is_fitzpatrick_modifier(rev[i-1]):
502 rev[i], rev[i-1] = rev[i-1], rev[i]
503 return tuple(rev)
Doug Felt4970cda2016-07-08 17:42:15 -0700504
505
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700506def compute_expected_emoji():
507 equivalent_emoji = {}
508 sequence_pieces = set()
509 all_sequences = set()
510 all_sequences.update(_emoji_variation_sequences)
511
Raph Levien0790ac62016-08-09 14:28:54 -0700512 # add zwj sequences not in the current emoji-zwj-sequences.txt
513 adjusted_emoji_zwj_sequences = dict(_emoji_zwj_sequences)
514 adjusted_emoji_zwj_sequences.update(_emoji_zwj_sequences)
515 # single parent families
516 additional_emoji_zwj = (
517 (0x1F468, 0x200D, 0x1F466),
518 (0x1F468, 0x200D, 0x1F467),
519 (0x1F468, 0x200D, 0x1F466, 0x200D, 0x1F466),
520 (0x1F468, 0x200D, 0x1F467, 0x200D, 0x1F466),
521 (0x1F468, 0x200D, 0x1F467, 0x200D, 0x1F467),
522 (0x1F469, 0x200D, 0x1F466),
523 (0x1F469, 0x200D, 0x1F467),
524 (0x1F469, 0x200D, 0x1F466, 0x200D, 0x1F466),
525 (0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F466),
526 (0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467),
527 )
528 # sequences formed from man and woman and optional fitzpatrick modifier
529 modified_extensions = (
530 0x2696,
531 0x2708,
532 0x1F3A8,
533 0x1F680,
534 0x1F692,
535 )
536 for seq in additional_emoji_zwj:
537 adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
538 for ext in modified_extensions:
539 for base in (0x1F468, 0x1F469):
540 seq = (base, 0x200D, ext)
541 adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
542 for modifier in range(0x1F3FB, 0x1F400):
543 seq = (base, modifier, 0x200D, ext)
544 adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
545
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700546 for sequence in _emoji_sequences.keys():
547 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
548 all_sequences.add(sequence)
549 sequence_pieces.update(sequence)
550
Raph Levien0790ac62016-08-09 14:28:54 -0700551 for sequence in adjusted_emoji_zwj_sequences.keys():
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700552 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
553 all_sequences.add(sequence)
554 sequence_pieces.update(sequence)
555 # Add reverse of all emoji ZWJ sequences, which are added to the fonts
556 # as a workaround to get the sequences work in RTL text.
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700557 reversed_seq = reverse_emoji(sequence)
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700558 all_sequences.add(reversed_seq)
559 equivalent_emoji[reversed_seq] = sequence
560
561 # Add all two-letter flag sequences, as even the unsupported ones should
562 # resolve to a flag tofu.
563 all_letters = [chr(code) for code in range(ord('A'), ord('Z')+1)]
564 all_two_letter_codes = itertools.product(all_letters, repeat=2)
565 all_flags = {flag_sequence(code) for code in all_two_letter_codes}
566 all_sequences.update(all_flags)
567 tofu_flags = UNSUPPORTED_FLAGS | (all_flags - set(_emoji_sequences.keys()))
568
569 all_emoji = (
570 _emoji_properties['Emoji'] |
571 all_sequences |
572 sequence_pieces |
573 set(LEGACY_ANDROID_EMOJI.keys()))
574 default_emoji = (
575 _emoji_properties['Emoji_Presentation'] |
Roozbeh Pournader84b17462016-07-25 18:14:14 -0700576 ANDROID_DEFAULT_EMOJI |
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700577 all_sequences |
578 set(LEGACY_ANDROID_EMOJI.keys()))
579
580 first_tofu_flag = sorted(tofu_flags)[0]
581 for flag in tofu_flags:
582 if flag != first_tofu_flag:
583 equivalent_emoji[flag] = first_tofu_flag
584 equivalent_emoji.update(EQUIVALENT_FLAGS)
585 equivalent_emoji.update(LEGACY_ANDROID_EMOJI)
586 equivalent_emoji.update(ZWJ_IDENTICALS)
587 for seq in _emoji_variation_sequences:
588 equivalent_emoji[seq] = seq[0]
589
590 return all_emoji, default_emoji, equivalent_emoji
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700591
592
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800593def main():
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800594 global _fonts_dir
Doug Felt4970cda2016-07-08 17:42:15 -0700595 target_out = sys.argv[1]
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800596 _fonts_dir = path.join(target_out, 'fonts')
597
598 fonts_xml_path = path.join(target_out, 'etc', 'fonts.xml')
599 parse_fonts_xml(fonts_xml_path)
600
601 hyphens_dir = path.join(target_out, 'usr', 'hyphen-data')
602 check_hyphens(hyphens_dir)
603
Roozbeh Pournader27ec3ac2016-03-31 13:05:32 -0700604 check_emoji = sys.argv[2]
605 if check_emoji == 'true':
606 ucd_path = sys.argv[3]
607 parse_ucd(ucd_path)
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700608 all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji()
609 check_emoji_coverage(all_emoji, equivalent_emoji)
610 check_emoji_defaults(default_emoji)
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700611
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800612
613if __name__ == '__main__':
614 main()