blob: 7ec46a3ee86b91cf9dfff016cc1a943afd314d3e [file] [log] [blame]
Roozbeh Pournader0e969e22016-03-09 23:08:45 -08001#!/usr/bin/env python
2
3import collections
Roozbeh Pournader5dde0872016-03-31 13:54:56 -07004import copy
Roozbeh Pournader0e969e22016-03-09 23:08:45 -08005import glob
Roozbeh Pournader5dde0872016-03-31 13:54:56 -07006import itertools
Roozbeh Pournader0e969e22016-03-09 23:08:45 -08007from os import path
8import sys
9from xml.etree import ElementTree
10
11from fontTools import ttLib
12
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070013EMOJI_VS = 0xFE0F
14
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080015LANG_TO_SCRIPT = {
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070016 'as': 'Beng',
17 'bn': 'Beng',
18 'cy': 'Latn',
19 'da': 'Latn',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080020 'de': 'Latn',
21 'en': 'Latn',
22 'es': 'Latn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070023 'et': 'Latn',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080024 'eu': 'Latn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070025 'fr': 'Latn',
26 'ga': 'Latn',
27 'gu': 'Gujr',
28 'hi': 'Deva',
29 'hr': 'Latn',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080030 'hu': 'Latn',
31 'hy': 'Armn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070032 'ja': 'Jpan',
33 'kn': 'Knda',
34 'ko': 'Kore',
35 'ml': 'Mlym',
36 'mn': 'Cyrl',
37 'mr': 'Deva',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080038 'nb': 'Latn',
39 'nn': 'Latn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070040 'or': 'Orya',
41 'pa': 'Guru',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080042 'pt': 'Latn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070043 'sl': 'Latn',
44 'ta': 'Taml',
45 'te': 'Telu',
46 'tk': 'Latn',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080047}
48
49def lang_to_script(lang_code):
50 lang = lang_code.lower()
51 while lang not in LANG_TO_SCRIPT:
52 hyphen_idx = lang.rfind('-')
53 assert hyphen_idx != -1, (
54 'We do not know what script the "%s" language is written in.'
55 % lang_code)
56 assumed_script = lang[hyphen_idx+1:]
57 if len(assumed_script) == 4 and assumed_script.isalpha():
58 # This is actually the script
59 return assumed_script.title()
60 lang = lang[:hyphen_idx]
61 return LANG_TO_SCRIPT[lang]
62
63
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070064def printable(inp):
65 if type(inp) is set: # set of character sequences
66 return '{' + ', '.join([printable(seq) for seq in inp]) + '}'
67 if type(inp) is tuple: # character sequence
68 return '<' + (', '.join([printable(ch) for ch in inp])) + '>'
69 else: # single character
70 return 'U+%04X' % inp
71
72
73def open_font(font):
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080074 font_file, index = font
75 font_path = path.join(_fonts_dir, font_file)
76 if index is not None:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070077 return ttLib.TTFont(font_path, fontNumber=index)
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080078 else:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070079 return ttLib.TTFont(font_path)
80
81
82def get_best_cmap(font):
83 ttfont = open_font(font)
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080084 all_unicode_cmap = None
85 bmp_cmap = None
86 for cmap in ttfont['cmap'].tables:
87 specifier = (cmap.format, cmap.platformID, cmap.platEncID)
88 if specifier == (4, 3, 1):
89 assert bmp_cmap is None, 'More than one BMP cmap in %s' % (font, )
90 bmp_cmap = cmap
91 elif specifier == (12, 3, 10):
92 assert all_unicode_cmap is None, (
93 'More than one UCS-4 cmap in %s' % (font, ))
94 all_unicode_cmap = cmap
95
96 return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap
97
98
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070099def get_variation_sequences_cmap(font):
100 ttfont = open_font(font)
101 vs_cmap = None
102 for cmap in ttfont['cmap'].tables:
103 specifier = (cmap.format, cmap.platformID, cmap.platEncID)
104 if specifier == (14, 0, 5):
105 assert vs_cmap is None, 'More than one VS cmap in %s' % (font, )
106 vs_cmap = cmap
107 return vs_cmap
108
109
110def get_emoji_map(font):
111 # Add normal characters
112 emoji_map = copy.copy(get_best_cmap(font))
113 reverse_cmap = {glyph: code for code, glyph in emoji_map.items()}
114
115 # Add variation sequences
116 vs_dict = get_variation_sequences_cmap(font).uvsDict
117 for vs in vs_dict:
118 for base, glyph in vs_dict[vs]:
119 if glyph is None:
120 emoji_map[(base, vs)] = emoji_map[base]
121 else:
122 emoji_map[(base, vs)] = glyph
123
124 # Add GSUB rules
125 ttfont = open_font(font)
126 for lookup in ttfont['GSUB'].table.LookupList.Lookup:
127 assert lookup.LookupType == 4, 'We only understand type 4 lookups'
128 for subtable in lookup.SubTable:
129 ligatures = subtable.ligatures
130 for first_glyph in ligatures:
131 for ligature in ligatures[first_glyph]:
132 sequence = [first_glyph] + ligature.Component
133 sequence = [reverse_cmap[glyph] for glyph in sequence]
134 sequence = tuple(sequence)
135 # Make sure no starting subsequence of 'sequence' has been
136 # seen before.
137 for sub_len in range(2, len(sequence)+1):
138 subsequence = sequence[:sub_len]
139 assert subsequence not in emoji_map
140 emoji_map[sequence] = ligature.LigGlyph
141
142 return emoji_map
143
144
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800145def assert_font_supports_any_of_chars(font, chars):
146 best_cmap = get_best_cmap(font)
147 for char in chars:
148 if char in best_cmap:
149 return
150 sys.exit('None of characters in %s were found in %s' % (chars, font))
151
152
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700153def assert_font_supports_all_of_chars(font, chars):
154 best_cmap = get_best_cmap(font)
155 for char in chars:
156 assert char in best_cmap, (
157 'U+%04X was not found in %s' % (char, font))
158
159
160def assert_font_supports_none_of_chars(font, chars):
161 best_cmap = get_best_cmap(font)
162 for char in chars:
163 assert char not in best_cmap, (
164 'U+%04X was found in %s' % (char, font))
165
166
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700167def assert_font_supports_all_sequences(font, sequences):
168 vs_dict = get_variation_sequences_cmap(font).uvsDict
169 for base, vs in sorted(sequences):
170 assert vs in vs_dict and (base, None) in vs_dict[vs], (
171 '<U+%04X, U+%04X> was not found in %s' % (base, vs, font))
172
173
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800174def check_hyphens(hyphens_dir):
175 # Find all the scripts that need automatic hyphenation
176 scripts = set()
177 for hyb_file in glob.iglob(path.join(hyphens_dir, '*.hyb')):
178 hyb_file = path.basename(hyb_file)
179 assert hyb_file.startswith('hyph-'), (
180 'Unknown hyphenation file %s' % hyb_file)
181 lang_code = hyb_file[hyb_file.index('-')+1:hyb_file.index('.')]
182 scripts.add(lang_to_script(lang_code))
183
184 HYPHENS = {0x002D, 0x2010}
185 for script in scripts:
186 fonts = _script_to_font_map[script]
187 assert fonts, 'No fonts found for the "%s" script' % script
188 for font in fonts:
189 assert_font_supports_any_of_chars(font, HYPHENS)
190
191
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700192class FontRecord(object):
193 def __init__(self, name, scripts, variant, weight, style, font):
194 self.name = name
195 self.scripts = scripts
196 self.variant = variant
197 self.weight = weight
198 self.style = style
199 self.font = font
200
201
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800202def parse_fonts_xml(fonts_xml_path):
203 global _script_to_font_map, _fallback_chain
204 _script_to_font_map = collections.defaultdict(set)
205 _fallback_chain = []
206 tree = ElementTree.parse(fonts_xml_path)
207 for family in tree.findall('family'):
208 name = family.get('name')
209 variant = family.get('variant')
210 langs = family.get('lang')
211 if name:
212 assert variant is None, (
213 'No variant expected for LGC font %s.' % name)
214 assert langs is None, (
215 'No language expected for LGC fonts %s.' % name)
216 else:
217 assert variant in {None, 'elegant', 'compact'}, (
218 'Unexpected value for variant: %s' % variant)
219
220 if langs:
221 langs = langs.split()
222 scripts = {lang_to_script(lang) for lang in langs}
223 else:
224 scripts = set()
225
226 for child in family:
227 assert child.tag == 'font', (
228 'Unknown tag <%s>' % child.tag)
229 font_file = child.text
230 weight = int(child.get('weight'))
231 assert weight % 100 == 0, (
232 'Font weight "%d" is not a multiple of 100.' % weight)
233
234 style = child.get('style')
235 assert style in {'normal', 'italic'}, (
236 'Unknown style "%s"' % style)
237
238 index = child.get('index')
239 if index:
240 index = int(index)
241
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700242 _fallback_chain.append(FontRecord(
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800243 name,
244 frozenset(scripts),
245 variant,
246 weight,
247 style,
248 (font_file, index)))
249
250 if name: # non-empty names are used for default LGC fonts
251 map_scripts = {'Latn', 'Grek', 'Cyrl'}
252 else:
253 map_scripts = scripts
254 for script in map_scripts:
255 _script_to_font_map[script].add((font_file, index))
256
257
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700258def check_emoji_coverage(all_emoji, equivalent_emoji):
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700259 emoji_font = get_emoji_font()
260 check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji)
Doug Felt4970cda2016-07-08 17:42:15 -0700261
262
263def get_emoji_font():
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700264 emoji_fonts = [
265 record.font for record in _fallback_chain
266 if 'Zsye' in record.scripts]
Roozbeh Pournader27ec3ac2016-03-31 13:05:32 -0700267 assert len(emoji_fonts) == 1, 'There are %d emoji fonts.' % len(emoji_fonts)
Doug Felt4970cda2016-07-08 17:42:15 -0700268 return emoji_fonts[0]
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700269
Doug Felt4970cda2016-07-08 17:42:15 -0700270
271def check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji):
272 coverage = get_emoji_map(emoji_font)
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700273 for sequence in all_emoji:
274 assert sequence in coverage, (
275 '%s is not supported in the emoji font.' % printable(sequence))
276
277 for sequence in coverage:
278 if sequence in {0x0000, 0x000D, 0x0020}:
279 # The font needs to support a few extra characters, which is OK
280 continue
281 assert sequence in all_emoji, (
282 'Emoji font should not support %s.' % printable(sequence))
283
284 for first, second in sorted(equivalent_emoji.items()):
285 assert coverage[first] == coverage[second], (
286 '%s and %s should map to the same glyph.' % (
287 printable(first),
288 printable(second)))
289
290 for glyph in set(coverage.values()):
291 maps_to_glyph = [seq for seq in coverage if coverage[seq] == glyph]
292 if len(maps_to_glyph) > 1:
293 # There are more than one sequences mapping to the same glyph. We
294 # need to make sure they were expected to be equivalent.
295 equivalent_seqs = set()
296 for seq in maps_to_glyph:
297 equivalent_seq = seq
298 while equivalent_seq in equivalent_emoji:
299 equivalent_seq = equivalent_emoji[equivalent_seq]
300 equivalent_seqs.add(equivalent_seq)
301 assert len(equivalent_seqs) == 1, (
302 'The sequences %s should not result in the same glyph %s' % (
303 printable(equivalent_seqs),
304 glyph))
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700305
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700306
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700307def check_emoji_defaults(default_emoji):
308 missing_text_chars = _emoji_properties['Emoji'] - default_emoji
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700309 emoji_font_seen = False
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700310 for record in _fallback_chain:
311 if 'Zsye' in record.scripts:
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700312 emoji_font_seen = True
313 # No need to check the emoji font
314 continue
315 # For later fonts, we only check them if they have a script
316 # defined, since the defined script may get them to a higher
317 # score even if they appear after the emoji font.
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700318 if emoji_font_seen and not record.scripts:
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700319 continue
320
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700321 # Check default emoji-style characters
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700322 assert_font_supports_none_of_chars(record.font, sorted(default_emoji))
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700323
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700324 # Mark default text-style characters appearing in fonts above the emoji
325 # font as seen
326 if not emoji_font_seen:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700327 missing_text_chars -= set(get_best_cmap(record.font))
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700328
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700329 # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and
330 # webdings yet.
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700331 missing_text_chars -= _chars_by_age['7.0']
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700332 assert missing_text_chars == set(), (
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700333 'Text style version of some emoji characters are missing: ' +
334 repr(missing_text_chars))
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700335
336
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700337# Setting reverse to true returns a dictionary that maps the values to sets of
338# characters, useful for some binary properties. Otherwise, we get a
339# dictionary that maps characters to the property values, assuming there's only
340# one property in the file.
341def parse_unicode_datafile(file_path, reverse=False):
342 if reverse:
343 output_dict = collections.defaultdict(set)
344 else:
345 output_dict = {}
346 with open(file_path) as datafile:
347 for line in datafile:
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700348 if '#' in line:
349 line = line[:line.index('#')]
350 line = line.strip()
351 if not line:
352 continue
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700353
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700354 chars, prop = line.split(';')[:2]
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700355 chars = chars.strip()
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700356 prop = prop.strip()
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700357
358 if ' ' in chars: # character sequence
359 sequence = [int(ch, 16) for ch in chars.split(' ')]
360 additions = [tuple(sequence)]
361 elif '..' in chars: # character range
362 char_start, char_end = chars.split('..')
363 char_start = int(char_start, 16)
364 char_end = int(char_end, 16)
365 additions = xrange(char_start, char_end+1)
366 else: # singe character
367 additions = [int(chars, 16)]
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700368 if reverse:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700369 output_dict[prop].update(additions)
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700370 else:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700371 for addition in additions:
372 assert addition not in output_dict
373 output_dict[addition] = prop
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700374 return output_dict
375
376
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700377def parse_standardized_variants(file_path):
378 emoji_set = set()
379 text_set = set()
380 with open(file_path) as datafile:
381 for line in datafile:
382 if '#' in line:
383 line = line[:line.index('#')]
384 line = line.strip()
385 if not line:
386 continue
387 sequence, description, _ = line.split(';')
388 sequence = sequence.strip().split(' ')
389 base = int(sequence[0], 16)
390 vs = int(sequence[1], 16)
391 description = description.strip()
392 if description == 'text style':
393 text_set.add((base, vs))
394 elif description == 'emoji style':
395 emoji_set.add((base, vs))
396 return text_set, emoji_set
397
398
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700399def parse_ucd(ucd_path):
400 global _emoji_properties, _chars_by_age
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700401 global _text_variation_sequences, _emoji_variation_sequences
402 global _emoji_sequences, _emoji_zwj_sequences
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700403 _emoji_properties = parse_unicode_datafile(
404 path.join(ucd_path, 'emoji-data.txt'), reverse=True)
405 _chars_by_age = parse_unicode_datafile(
406 path.join(ucd_path, 'DerivedAge.txt'), reverse=True)
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700407 sequences = parse_standardized_variants(
408 path.join(ucd_path, 'StandardizedVariants.txt'))
409 _text_variation_sequences, _emoji_variation_sequences = sequences
410 _emoji_sequences = parse_unicode_datafile(
411 path.join(ucd_path, 'emoji-sequences.txt'))
412 _emoji_zwj_sequences = parse_unicode_datafile(
413 path.join(ucd_path, 'emoji-zwj-sequences.txt'))
414
Doug Felt4970cda2016-07-08 17:42:15 -0700415
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700416def flag_sequence(territory_code):
417 return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code)
418
419
420UNSUPPORTED_FLAGS = frozenset({
421 flag_sequence('BL'), flag_sequence('BQ'), flag_sequence('DG'),
422 flag_sequence('EA'), flag_sequence('EH'), flag_sequence('FK'),
423 flag_sequence('GF'), flag_sequence('GP'), flag_sequence('GS'),
424 flag_sequence('MF'), flag_sequence('MQ'), flag_sequence('NC'),
425 flag_sequence('PM'), flag_sequence('RE'), flag_sequence('TF'),
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700426 flag_sequence('UN'), flag_sequence('WF'), flag_sequence('XK'),
427 flag_sequence('YT'),
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700428})
429
430EQUIVALENT_FLAGS = {
431 flag_sequence('BV'): flag_sequence('NO'),
432 flag_sequence('CP'): flag_sequence('FR'),
433 flag_sequence('HM'): flag_sequence('AU'),
434 flag_sequence('SJ'): flag_sequence('NO'),
435 flag_sequence('UM'): flag_sequence('US'),
436}
437
438COMBINING_KEYCAP = 0x20E3
439
Roozbeh Pournader84b17462016-07-25 18:14:14 -0700440# Characters that Android defaults to emoji style, different from the recommendations in UTR #51
441ANDROID_DEFAULT_EMOJI = frozenset({
442 0x2600, # BLACK SUN WITH RAYS
443 0x2601, # CLOUD
444 0x260E, # BLACK TELEPHONE
445 0x261D, # WHITE UP POINTING INDEX
446 0x263A, # WHITE SMILING FACE
447 0x2660, # BLACK SPADE SUIT
448 0x2663, # BLACK CLUB SUIT
449 0x2665, # BLACK HEART SUIT
450 0x2666, # BLACK DIAMOND SUIT
451 0x270C, # VICTORY HAND
452 0x2744, # SNOWFLAKE
453 0x2764, # HEAVY BLACK HEART
454})
455
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700456LEGACY_ANDROID_EMOJI = {
457 0xFE4E5: flag_sequence('JP'),
458 0xFE4E6: flag_sequence('US'),
459 0xFE4E7: flag_sequence('FR'),
460 0xFE4E8: flag_sequence('DE'),
461 0xFE4E9: flag_sequence('IT'),
462 0xFE4EA: flag_sequence('GB'),
463 0xFE4EB: flag_sequence('ES'),
464 0xFE4EC: flag_sequence('RU'),
465 0xFE4ED: flag_sequence('CN'),
466 0xFE4EE: flag_sequence('KR'),
467 0xFE82C: (ord('#'), COMBINING_KEYCAP),
468 0xFE82E: (ord('1'), COMBINING_KEYCAP),
469 0xFE82F: (ord('2'), COMBINING_KEYCAP),
470 0xFE830: (ord('3'), COMBINING_KEYCAP),
471 0xFE831: (ord('4'), COMBINING_KEYCAP),
472 0xFE832: (ord('5'), COMBINING_KEYCAP),
473 0xFE833: (ord('6'), COMBINING_KEYCAP),
474 0xFE834: (ord('7'), COMBINING_KEYCAP),
475 0xFE835: (ord('8'), COMBINING_KEYCAP),
476 0xFE836: (ord('9'), COMBINING_KEYCAP),
477 0xFE837: (ord('0'), COMBINING_KEYCAP),
478}
479
480ZWJ_IDENTICALS = {
481 # KISS
482 (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F,
483 # COUPLE WITH HEART
484 (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F468): 0x1F491,
485 # FAMILY
486 (0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466): 0x1F46A,
487}
488
Doug Felt4970cda2016-07-08 17:42:15 -0700489
490def is_fitzpatrick_modifier(cp):
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700491 return 0x1F3FB <= cp <= 0x1F3FF
492
493
494def reverse_emoji(seq):
495 rev = list(reversed(seq))
496 # if there are fitzpatrick modifiers in the sequence, keep them after
497 # the emoji they modify
498 for i in xrange(1, len(rev)):
499 if is_fitzpatrick_modifier(rev[i-1]):
500 rev[i], rev[i-1] = rev[i-1], rev[i]
501 return tuple(rev)
Doug Felt4970cda2016-07-08 17:42:15 -0700502
503
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700504def compute_expected_emoji():
505 equivalent_emoji = {}
506 sequence_pieces = set()
507 all_sequences = set()
508 all_sequences.update(_emoji_variation_sequences)
509
Raph Levien0790ac62016-08-09 14:28:54 -0700510 # add zwj sequences not in the current emoji-zwj-sequences.txt
511 adjusted_emoji_zwj_sequences = dict(_emoji_zwj_sequences)
512 adjusted_emoji_zwj_sequences.update(_emoji_zwj_sequences)
513 # single parent families
514 additional_emoji_zwj = (
515 (0x1F468, 0x200D, 0x1F466),
516 (0x1F468, 0x200D, 0x1F467),
517 (0x1F468, 0x200D, 0x1F466, 0x200D, 0x1F466),
518 (0x1F468, 0x200D, 0x1F467, 0x200D, 0x1F466),
519 (0x1F468, 0x200D, 0x1F467, 0x200D, 0x1F467),
520 (0x1F469, 0x200D, 0x1F466),
521 (0x1F469, 0x200D, 0x1F467),
522 (0x1F469, 0x200D, 0x1F466, 0x200D, 0x1F466),
523 (0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F466),
524 (0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467),
525 )
526 # sequences formed from man and woman and optional fitzpatrick modifier
527 modified_extensions = (
528 0x2696,
529 0x2708,
530 0x1F3A8,
531 0x1F680,
532 0x1F692,
533 )
534 for seq in additional_emoji_zwj:
535 adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
536 for ext in modified_extensions:
537 for base in (0x1F468, 0x1F469):
538 seq = (base, 0x200D, ext)
539 adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
540 for modifier in range(0x1F3FB, 0x1F400):
541 seq = (base, modifier, 0x200D, ext)
542 adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
543
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700544 for sequence in _emoji_sequences.keys():
545 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
546 all_sequences.add(sequence)
547 sequence_pieces.update(sequence)
548
Raph Levien0790ac62016-08-09 14:28:54 -0700549 for sequence in adjusted_emoji_zwj_sequences.keys():
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700550 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
551 all_sequences.add(sequence)
552 sequence_pieces.update(sequence)
553 # Add reverse of all emoji ZWJ sequences, which are added to the fonts
554 # as a workaround to get the sequences work in RTL text.
Roozbeh Pournader8cd1b1b2016-07-25 14:04:34 -0700555 reversed_seq = reverse_emoji(sequence)
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700556 all_sequences.add(reversed_seq)
557 equivalent_emoji[reversed_seq] = sequence
558
559 # Add all two-letter flag sequences, as even the unsupported ones should
560 # resolve to a flag tofu.
561 all_letters = [chr(code) for code in range(ord('A'), ord('Z')+1)]
562 all_two_letter_codes = itertools.product(all_letters, repeat=2)
563 all_flags = {flag_sequence(code) for code in all_two_letter_codes}
564 all_sequences.update(all_flags)
565 tofu_flags = UNSUPPORTED_FLAGS | (all_flags - set(_emoji_sequences.keys()))
566
567 all_emoji = (
568 _emoji_properties['Emoji'] |
569 all_sequences |
570 sequence_pieces |
571 set(LEGACY_ANDROID_EMOJI.keys()))
572 default_emoji = (
573 _emoji_properties['Emoji_Presentation'] |
Roozbeh Pournader84b17462016-07-25 18:14:14 -0700574 ANDROID_DEFAULT_EMOJI |
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700575 all_sequences |
576 set(LEGACY_ANDROID_EMOJI.keys()))
577
578 first_tofu_flag = sorted(tofu_flags)[0]
579 for flag in tofu_flags:
580 if flag != first_tofu_flag:
581 equivalent_emoji[flag] = first_tofu_flag
582 equivalent_emoji.update(EQUIVALENT_FLAGS)
583 equivalent_emoji.update(LEGACY_ANDROID_EMOJI)
584 equivalent_emoji.update(ZWJ_IDENTICALS)
585 for seq in _emoji_variation_sequences:
586 equivalent_emoji[seq] = seq[0]
587
588 return all_emoji, default_emoji, equivalent_emoji
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700589
590
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800591def main():
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800592 global _fonts_dir
Doug Felt4970cda2016-07-08 17:42:15 -0700593 target_out = sys.argv[1]
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800594 _fonts_dir = path.join(target_out, 'fonts')
595
596 fonts_xml_path = path.join(target_out, 'etc', 'fonts.xml')
597 parse_fonts_xml(fonts_xml_path)
598
599 hyphens_dir = path.join(target_out, 'usr', 'hyphen-data')
600 check_hyphens(hyphens_dir)
601
Roozbeh Pournader27ec3ac2016-03-31 13:05:32 -0700602 check_emoji = sys.argv[2]
603 if check_emoji == 'true':
604 ucd_path = sys.argv[3]
605 parse_ucd(ucd_path)
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700606 all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji()
607 check_emoji_coverage(all_emoji, equivalent_emoji)
608 check_emoji_defaults(default_emoji)
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700609
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800610
611if __name__ == '__main__':
612 main()