Ezio Melotti | ccc9e61 | 2012-10-23 15:46:33 +0200 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | """ |
| 3 | Utility for parsing HTML5 entity definitions available from: |
| 4 | |
| 5 | http://dev.w3.org/html5/spec/entities.json |
| 6 | |
| 7 | Written by Ezio Melotti and Iuliia Proskurnia. |
| 8 | |
| 9 | """ |
| 10 | |
| 11 | import os |
| 12 | import sys |
| 13 | import json |
| 14 | from urllib.request import urlopen |
| 15 | from html.entities import html5 |
| 16 | |
| 17 | entities_url = 'http://dev.w3.org/html5/spec/entities.json' |
| 18 | |
| 19 | def get_json(url): |
| 20 | """Download the json file from the url and returns a decoded object.""" |
| 21 | with urlopen(url) as f: |
| 22 | data = f.read().decode('utf-8') |
| 23 | return json.loads(data) |
| 24 | |
| 25 | def create_dict(entities): |
| 26 | """Create the html5 dict from the decoded json object.""" |
| 27 | new_html5 = {} |
| 28 | for name, value in entities.items(): |
| 29 | new_html5[name.lstrip('&')] = value['characters'] |
| 30 | return new_html5 |
| 31 | |
| 32 | def compare_dicts(old, new): |
| 33 | """Compare the old and new dicts and print the differences.""" |
| 34 | added = new.keys() - old.keys() |
| 35 | if added: |
| 36 | print('{} entitie(s) have been added:'.format(len(added))) |
| 37 | for name in sorted(added): |
| 38 | print(' {!r}: {!r}'.format(name, new[name])) |
| 39 | removed = old.keys() - new.keys() |
| 40 | if removed: |
| 41 | print('{} entitie(s) have been removed:'.format(len(removed))) |
| 42 | for name in sorted(removed): |
| 43 | print(' {!r}: {!r}'.format(name, old[name])) |
| 44 | changed = set() |
| 45 | for name in (old.keys() & new.keys()): |
| 46 | if old[name] != new[name]: |
| 47 | changed.add((name, old[name], new[name])) |
| 48 | if changed: |
| 49 | print('{} entitie(s) have been modified:'.format(len(changed))) |
| 50 | for item in sorted(changed): |
| 51 | print(' {!r}: {!r} -> {!r}'.format(*item)) |
| 52 | |
| 53 | def write_items(entities, file=sys.stdout): |
| 54 | """Write the items of the dictionary in the specified file.""" |
| 55 | # The keys in the generated dictionary should be sorted |
| 56 | # in a case-insensitive way, however, when two keys are equal, |
| 57 | # the uppercase version should come first so that the result |
| 58 | # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] |
| 59 | # To do this we first sort in a case-sensitive way (so all the |
| 60 | # uppercase chars come first) and then sort with key=str.lower. |
| 61 | # Since the sorting is stable the uppercase keys will eventually |
| 62 | # be before their equivalent lowercase version. |
| 63 | keys = sorted(entities.keys()) |
| 64 | keys = sorted(keys, key=str.lower) |
| 65 | print('html5 = {', file=file) |
| 66 | for name in keys: |
| 67 | print(' {!r}: {!a},'.format(name, entities[name]), file=file) |
| 68 | print('}', file=file) |
| 69 | |
| 70 | |
| 71 | if __name__ == '__main__': |
| 72 | # without args print a diff between html.entities.html5 and new_html5 |
| 73 | # with --create print the new html5 dict |
| 74 | # with --patch patch the Lib/html/entities.py file |
| 75 | new_html5 = create_dict(get_json(entities_url)) |
| 76 | if '--create' in sys.argv: |
| 77 | print('# map the HTML5 named character references to the ' |
| 78 | 'equivalent Unicode character(s)') |
| 79 | print('# Generated by {}. Do not edit manually.'.format(__file__)) |
| 80 | write_items(new_html5) |
| 81 | elif '--patch' in sys.argv: |
| 82 | fname = 'Lib/html/entities.py' |
| 83 | temp_fname = fname + '.temp' |
| 84 | with open(fname) as f1, open(temp_fname, 'w') as f2: |
| 85 | skip = False |
| 86 | for line in f1: |
| 87 | if line.startswith('html5 = {'): |
| 88 | write_items(new_html5, file=f2) |
| 89 | skip = True |
| 90 | continue |
| 91 | if skip: |
| 92 | # skip the old items until the } |
| 93 | if line.startswith('}'): |
| 94 | skip = False |
| 95 | continue |
| 96 | f2.write(line) |
| 97 | os.remove(fname) |
| 98 | os.rename(temp_fname, fname) |
| 99 | else: |
| 100 | if html5 == new_html5: |
| 101 | print('The current dictionary is updated.') |
| 102 | else: |
| 103 | compare_dicts(html5, new_html5) |
| 104 | print('Run "./python {0} --patch" to update Lib/html/entities.html ' |
| 105 | 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__)) |