blob: c011328b0101bf4267d874de009690c97d77ec09 [file] [log] [blame]
Ezio Melotticcc9e612012-10-23 15:46:33 +02001#!/usr/bin/env python3
2"""
3Utility for parsing HTML5 entity definitions available from:
4
5 http://dev.w3.org/html5/spec/entities.json
6
7Written by Ezio Melotti and Iuliia Proskurnia.
8
9"""
10
11import os
12import sys
13import json
14from urllib.request import urlopen
15from html.entities import html5
16
17entities_url = 'http://dev.w3.org/html5/spec/entities.json'
18
19def get_json(url):
20 """Download the json file from the url and returns a decoded object."""
21 with urlopen(url) as f:
22 data = f.read().decode('utf-8')
23 return json.loads(data)
24
25def create_dict(entities):
26 """Create the html5 dict from the decoded json object."""
27 new_html5 = {}
28 for name, value in entities.items():
29 new_html5[name.lstrip('&')] = value['characters']
30 return new_html5
31
32def compare_dicts(old, new):
33 """Compare the old and new dicts and print the differences."""
34 added = new.keys() - old.keys()
35 if added:
36 print('{} entitie(s) have been added:'.format(len(added)))
37 for name in sorted(added):
38 print(' {!r}: {!r}'.format(name, new[name]))
39 removed = old.keys() - new.keys()
40 if removed:
41 print('{} entitie(s) have been removed:'.format(len(removed)))
42 for name in sorted(removed):
43 print(' {!r}: {!r}'.format(name, old[name]))
44 changed = set()
45 for name in (old.keys() & new.keys()):
46 if old[name] != new[name]:
47 changed.add((name, old[name], new[name]))
48 if changed:
49 print('{} entitie(s) have been modified:'.format(len(changed)))
50 for item in sorted(changed):
51 print(' {!r}: {!r} -> {!r}'.format(*item))
52
53def write_items(entities, file=sys.stdout):
54 """Write the items of the dictionary in the specified file."""
55 # The keys in the generated dictionary should be sorted
56 # in a case-insensitive way, however, when two keys are equal,
57 # the uppercase version should come first so that the result
58 # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...]
59 # To do this we first sort in a case-sensitive way (so all the
60 # uppercase chars come first) and then sort with key=str.lower.
61 # Since the sorting is stable the uppercase keys will eventually
62 # be before their equivalent lowercase version.
63 keys = sorted(entities.keys())
64 keys = sorted(keys, key=str.lower)
65 print('html5 = {', file=file)
66 for name in keys:
67 print(' {!r}: {!a},'.format(name, entities[name]), file=file)
68 print('}', file=file)
69
70
71if __name__ == '__main__':
72 # without args print a diff between html.entities.html5 and new_html5
73 # with --create print the new html5 dict
74 # with --patch patch the Lib/html/entities.py file
75 new_html5 = create_dict(get_json(entities_url))
76 if '--create' in sys.argv:
77 print('# map the HTML5 named character references to the '
78 'equivalent Unicode character(s)')
79 print('# Generated by {}. Do not edit manually.'.format(__file__))
80 write_items(new_html5)
81 elif '--patch' in sys.argv:
82 fname = 'Lib/html/entities.py'
83 temp_fname = fname + '.temp'
84 with open(fname) as f1, open(temp_fname, 'w') as f2:
85 skip = False
86 for line in f1:
87 if line.startswith('html5 = {'):
88 write_items(new_html5, file=f2)
89 skip = True
90 continue
91 if skip:
92 # skip the old items until the }
93 if line.startswith('}'):
94 skip = False
95 continue
96 f2.write(line)
97 os.remove(fname)
98 os.rename(temp_fname, fname)
99 else:
100 if html5 == new_html5:
101 print('The current dictionary is updated.')
102 else:
103 compare_dicts(html5, new_html5)
104 print('Run "./python {0} --patch" to update Lib/html/entities.html '
105 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__))