Guido van Rossum | a8b37ad | 1999-08-19 16:00:41 +0000 | [diff] [blame] | 1 | #!/usr/local/bin/python |
| 2 | """ Utility for parsing HTML entity definitions available from: |
| 3 | |
| 4 | http://www.w3.org/ as e.g. |
| 5 | http://www.w3.org/TR/REC-html40/HTMLlat1.ent |
| 6 | |
| 7 | Input is read from stdin, output is written to stdout in form of a |
| 8 | Python snippet defining a dictionary "entitydefs" mapping literal |
| 9 | entity name to character or numeric entity. |
| 10 | |
Tim Peters | 70c4378 | 2001-01-17 08:48:39 +0000 | [diff] [blame] | 11 | Marc-Andre Lemburg, mal@lemburg.com, 1999. |
Guido van Rossum | a8b37ad | 1999-08-19 16:00:41 +0000 | [diff] [blame] | 12 | Use as you like. NO WARRANTIES. |
| 13 | |
| 14 | """ |
| 15 | import re,sys |
| 16 | import TextTools |
| 17 | |
| 18 | entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->') |
| 19 | |
| 20 | def parse(text,pos=0,endpos=None): |
| 21 | |
| 22 | pos = 0 |
| 23 | if endpos is None: |
Jeremy Hylton | 0b7b4b8 | 2000-09-18 01:46:01 +0000 | [diff] [blame] | 24 | endpos = len(text) |
Guido van Rossum | a8b37ad | 1999-08-19 16:00:41 +0000 | [diff] [blame] | 25 | d = {} |
| 26 | while 1: |
Jeremy Hylton | 0b7b4b8 | 2000-09-18 01:46:01 +0000 | [diff] [blame] | 27 | m = entityRE.search(text,pos,endpos) |
| 28 | if not m: |
| 29 | break |
| 30 | name,charcode,comment = m.groups() |
| 31 | d[name] = charcode,comment |
| 32 | pos = m.end() |
Guido van Rossum | a8b37ad | 1999-08-19 16:00:41 +0000 | [diff] [blame] | 33 | return d |
| 34 | |
| 35 | def writefile(f,defs): |
| 36 | |
| 37 | f.write("entitydefs = {\n") |
Georg Brandl | 8efadf5 | 2008-05-16 15:23:30 +0000 | [diff] [blame] | 38 | items = sorted(defs.items()) |
| 39 | for name, (charcode,comment) in items: |
Jeremy Hylton | 0b7b4b8 | 2000-09-18 01:46:01 +0000 | [diff] [blame] | 40 | if charcode[:2] == '&#': |
| 41 | code = int(charcode[2:-1]) |
| 42 | if code < 256: |
| 43 | charcode = "'\%o'" % code |
| 44 | else: |
| 45 | charcode = repr(charcode) |
| 46 | else: |
| 47 | charcode = repr(charcode) |
| 48 | comment = TextTools.collapse(comment) |
| 49 | f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment)) |
Guido van Rossum | a8b37ad | 1999-08-19 16:00:41 +0000 | [diff] [blame] | 50 | f.write('\n}\n') |
| 51 | |
| 52 | if __name__ == '__main__': |
| 53 | if len(sys.argv) > 1: |
Jeremy Hylton | 0b7b4b8 | 2000-09-18 01:46:01 +0000 | [diff] [blame] | 54 | infile = open(sys.argv[1]) |
Guido van Rossum | a8b37ad | 1999-08-19 16:00:41 +0000 | [diff] [blame] | 55 | else: |
Jeremy Hylton | 0b7b4b8 | 2000-09-18 01:46:01 +0000 | [diff] [blame] | 56 | infile = sys.stdin |
Guido van Rossum | a8b37ad | 1999-08-19 16:00:41 +0000 | [diff] [blame] | 57 | if len(sys.argv) > 2: |
Jeremy Hylton | 0b7b4b8 | 2000-09-18 01:46:01 +0000 | [diff] [blame] | 58 | outfile = open(sys.argv[2],'w') |
Guido van Rossum | a8b37ad | 1999-08-19 16:00:41 +0000 | [diff] [blame] | 59 | else: |
Jeremy Hylton | 0b7b4b8 | 2000-09-18 01:46:01 +0000 | [diff] [blame] | 60 | outfile = sys.stdout |
Guido van Rossum | a8b37ad | 1999-08-19 16:00:41 +0000 | [diff] [blame] | 61 | text = infile.read() |
| 62 | defs = parse(text) |
| 63 | writefile(outfile,defs) |