| #!/usr/local/bin/python | 
 | """ Utility for parsing HTML entity definitions available from: | 
 |  | 
 |       http://www.w3.org/ as e.g. | 
 |       http://www.w3.org/TR/REC-html40/HTMLlat1.ent | 
 |  | 
 |     Input is read from stdin, output is written to stdout in form of a | 
 |     Python snippet defining a dictionary "entitydefs" mapping literal | 
 |     entity name to character or numeric entity. | 
 |  | 
 |     Marc-Andre Lemburg, mal@lemburg.com, 1999. | 
 |     Use as you like. NO WARRANTIES. | 
 |  | 
 | """ | 
 | import re,sys | 
 | import TextTools | 
 |  | 
 | entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->') | 
 |  | 
 | def parse(text,pos=0,endpos=None): | 
 |  | 
 |     pos = 0 | 
 |     if endpos is None: | 
 |         endpos = len(text) | 
 |     d = {} | 
 |     while 1: | 
 |         m = entityRE.search(text,pos,endpos) | 
 |         if not m: | 
 |             break | 
 |         name,charcode,comment = m.groups() | 
 |         d[name] = charcode,comment | 
 |         pos = m.end() | 
 |     return d | 
 |  | 
 | def writefile(f,defs): | 
 |  | 
 |     f.write("entitydefs = {\n") | 
 |     items = defs.items() | 
 |     items.sort() | 
 |     for name,(charcode,comment) in items: | 
 |         if charcode[:2] == '&#': | 
 |             code = int(charcode[2:-1]) | 
 |             if code < 256: | 
 |                 charcode = "'\%o'" % code | 
 |             else: | 
 |                 charcode = repr(charcode) | 
 |         else: | 
 |             charcode = repr(charcode) | 
 |         comment = TextTools.collapse(comment) | 
 |         f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment)) | 
 |     f.write('\n}\n') | 
 |  | 
 | if __name__ == '__main__': | 
 |     if len(sys.argv) > 1: | 
 |         infile = open(sys.argv[1]) | 
 |     else: | 
 |         infile = sys.stdin | 
 |     if len(sys.argv) > 2: | 
 |         outfile = open(sys.argv[2],'w') | 
 |     else: | 
 |         outfile = sys.stdout | 
 |     text = infile.read() | 
 |     defs = parse(text) | 
 |     writefile(outfile,defs) |