Tools/scripts/parseentities.py - platform/external/python/cpython2 - Gitiles

 #!/usr/local/bin/python
 """ Utility for parsing HTML entity definitions available from:

       http://www.w3.org/ as e.g.
       http://www.w3.org/TR/REC-html40/HTMLlat1.ent

     Input is read from stdin, output is written to stdout in form of a
     Python snippet defining a dictionary "entitydefs" mapping literal
     entity name to character or numeric entity.

     Marc-Andre Lemburg, mal@lemburg.com, 1999.
     Use as you like. NO WARRANTIES.

 """
 import re,sys
 import TextTools

 entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')

 def parse(text,pos=0,endpos=None):

     pos = 0
     if endpos is None:
         endpos = len(text)
     d = {}
     while 1:
         m = entityRE.search(text,pos,endpos)
         if not m:
             break
         name,charcode,comment = m.groups()
         d[name] = charcode,comment
         pos = m.end()
     return d

 def writefile(f,defs):

     f.write("entitydefs = {\n")
     items = defs.items()
     items.sort()
     for name,(charcode,comment) in items:
         if charcode[:2] == '&#':
             code = int(charcode[2:-1])
             if code < 256:
                 charcode = "'\%o'" % code
             else:
                 charcode = repr(charcode)
         else:
             charcode = repr(charcode)
         comment = TextTools.collapse(comment)
         f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment))
     f.write('\n}\n')

 if __name__ == '__main__':
     if len(sys.argv) > 1:
         infile = open(sys.argv[1])
     else:
         infile = sys.stdin
     if len(sys.argv) > 2:
         outfile = open(sys.argv[2],'w')
     else:
         outfile = sys.stdout
     text = infile.read()
     defs = parse(text)
     writefile(outfile,defs)
	#!/usr/local/bin/python
	""" Utility for parsing HTML entity definitions available from:

	http://www.w3.org/ as e.g.
	http://www.w3.org/TR/REC-html40/HTMLlat1.ent

	Input is read from stdin, output is written to stdout in form of a
	Python snippet defining a dictionary "entitydefs" mapping literal
	entity name to character or numeric entity.

	Marc-Andre Lemburg, mal@lemburg.com, 1999.
	Use as you like. NO WARRANTIES.

	"""
	import re,sys
	import TextTools

	entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.\|\n)+?) *-->')

	def parse(text,pos=0,endpos=None):

	pos = 0
	if endpos is None:
	endpos = len(text)
	d = {}
	while 1:
	m = entityRE.search(text,pos,endpos)
	if not m:
	break
	name,charcode,comment = m.groups()
	d[name] = charcode,comment
	pos = m.end()
	return d

	def writefile(f,defs):

	f.write("entitydefs = {\n")
	items = defs.items()
	items.sort()
	for name,(charcode,comment) in items:
	if charcode[:2] == '&#':
	code = int(charcode[2:-1])
	if code < 256:
	charcode = "'\%o'" % code
	else:
	charcode = repr(charcode)
	else:
	charcode = repr(charcode)
	comment = TextTools.collapse(comment)
	f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment))
	f.write('\n}\n')

	if __name__ == '__main__':
	if len(sys.argv) > 1:
	infile = open(sys.argv[1])
	else:
	infile = sys.stdin
	if len(sys.argv) > 2:
	outfile = open(sys.argv[2],'w')
	else:
	outfile = sys.stdout
	text = infile.read()
	defs = parse(text)
	writefile(outfile,defs)