improved the script to extracts comments from the gtk-doc DocBook output

* doc/libxml2-api.xml doc/parsedecl.py: improved the script to
  extracts comments from the gtk-doc DocBook output (a bit
  convoluted but seems to work).
Daniel
diff --git a/doc/parsedecl.py b/doc/parsedecl.py
index 0243347..1811b48 100755
--- a/doc/parsedecl.py
+++ b/doc/parsedecl.py
@@ -8,12 +8,13 @@
 import sys
 import string
 
-macros = []
-structs = []
-typedefs = []
+macros = {}
+variables = {}
+structs = {}
+typedefs = {}
 enums = {}
 functions = {}
-private_functions = {}
+user_functions = {}
 ret_types = {}
 types = {}
 
@@ -22,6 +23,11 @@
 identifiers_file = {}
 identifiers_type = {}
 
+##################################################################
+#
+#          Parsing: libxml-decl.txt
+#
+##################################################################
 def mormalizeTypeSpaces(raw, function):
     global types
 
@@ -96,15 +102,23 @@
 def parseMacro():
     global input
     global macros
+    global variables
 
+    var = 1
     line = input.readline()[:-1]
     while line != "</MACRO>":
         if line[0:6] == "<NAME>" and line[-7:] == "</NAME>":
 	    name = line[6:-7]
+	elif string.find(line, "#define") >= 0:
+	    var = 0
 	line = input.readline()[:-1]
 
-    macros.append(name)
-    identifiers_type[name] = "macro"
+    if var == 1:
+	variables[name] = ''
+	identifiers_type[name] = "variable"
+    else:
+	macros[name] = ''
+	identifiers_type[name] = "macro"
 
 def parseStruct():
     global input
@@ -116,7 +130,7 @@
 	    name = line[6:-7]
 	line = input.readline()[:-1]
 
-    structs.append(name)
+    structs[name] = ''
     identifiers_type[name] = "struct"
 
 def parseTypedef():
@@ -129,7 +143,7 @@
 	    name = line[6:-7]
 	line = input.readline()[:-1]
 
-    typedefs.append(name)
+    typedefs[name] = ''
     identifiers_type[name] = "typedef"
 
 def parseEnum():
@@ -164,12 +178,12 @@
 			identifiers_type[token] = "const"
 	line = input.readline()[:-1]
         
-    enums[name] = consts
+    enums[name] = [consts, '']
     identifiers_type[name] = "enum"
 
 def parseStaticFunction():
     global input
-    global private_functions
+    global user_functions
 
     line = input.readline()[:-1]
     type = None
@@ -184,8 +198,8 @@
 	line = input.readline()[:-1]
 
     args = extractArgs(signature, name)
-    private_functions[name] = (type , args)
-    identifiers_type[name] = "private_func"
+    user_functions[name] = [type , args, '']
+    identifiers_type[name] = "functype"
 
 def parseFunction():
     global input
@@ -204,35 +218,9 @@
 	line = input.readline()[:-1]
 
     args = extractArgs(signature, name)
-    functions[name] = (type , args)
+    functions[name] = [type , args, '']
     identifiers_type[name] = "function"
 
-def parseSection():
-    global input
-    global sections
-    global files
-    global identifiers_file
-
-    tokens = []
-    line = input.readline()[:-1]
-    while line != "</SECTION>":
-        if line[0:6] == "<FILE>" and line[-7:] == "</FILE>":
-	    name = line[6:-7]
-	elif len(line) > 0:
-	    tokens.append(line)
-	line = input.readline()[:-1]
-
-    sections.append(name)
-    files[name] = tokens
-    for token in tokens:
-        identifiers_file[token] = name
-	#
-	# Small transitivity for enum values
-	#
-	if enums.has_key(token):
-	    for const in enums[token]:
-	        identifiers_file[const] = name
-
 print "Parsing: libxml-decl.txt"
 input = open('libxml-decl.txt')
 while 1:
@@ -256,16 +244,49 @@
         print "unhandled %s" % (line)
 
 print "Parsed: %d macros. %d structs, %d typedefs, %d enums" % (
-          len(macros), len(structs), len(typedefs), len(enums))
+          len(macros.keys()), len(structs.keys()), len(typedefs.keys()),
+	  len(enums))
 c = 0
 for enum in enums.keys():
-    consts = enums[enum]
+    consts = enums[enum][0]
     c = c + len(consts)
-print "        %d constants, %d functions and %d private functions" % (
-          c, len(functions.keys()), len(private_functions.keys()))
+print "        %d variables, %d constants, %d functions and %d functypes" % (
+          len(variables.keys()), c, len(functions.keys()),
+	  len(user_functions.keys()))
 print "The functions manipulates %d different types" % (len(types.keys()))
 print "The functions returns %d different types" % (len(ret_types.keys()))
 
+##################################################################
+#
+#          Parsing: libxml-decl-list.txt
+#
+##################################################################
+def parseSection():
+    global input
+    global sections
+    global files
+    global identifiers_file
+
+    tokens = []
+    line = input.readline()[:-1]
+    while line != "</SECTION>":
+        if line[0:6] == "<FILE>" and line[-7:] == "</FILE>":
+	    name = line[6:-7]
+	elif len(line) > 0:
+	    tokens.append(line)
+	line = input.readline()[:-1]
+
+    sections.append(name)
+    files[name] = tokens
+    for token in tokens:
+        identifiers_file[token] = name
+	#
+	# Small transitivity for enum values
+	#
+	if enums.has_key(token):
+	    for const in enums[token][0]:
+	        identifiers_file[const] = name
+
 print "Parsing: libxml-decl-list.txt"
 input = open('libxml-decl-list.txt')
 while 1:
@@ -279,6 +300,200 @@
         print "unhandled %s" % (line)
 
 print "Parsed: %d files %d identifiers" % (len(files), len(identifiers_file.keys()))
+##################################################################
+#
+#          Parsing: xml/*.xml
+#          To enrich the existing info with extracted comments
+#
+##################################################################
+
+nbcomments = 0
+
+def insertComment(name, title, value):
+    global nbcomments
+
+    if functions.has_key(name):
+        functions[name][2] = value
+    elif typedefs.has_key(name):
+        typedefs[name] = value
+    elif macros.has_key(name):
+        macros[name] = value
+    elif variables.has_key(name):
+        variables[name] = value
+    elif structs.has_key(name):
+        structs[name] = value
+    elif enums.has_key(name):
+        enums[name][1] = value
+    elif user_functions.has_key(name):
+        user_functions[name] = value
+    else:
+        print "lost comment %s: %s" % (name, value)
+	return
+    nbcomments = nbcomments + 1
+
+import os
+import xmllib
+try:
+    import sgmlop
+except ImportError:
+    sgmlop = None # accelerator not available
+
+debug = 0
+
+if sgmlop:
+    class FastParser:
+	"""sgmlop based XML parser.  this is typically 15x faster
+	   than SlowParser..."""
+
+	def __init__(self, target):
+
+	    # setup callbacks
+	    self.finish_starttag = target.start
+	    self.finish_endtag = target.end
+	    self.handle_data = target.data
+
+	    # activate parser
+	    self.parser = sgmlop.XMLParser()
+	    self.parser.register(self)
+	    self.feed = self.parser.feed
+	    self.entity = {
+		"amp": "&", "gt": ">", "lt": "<",
+		"apos": "'", "quot": '"'
+		}
+
+	def close(self):
+	    try:
+		self.parser.close()
+	    finally:
+		self.parser = self.feed = None # nuke circular reference
+
+	def handle_entityref(self, entity):
+	    # <string> entity
+	    try:
+		self.handle_data(self.entity[entity])
+	    except KeyError:
+		self.handle_data("&%s;" % entity)
+
+else:
+    FastParser = None
+
+
+class SlowParser(xmllib.XMLParser):
+    """slow but safe standard parser, based on the XML parser in
+       Python's standard library."""
+
+    def __init__(self, target):
+	self.unknown_starttag = target.start
+	self.handle_data = target.data
+	self.unknown_endtag = target.end
+	xmllib.XMLParser.__init__(self)
+
+def getparser(target = None):
+    # get the fastest available parser, and attach it to an
+    # unmarshalling object.  return both objects.
+    if target == None:
+	target = docParser()
+    if FastParser:
+	return FastParser(target), target
+    return SlowParser(target), target
+
+class docParser:
+    def __init__(self):
+        self._methodname = None
+	self._data = []
+	self.id = None
+	self.title = None
+	self.descr = None
+	self.string = None
+
+    def close(self):
+        if debug:
+	    print "close"
+
+    def getmethodname(self):
+        return self._methodname
+
+    def data(self, text):
+        if debug:
+	    print "data %s" % text
+        self._data.append(text)
+
+    def start(self, tag, attrs):
+        if debug:
+	    print "start %s, %s" % (tag, attrs)
+	if tag == 'refsect2':
+	    self.id = None
+	    self.title = None
+	    self.descr = None
+	    self.string = None
+	elif tag == 'para':
+	    self._data = []
+	elif tag == 'title':
+	    self._data = []
+	elif tag == 'anchor' and self.id == None:
+	    if attrs.has_key('id'):
+	        self.id = attrs['id']
+		self.id = string.replace(self.id, '-CAPS', '')
+		self.id = string.replace(self.id, '-', '_')
+
+    def end(self, tag):
+        if debug:
+	    print "end %s" % tag
+	if tag == 'refsect2':
+	    insertComment(self.id, self.title, self.string)
+	elif tag == 'para':
+	    if self.string == None:
+		str = ''
+		for c in self._data:
+		    str = str + c
+		str = string.replace(str, '\n', ' ')
+		str = string.replace(str, '\r', ' ')
+		str = string.replace(str, '    ', ' ')
+		str = string.replace(str, '   ', ' ')
+		str = string.replace(str, '  ', ' ')
+		while len(str) >= 1 and str[0] == ' ':
+		    str=str[1:]
+		self.string = str
+	    self._data = []
+	elif tag == 'title':
+	    str = ''
+	    for c in self._data:
+	        str = str + c
+	    str = string.replace(str, '\n', ' ')
+	    str = string.replace(str, '\r', ' ')
+	    str = string.replace(str, '    ', ' ')
+	    str = string.replace(str, '   ', ' ')
+	    str = string.replace(str, '  ', ' ')
+	    while len(str) >= 1 and str[0] == ' ':
+		str=str[1:]
+	    self.title = str
+
+xmlfiles = 0
+filenames = os.listdir("xml")
+for filename in filenames:
+    try:
+        f = open("xml/" + filename, 'r')
+    except IOError, msg:
+        print file, ":", msg
+	continue
+    data = f.read()
+    (parser, target)  = getparser()
+    parser.feed(data)
+    parser.close()
+    xmlfiles = xmlfiles + 1
+
+print "Parsed: %d XML files collexting %d comments" % (xmlfiles, nbcomments)
+
+##################################################################
+#
+#          Saving: libxml2-api.xml
+#
+##################################################################
+
+def escape(raw):
+    raw = string.replace(raw, '<', '&lt;')
+    raw = string.replace(raw, '>', '&gt;')
+    return raw
 
 print "Saving XML description libxml2-api.xml"
 output = open("libxml2-api.xml", "w")
@@ -292,14 +507,16 @@
 output.write("  </files>\n")
 
 output.write("  <symbols>\n")
-symbols=macros
-for i in structs: symbols.append(i)
-for i in typedefs: symbols.append(i)
+symbols=macros.keys()
+for i in structs.keys(): symbols.append(i)
+for i in variables.keys(): variables.append(i)
+for i in typedefs.keys(): symbols.append(i)
 for i in enums.keys():
     symbols.append(i)
-    for j in enums[i]:
+    for j in enums[i][0]:
         symbols.append(j)
 for i in functions.keys(): symbols.append(i)
+for i in user_functions.keys(): symbols.append(i)
 symbols.sort()
 prev = None
 for i in symbols:
@@ -321,12 +538,39 @@
 	    output.write(" file='%s'" % (file))
 	if type == "function":
 	   output.write(">\n");
-	   (ret, args) = functions[i]
+	   (ret, args, doc) = functions[i]
+	   if doc != None and doc != '':
+	       output.write("      <info>%s</info>\n" % (escape(doc)))
 	   output.write("      <return type='%s'/>\n" % (ret))
 	   for arg in args:
 	       output.write("      <arg name='%s' type='%s'/>\n" % (
 	                    arg[1], arg[0]))
 	   output.write("    </%s>\n" % (type));
+	elif type == 'macro':
+	   if macros[i] != None and macros[i] != '':
+	       output.write(" info='%s'/>\n" % (escape(macros[i])))
+	   else:
+	       output.write("/>\n");
+	elif type == 'struct':
+	   if structs[i] != None and structs[i] != '':
+	       output.write(" info='%s'/>\n" % (escape(structs[i])))
+	   else:
+	       output.write("/>\n");
+	elif type == 'functype':
+	   if user_functions[i] != None and user_functions[i] != '':
+	       output.write(" info='%s'/>\n" % (escape(user_functions[i])))
+	   else:
+	       output.write("/>\n");
+	elif type == 'variable':
+	   if variables[i] != None and variables[i] != '':
+	       output.write(" info='%s'/>\n" % (escape(variables[i])))
+	   else:
+	       output.write("/>\n");
+	elif type == 'typedef':
+	   if typedefs[i] != None and typedefs[i] != '':
+	       output.write(" info='%s'/>\n" % (escape(typedefs[i])))
+	   else:
+	       output.write("/>\n");
 	else:
 	   output.write("/>\n");
     else: