added HTML page indexing Daniel * doc/index.py: added HTML page indexing Daniel

commit: 141d04ba749f93490561b8253794b99490b801eb [log] [tgz]
author: Daniel Veillard <veillard@src.gnome.org> Sun Oct 06 21:51:18 2002 +0000
committer: Daniel Veillard <veillard@src.gnome.org> Sun Oct 06 21:51:18 2002 +0000
tree: 7725c29fa8fcdf1a4779a947cd9b8f78e1092364
parent: dc6d4abae736cfdf09dfb7ee38bafe47708d328f [diff] [blame]
diff --git a/doc/index.py b/doc/index.py
index 4e32419..2cca7c0 100755
--- a/doc/index.py
+++ b/doc/index.py

@@ -42,6 +42,13 @@
 import os
 
 #
+# We are not interested in parsing errors here
+#
+def callback(ctx, str):
+    return
+libxml2.registerErrorHandler(callback, None)
+
+#
 # The dictionnary of tables required and the SQL command needed
 # to create them
 #
@@ -60,6 +67,19 @@
 	   KEY name (name),
 	   KEY symbol (symbol),
 	   UNIQUE KEY ID (name, symbol))""",
+  "wordsHTML" : """CREATE TABLE wordsHTML (
+           name varchar(50) NOT NULL,
+	   resource varchar(255) NOT NULL,
+	   section varchar(255),
+	   id varchar(50),
+           relevance int,
+	   KEY name (name),
+	   KEY resource (resource),
+	   UNIQUE KEY ref (name, resource))""",
+  "pages" : """CREATE TABLE pages (
+           resource varchar(255) NOT NULL,
+	   title varchar(255) NOT NULL,
+	   UNIQUE KEY name (resource))""",
   "Queries" : """CREATE TABLE Queries (
            ID int(11) NOT NULL auto_increment,
 	   Value varchar(50) NOT NULL,
@@ -237,6 +257,74 @@
 def addFunctype(name, module, desc = ""):
     return updateSymbol(name, module, 'functype', desc)
 
+def addPage(resource, title):
+    global DB
+
+    if DB == None:
+        openMySQL()
+    if DB == None:
+        return -1
+    if resource == None:
+        return -1
+
+    c = DB.cursor()
+    try:
+	ret = c.execute(
+	    """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
+                    (resource, title))
+    except:
+        try:
+	    ret = c.execute(
+		"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
+                    (title, resource))
+        except:
+	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
+	    print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
+	    print sys.exc_type, sys.exc_value
+	    return -1
+	     
+    return ret
+
+def updateWordHTML(name, resource, desc, id, relevance):
+    global DB
+
+    if DB == None:
+        openMySQL()
+    if DB == None:
+        return -1
+    if name == None:
+        return -1
+    if resource == None:
+        return -1
+    if id == None:
+        id = ""
+    if desc == None:
+        desc = ""
+    else:
+	try:
+	    desc = string.replace(desc, "'", " ")
+	    desc = desc[0:99]
+	except:
+	    desc = ""
+
+    c = DB.cursor()
+    try:
+	ret = c.execute(
+"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
+                    (name, resource, desc, id, relevance))
+    except:
+        try:
+	    ret = c.execute(
+"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
+                    (desc, id, relevance, name, resource))
+        except:
+	    print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
+	    print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
+	    print sys.exc_type, sys.exc_value
+	    return -1
+	     
+    return ret
+        
 #########################################################################
 #									#
 #                  Word dictionnary and analysis routines		#
@@ -244,6 +332,7 @@
 #########################################################################
 
 wordsDict = {}
+wordsDictHTML = {}
 
 def splitIdentifier(str):
     ret = []
@@ -303,6 +392,65 @@
 
     return ret
 
+def addWordHTML(word, resource, id, section, relevance):
+    global wordsDictHTML
+
+    if word == None or len(word) < 3:
+        return -1
+    if resource == None or section == None:
+        return -1
+
+    if wordsDictHTML.has_key(word):
+        d = wordsDictHTML[word]
+	if d == None:
+	    return 0
+	if len(d) > 15:
+	    wordsDictHTML[word] = None
+	    return 0
+	try:
+	    (r,i,s) = d[resource]
+	    if i != None:
+	        id = i
+	    if s != None:
+	        section = s
+	    relevance = relevance + r
+	except:
+	    pass
+    else:
+        wordsDictHTML[word] = {}
+    wordsDictHTML[word][resource] = (relevance, id, section)
+    return relevance
+    
+def addStringHTML(str, resource, id, section, relevance):
+    if str == None or len(str) < 3:
+        return -1
+    ret = 0
+    str = string.replace(str, ".", " ")
+    str = string.replace(str, ",", " ")
+    str = string.replace(str, "'", " ")
+    str = string.replace(str, '"', " ")
+    str = string.replace(str, ";", " ")
+    str = string.replace(str, "-", " ")
+    str = string.replace(str, "(", " ")
+    str = string.replace(str, ")", " ")
+    str = string.replace(str, "{", " ")
+    str = string.replace(str, "}", " ")
+    str = string.replace(str, "<", " ")
+    str = string.replace(str, ">", " ")
+    str = string.replace(str, "/", " ")
+    str = string.replace(str, "*", " ")
+    str = string.replace(str, ":", " ")
+    str = string.replace(str, "\n", " ")
+    str = string.replace(str, "\r", " ")
+    str = string.replace(str, "\xc2", " ")
+    str = string.replace(str, "\xa0", " ")
+    l = string.split(str)
+    for word in l:
+	if len(word) > 2:
+	    ret = ret + addWordHTML(word, resource, id, section, relevance)
+
+    return ret
+
 
 #########################################################################
 #									#
@@ -563,6 +711,83 @@
 
 #########################################################################
 #									#
+#                  Web pages parsing and analysis			#
+#									#
+#########################################################################
+
+import glob
+
+def analyzeHTMLPara(doc, resource, p, section, id):
+    words = 0
+    try:
+	content = p.content
+	words = words + addStringHTML(content, resource, id, section, 5)
+    except:
+        return -1
+    return words
+
+def analyzeHTMLPre(doc, resource, p, section, id):
+    words = 0
+    try:
+	content = p.content
+	words = words + addStringHTML(content, resource, id, section, 5)
+    except:
+        return -1
+    return words
+
+def analyzeHTML(doc, resource):
+    para = 0;
+    ctxt = doc.xpathNewContext()
+    try:
+	res = ctxt.xpathEval("//head/title")
+	title = res[0].content
+    except:
+        title = "Page %s" % (resource)
+    addPage(resource, title)
+    try:
+	items = ctxt.xpathEval("//h1 | //h2 | //h3 | //p | //pre")
+	section = title
+	id = ""
+	for item in items:
+	    if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
+	        section = item.content
+		if item.prop("id"):
+		    id = item.prop("id")
+		elif item.prop("name"):
+		    id = item.prop("name")
+	    elif item.name == 'p':
+	        analyzeHTMLPara(doc, resource, item, section, id)
+		para = para + 1
+	    elif item.name == 'pre':
+	        analyzeHTMLPre(doc, resource, item, section, id)
+		para = para + 1
+	    else:
+	        print "Page %s, unexpected %s element" % (resource, item.name)
+    except:
+        print "Page %s: problem analyzing" % (resource)
+	print sys.exc_type, sys.exc_value
+
+    return para
+
+def analyzeHTMLPages():
+    ret = 0
+    HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
+    for html in HTMLfiles:
+	if html[0:3] == "API":
+	    continue
+	if html == "xml.html":
+	    continue
+	try:
+	    doc = libxml2.htmlParseFile(html, None)
+	    res = analyzeHTML(doc, html)
+	    print "Parsed %s : %d paragraphs" % (html, res)
+	    ret = ret + 1
+	except:
+	    print "could not parse %s" % (html)
+    return ret
+
+#########################################################################
+#									#
 #          Main code: open the DB, the API XML and analyze it		#
 #									#
 #########################################################################
@@ -573,6 +798,23 @@
     print sys.exc_type, sys.exc_value
     sys.exit(1)
 
+ret = analyzeHTMLPages()
+print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
+
+i = 0
+skipped = 0
+for word in wordsDictHTML.keys():
+    refs = wordsDictHTML[word]
+    if refs  == None:
+        skipped = skipped + 1
+        continue;
+    for resource in refs.keys():
+        (relevance, id, section) = refs[resource]
+        updateWordHTML(word, resource, section, id, relevance)
+	i = i + 1
+
+print "Found %d associations in HTML pages" % (i)
+
 try:
     doc = loadAPI(API)
     ret = analyzeAPI(doc)
commit	141d04ba749f93490561b8253794b99490b801eb	[log] [tgz]
author	Daniel Veillard <veillard@src.gnome.org>	Sun Oct 06 21:51:18 2002 +0000
committer	Daniel Veillard <veillard@src.gnome.org>	Sun Oct 06 21:51:18 2002 +0000
tree	7725c29fa8fcdf1a4779a947cd9b8f78e1092364
parent	dc6d4abae736cfdf09dfb7ee38bafe47708d328f [diff] [blame]