Debugging of strange results and tuning, Daniel

commit: a6287a463c7f9e3c4b6db7a2b12f9c84ad0b2020 [log] [tgz]
author: Daniel Veillard <veillard@src.gnome.org> Mon Oct 07 13:17:22 2002 +0000
committer: Daniel Veillard <veillard@src.gnome.org> Mon Oct 07 13:17:22 2002 +0000
tree: 6076635ffb2cf5a6770e17586236426d4c6c083b
parent: 9b00613d0583e2bca7589257bc32984d5b358a10 [diff] [blame]
diff --git a/doc/index.py b/doc/index.py
index e3b8588..409e90d 100755
--- a/doc/index.py
+++ b/doc/index.py

@@ -54,22 +54,22 @@
 #
 TABLES={
   "symbols" : """CREATE TABLE symbols (
-           name varchar(255) NOT NULL,
-	   module varchar(255) NOT NULL,
+           name varchar(255) BINARY NOT NULL,
+	   module varchar(255) BINARY NOT NULL,
            type varchar(25) NOT NULL,
 	   descr varchar(255),
 	   UNIQUE KEY name (name),
 	   KEY module (module))""",
   "words" : """CREATE TABLE words (
-           name varchar(50) NOT NULL,
-	   symbol varchar(255) NOT NULL,
+           name varchar(50) BINARY NOT NULL,
+	   symbol varchar(255) BINARY NOT NULL,
            relevance int,
 	   KEY name (name),
 	   KEY symbol (symbol),
 	   UNIQUE KEY ID (name, symbol))""",
   "wordsHTML" : """CREATE TABLE wordsHTML (
-           name varchar(50) NOT NULL,
-	   resource varchar(255) NOT NULL,
+           name varchar(50) BINARY NOT NULL,
+	   resource varchar(255) BINARY NOT NULL,
 	   section varchar(255),
 	   id varchar(50),
            relevance int,
@@ -77,8 +77,8 @@
 	   KEY resource (resource),
 	   UNIQUE KEY ref (name, resource))""",
   "pages" : """CREATE TABLE pages (
-           resource varchar(255) NOT NULL,
-	   title varchar(255) NOT NULL,
+           resource varchar(255) BINARY NOT NULL,
+	   title varchar(255) BINARY NOT NULL,
 	   UNIQUE KEY name (resource))""",
   "Queries" : """CREATE TABLE Queries (
            ID int(11) NOT NULL auto_increment,
@@ -403,9 +403,7 @@
     if wordsDictHTML.has_key(word):
         d = wordsDictHTML[word]
 	if d == None:
-	    return 0
-	if len(d) > 15:
-	    wordsDictHTML[word] = None
+	    print "skipped %s" % (word)
 	    return 0
 	try:
 	    (r,i,s) = d[resource]
@@ -418,7 +416,8 @@
 	    pass
     else:
         wordsDictHTML[word] = {}
-    wordsDictHTML[word][resource] = (relevance, id, section)
+    d = wordsDictHTML[word];
+    d[resource] = (relevance, id, section)
     return relevance
     
 def addStringHTML(str, resource, id, section, relevance):
@@ -440,6 +439,8 @@
     str = string.replace(str, "/", " ")
     str = string.replace(str, "*", " ")
     str = string.replace(str, ":", " ")
+    str = string.replace(str, "#", " ")
+    str = string.replace(str, "!", " ")
     str = string.replace(str, "\n", " ")
     str = string.replace(str, "\r", " ")
     str = string.replace(str, "\xc2", " ")
@@ -447,7 +448,14 @@
     l = string.split(str)
     for word in l:
 	if len(word) > 2:
-	    ret = ret + addWordHTML(word, resource, id, section, relevance)
+	    try:
+		r = addWordHTML(word, resource, id, section, relevance)
+		if r <= 0:
+		    print "addWordHTML failed: %s %s" % (word, resource)
+		ret = ret + r
+	    except:
+		print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
+		print sys.exc_type, sys.exc_value
 
     return ret
 
@@ -776,7 +784,7 @@
 	    elif item.type == 'text':
 	        analyzeHTMLText(doc, resource, item, section, id)
 		para = para + 1
-	    elif item.name == 'text':
+	    elif item.name == 'p':
 	        analyzeHTMLPara(doc, resource, item, section, id)
 		para = para + 1
 	    elif item.name == 'pre':
commit	a6287a463c7f9e3c4b6db7a2b12f9c84ad0b2020	[log] [tgz]
author	Daniel Veillard <veillard@src.gnome.org>	Mon Oct 07 13:17:22 2002 +0000
committer	Daniel Veillard <veillard@src.gnome.org>	Mon Oct 07 13:17:22 2002 +0000
tree	6076635ffb2cf5a6770e17586236426d4c6c083b
parent	9b00613d0583e2bca7589257bc32984d5b358a10 [diff] [blame]