blob: 409e90d7e4b38487b07fc584fde281f2e1ba1d3e [file] [log] [blame]
Daniel Veillard3371ff82002-10-01 13:37:48 +00001#!/usr/bin/python -u
2#
3# imports the API description and fills up a database with
4# name relevance to modules, functions or web pages
5#
Daniel Veillard2c77cd72002-10-01 13:54:14 +00006# Operation needed:
7# =================
8#
9# install mysqld, the python wrappers for mysql and libxml2, start mysqld
10# Change the root passwd of mysql:
11# mysqladmin -u root password new_password
12# Create the new database xmlsoft
13# mysqladmin -p create xmlsoft
14# Create a database user 'veillard' and give him passord access
15# change veillard and abcde with the right user name and passwd
16# mysql -p
17# password:
18# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
19# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
20#
21# As the user check the access:
22# mysql -p xmlsoft
23# Enter password:
24# Welcome to the MySQL monitor....
25# mysql> use xmlsoft
26# Database changed
27# mysql> quit
28# Bye
29#
30# Then run the script in the doc subdir, it will create the symbols and
31# word tables and populate them with informations extracted from
32# the libxml2-api.xml API description, and make then accessible read-only
33# by nobody@loaclhost the user expected to be Apache's one
34#
35# On the Apache configuration, make sure you have php support enabled
36#
37
Daniel Veillard3371ff82002-10-01 13:37:48 +000038import MySQLdb
39import libxml2
40import sys
41import string
42import os
43
44#
Daniel Veillard141d04b2002-10-06 21:51:18 +000045# We are not interested in parsing errors here
46#
47def callback(ctx, str):
48 return
49libxml2.registerErrorHandler(callback, None)
50
51#
Daniel Veillard3371ff82002-10-01 13:37:48 +000052# The dictionnary of tables required and the SQL command needed
53# to create them
54#
55TABLES={
56 "symbols" : """CREATE TABLE symbols (
Daniel Veillarda6287a42002-10-07 13:17:22 +000057 name varchar(255) BINARY NOT NULL,
58 module varchar(255) BINARY NOT NULL,
Daniel Veillard3371ff82002-10-01 13:37:48 +000059 type varchar(25) NOT NULL,
60 descr varchar(255),
61 UNIQUE KEY name (name),
62 KEY module (module))""",
63 "words" : """CREATE TABLE words (
Daniel Veillarda6287a42002-10-07 13:17:22 +000064 name varchar(50) BINARY NOT NULL,
65 symbol varchar(255) BINARY NOT NULL,
Daniel Veillard3371ff82002-10-01 13:37:48 +000066 relevance int,
67 KEY name (name),
68 KEY symbol (symbol),
69 UNIQUE KEY ID (name, symbol))""",
Daniel Veillard141d04b2002-10-06 21:51:18 +000070 "wordsHTML" : """CREATE TABLE wordsHTML (
Daniel Veillarda6287a42002-10-07 13:17:22 +000071 name varchar(50) BINARY NOT NULL,
72 resource varchar(255) BINARY NOT NULL,
Daniel Veillard141d04b2002-10-06 21:51:18 +000073 section varchar(255),
74 id varchar(50),
75 relevance int,
76 KEY name (name),
77 KEY resource (resource),
78 UNIQUE KEY ref (name, resource))""",
79 "pages" : """CREATE TABLE pages (
Daniel Veillarda6287a42002-10-07 13:17:22 +000080 resource varchar(255) BINARY NOT NULL,
81 title varchar(255) BINARY NOT NULL,
Daniel Veillard141d04b2002-10-06 21:51:18 +000082 UNIQUE KEY name (resource))""",
Daniel Veillarddc6d4ab2002-10-04 15:58:34 +000083 "Queries" : """CREATE TABLE Queries (
84 ID int(11) NOT NULL auto_increment,
85 Value varchar(50) NOT NULL,
86 Count int(11) NOT NULL,
87 UNIQUE KEY id (ID,Value(35)),
88 INDEX (ID))""",
Daniel Veillard3371ff82002-10-01 13:37:48 +000089}
90
91#
92# The XML API description file to parse
93#
94API="libxml2-api.xml"
95DB=None
96
97#########################################################################
98# #
99# MySQL database interfaces #
100# #
101#########################################################################
102def createTable(db, name):
103 global TABLES
104
105 if db == None:
106 return -1
107 if name == None:
108 return -1
109 c = db.cursor()
110
111 ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
112 if ret == 1:
113 print "Removed table %s" % (name)
114 print "Creating table %s" % (name)
115 try:
116 ret = c.execute(TABLES[name])
117 except:
118 print "Failed to create table %s" % (name)
119 return -1
120 return ret
121
122def checkTables(db):
123 global TABLES
124
125 if db == None:
126 return -1
127 c = db.cursor()
128 nbtables = c.execute("show tables")
129 print "Found %d tables" % (nbtables)
130 tables = {}
131 i = 0
132 while i < nbtables:
133 l = c.fetchone()
134 name = l[0]
135 tables[name] = {}
136 i = i + 1
137
138 for table in TABLES.keys():
139 if not tables.has_key(table):
140 print "table %s missing" % (table)
141 createTable(db, table)
142 print "checkTables finished"
143
144 # make sure apache can access the tables read-only
145 try:
146 ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
Daniel Veillarddc6d4ab2002-10-04 15:58:34 +0000147 ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
Daniel Veillard3371ff82002-10-01 13:37:48 +0000148 except:
149 pass
150 return 0
151
152def openMySQL(db="xmlsoft", passwd=None):
153 global DB
154
155 if passwd == None:
Daniel Veillard538d3b92002-10-01 14:04:56 +0000156 try:
157 passwd = os.environ["MySQL_PASS"]
158 except:
159 print "No password available, set environment MySQL_PASS"
160 sys.exit(1)
161
Daniel Veillard3371ff82002-10-01 13:37:48 +0000162 DB = MySQLdb.connect(passwd=passwd, db=db)
163 if DB == None:
164 return -1
165 ret = checkTables(DB)
166 return ret
167
168def updateWord(name, symbol, relevance):
169 global DB
170
171 if DB == None:
172 openMySQL()
173 if DB == None:
174 return -1
175 if name == None:
176 return -1
177 if symbol == None:
178 return -1
179
180 c = DB.cursor()
181 try:
182 ret = c.execute(
183"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
184 (name, symbol, relevance))
185 except:
186 try:
187 ret = c.execute(
188 """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
189 (relevance, name, symbol))
190 except:
191 print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
192 print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
193 print sys.exc_type, sys.exc_value
194 return -1
195
196 return ret
197
198def updateSymbol(name, module, type, desc):
199 global DB
200
201 updateWord(name, name, 50)
202 if DB == None:
203 openMySQL()
204 if DB == None:
205 return -1
206 if name == None:
207 return -1
208 if module == None:
209 return -1
210 if type == None:
211 return -1
212
213 try:
214 desc = string.replace(desc, "'", " ")
215 l = string.split(desc, ".")
216 desc = l[0]
217 desc = desc[0:99]
218 except:
219 desc = ""
220
221 c = DB.cursor()
222 try:
223 ret = c.execute(
224"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
225 (name, module, type, desc))
226 except:
227 try:
228 ret = c.execute(
229"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
230 (module, type, desc, name))
231 except:
232 print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
233 print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
234 print sys.exc_type, sys.exc_value
235 return -1
236
237 return ret
238
239def addFunction(name, module, desc = ""):
240 return updateSymbol(name, module, 'function', desc)
241
242def addMacro(name, module, desc = ""):
243 return updateSymbol(name, module, 'macro', desc)
244
245def addEnum(name, module, desc = ""):
246 return updateSymbol(name, module, 'enum', desc)
247
248def addStruct(name, module, desc = ""):
249 return updateSymbol(name, module, 'struct', desc)
250
251def addConst(name, module, desc = ""):
252 return updateSymbol(name, module, 'const', desc)
253
254def addType(name, module, desc = ""):
255 return updateSymbol(name, module, 'type', desc)
256
257def addFunctype(name, module, desc = ""):
258 return updateSymbol(name, module, 'functype', desc)
259
Daniel Veillard141d04b2002-10-06 21:51:18 +0000260def addPage(resource, title):
261 global DB
262
263 if DB == None:
264 openMySQL()
265 if DB == None:
266 return -1
267 if resource == None:
268 return -1
269
270 c = DB.cursor()
271 try:
272 ret = c.execute(
273 """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
274 (resource, title))
275 except:
276 try:
277 ret = c.execute(
278 """UPDATE pages SET title='%s' WHERE resource='%s'""" %
279 (title, resource))
280 except:
281 print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
282 print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
283 print sys.exc_type, sys.exc_value
284 return -1
285
286 return ret
287
288def updateWordHTML(name, resource, desc, id, relevance):
289 global DB
290
291 if DB == None:
292 openMySQL()
293 if DB == None:
294 return -1
295 if name == None:
296 return -1
297 if resource == None:
298 return -1
299 if id == None:
300 id = ""
301 if desc == None:
302 desc = ""
303 else:
304 try:
305 desc = string.replace(desc, "'", " ")
306 desc = desc[0:99]
307 except:
308 desc = ""
309
310 c = DB.cursor()
311 try:
312 ret = c.execute(
313"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
314 (name, resource, desc, id, relevance))
315 except:
316 try:
317 ret = c.execute(
318"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
319 (desc, id, relevance, name, resource))
320 except:
321 print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
322 print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
323 print sys.exc_type, sys.exc_value
324 return -1
325
326 return ret
327
Daniel Veillard3371ff82002-10-01 13:37:48 +0000328#########################################################################
329# #
330# Word dictionnary and analysis routines #
331# #
332#########################################################################
333
334wordsDict = {}
Daniel Veillard141d04b2002-10-06 21:51:18 +0000335wordsDictHTML = {}
Daniel Veillard3371ff82002-10-01 13:37:48 +0000336
337def splitIdentifier(str):
338 ret = []
339 while str != "":
340 cur = string.lower(str[0])
341 str = str[1:]
342 if ((cur < 'a') or (cur > 'z')):
343 continue
344 while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
345 cur = cur + string.lower(str[0])
346 str = str[1:]
347 while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
348 cur = cur + str[0]
349 str = str[1:]
350 while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
351 str = str[1:]
352 ret.append(cur)
353 return ret
354
355def addWord(word, module, symbol, relevance):
356 global wordsDict
357
358 if word == None or len(word) < 3:
359 return -1
360 if module == None or symbol == None:
361 return -1
362 if wordsDict.has_key(word):
363 d = wordsDict[word]
364 if d == None:
365 return 0
366 if len(d) > 500:
367 wordsDict[word] = None
368 return 0
369 try:
370 relevance = relevance + d[(module, symbol)]
371 except:
372 pass
373 else:
374 wordsDict[word] = {}
375 wordsDict[word][(module, symbol)] = relevance
376 return relevance
377
378def addString(str, module, symbol, relevance):
379 if str == None or len(str) < 3:
380 return -1
381 ret = 0
382 str = string.replace(str, ".", " ")
383 str = string.replace(str, ",", " ")
384 str = string.replace(str, "'", " ")
385 str = string.replace(str, '"', " ")
386 str = string.replace(str, ";", " ")
387 str = string.replace(str, "-", " ")
388 l = string.split(str)
389 for word in l:
390 if len(word) > 2:
391 ret = ret + addWord(word, module, symbol, 5)
392
393 return ret
394
Daniel Veillard141d04b2002-10-06 21:51:18 +0000395def addWordHTML(word, resource, id, section, relevance):
396 global wordsDictHTML
397
398 if word == None or len(word) < 3:
399 return -1
400 if resource == None or section == None:
401 return -1
402
403 if wordsDictHTML.has_key(word):
404 d = wordsDictHTML[word]
405 if d == None:
Daniel Veillarda6287a42002-10-07 13:17:22 +0000406 print "skipped %s" % (word)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000407 return 0
408 try:
409 (r,i,s) = d[resource]
410 if i != None:
411 id = i
412 if s != None:
413 section = s
414 relevance = relevance + r
415 except:
416 pass
417 else:
418 wordsDictHTML[word] = {}
Daniel Veillarda6287a42002-10-07 13:17:22 +0000419 d = wordsDictHTML[word];
420 d[resource] = (relevance, id, section)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000421 return relevance
422
423def addStringHTML(str, resource, id, section, relevance):
424 if str == None or len(str) < 3:
425 return -1
426 ret = 0
427 str = string.replace(str, ".", " ")
428 str = string.replace(str, ",", " ")
429 str = string.replace(str, "'", " ")
430 str = string.replace(str, '"', " ")
431 str = string.replace(str, ";", " ")
432 str = string.replace(str, "-", " ")
433 str = string.replace(str, "(", " ")
434 str = string.replace(str, ")", " ")
435 str = string.replace(str, "{", " ")
436 str = string.replace(str, "}", " ")
437 str = string.replace(str, "<", " ")
438 str = string.replace(str, ">", " ")
439 str = string.replace(str, "/", " ")
440 str = string.replace(str, "*", " ")
441 str = string.replace(str, ":", " ")
Daniel Veillarda6287a42002-10-07 13:17:22 +0000442 str = string.replace(str, "#", " ")
443 str = string.replace(str, "!", " ")
Daniel Veillard141d04b2002-10-06 21:51:18 +0000444 str = string.replace(str, "\n", " ")
445 str = string.replace(str, "\r", " ")
446 str = string.replace(str, "\xc2", " ")
447 str = string.replace(str, "\xa0", " ")
448 l = string.split(str)
449 for word in l:
450 if len(word) > 2:
Daniel Veillarda6287a42002-10-07 13:17:22 +0000451 try:
452 r = addWordHTML(word, resource, id, section, relevance)
453 if r <= 0:
454 print "addWordHTML failed: %s %s" % (word, resource)
455 ret = ret + r
456 except:
457 print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
458 print sys.exc_type, sys.exc_value
Daniel Veillard141d04b2002-10-06 21:51:18 +0000459
460 return ret
461
Daniel Veillard3371ff82002-10-01 13:37:48 +0000462
463#########################################################################
464# #
465# XML API description analysis #
466# #
467#########################################################################
468
469def loadAPI(filename):
470 doc = libxml2.parseFile(filename)
471 print "loaded %s" % (filename)
472 return doc
473
474def foundExport(file, symbol):
475 if file == None:
476 return 0
477 if symbol == None:
478 return 0
479 addFunction(symbol, file)
480 l = splitIdentifier(symbol)
481 for word in l:
482 addWord(word, file, symbol, 10)
483 return 1
484
485def analyzeAPIFile(top):
486 count = 0
487 name = top.prop("name")
488 cur = top.children
489 while cur != None:
490 if cur.type == 'text':
491 cur = cur.next
492 continue
493 if cur.name == "exports":
494 count = count + foundExport(name, cur.prop("symbol"))
495 else:
496 print "unexpected element %s in API doc <file name='%s'>" % (name)
497 cur = cur.next
498 return count
499
500def analyzeAPIFiles(top):
501 count = 0
502 cur = top.children
503
504 while cur != None:
505 if cur.type == 'text':
506 cur = cur.next
507 continue
508 if cur.name == "file":
509 count = count + analyzeAPIFile(cur)
510 else:
511 print "unexpected element %s in API doc <files>" % (cur.name)
512 cur = cur.next
513 return count
514
515def analyzeAPIEnum(top):
516 file = top.prop("file")
517 if file == None:
518 return 0
519 symbol = top.prop("name")
520 if symbol == None:
521 return 0
522
523 addEnum(symbol, file)
524 l = splitIdentifier(symbol)
525 for word in l:
526 addWord(word, file, symbol, 10)
527
528 return 1
529
530def analyzeAPIConst(top):
531 file = top.prop("file")
532 if file == None:
533 return 0
534 symbol = top.prop("name")
535 if symbol == None:
536 return 0
537
538 addConst(symbol, file)
539 l = splitIdentifier(symbol)
540 for word in l:
541 addWord(word, file, symbol, 10)
542
543 return 1
544
545def analyzeAPIType(top):
546 file = top.prop("file")
547 if file == None:
548 return 0
549 symbol = top.prop("name")
550 if symbol == None:
551 return 0
552
553 addType(symbol, file)
554 l = splitIdentifier(symbol)
555 for word in l:
556 addWord(word, file, symbol, 10)
557 return 1
558
559def analyzeAPIFunctype(top):
560 file = top.prop("file")
561 if file == None:
562 return 0
563 symbol = top.prop("name")
564 if symbol == None:
565 return 0
566
567 addFunctype(symbol, file)
568 l = splitIdentifier(symbol)
569 for word in l:
570 addWord(word, file, symbol, 10)
571 return 1
572
573def analyzeAPIStruct(top):
574 file = top.prop("file")
575 if file == None:
576 return 0
577 symbol = top.prop("name")
578 if symbol == None:
579 return 0
580
581 addStruct(symbol, file)
582 l = splitIdentifier(symbol)
583 for word in l:
584 addWord(word, file, symbol, 10)
585
586 info = top.prop("info")
587 if info != None:
588 l = string.split(info)
589 for word in l:
590 if len(word) > 2:
591 addWord(word, file, symbol, 5)
592 return 1
593
594def analyzeAPIMacro(top):
595 file = top.prop("file")
596 if file == None:
597 return 0
598 symbol = top.prop("name")
599 if symbol == None:
600 return 0
601
602 info = None
603 cur = top.children
604 while cur != None:
605 if cur.type == 'text':
606 cur = cur.next
607 continue
608 if cur.name == "info":
609 info = cur.content
610 break
611 cur = cur.next
612
613 l = splitIdentifier(symbol)
614 for word in l:
615 addWord(word, file, symbol, 10)
616
617 if info == None:
618 addMacro(symbol, file)
619 print "Macro %s description has no <info>" % (symbol)
620 return 0
621
622 addMacro(symbol, file, info)
623 l = string.split(info)
624 for word in l:
625 if len(word) > 2:
626 addWord(word, file, symbol, 5)
627 return 1
628
629def analyzeAPIFunction(top):
630 file = top.prop("file")
631 if file == None:
632 return 0
633 symbol = top.prop("name")
634 if symbol == None:
635 return 0
636
637 info = None
638 cur = top.children
639 while cur != None:
640 if cur.type == 'text':
641 cur = cur.next
642 continue
643 if cur.name == "info":
644 info = cur.content
645 elif cur.name == "return":
646 rinfo = cur.prop("info")
647 if rinfo != None:
648 addString(rinfo, file, symbol, 7)
649 elif cur.name == "arg":
650 ainfo = cur.prop("info")
651 if rinfo != None:
652 addString(ainfo, file, symbol, 5)
653 name = cur.prop("name")
654 if name != None:
655 addWord(name, file, symbol, 7)
656 cur = cur.next
657 if info == None:
658 print "Function %s description has no <info>" % (symbol)
659 addFunction(symbol, file, "")
660 else:
661 addFunction(symbol, file, info)
662 addString(info, file, symbol, 5)
663
664 l = splitIdentifier(symbol)
665 for word in l:
666 addWord(word, file, symbol, 10)
667
668 return 1
669
670def analyzeAPISymbols(top):
671 count = 0
672 cur = top.children
673
674 while cur != None:
675 if cur.type == 'text':
676 cur = cur.next
677 continue
678 if cur.name == "macro":
679 count = count + analyzeAPIMacro(cur)
680 elif cur.name == "function":
681 count = count + analyzeAPIFunction(cur)
682 elif cur.name == "const":
683 count = count + analyzeAPIConst(cur)
684 elif cur.name == "typedef":
685 count = count + analyzeAPIType(cur)
686 elif cur.name == "struct":
687 count = count + analyzeAPIStruct(cur)
688 elif cur.name == "enum":
689 count = count + analyzeAPIEnum(cur)
690 elif cur.name == "functype":
691 count = count + analyzeAPIFunctype(cur)
692 else:
693 print "unexpected element %s in API doc <files>" % (cur.name)
694 cur = cur.next
695 return count
696
697def analyzeAPI(doc):
698 count = 0
699 if doc == None:
700 return -1
701 root = doc.getRootElement()
702 if root.name != "api":
703 print "Unexpected root name"
704 return -1
705 cur = root.children
706 while cur != None:
707 if cur.type == 'text':
708 cur = cur.next
709 continue
710 if cur.name == "files":
711 pass
712# count = count + analyzeAPIFiles(cur)
713 elif cur.name == "symbols":
714 count = count + analyzeAPISymbols(cur)
715 else:
716 print "unexpected element %s in API doc" % (cur.name)
717 cur = cur.next
718 return count
719
720#########################################################################
721# #
Daniel Veillard141d04b2002-10-06 21:51:18 +0000722# Web pages parsing and analysis #
723# #
724#########################################################################
725
726import glob
727
Daniel Veillard9b006132002-10-07 11:13:27 +0000728def analyzeHTMLText(doc, resource, p, section, id):
729 words = 0
730 try:
731 content = p.content
732 words = words + addStringHTML(content, resource, id, section, 5)
733 except:
734 return -1
735 return words
736
Daniel Veillard141d04b2002-10-06 21:51:18 +0000737def analyzeHTMLPara(doc, resource, p, section, id):
738 words = 0
739 try:
740 content = p.content
741 words = words + addStringHTML(content, resource, id, section, 5)
742 except:
743 return -1
744 return words
745
746def analyzeHTMLPre(doc, resource, p, section, id):
747 words = 0
748 try:
749 content = p.content
750 words = words + addStringHTML(content, resource, id, section, 5)
751 except:
752 return -1
753 return words
754
Daniel Veillard9b006132002-10-07 11:13:27 +0000755def analyzeHTML(doc, resource, p, section, id):
756 words = 0
757 try:
758 content = p.content
759 words = words + addStringHTML(content, resource, id, section, 5)
760 except:
761 return -1
762 return words
763
Daniel Veillard141d04b2002-10-06 21:51:18 +0000764def analyzeHTML(doc, resource):
765 para = 0;
766 ctxt = doc.xpathNewContext()
767 try:
768 res = ctxt.xpathEval("//head/title")
769 title = res[0].content
770 except:
771 title = "Page %s" % (resource)
772 addPage(resource, title)
773 try:
Daniel Veillard9b006132002-10-07 11:13:27 +0000774 items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
Daniel Veillard141d04b2002-10-06 21:51:18 +0000775 section = title
776 id = ""
777 for item in items:
778 if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
779 section = item.content
780 if item.prop("id"):
781 id = item.prop("id")
782 elif item.prop("name"):
783 id = item.prop("name")
Daniel Veillard9b006132002-10-07 11:13:27 +0000784 elif item.type == 'text':
785 analyzeHTMLText(doc, resource, item, section, id)
786 para = para + 1
Daniel Veillarda6287a42002-10-07 13:17:22 +0000787 elif item.name == 'p':
Daniel Veillard141d04b2002-10-06 21:51:18 +0000788 analyzeHTMLPara(doc, resource, item, section, id)
789 para = para + 1
790 elif item.name == 'pre':
791 analyzeHTMLPre(doc, resource, item, section, id)
792 para = para + 1
793 else:
794 print "Page %s, unexpected %s element" % (resource, item.name)
795 except:
796 print "Page %s: problem analyzing" % (resource)
797 print sys.exc_type, sys.exc_value
798
799 return para
800
801def analyzeHTMLPages():
802 ret = 0
803 HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
804 for html in HTMLfiles:
805 if html[0:3] == "API":
806 continue
807 if html == "xml.html":
808 continue
809 try:
810 doc = libxml2.htmlParseFile(html, None)
811 res = analyzeHTML(doc, html)
812 print "Parsed %s : %d paragraphs" % (html, res)
813 ret = ret + 1
814 except:
815 print "could not parse %s" % (html)
816 return ret
817
818#########################################################################
819# #
Daniel Veillard3371ff82002-10-01 13:37:48 +0000820# Main code: open the DB, the API XML and analyze it #
821# #
822#########################################################################
823try:
824 openMySQL()
825except:
826 print "Failed to open the database"
827 print sys.exc_type, sys.exc_value
828 sys.exit(1)
829
Daniel Veillard141d04b2002-10-06 21:51:18 +0000830ret = analyzeHTMLPages()
831print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
832
833i = 0
834skipped = 0
835for word in wordsDictHTML.keys():
836 refs = wordsDictHTML[word]
837 if refs == None:
838 skipped = skipped + 1
839 continue;
840 for resource in refs.keys():
841 (relevance, id, section) = refs[resource]
842 updateWordHTML(word, resource, section, id, relevance)
843 i = i + 1
844
845print "Found %d associations in HTML pages" % (i)
846
Daniel Veillard3371ff82002-10-01 13:37:48 +0000847try:
848 doc = loadAPI(API)
849 ret = analyzeAPI(doc)
850 print "Analyzed %d blocs" % (ret)
851 doc.freeDoc()
852except:
853 print "Failed to parse and analyze %s" % (API)
854 print sys.exc_type, sys.exc_value
855 sys.exit(1)
856
857print "Indexed %d words" % (len(wordsDict))
858i = 0
859skipped = 0
860for word in wordsDict.keys():
861 refs = wordsDict[word]
862 if refs == None:
863 skipped = skipped + 1
864 continue;
865 for (module, symbol) in refs.keys():
866 updateWord(word, symbol, refs[(module, symbol)])
867 i = i + 1
868
869print "Found %d associations, skipped %d words" % (i, skipped)