blob: 128d11a71b7315a0f19a56119b0c87888ba1a083 [file] [log] [blame]
Daniel Veillard3371ff82002-10-01 13:37:48 +00001#!/usr/bin/python -u
2#
3# imports the API description and fills up a database with
4# name relevance to modules, functions or web pages
5#
Daniel Veillard2c77cd72002-10-01 13:54:14 +00006# Operation needed:
7# =================
8#
9# install mysqld, the python wrappers for mysql and libxml2, start mysqld
10# Change the root passwd of mysql:
11# mysqladmin -u root password new_password
12# Create the new database xmlsoft
13# mysqladmin -p create xmlsoft
14# Create a database user 'veillard' and give him passord access
15# change veillard and abcde with the right user name and passwd
16# mysql -p
17# password:
18# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
19# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
20#
21# As the user check the access:
22# mysql -p xmlsoft
23# Enter password:
24# Welcome to the MySQL monitor....
25# mysql> use xmlsoft
26# Database changed
27# mysql> quit
28# Bye
29#
30# Then run the script in the doc subdir, it will create the symbols and
31# word tables and populate them with informations extracted from
32# the libxml2-api.xml API description, and make then accessible read-only
33# by nobody@loaclhost the user expected to be Apache's one
34#
35# On the Apache configuration, make sure you have php support enabled
36#
37
Daniel Veillard3371ff82002-10-01 13:37:48 +000038import MySQLdb
39import libxml2
40import sys
41import string
42import os
43
44#
Daniel Veillard141d04b2002-10-06 21:51:18 +000045# We are not interested in parsing errors here
46#
47def callback(ctx, str):
48 return
49libxml2.registerErrorHandler(callback, None)
50
51#
Daniel Veillard3371ff82002-10-01 13:37:48 +000052# The dictionnary of tables required and the SQL command needed
53# to create them
54#
55TABLES={
56 "symbols" : """CREATE TABLE symbols (
Daniel Veillarda6287a42002-10-07 13:17:22 +000057 name varchar(255) BINARY NOT NULL,
58 module varchar(255) BINARY NOT NULL,
Daniel Veillard3371ff82002-10-01 13:37:48 +000059 type varchar(25) NOT NULL,
60 descr varchar(255),
61 UNIQUE KEY name (name),
62 KEY module (module))""",
63 "words" : """CREATE TABLE words (
Daniel Veillarda6287a42002-10-07 13:17:22 +000064 name varchar(50) BINARY NOT NULL,
65 symbol varchar(255) BINARY NOT NULL,
Daniel Veillard3371ff82002-10-01 13:37:48 +000066 relevance int,
67 KEY name (name),
68 KEY symbol (symbol),
69 UNIQUE KEY ID (name, symbol))""",
Daniel Veillard141d04b2002-10-06 21:51:18 +000070 "wordsHTML" : """CREATE TABLE wordsHTML (
Daniel Veillarda6287a42002-10-07 13:17:22 +000071 name varchar(50) BINARY NOT NULL,
72 resource varchar(255) BINARY NOT NULL,
Daniel Veillard141d04b2002-10-06 21:51:18 +000073 section varchar(255),
74 id varchar(50),
75 relevance int,
76 KEY name (name),
77 KEY resource (resource),
78 UNIQUE KEY ref (name, resource))""",
Daniel Veillard01e87d22002-10-08 16:55:06 +000079 "wordsArchive" : """CREATE TABLE wordsArchive (
80 name varchar(50) BINARY NOT NULL,
81 ID int(11) NOT NULL,
82 relevance int,
83 KEY name (name),
84 UNIQUE KEY ref (name, ID))""",
Daniel Veillard141d04b2002-10-06 21:51:18 +000085 "pages" : """CREATE TABLE pages (
Daniel Veillarda6287a42002-10-07 13:17:22 +000086 resource varchar(255) BINARY NOT NULL,
87 title varchar(255) BINARY NOT NULL,
Daniel Veillard141d04b2002-10-06 21:51:18 +000088 UNIQUE KEY name (resource))""",
Daniel Veillard01e87d22002-10-08 16:55:06 +000089 "archives" : """CREATE TABLE archives (
90 ID int(11) NOT NULL auto_increment,
91 resource varchar(255) BINARY NOT NULL,
92 title varchar(255) BINARY NOT NULL,
93 UNIQUE KEY id (ID,resource(255)),
94 INDEX (ID),
95 INDEX (resource))""",
Daniel Veillarddc6d4ab2002-10-04 15:58:34 +000096 "Queries" : """CREATE TABLE Queries (
97 ID int(11) NOT NULL auto_increment,
98 Value varchar(50) NOT NULL,
99 Count int(11) NOT NULL,
100 UNIQUE KEY id (ID,Value(35)),
101 INDEX (ID))""",
Daniel Veillard3371ff82002-10-01 13:37:48 +0000102}
103
104#
105# The XML API description file to parse
106#
107API="libxml2-api.xml"
108DB=None
109
110#########################################################################
111# #
112# MySQL database interfaces #
113# #
114#########################################################################
115def createTable(db, name):
116 global TABLES
117
118 if db == None:
119 return -1
120 if name == None:
121 return -1
122 c = db.cursor()
123
124 ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
125 if ret == 1:
126 print "Removed table %s" % (name)
127 print "Creating table %s" % (name)
128 try:
129 ret = c.execute(TABLES[name])
130 except:
131 print "Failed to create table %s" % (name)
132 return -1
133 return ret
134
135def checkTables(db):
136 global TABLES
137
138 if db == None:
139 return -1
140 c = db.cursor()
141 nbtables = c.execute("show tables")
142 print "Found %d tables" % (nbtables)
143 tables = {}
144 i = 0
145 while i < nbtables:
146 l = c.fetchone()
147 name = l[0]
148 tables[name] = {}
149 i = i + 1
150
151 for table in TABLES.keys():
152 if not tables.has_key(table):
153 print "table %s missing" % (table)
154 createTable(db, table)
155 print "checkTables finished"
156
157 # make sure apache can access the tables read-only
158 try:
159 ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
Daniel Veillarddc6d4ab2002-10-04 15:58:34 +0000160 ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
Daniel Veillard3371ff82002-10-01 13:37:48 +0000161 except:
162 pass
163 return 0
164
165def openMySQL(db="xmlsoft", passwd=None):
166 global DB
167
168 if passwd == None:
Daniel Veillard538d3b92002-10-01 14:04:56 +0000169 try:
170 passwd = os.environ["MySQL_PASS"]
171 except:
172 print "No password available, set environment MySQL_PASS"
173 sys.exit(1)
174
Daniel Veillard3371ff82002-10-01 13:37:48 +0000175 DB = MySQLdb.connect(passwd=passwd, db=db)
176 if DB == None:
177 return -1
178 ret = checkTables(DB)
179 return ret
180
181def updateWord(name, symbol, relevance):
182 global DB
183
184 if DB == None:
185 openMySQL()
186 if DB == None:
187 return -1
188 if name == None:
189 return -1
190 if symbol == None:
191 return -1
192
193 c = DB.cursor()
194 try:
195 ret = c.execute(
196"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
197 (name, symbol, relevance))
198 except:
199 try:
200 ret = c.execute(
201 """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
202 (relevance, name, symbol))
203 except:
204 print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
205 print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
206 print sys.exc_type, sys.exc_value
207 return -1
208
209 return ret
210
211def updateSymbol(name, module, type, desc):
212 global DB
213
214 updateWord(name, name, 50)
215 if DB == None:
216 openMySQL()
217 if DB == None:
218 return -1
219 if name == None:
220 return -1
221 if module == None:
222 return -1
223 if type == None:
224 return -1
225
226 try:
227 desc = string.replace(desc, "'", " ")
228 l = string.split(desc, ".")
229 desc = l[0]
230 desc = desc[0:99]
231 except:
232 desc = ""
233
234 c = DB.cursor()
235 try:
236 ret = c.execute(
237"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
238 (name, module, type, desc))
239 except:
240 try:
241 ret = c.execute(
242"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
243 (module, type, desc, name))
244 except:
245 print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
246 print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
247 print sys.exc_type, sys.exc_value
248 return -1
249
250 return ret
251
252def addFunction(name, module, desc = ""):
253 return updateSymbol(name, module, 'function', desc)
254
255def addMacro(name, module, desc = ""):
256 return updateSymbol(name, module, 'macro', desc)
257
258def addEnum(name, module, desc = ""):
259 return updateSymbol(name, module, 'enum', desc)
260
261def addStruct(name, module, desc = ""):
262 return updateSymbol(name, module, 'struct', desc)
263
264def addConst(name, module, desc = ""):
265 return updateSymbol(name, module, 'const', desc)
266
267def addType(name, module, desc = ""):
268 return updateSymbol(name, module, 'type', desc)
269
270def addFunctype(name, module, desc = ""):
271 return updateSymbol(name, module, 'functype', desc)
272
Daniel Veillard141d04b2002-10-06 21:51:18 +0000273def addPage(resource, title):
274 global DB
275
276 if DB == None:
277 openMySQL()
278 if DB == None:
279 return -1
280 if resource == None:
281 return -1
282
283 c = DB.cursor()
284 try:
285 ret = c.execute(
286 """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
287 (resource, title))
288 except:
289 try:
290 ret = c.execute(
291 """UPDATE pages SET title='%s' WHERE resource='%s'""" %
292 (title, resource))
293 except:
294 print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
295 print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
296 print sys.exc_type, sys.exc_value
297 return -1
298
299 return ret
300
301def updateWordHTML(name, resource, desc, id, relevance):
302 global DB
303
304 if DB == None:
305 openMySQL()
306 if DB == None:
307 return -1
308 if name == None:
309 return -1
310 if resource == None:
311 return -1
312 if id == None:
313 id = ""
314 if desc == None:
315 desc = ""
316 else:
317 try:
318 desc = string.replace(desc, "'", " ")
319 desc = desc[0:99]
320 except:
321 desc = ""
322
323 c = DB.cursor()
324 try:
325 ret = c.execute(
326"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
327 (name, resource, desc, id, relevance))
328 except:
329 try:
330 ret = c.execute(
331"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
332 (desc, id, relevance, name, resource))
333 except:
334 print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
335 print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
336 print sys.exc_type, sys.exc_value
337 return -1
338
339 return ret
Daniel Veillard01e87d22002-10-08 16:55:06 +0000340
341def checkXMLMsgArchive(url):
342 global DB
343
344 if DB == None:
345 openMySQL()
346 if DB == None:
347 return -1
348 if url == None:
349 return -1
350
351 c = DB.cursor()
352 try:
353 ret = c.execute(
354 """SELECT ID FROM archives WHERE resource='%s'""" % (url))
355 row = c.fetchone()
356 if row == None:
357 return -1
358 except:
359 return -1
360
361 return row[0]
362
363def addXMLMsgArchive(url, title):
364 global DB
365
366 if DB == None:
367 openMySQL()
368 if DB == None:
369 return -1
370 if url == None:
371 return -1
372 if title == None:
373 title = ""
374 else:
375 title = string.replace(title, "'", " ")
376 title = title[0:99]
377
378 c = DB.cursor()
379 try:
380 cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
381 ret = c.execute(cmd)
382 cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
383 ret = c.execute(cmd)
384 row = c.fetchone()
385 if row == None:
386 print "addXMLMsgArchive failed to get the ID: %s" % (url)
387 return -1
388 except:
389 print "addXMLMsgArchive failed command: %s" % (cmd)
390 return -1
391
392 return((int)(row[0]))
393
394def updateWordArchive(name, id, relevance):
395 global DB
396
397 if DB == None:
398 openMySQL()
399 if DB == None:
400 return -1
401 if name == None:
402 return -1
403 if id == None:
404 return -1
405
406 c = DB.cursor()
407 try:
408 ret = c.execute(
409"""INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
410 (name, id, relevance))
411 except:
412 try:
413 ret = c.execute(
414"""UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
415 (relevance, name, id))
416 except:
417 print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
418 print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
419 print sys.exc_type, sys.exc_value
420 return -1
421
422 return ret
423
Daniel Veillard3371ff82002-10-01 13:37:48 +0000424#########################################################################
425# #
426# Word dictionnary and analysis routines #
427# #
428#########################################################################
429
Daniel Veillard01e87d22002-10-08 16:55:06 +0000430#
431# top 100 english word without the one len < 3 + own set
432#
433dropWords = {
434 'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
435 'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
436 'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
437 'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
438 'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
439 'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
440 'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
441 'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
442 'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
443 'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
444 'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
445 'down':0,
446 'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
447}
448
Daniel Veillard3371ff82002-10-01 13:37:48 +0000449wordsDict = {}
Daniel Veillard141d04b2002-10-06 21:51:18 +0000450wordsDictHTML = {}
Daniel Veillard01e87d22002-10-08 16:55:06 +0000451wordsDictArchive = {}
452
453def cleanupWordsString(str):
454 str = string.replace(str, ".", " ")
455 str = string.replace(str, "!", " ")
456 str = string.replace(str, "?", " ")
457 str = string.replace(str, ",", " ")
458 str = string.replace(str, "'", " ")
459 str = string.replace(str, '"', " ")
460 str = string.replace(str, ";", " ")
461 str = string.replace(str, "-", " ")
462 str = string.replace(str, "(", " ")
463 str = string.replace(str, ")", " ")
464 str = string.replace(str, "{", " ")
465 str = string.replace(str, "}", " ")
466 str = string.replace(str, "<", " ")
467 str = string.replace(str, ">", " ")
468 str = string.replace(str, "=", " ")
469 str = string.replace(str, "/", " ")
470 str = string.replace(str, "*", " ")
471 str = string.replace(str, ":", " ")
472 str = string.replace(str, "#", " ")
473 str = string.replace(str, "\\", " ")
474 str = string.replace(str, "\n", " ")
475 str = string.replace(str, "\r", " ")
476 str = string.replace(str, "\xc2", " ")
477 str = string.replace(str, "\xa0", " ")
478 return str
479
480def cleanupDescrString(str):
481 str = string.replace(str, "\n", " ")
482 str = string.replace(str, "\r", " ")
483 str = string.replace(str, "\xc2", " ")
484 str = string.replace(str, "\xa0", " ")
485 l = string.split(str)
486 str = string.join(str)
487 return str
Daniel Veillard3371ff82002-10-01 13:37:48 +0000488
489def splitIdentifier(str):
490 ret = []
491 while str != "":
492 cur = string.lower(str[0])
493 str = str[1:]
494 if ((cur < 'a') or (cur > 'z')):
495 continue
496 while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
497 cur = cur + string.lower(str[0])
498 str = str[1:]
499 while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
500 cur = cur + str[0]
501 str = str[1:]
502 while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
503 str = str[1:]
504 ret.append(cur)
505 return ret
506
507def addWord(word, module, symbol, relevance):
508 global wordsDict
509
510 if word == None or len(word) < 3:
511 return -1
512 if module == None or symbol == None:
513 return -1
Daniel Veillard01e87d22002-10-08 16:55:06 +0000514 if dropWords.has_key(word):
515 return 0
516 if ord(word[0]) > 0x80:
517 return 0
518
Daniel Veillard3371ff82002-10-01 13:37:48 +0000519 if wordsDict.has_key(word):
520 d = wordsDict[word]
521 if d == None:
522 return 0
523 if len(d) > 500:
524 wordsDict[word] = None
525 return 0
526 try:
527 relevance = relevance + d[(module, symbol)]
528 except:
529 pass
530 else:
531 wordsDict[word] = {}
532 wordsDict[word][(module, symbol)] = relevance
533 return relevance
534
535def addString(str, module, symbol, relevance):
536 if str == None or len(str) < 3:
537 return -1
538 ret = 0
Daniel Veillard01e87d22002-10-08 16:55:06 +0000539 str = cleanupWordsString(str)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000540 l = string.split(str)
541 for word in l:
542 if len(word) > 2:
543 ret = ret + addWord(word, module, symbol, 5)
544
545 return ret
546
Daniel Veillard141d04b2002-10-06 21:51:18 +0000547def addWordHTML(word, resource, id, section, relevance):
548 global wordsDictHTML
549
550 if word == None or len(word) < 3:
551 return -1
552 if resource == None or section == None:
553 return -1
Daniel Veillard01e87d22002-10-08 16:55:06 +0000554 if dropWords.has_key(word):
555 return 0
556 if ord(word[0]) > 0x80:
557 return 0
558
559 section = cleanupDescrString(section)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000560
561 if wordsDictHTML.has_key(word):
562 d = wordsDictHTML[word]
563 if d == None:
Daniel Veillarda6287a42002-10-07 13:17:22 +0000564 print "skipped %s" % (word)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000565 return 0
566 try:
567 (r,i,s) = d[resource]
568 if i != None:
569 id = i
570 if s != None:
571 section = s
572 relevance = relevance + r
573 except:
574 pass
575 else:
576 wordsDictHTML[word] = {}
Daniel Veillarda6287a42002-10-07 13:17:22 +0000577 d = wordsDictHTML[word];
578 d[resource] = (relevance, id, section)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000579 return relevance
580
581def addStringHTML(str, resource, id, section, relevance):
582 if str == None or len(str) < 3:
583 return -1
584 ret = 0
Daniel Veillard01e87d22002-10-08 16:55:06 +0000585 str = cleanupWordsString(str)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000586 l = string.split(str)
587 for word in l:
588 if len(word) > 2:
Daniel Veillarda6287a42002-10-07 13:17:22 +0000589 try:
590 r = addWordHTML(word, resource, id, section, relevance)
Daniel Veillard01e87d22002-10-08 16:55:06 +0000591 if r < 0:
Daniel Veillarda6287a42002-10-07 13:17:22 +0000592 print "addWordHTML failed: %s %s" % (word, resource)
593 ret = ret + r
594 except:
595 print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
596 print sys.exc_type, sys.exc_value
Daniel Veillard141d04b2002-10-06 21:51:18 +0000597
598 return ret
599
Daniel Veillard01e87d22002-10-08 16:55:06 +0000600def addWordArchive(word, id, relevance):
601 global wordsDictArchive
602
603 if word == None or len(word) < 3:
604 return -1
605 if id == None or id == -1:
606 return -1
607 if dropWords.has_key(word):
608 return 0
609 if ord(word[0]) > 0x80:
610 return 0
611
612 if wordsDictArchive.has_key(word):
613 d = wordsDictArchive[word]
614 if d == None:
615 print "skipped %s" % (word)
616 return 0
617 try:
618 r = d[id]
619 relevance = relevance + r
620 except:
621 pass
622 else:
623 wordsDictArchive[word] = {}
624 d = wordsDictArchive[word];
625 d[id] = relevance
626 return relevance
627
628def addStringArchive(str, id, relevance):
629 if str == None or len(str) < 3:
630 return -1
631 ret = 0
632 str = cleanupWordsString(str)
633 l = string.split(str)
634 for word in l:
635 i = len(word)
636 if i > 2:
637 try:
638 r = addWordArchive(word, id, relevance)
639 if r < 0:
640 print "addWordArchive failed: %s %s" % (word, id)
641 else:
642 ret = ret + r
643 except:
644 print "addWordArchive failed: %s %s %d" % (word, id, relevance)
645 print sys.exc_type, sys.exc_value
646 return ret
Daniel Veillard3371ff82002-10-01 13:37:48 +0000647
648#########################################################################
649# #
650# XML API description analysis #
651# #
652#########################################################################
653
654def loadAPI(filename):
655 doc = libxml2.parseFile(filename)
656 print "loaded %s" % (filename)
657 return doc
658
659def foundExport(file, symbol):
660 if file == None:
661 return 0
662 if symbol == None:
663 return 0
664 addFunction(symbol, file)
665 l = splitIdentifier(symbol)
666 for word in l:
667 addWord(word, file, symbol, 10)
668 return 1
669
670def analyzeAPIFile(top):
671 count = 0
672 name = top.prop("name")
673 cur = top.children
674 while cur != None:
675 if cur.type == 'text':
676 cur = cur.next
677 continue
678 if cur.name == "exports":
679 count = count + foundExport(name, cur.prop("symbol"))
680 else:
681 print "unexpected element %s in API doc <file name='%s'>" % (name)
682 cur = cur.next
683 return count
684
685def analyzeAPIFiles(top):
686 count = 0
687 cur = top.children
688
689 while cur != None:
690 if cur.type == 'text':
691 cur = cur.next
692 continue
693 if cur.name == "file":
694 count = count + analyzeAPIFile(cur)
695 else:
696 print "unexpected element %s in API doc <files>" % (cur.name)
697 cur = cur.next
698 return count
699
700def analyzeAPIEnum(top):
701 file = top.prop("file")
702 if file == None:
703 return 0
704 symbol = top.prop("name")
705 if symbol == None:
706 return 0
707
708 addEnum(symbol, file)
709 l = splitIdentifier(symbol)
710 for word in l:
711 addWord(word, file, symbol, 10)
712
713 return 1
714
715def analyzeAPIConst(top):
716 file = top.prop("file")
717 if file == None:
718 return 0
719 symbol = top.prop("name")
720 if symbol == None:
721 return 0
722
723 addConst(symbol, file)
724 l = splitIdentifier(symbol)
725 for word in l:
726 addWord(word, file, symbol, 10)
727
728 return 1
729
730def analyzeAPIType(top):
731 file = top.prop("file")
732 if file == None:
733 return 0
734 symbol = top.prop("name")
735 if symbol == None:
736 return 0
737
738 addType(symbol, file)
739 l = splitIdentifier(symbol)
740 for word in l:
741 addWord(word, file, symbol, 10)
742 return 1
743
744def analyzeAPIFunctype(top):
745 file = top.prop("file")
746 if file == None:
747 return 0
748 symbol = top.prop("name")
749 if symbol == None:
750 return 0
751
752 addFunctype(symbol, file)
753 l = splitIdentifier(symbol)
754 for word in l:
755 addWord(word, file, symbol, 10)
756 return 1
757
758def analyzeAPIStruct(top):
759 file = top.prop("file")
760 if file == None:
761 return 0
762 symbol = top.prop("name")
763 if symbol == None:
764 return 0
765
766 addStruct(symbol, file)
767 l = splitIdentifier(symbol)
768 for word in l:
769 addWord(word, file, symbol, 10)
770
771 info = top.prop("info")
772 if info != None:
773 l = string.split(info)
774 for word in l:
775 if len(word) > 2:
776 addWord(word, file, symbol, 5)
777 return 1
778
779def analyzeAPIMacro(top):
780 file = top.prop("file")
781 if file == None:
782 return 0
783 symbol = top.prop("name")
784 if symbol == None:
785 return 0
786
787 info = None
788 cur = top.children
789 while cur != None:
790 if cur.type == 'text':
791 cur = cur.next
792 continue
793 if cur.name == "info":
794 info = cur.content
795 break
796 cur = cur.next
797
798 l = splitIdentifier(symbol)
799 for word in l:
800 addWord(word, file, symbol, 10)
801
802 if info == None:
803 addMacro(symbol, file)
804 print "Macro %s description has no <info>" % (symbol)
805 return 0
806
807 addMacro(symbol, file, info)
808 l = string.split(info)
809 for word in l:
810 if len(word) > 2:
811 addWord(word, file, symbol, 5)
812 return 1
813
814def analyzeAPIFunction(top):
815 file = top.prop("file")
816 if file == None:
817 return 0
818 symbol = top.prop("name")
819 if symbol == None:
820 return 0
821
822 info = None
823 cur = top.children
824 while cur != None:
825 if cur.type == 'text':
826 cur = cur.next
827 continue
828 if cur.name == "info":
829 info = cur.content
830 elif cur.name == "return":
831 rinfo = cur.prop("info")
832 if rinfo != None:
833 addString(rinfo, file, symbol, 7)
834 elif cur.name == "arg":
835 ainfo = cur.prop("info")
836 if rinfo != None:
837 addString(ainfo, file, symbol, 5)
838 name = cur.prop("name")
839 if name != None:
840 addWord(name, file, symbol, 7)
841 cur = cur.next
842 if info == None:
843 print "Function %s description has no <info>" % (symbol)
844 addFunction(symbol, file, "")
845 else:
846 addFunction(symbol, file, info)
847 addString(info, file, symbol, 5)
848
849 l = splitIdentifier(symbol)
850 for word in l:
851 addWord(word, file, symbol, 10)
852
853 return 1
854
855def analyzeAPISymbols(top):
856 count = 0
857 cur = top.children
858
859 while cur != None:
860 if cur.type == 'text':
861 cur = cur.next
862 continue
863 if cur.name == "macro":
864 count = count + analyzeAPIMacro(cur)
865 elif cur.name == "function":
866 count = count + analyzeAPIFunction(cur)
867 elif cur.name == "const":
868 count = count + analyzeAPIConst(cur)
869 elif cur.name == "typedef":
870 count = count + analyzeAPIType(cur)
871 elif cur.name == "struct":
872 count = count + analyzeAPIStruct(cur)
873 elif cur.name == "enum":
874 count = count + analyzeAPIEnum(cur)
875 elif cur.name == "functype":
876 count = count + analyzeAPIFunctype(cur)
877 else:
878 print "unexpected element %s in API doc <files>" % (cur.name)
879 cur = cur.next
880 return count
881
882def analyzeAPI(doc):
883 count = 0
884 if doc == None:
885 return -1
886 root = doc.getRootElement()
887 if root.name != "api":
888 print "Unexpected root name"
889 return -1
890 cur = root.children
891 while cur != None:
892 if cur.type == 'text':
893 cur = cur.next
894 continue
895 if cur.name == "files":
896 pass
897# count = count + analyzeAPIFiles(cur)
898 elif cur.name == "symbols":
899 count = count + analyzeAPISymbols(cur)
900 else:
901 print "unexpected element %s in API doc" % (cur.name)
902 cur = cur.next
903 return count
904
905#########################################################################
906# #
Daniel Veillard141d04b2002-10-06 21:51:18 +0000907# Web pages parsing and analysis #
908# #
909#########################################################################
910
911import glob
912
Daniel Veillard9b006132002-10-07 11:13:27 +0000913def analyzeHTMLText(doc, resource, p, section, id):
914 words = 0
915 try:
916 content = p.content
917 words = words + addStringHTML(content, resource, id, section, 5)
918 except:
919 return -1
920 return words
921
Daniel Veillard141d04b2002-10-06 21:51:18 +0000922def analyzeHTMLPara(doc, resource, p, section, id):
923 words = 0
924 try:
925 content = p.content
926 words = words + addStringHTML(content, resource, id, section, 5)
927 except:
928 return -1
929 return words
930
931def analyzeHTMLPre(doc, resource, p, section, id):
932 words = 0
933 try:
934 content = p.content
935 words = words + addStringHTML(content, resource, id, section, 5)
936 except:
937 return -1
938 return words
939
Daniel Veillard9b006132002-10-07 11:13:27 +0000940def analyzeHTML(doc, resource, p, section, id):
941 words = 0
942 try:
943 content = p.content
944 words = words + addStringHTML(content, resource, id, section, 5)
945 except:
946 return -1
947 return words
948
Daniel Veillard141d04b2002-10-06 21:51:18 +0000949def analyzeHTML(doc, resource):
950 para = 0;
951 ctxt = doc.xpathNewContext()
952 try:
953 res = ctxt.xpathEval("//head/title")
954 title = res[0].content
955 except:
956 title = "Page %s" % (resource)
957 addPage(resource, title)
958 try:
Daniel Veillard9b006132002-10-07 11:13:27 +0000959 items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
Daniel Veillard141d04b2002-10-06 21:51:18 +0000960 section = title
961 id = ""
962 for item in items:
963 if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
964 section = item.content
965 if item.prop("id"):
966 id = item.prop("id")
967 elif item.prop("name"):
968 id = item.prop("name")
Daniel Veillard9b006132002-10-07 11:13:27 +0000969 elif item.type == 'text':
970 analyzeHTMLText(doc, resource, item, section, id)
971 para = para + 1
Daniel Veillarda6287a42002-10-07 13:17:22 +0000972 elif item.name == 'p':
Daniel Veillard141d04b2002-10-06 21:51:18 +0000973 analyzeHTMLPara(doc, resource, item, section, id)
974 para = para + 1
975 elif item.name == 'pre':
976 analyzeHTMLPre(doc, resource, item, section, id)
977 para = para + 1
978 else:
979 print "Page %s, unexpected %s element" % (resource, item.name)
980 except:
981 print "Page %s: problem analyzing" % (resource)
982 print sys.exc_type, sys.exc_value
983
984 return para
985
986def analyzeHTMLPages():
987 ret = 0
988 HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
989 for html in HTMLfiles:
990 if html[0:3] == "API":
991 continue
992 if html == "xml.html":
993 continue
994 try:
995 doc = libxml2.htmlParseFile(html, None)
996 res = analyzeHTML(doc, html)
997 print "Parsed %s : %d paragraphs" % (html, res)
998 ret = ret + 1
999 except:
1000 print "could not parse %s" % (html)
1001 return ret
1002
1003#########################################################################
1004# #
Daniel Veillard01e87d22002-10-08 16:55:06 +00001005# Mail archives parsing and analysis #
1006# #
1007#########################################################################
1008
1009import time
1010
1011def getXMLDateArchive(t = None):
1012 if t == None:
1013 t = time.time()
1014 T = time.gmtime(t)
1015 month = time.strftime("%B", T)
1016 year = T[0]
1017 url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
1018 return url
1019
1020def scanXMLMsgArchive(url, title, force = 0):
1021 if url == None or title == None:
1022 return 0
1023
1024 ID = checkXMLMsgArchive(url)
1025 if force == 0 and ID != -1:
1026 return 0
1027
1028 if ID == -1:
1029 ID = addXMLMsgArchive(url, title)
1030 if ID == -1:
1031 return 0
1032
1033 try:
1034 print "Loading %s" % (url)
1035 doc = libxml2.htmlParseFile(url, None);
1036 except:
1037 doc = None
1038 if doc == None:
1039 print "Failed to parse %s" % (url)
1040 return 0
1041
1042 addStringArchive(title, ID, 20)
1043 ctxt = doc.xpathNewContext()
1044 texts = ctxt.xpathEval("//pre//text()")
1045 for text in texts:
1046 addStringArchive(text.content, ID, 5)
1047
1048 return 1
1049
1050def scanXMLDateArchive(t = None, force = 0):
Daniel Veillardd7960a82002-10-08 19:13:50 +00001051 global wordsDictArchive
1052
1053 wordsDictArchive = {}
1054
Daniel Veillard01e87d22002-10-08 16:55:06 +00001055 url = getXMLDateArchive(t)
1056 print "loading %s" % (url)
1057 try:
1058 doc = libxml2.htmlParseFile(url, None);
1059 except:
1060 doc = None
1061 if doc == None:
1062 print "Failed to parse %s" % (url)
1063 return -1
1064 ctxt = doc.xpathNewContext()
1065 anchors = ctxt.xpathEval("//a[@href]")
1066 links = 0
1067 newmsg = 0
1068 for anchor in anchors:
1069 href = anchor.prop("href")
1070 if href == None or href[0:3] != "msg":
1071 continue
1072 try:
1073 links = links + 1
1074
1075 msg = libxml2.buildURI(href, url)
1076 title = anchor.content
1077 if title != None and title[0:4] == 'Re: ':
1078 title = title[4:]
1079 if title != None and title[0:6] == '[xml] ':
1080 title = title[6:]
1081 newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
1082
1083 except:
1084 pass
1085
1086 return newmsg
1087
1088
1089#########################################################################
1090# #
Daniel Veillard3371ff82002-10-01 13:37:48 +00001091# Main code: open the DB, the API XML and analyze it #
1092# #
1093#########################################################################
1094try:
1095 openMySQL()
1096except:
1097 print "Failed to open the database"
1098 print sys.exc_type, sys.exc_value
1099 sys.exit(1)
1100
Daniel Veillard01e87d22002-10-08 16:55:06 +00001101def analyzeArchives(t = None, force = 0):
1102 global wordsDictArchive
Daniel Veillard141d04b2002-10-06 21:51:18 +00001103
Daniel Veillard01e87d22002-10-08 16:55:06 +00001104 ret = scanXMLDateArchive(t, force)
1105 print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
Daniel Veillard141d04b2002-10-06 21:51:18 +00001106
Daniel Veillard01e87d22002-10-08 16:55:06 +00001107 i = 0
1108 skipped = 0
1109 for word in wordsDictArchive.keys():
1110 refs = wordsDictArchive[word]
1111 if refs == None:
1112 skipped = skipped + 1
1113 continue;
1114 for id in refs.keys():
1115 relevance = refs[id]
1116 updateWordArchive(word, id, relevance)
1117 i = i + 1
Daniel Veillard141d04b2002-10-06 21:51:18 +00001118
Daniel Veillard01e87d22002-10-08 16:55:06 +00001119 print "Found %d associations in HTML pages" % (i)
1120
1121def analyzeHTML():
1122 global wordsDictHTML
1123
1124 ret = analyzeHTMLPages()
1125 print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
1126
1127 i = 0
1128 skipped = 0
1129 for word in wordsDictHTML.keys():
1130 refs = wordsDictHTML[word]
1131 if refs == None:
1132 skipped = skipped + 1
1133 continue;
1134 for resource in refs.keys():
1135 (relevance, id, section) = refs[resource]
1136 updateWordHTML(word, resource, section, id, relevance)
1137 i = i + 1
1138
1139 print "Found %d associations in HTML pages" % (i)
1140
1141def analyzeAPI():
1142 global wordsDict
1143
1144 try:
1145 doc = loadAPI(API)
1146 ret = analyzeAPI(doc)
1147 print "Analyzed %d blocs" % (ret)
1148 doc.freeDoc()
1149 except:
1150 print "Failed to parse and analyze %s" % (API)
1151 print sys.exc_type, sys.exc_value
1152 sys.exit(1)
1153
1154 print "Indexed %d words" % (len(wordsDict))
1155 i = 0
1156 skipped = 0
1157 for word in wordsDict.keys():
1158 refs = wordsDict[word]
1159 if refs == None:
1160 skipped = skipped + 1
1161 continue;
1162 for (module, symbol) in refs.keys():
1163 updateWord(word, symbol, refs[(module, symbol)])
1164 i = i + 1
1165
1166 print "Found %d associations, skipped %d words" % (i, skipped)
1167
1168def usage():
Daniel Veillardf08d4002002-10-08 17:17:11 +00001169 print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
Daniel Veillard3371ff82002-10-01 13:37:48 +00001170 sys.exit(1)
1171
Daniel Veillard01e87d22002-10-08 16:55:06 +00001172def main():
1173 args = sys.argv[1:]
1174 force = 0
1175 if args:
1176 i = 0
1177 while i < len(args):
1178 if args[i] == '--force':
1179 force = 1
1180 elif args[i] == '--archive':
Daniel Veillardf08d4002002-10-08 17:17:11 +00001181 analyzeArchives(None, force)
1182 elif args[i] == '--archive-year':
1183 i = i + 1;
1184 year = args[i]
1185 months = ["January" , "February", "March", "April", "May",
1186 "June", "July", "August", "September", "October",
1187 "November", "December"];
1188 for month in months:
1189 try:
1190 str = "%s-%s" % (year, month)
1191 T = time.strptime(str, "%Y-%B")
1192 t = time.mktime(T) + 3600 * 24 * 10;
1193 analyzeArchives(t, force)
1194 except:
1195 print "Failed to index month archive:"
1196 print sys.exc_type, sys.exc_value
Daniel Veillard01e87d22002-10-08 16:55:06 +00001197 elif args[i] == '--archive-month':
1198 i = i + 1;
1199 month = args[i]
1200 try:
1201 T = time.strptime(month, "%Y-%B")
1202 t = time.mktime(T) + 3600 * 24 * 10;
1203 analyzeArchives(t, force)
1204 except:
1205 print "Failed to index month archive:"
1206 print sys.exc_type, sys.exc_value
1207 elif args[i] == '--API':
1208 analyzeAPI()
1209 elif args[i] == '--docs':
1210 analyzeHTML()
1211 else:
1212 usage()
1213 i = i + 1
1214 else:
1215 usage()
Daniel Veillard3371ff82002-10-01 13:37:48 +00001216
Daniel Veillard01e87d22002-10-08 16:55:06 +00001217if __name__ == "__main__":
1218 main()