blob: 600871b39d4714f085f7882c25e3d55cdcd1acf9 [file] [log] [blame]
Daniel Veillard3371ff82002-10-01 13:37:48 +00001#!/usr/bin/python -u
2#
3# imports the API description and fills up a database with
4# name relevance to modules, functions or web pages
5#
Daniel Veillard2c77cd72002-10-01 13:54:14 +00006# Operation needed:
7# =================
8#
9# install mysqld, the python wrappers for mysql and libxml2, start mysqld
10# Change the root passwd of mysql:
11# mysqladmin -u root password new_password
12# Create the new database xmlsoft
13# mysqladmin -p create xmlsoft
14# Create a database user 'veillard' and give him passord access
15# change veillard and abcde with the right user name and passwd
16# mysql -p
17# password:
18# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
19# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
20#
21# As the user check the access:
22# mysql -p xmlsoft
23# Enter password:
24# Welcome to the MySQL monitor....
25# mysql> use xmlsoft
26# Database changed
27# mysql> quit
28# Bye
29#
30# Then run the script in the doc subdir, it will create the symbols and
31# word tables and populate them with informations extracted from
32# the libxml2-api.xml API description, and make then accessible read-only
33# by nobody@loaclhost the user expected to be Apache's one
34#
35# On the Apache configuration, make sure you have php support enabled
36#
37
Daniel Veillard3371ff82002-10-01 13:37:48 +000038import MySQLdb
39import libxml2
40import sys
41import string
42import os
43
44#
Daniel Veillard141d04b2002-10-06 21:51:18 +000045# We are not interested in parsing errors here
46#
47def callback(ctx, str):
48 return
49libxml2.registerErrorHandler(callback, None)
50
51#
Daniel Veillard3371ff82002-10-01 13:37:48 +000052# The dictionnary of tables required and the SQL command needed
53# to create them
54#
55TABLES={
56 "symbols" : """CREATE TABLE symbols (
Daniel Veillarda6287a42002-10-07 13:17:22 +000057 name varchar(255) BINARY NOT NULL,
58 module varchar(255) BINARY NOT NULL,
Daniel Veillard3371ff82002-10-01 13:37:48 +000059 type varchar(25) NOT NULL,
60 descr varchar(255),
61 UNIQUE KEY name (name),
62 KEY module (module))""",
63 "words" : """CREATE TABLE words (
Daniel Veillarda6287a42002-10-07 13:17:22 +000064 name varchar(50) BINARY NOT NULL,
65 symbol varchar(255) BINARY NOT NULL,
Daniel Veillard3371ff82002-10-01 13:37:48 +000066 relevance int,
67 KEY name (name),
68 KEY symbol (symbol),
69 UNIQUE KEY ID (name, symbol))""",
Daniel Veillard141d04b2002-10-06 21:51:18 +000070 "wordsHTML" : """CREATE TABLE wordsHTML (
Daniel Veillarda6287a42002-10-07 13:17:22 +000071 name varchar(50) BINARY NOT NULL,
72 resource varchar(255) BINARY NOT NULL,
Daniel Veillard141d04b2002-10-06 21:51:18 +000073 section varchar(255),
74 id varchar(50),
75 relevance int,
76 KEY name (name),
77 KEY resource (resource),
78 UNIQUE KEY ref (name, resource))""",
Daniel Veillard01e87d22002-10-08 16:55:06 +000079 "wordsArchive" : """CREATE TABLE wordsArchive (
80 name varchar(50) BINARY NOT NULL,
81 ID int(11) NOT NULL,
82 relevance int,
83 KEY name (name),
84 UNIQUE KEY ref (name, ID))""",
Daniel Veillard141d04b2002-10-06 21:51:18 +000085 "pages" : """CREATE TABLE pages (
Daniel Veillarda6287a42002-10-07 13:17:22 +000086 resource varchar(255) BINARY NOT NULL,
87 title varchar(255) BINARY NOT NULL,
Daniel Veillard141d04b2002-10-06 21:51:18 +000088 UNIQUE KEY name (resource))""",
Daniel Veillard01e87d22002-10-08 16:55:06 +000089 "archives" : """CREATE TABLE archives (
90 ID int(11) NOT NULL auto_increment,
91 resource varchar(255) BINARY NOT NULL,
92 title varchar(255) BINARY NOT NULL,
93 UNIQUE KEY id (ID,resource(255)),
94 INDEX (ID),
95 INDEX (resource))""",
Daniel Veillarddc6d4ab2002-10-04 15:58:34 +000096 "Queries" : """CREATE TABLE Queries (
97 ID int(11) NOT NULL auto_increment,
98 Value varchar(50) NOT NULL,
99 Count int(11) NOT NULL,
100 UNIQUE KEY id (ID,Value(35)),
101 INDEX (ID))""",
Daniel Veillard3371ff82002-10-01 13:37:48 +0000102}
103
104#
105# The XML API description file to parse
106#
107API="libxml2-api.xml"
108DB=None
109
110#########################################################################
111# #
112# MySQL database interfaces #
113# #
114#########################################################################
115def createTable(db, name):
116 global TABLES
117
118 if db == None:
119 return -1
120 if name == None:
121 return -1
122 c = db.cursor()
123
124 ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
125 if ret == 1:
126 print "Removed table %s" % (name)
127 print "Creating table %s" % (name)
128 try:
129 ret = c.execute(TABLES[name])
130 except:
131 print "Failed to create table %s" % (name)
132 return -1
133 return ret
134
135def checkTables(db):
136 global TABLES
137
138 if db == None:
139 return -1
140 c = db.cursor()
141 nbtables = c.execute("show tables")
142 print "Found %d tables" % (nbtables)
143 tables = {}
144 i = 0
145 while i < nbtables:
146 l = c.fetchone()
147 name = l[0]
148 tables[name] = {}
149 i = i + 1
150
151 for table in TABLES.keys():
152 if not tables.has_key(table):
153 print "table %s missing" % (table)
154 createTable(db, table)
Daniel Veillard321be0c2002-10-08 21:26:42 +0000155 try:
156 ret = c.execute("SELECT count(*) from %s" % table);
157 row = c.fetchone()
158 print "Table %s contains %d records" % (table, row[0])
159 except:
160 print "Troubles with table %s : repairing" % (table)
161 ret = c.execute("repair table %s" % table);
162 print "repairing returned %d" % (ret)
163 ret = c.execute("SELECT count(*) from %s" % table);
164 row = c.fetchone()
165 print "Table %s contains %d records" % (table, row[0])
Daniel Veillard3371ff82002-10-01 13:37:48 +0000166 print "checkTables finished"
167
168 # make sure apache can access the tables read-only
169 try:
170 ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
Daniel Veillarddc6d4ab2002-10-04 15:58:34 +0000171 ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
Daniel Veillard3371ff82002-10-01 13:37:48 +0000172 except:
173 pass
174 return 0
175
176def openMySQL(db="xmlsoft", passwd=None):
177 global DB
178
179 if passwd == None:
Daniel Veillard538d3b92002-10-01 14:04:56 +0000180 try:
181 passwd = os.environ["MySQL_PASS"]
182 except:
183 print "No password available, set environment MySQL_PASS"
184 sys.exit(1)
185
Daniel Veillard3371ff82002-10-01 13:37:48 +0000186 DB = MySQLdb.connect(passwd=passwd, db=db)
187 if DB == None:
188 return -1
189 ret = checkTables(DB)
190 return ret
191
192def updateWord(name, symbol, relevance):
193 global DB
194
195 if DB == None:
196 openMySQL()
197 if DB == None:
198 return -1
199 if name == None:
200 return -1
201 if symbol == None:
202 return -1
203
204 c = DB.cursor()
205 try:
206 ret = c.execute(
207"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
208 (name, symbol, relevance))
209 except:
210 try:
211 ret = c.execute(
212 """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
213 (relevance, name, symbol))
214 except:
215 print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
216 print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
217 print sys.exc_type, sys.exc_value
218 return -1
219
220 return ret
221
222def updateSymbol(name, module, type, desc):
223 global DB
224
225 updateWord(name, name, 50)
226 if DB == None:
227 openMySQL()
228 if DB == None:
229 return -1
230 if name == None:
231 return -1
232 if module == None:
233 return -1
234 if type == None:
235 return -1
236
237 try:
238 desc = string.replace(desc, "'", " ")
239 l = string.split(desc, ".")
240 desc = l[0]
241 desc = desc[0:99]
242 except:
243 desc = ""
244
245 c = DB.cursor()
246 try:
247 ret = c.execute(
248"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
249 (name, module, type, desc))
250 except:
251 try:
252 ret = c.execute(
253"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
254 (module, type, desc, name))
255 except:
256 print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
257 print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
258 print sys.exc_type, sys.exc_value
259 return -1
260
261 return ret
262
263def addFunction(name, module, desc = ""):
264 return updateSymbol(name, module, 'function', desc)
265
266def addMacro(name, module, desc = ""):
267 return updateSymbol(name, module, 'macro', desc)
268
269def addEnum(name, module, desc = ""):
270 return updateSymbol(name, module, 'enum', desc)
271
272def addStruct(name, module, desc = ""):
273 return updateSymbol(name, module, 'struct', desc)
274
275def addConst(name, module, desc = ""):
276 return updateSymbol(name, module, 'const', desc)
277
278def addType(name, module, desc = ""):
279 return updateSymbol(name, module, 'type', desc)
280
281def addFunctype(name, module, desc = ""):
282 return updateSymbol(name, module, 'functype', desc)
283
Daniel Veillard141d04b2002-10-06 21:51:18 +0000284def addPage(resource, title):
285 global DB
286
287 if DB == None:
288 openMySQL()
289 if DB == None:
290 return -1
291 if resource == None:
292 return -1
293
294 c = DB.cursor()
295 try:
296 ret = c.execute(
297 """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
298 (resource, title))
299 except:
300 try:
301 ret = c.execute(
302 """UPDATE pages SET title='%s' WHERE resource='%s'""" %
303 (title, resource))
304 except:
305 print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
306 print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
307 print sys.exc_type, sys.exc_value
308 return -1
309
310 return ret
311
312def updateWordHTML(name, resource, desc, id, relevance):
313 global DB
314
315 if DB == None:
316 openMySQL()
317 if DB == None:
318 return -1
319 if name == None:
320 return -1
321 if resource == None:
322 return -1
323 if id == None:
324 id = ""
325 if desc == None:
326 desc = ""
327 else:
328 try:
329 desc = string.replace(desc, "'", " ")
330 desc = desc[0:99]
331 except:
332 desc = ""
333
334 c = DB.cursor()
335 try:
336 ret = c.execute(
337"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
338 (name, resource, desc, id, relevance))
339 except:
340 try:
341 ret = c.execute(
342"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
343 (desc, id, relevance, name, resource))
344 except:
345 print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
346 print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
347 print sys.exc_type, sys.exc_value
348 return -1
349
350 return ret
Daniel Veillard01e87d22002-10-08 16:55:06 +0000351
352def checkXMLMsgArchive(url):
353 global DB
354
355 if DB == None:
356 openMySQL()
357 if DB == None:
358 return -1
359 if url == None:
360 return -1
361
362 c = DB.cursor()
363 try:
364 ret = c.execute(
365 """SELECT ID FROM archives WHERE resource='%s'""" % (url))
366 row = c.fetchone()
367 if row == None:
368 return -1
369 except:
370 return -1
371
372 return row[0]
373
374def addXMLMsgArchive(url, title):
375 global DB
376
377 if DB == None:
378 openMySQL()
379 if DB == None:
380 return -1
381 if url == None:
382 return -1
383 if title == None:
384 title = ""
385 else:
386 title = string.replace(title, "'", " ")
387 title = title[0:99]
388
389 c = DB.cursor()
390 try:
391 cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
392 ret = c.execute(cmd)
393 cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
394 ret = c.execute(cmd)
395 row = c.fetchone()
396 if row == None:
397 print "addXMLMsgArchive failed to get the ID: %s" % (url)
398 return -1
399 except:
400 print "addXMLMsgArchive failed command: %s" % (cmd)
401 return -1
402
403 return((int)(row[0]))
404
405def updateWordArchive(name, id, relevance):
406 global DB
407
408 if DB == None:
409 openMySQL()
410 if DB == None:
411 return -1
412 if name == None:
413 return -1
414 if id == None:
415 return -1
416
417 c = DB.cursor()
418 try:
419 ret = c.execute(
420"""INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
421 (name, id, relevance))
422 except:
423 try:
424 ret = c.execute(
425"""UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
426 (relevance, name, id))
427 except:
428 print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
429 print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
430 print sys.exc_type, sys.exc_value
431 return -1
432
433 return ret
434
Daniel Veillard3371ff82002-10-01 13:37:48 +0000435#########################################################################
436# #
437# Word dictionnary and analysis routines #
438# #
439#########################################################################
440
Daniel Veillard01e87d22002-10-08 16:55:06 +0000441#
442# top 100 english word without the one len < 3 + own set
443#
444dropWords = {
445 'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
446 'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
447 'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
448 'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
449 'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
450 'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
451 'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
452 'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
453 'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
454 'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
455 'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
456 'down':0,
457 'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
458}
459
Daniel Veillard3371ff82002-10-01 13:37:48 +0000460wordsDict = {}
Daniel Veillard141d04b2002-10-06 21:51:18 +0000461wordsDictHTML = {}
Daniel Veillard01e87d22002-10-08 16:55:06 +0000462wordsDictArchive = {}
463
464def cleanupWordsString(str):
465 str = string.replace(str, ".", " ")
466 str = string.replace(str, "!", " ")
467 str = string.replace(str, "?", " ")
468 str = string.replace(str, ",", " ")
469 str = string.replace(str, "'", " ")
470 str = string.replace(str, '"', " ")
471 str = string.replace(str, ";", " ")
472 str = string.replace(str, "-", " ")
473 str = string.replace(str, "(", " ")
474 str = string.replace(str, ")", " ")
475 str = string.replace(str, "{", " ")
476 str = string.replace(str, "}", " ")
477 str = string.replace(str, "<", " ")
478 str = string.replace(str, ">", " ")
479 str = string.replace(str, "=", " ")
480 str = string.replace(str, "/", " ")
481 str = string.replace(str, "*", " ")
482 str = string.replace(str, ":", " ")
483 str = string.replace(str, "#", " ")
484 str = string.replace(str, "\\", " ")
485 str = string.replace(str, "\n", " ")
486 str = string.replace(str, "\r", " ")
487 str = string.replace(str, "\xc2", " ")
488 str = string.replace(str, "\xa0", " ")
489 return str
490
491def cleanupDescrString(str):
492 str = string.replace(str, "\n", " ")
493 str = string.replace(str, "\r", " ")
494 str = string.replace(str, "\xc2", " ")
495 str = string.replace(str, "\xa0", " ")
496 l = string.split(str)
497 str = string.join(str)
498 return str
Daniel Veillard3371ff82002-10-01 13:37:48 +0000499
500def splitIdentifier(str):
501 ret = []
502 while str != "":
503 cur = string.lower(str[0])
504 str = str[1:]
505 if ((cur < 'a') or (cur > 'z')):
506 continue
507 while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
508 cur = cur + string.lower(str[0])
509 str = str[1:]
510 while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
511 cur = cur + str[0]
512 str = str[1:]
513 while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
514 str = str[1:]
515 ret.append(cur)
516 return ret
517
518def addWord(word, module, symbol, relevance):
519 global wordsDict
520
521 if word == None or len(word) < 3:
522 return -1
523 if module == None or symbol == None:
524 return -1
Daniel Veillard01e87d22002-10-08 16:55:06 +0000525 if dropWords.has_key(word):
526 return 0
527 if ord(word[0]) > 0x80:
528 return 0
529
Daniel Veillard3371ff82002-10-01 13:37:48 +0000530 if wordsDict.has_key(word):
531 d = wordsDict[word]
532 if d == None:
533 return 0
534 if len(d) > 500:
535 wordsDict[word] = None
536 return 0
537 try:
538 relevance = relevance + d[(module, symbol)]
539 except:
540 pass
541 else:
542 wordsDict[word] = {}
543 wordsDict[word][(module, symbol)] = relevance
544 return relevance
545
546def addString(str, module, symbol, relevance):
547 if str == None or len(str) < 3:
548 return -1
549 ret = 0
Daniel Veillard01e87d22002-10-08 16:55:06 +0000550 str = cleanupWordsString(str)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000551 l = string.split(str)
552 for word in l:
553 if len(word) > 2:
554 ret = ret + addWord(word, module, symbol, 5)
555
556 return ret
557
Daniel Veillard141d04b2002-10-06 21:51:18 +0000558def addWordHTML(word, resource, id, section, relevance):
559 global wordsDictHTML
560
561 if word == None or len(word) < 3:
562 return -1
563 if resource == None or section == None:
564 return -1
Daniel Veillard01e87d22002-10-08 16:55:06 +0000565 if dropWords.has_key(word):
566 return 0
567 if ord(word[0]) > 0x80:
568 return 0
569
570 section = cleanupDescrString(section)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000571
572 if wordsDictHTML.has_key(word):
573 d = wordsDictHTML[word]
574 if d == None:
Daniel Veillarda6287a42002-10-07 13:17:22 +0000575 print "skipped %s" % (word)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000576 return 0
577 try:
578 (r,i,s) = d[resource]
579 if i != None:
580 id = i
581 if s != None:
582 section = s
583 relevance = relevance + r
584 except:
585 pass
586 else:
587 wordsDictHTML[word] = {}
Daniel Veillarda6287a42002-10-07 13:17:22 +0000588 d = wordsDictHTML[word];
589 d[resource] = (relevance, id, section)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000590 return relevance
591
592def addStringHTML(str, resource, id, section, relevance):
593 if str == None or len(str) < 3:
594 return -1
595 ret = 0
Daniel Veillard01e87d22002-10-08 16:55:06 +0000596 str = cleanupWordsString(str)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000597 l = string.split(str)
598 for word in l:
599 if len(word) > 2:
Daniel Veillarda6287a42002-10-07 13:17:22 +0000600 try:
601 r = addWordHTML(word, resource, id, section, relevance)
Daniel Veillard01e87d22002-10-08 16:55:06 +0000602 if r < 0:
Daniel Veillarda6287a42002-10-07 13:17:22 +0000603 print "addWordHTML failed: %s %s" % (word, resource)
604 ret = ret + r
605 except:
606 print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
607 print sys.exc_type, sys.exc_value
Daniel Veillard141d04b2002-10-06 21:51:18 +0000608
609 return ret
610
Daniel Veillard01e87d22002-10-08 16:55:06 +0000611def addWordArchive(word, id, relevance):
612 global wordsDictArchive
613
614 if word == None or len(word) < 3:
615 return -1
616 if id == None or id == -1:
617 return -1
618 if dropWords.has_key(word):
619 return 0
620 if ord(word[0]) > 0x80:
621 return 0
622
623 if wordsDictArchive.has_key(word):
624 d = wordsDictArchive[word]
625 if d == None:
626 print "skipped %s" % (word)
627 return 0
628 try:
629 r = d[id]
630 relevance = relevance + r
631 except:
632 pass
633 else:
634 wordsDictArchive[word] = {}
635 d = wordsDictArchive[word];
636 d[id] = relevance
637 return relevance
638
639def addStringArchive(str, id, relevance):
640 if str == None or len(str) < 3:
641 return -1
642 ret = 0
643 str = cleanupWordsString(str)
644 l = string.split(str)
645 for word in l:
646 i = len(word)
647 if i > 2:
648 try:
649 r = addWordArchive(word, id, relevance)
650 if r < 0:
651 print "addWordArchive failed: %s %s" % (word, id)
652 else:
653 ret = ret + r
654 except:
655 print "addWordArchive failed: %s %s %d" % (word, id, relevance)
656 print sys.exc_type, sys.exc_value
657 return ret
Daniel Veillard3371ff82002-10-01 13:37:48 +0000658
659#########################################################################
660# #
661# XML API description analysis #
662# #
663#########################################################################
664
665def loadAPI(filename):
666 doc = libxml2.parseFile(filename)
667 print "loaded %s" % (filename)
668 return doc
669
670def foundExport(file, symbol):
671 if file == None:
672 return 0
673 if symbol == None:
674 return 0
675 addFunction(symbol, file)
676 l = splitIdentifier(symbol)
677 for word in l:
678 addWord(word, file, symbol, 10)
679 return 1
680
681def analyzeAPIFile(top):
682 count = 0
683 name = top.prop("name")
684 cur = top.children
685 while cur != None:
686 if cur.type == 'text':
687 cur = cur.next
688 continue
689 if cur.name == "exports":
690 count = count + foundExport(name, cur.prop("symbol"))
691 else:
692 print "unexpected element %s in API doc <file name='%s'>" % (name)
693 cur = cur.next
694 return count
695
696def analyzeAPIFiles(top):
697 count = 0
698 cur = top.children
699
700 while cur != None:
701 if cur.type == 'text':
702 cur = cur.next
703 continue
704 if cur.name == "file":
705 count = count + analyzeAPIFile(cur)
706 else:
707 print "unexpected element %s in API doc <files>" % (cur.name)
708 cur = cur.next
709 return count
710
711def analyzeAPIEnum(top):
712 file = top.prop("file")
713 if file == None:
714 return 0
715 symbol = top.prop("name")
716 if symbol == None:
717 return 0
718
719 addEnum(symbol, file)
720 l = splitIdentifier(symbol)
721 for word in l:
722 addWord(word, file, symbol, 10)
723
724 return 1
725
726def analyzeAPIConst(top):
727 file = top.prop("file")
728 if file == None:
729 return 0
730 symbol = top.prop("name")
731 if symbol == None:
732 return 0
733
734 addConst(symbol, file)
735 l = splitIdentifier(symbol)
736 for word in l:
737 addWord(word, file, symbol, 10)
738
739 return 1
740
741def analyzeAPIType(top):
742 file = top.prop("file")
743 if file == None:
744 return 0
745 symbol = top.prop("name")
746 if symbol == None:
747 return 0
748
749 addType(symbol, file)
750 l = splitIdentifier(symbol)
751 for word in l:
752 addWord(word, file, symbol, 10)
753 return 1
754
755def analyzeAPIFunctype(top):
756 file = top.prop("file")
757 if file == None:
758 return 0
759 symbol = top.prop("name")
760 if symbol == None:
761 return 0
762
763 addFunctype(symbol, file)
764 l = splitIdentifier(symbol)
765 for word in l:
766 addWord(word, file, symbol, 10)
767 return 1
768
769def analyzeAPIStruct(top):
770 file = top.prop("file")
771 if file == None:
772 return 0
773 symbol = top.prop("name")
774 if symbol == None:
775 return 0
776
777 addStruct(symbol, file)
778 l = splitIdentifier(symbol)
779 for word in l:
780 addWord(word, file, symbol, 10)
781
782 info = top.prop("info")
783 if info != None:
784 l = string.split(info)
785 for word in l:
786 if len(word) > 2:
787 addWord(word, file, symbol, 5)
788 return 1
789
790def analyzeAPIMacro(top):
791 file = top.prop("file")
792 if file == None:
793 return 0
794 symbol = top.prop("name")
795 if symbol == None:
796 return 0
797
798 info = None
799 cur = top.children
800 while cur != None:
801 if cur.type == 'text':
802 cur = cur.next
803 continue
804 if cur.name == "info":
805 info = cur.content
806 break
807 cur = cur.next
808
809 l = splitIdentifier(symbol)
810 for word in l:
811 addWord(word, file, symbol, 10)
812
813 if info == None:
814 addMacro(symbol, file)
815 print "Macro %s description has no <info>" % (symbol)
816 return 0
817
818 addMacro(symbol, file, info)
819 l = string.split(info)
820 for word in l:
821 if len(word) > 2:
822 addWord(word, file, symbol, 5)
823 return 1
824
825def analyzeAPIFunction(top):
826 file = top.prop("file")
827 if file == None:
828 return 0
829 symbol = top.prop("name")
830 if symbol == None:
831 return 0
832
833 info = None
834 cur = top.children
835 while cur != None:
836 if cur.type == 'text':
837 cur = cur.next
838 continue
839 if cur.name == "info":
840 info = cur.content
841 elif cur.name == "return":
842 rinfo = cur.prop("info")
843 if rinfo != None:
844 addString(rinfo, file, symbol, 7)
845 elif cur.name == "arg":
846 ainfo = cur.prop("info")
847 if rinfo != None:
848 addString(ainfo, file, symbol, 5)
849 name = cur.prop("name")
850 if name != None:
851 addWord(name, file, symbol, 7)
852 cur = cur.next
853 if info == None:
854 print "Function %s description has no <info>" % (symbol)
855 addFunction(symbol, file, "")
856 else:
857 addFunction(symbol, file, info)
858 addString(info, file, symbol, 5)
859
860 l = splitIdentifier(symbol)
861 for word in l:
862 addWord(word, file, symbol, 10)
863
864 return 1
865
866def analyzeAPISymbols(top):
867 count = 0
868 cur = top.children
869
870 while cur != None:
871 if cur.type == 'text':
872 cur = cur.next
873 continue
874 if cur.name == "macro":
875 count = count + analyzeAPIMacro(cur)
876 elif cur.name == "function":
877 count = count + analyzeAPIFunction(cur)
878 elif cur.name == "const":
879 count = count + analyzeAPIConst(cur)
880 elif cur.name == "typedef":
881 count = count + analyzeAPIType(cur)
882 elif cur.name == "struct":
883 count = count + analyzeAPIStruct(cur)
884 elif cur.name == "enum":
885 count = count + analyzeAPIEnum(cur)
886 elif cur.name == "functype":
887 count = count + analyzeAPIFunctype(cur)
888 else:
889 print "unexpected element %s in API doc <files>" % (cur.name)
890 cur = cur.next
891 return count
892
893def analyzeAPI(doc):
894 count = 0
895 if doc == None:
896 return -1
897 root = doc.getRootElement()
898 if root.name != "api":
899 print "Unexpected root name"
900 return -1
901 cur = root.children
902 while cur != None:
903 if cur.type == 'text':
904 cur = cur.next
905 continue
906 if cur.name == "files":
907 pass
908# count = count + analyzeAPIFiles(cur)
909 elif cur.name == "symbols":
910 count = count + analyzeAPISymbols(cur)
911 else:
912 print "unexpected element %s in API doc" % (cur.name)
913 cur = cur.next
914 return count
915
916#########################################################################
917# #
Daniel Veillard141d04b2002-10-06 21:51:18 +0000918# Web pages parsing and analysis #
919# #
920#########################################################################
921
922import glob
923
Daniel Veillard9b006132002-10-07 11:13:27 +0000924def analyzeHTMLText(doc, resource, p, section, id):
925 words = 0
926 try:
927 content = p.content
928 words = words + addStringHTML(content, resource, id, section, 5)
929 except:
930 return -1
931 return words
932
Daniel Veillard141d04b2002-10-06 21:51:18 +0000933def analyzeHTMLPara(doc, resource, p, section, id):
934 words = 0
935 try:
936 content = p.content
937 words = words + addStringHTML(content, resource, id, section, 5)
938 except:
939 return -1
940 return words
941
942def analyzeHTMLPre(doc, resource, p, section, id):
943 words = 0
944 try:
945 content = p.content
946 words = words + addStringHTML(content, resource, id, section, 5)
947 except:
948 return -1
949 return words
950
Daniel Veillard9b006132002-10-07 11:13:27 +0000951def analyzeHTML(doc, resource, p, section, id):
952 words = 0
953 try:
954 content = p.content
955 words = words + addStringHTML(content, resource, id, section, 5)
956 except:
957 return -1
958 return words
959
Daniel Veillard141d04b2002-10-06 21:51:18 +0000960def analyzeHTML(doc, resource):
961 para = 0;
962 ctxt = doc.xpathNewContext()
963 try:
964 res = ctxt.xpathEval("//head/title")
965 title = res[0].content
966 except:
967 title = "Page %s" % (resource)
968 addPage(resource, title)
969 try:
Daniel Veillard9b006132002-10-07 11:13:27 +0000970 items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
Daniel Veillard141d04b2002-10-06 21:51:18 +0000971 section = title
972 id = ""
973 for item in items:
974 if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
975 section = item.content
976 if item.prop("id"):
977 id = item.prop("id")
978 elif item.prop("name"):
979 id = item.prop("name")
Daniel Veillard9b006132002-10-07 11:13:27 +0000980 elif item.type == 'text':
981 analyzeHTMLText(doc, resource, item, section, id)
982 para = para + 1
Daniel Veillarda6287a42002-10-07 13:17:22 +0000983 elif item.name == 'p':
Daniel Veillard141d04b2002-10-06 21:51:18 +0000984 analyzeHTMLPara(doc, resource, item, section, id)
985 para = para + 1
986 elif item.name == 'pre':
987 analyzeHTMLPre(doc, resource, item, section, id)
988 para = para + 1
989 else:
990 print "Page %s, unexpected %s element" % (resource, item.name)
991 except:
992 print "Page %s: problem analyzing" % (resource)
993 print sys.exc_type, sys.exc_value
994
995 return para
996
997def analyzeHTMLPages():
998 ret = 0
999 HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
1000 for html in HTMLfiles:
1001 if html[0:3] == "API":
1002 continue
1003 if html == "xml.html":
1004 continue
1005 try:
1006 doc = libxml2.htmlParseFile(html, None)
1007 res = analyzeHTML(doc, html)
1008 print "Parsed %s : %d paragraphs" % (html, res)
1009 ret = ret + 1
1010 except:
1011 print "could not parse %s" % (html)
1012 return ret
1013
1014#########################################################################
1015# #
Daniel Veillard01e87d22002-10-08 16:55:06 +00001016# Mail archives parsing and analysis #
1017# #
1018#########################################################################
1019
1020import time
1021
1022def getXMLDateArchive(t = None):
1023 if t == None:
1024 t = time.time()
1025 T = time.gmtime(t)
1026 month = time.strftime("%B", T)
1027 year = T[0]
1028 url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
1029 return url
1030
1031def scanXMLMsgArchive(url, title, force = 0):
1032 if url == None or title == None:
1033 return 0
1034
1035 ID = checkXMLMsgArchive(url)
1036 if force == 0 and ID != -1:
1037 return 0
1038
1039 if ID == -1:
1040 ID = addXMLMsgArchive(url, title)
1041 if ID == -1:
1042 return 0
1043
1044 try:
1045 print "Loading %s" % (url)
1046 doc = libxml2.htmlParseFile(url, None);
1047 except:
1048 doc = None
1049 if doc == None:
1050 print "Failed to parse %s" % (url)
1051 return 0
1052
1053 addStringArchive(title, ID, 20)
1054 ctxt = doc.xpathNewContext()
1055 texts = ctxt.xpathEval("//pre//text()")
1056 for text in texts:
1057 addStringArchive(text.content, ID, 5)
1058
1059 return 1
1060
1061def scanXMLDateArchive(t = None, force = 0):
Daniel Veillardd7960a82002-10-08 19:13:50 +00001062 global wordsDictArchive
1063
1064 wordsDictArchive = {}
1065
Daniel Veillard01e87d22002-10-08 16:55:06 +00001066 url = getXMLDateArchive(t)
1067 print "loading %s" % (url)
1068 try:
1069 doc = libxml2.htmlParseFile(url, None);
1070 except:
1071 doc = None
1072 if doc == None:
1073 print "Failed to parse %s" % (url)
1074 return -1
1075 ctxt = doc.xpathNewContext()
1076 anchors = ctxt.xpathEval("//a[@href]")
1077 links = 0
1078 newmsg = 0
1079 for anchor in anchors:
1080 href = anchor.prop("href")
1081 if href == None or href[0:3] != "msg":
1082 continue
1083 try:
1084 links = links + 1
1085
1086 msg = libxml2.buildURI(href, url)
1087 title = anchor.content
1088 if title != None and title[0:4] == 'Re: ':
1089 title = title[4:]
1090 if title != None and title[0:6] == '[xml] ':
1091 title = title[6:]
1092 newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
1093
1094 except:
1095 pass
1096
1097 return newmsg
1098
1099
1100#########################################################################
1101# #
Daniel Veillard3371ff82002-10-01 13:37:48 +00001102# Main code: open the DB, the API XML and analyze it #
1103# #
1104#########################################################################
1105try:
1106 openMySQL()
1107except:
1108 print "Failed to open the database"
1109 print sys.exc_type, sys.exc_value
1110 sys.exit(1)
1111
Daniel Veillard01e87d22002-10-08 16:55:06 +00001112def analyzeArchives(t = None, force = 0):
1113 global wordsDictArchive
Daniel Veillard141d04b2002-10-06 21:51:18 +00001114
Daniel Veillard01e87d22002-10-08 16:55:06 +00001115 ret = scanXMLDateArchive(t, force)
1116 print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
Daniel Veillard141d04b2002-10-06 21:51:18 +00001117
Daniel Veillard01e87d22002-10-08 16:55:06 +00001118 i = 0
1119 skipped = 0
1120 for word in wordsDictArchive.keys():
1121 refs = wordsDictArchive[word]
1122 if refs == None:
1123 skipped = skipped + 1
1124 continue;
1125 for id in refs.keys():
1126 relevance = refs[id]
1127 updateWordArchive(word, id, relevance)
1128 i = i + 1
Daniel Veillard141d04b2002-10-06 21:51:18 +00001129
Daniel Veillard01e87d22002-10-08 16:55:06 +00001130 print "Found %d associations in HTML pages" % (i)
1131
Daniel Veillard321be0c2002-10-08 21:26:42 +00001132def analyzeHTMLTop():
Daniel Veillard01e87d22002-10-08 16:55:06 +00001133 global wordsDictHTML
1134
1135 ret = analyzeHTMLPages()
1136 print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
1137
1138 i = 0
1139 skipped = 0
1140 for word in wordsDictHTML.keys():
1141 refs = wordsDictHTML[word]
1142 if refs == None:
1143 skipped = skipped + 1
1144 continue;
1145 for resource in refs.keys():
1146 (relevance, id, section) = refs[resource]
1147 updateWordHTML(word, resource, section, id, relevance)
1148 i = i + 1
1149
1150 print "Found %d associations in HTML pages" % (i)
1151
Daniel Veillard321be0c2002-10-08 21:26:42 +00001152def analyzeAPITop():
Daniel Veillard01e87d22002-10-08 16:55:06 +00001153 global wordsDict
Daniel Veillard321be0c2002-10-08 21:26:42 +00001154 global API
Daniel Veillard01e87d22002-10-08 16:55:06 +00001155
1156 try:
1157 doc = loadAPI(API)
1158 ret = analyzeAPI(doc)
1159 print "Analyzed %d blocs" % (ret)
1160 doc.freeDoc()
1161 except:
1162 print "Failed to parse and analyze %s" % (API)
1163 print sys.exc_type, sys.exc_value
1164 sys.exit(1)
1165
1166 print "Indexed %d words" % (len(wordsDict))
1167 i = 0
1168 skipped = 0
1169 for word in wordsDict.keys():
1170 refs = wordsDict[word]
1171 if refs == None:
1172 skipped = skipped + 1
1173 continue;
1174 for (module, symbol) in refs.keys():
1175 updateWord(word, symbol, refs[(module, symbol)])
1176 i = i + 1
1177
1178 print "Found %d associations, skipped %d words" % (i, skipped)
1179
1180def usage():
Daniel Veillardf08d4002002-10-08 17:17:11 +00001181 print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
Daniel Veillard3371ff82002-10-01 13:37:48 +00001182 sys.exit(1)
1183
Daniel Veillard01e87d22002-10-08 16:55:06 +00001184def main():
1185 args = sys.argv[1:]
1186 force = 0
1187 if args:
1188 i = 0
1189 while i < len(args):
1190 if args[i] == '--force':
1191 force = 1
1192 elif args[i] == '--archive':
Daniel Veillardf08d4002002-10-08 17:17:11 +00001193 analyzeArchives(None, force)
1194 elif args[i] == '--archive-year':
1195 i = i + 1;
1196 year = args[i]
1197 months = ["January" , "February", "March", "April", "May",
1198 "June", "July", "August", "September", "October",
1199 "November", "December"];
1200 for month in months:
1201 try:
1202 str = "%s-%s" % (year, month)
1203 T = time.strptime(str, "%Y-%B")
1204 t = time.mktime(T) + 3600 * 24 * 10;
1205 analyzeArchives(t, force)
1206 except:
1207 print "Failed to index month archive:"
1208 print sys.exc_type, sys.exc_value
Daniel Veillard01e87d22002-10-08 16:55:06 +00001209 elif args[i] == '--archive-month':
1210 i = i + 1;
1211 month = args[i]
1212 try:
1213 T = time.strptime(month, "%Y-%B")
1214 t = time.mktime(T) + 3600 * 24 * 10;
1215 analyzeArchives(t, force)
1216 except:
1217 print "Failed to index month archive:"
1218 print sys.exc_type, sys.exc_value
1219 elif args[i] == '--API':
Daniel Veillard321be0c2002-10-08 21:26:42 +00001220 analyzeAPITop()
Daniel Veillard01e87d22002-10-08 16:55:06 +00001221 elif args[i] == '--docs':
Daniel Veillard321be0c2002-10-08 21:26:42 +00001222 analyzeHTMLTop()
Daniel Veillard01e87d22002-10-08 16:55:06 +00001223 else:
1224 usage()
1225 i = i + 1
1226 else:
1227 usage()
Daniel Veillard3371ff82002-10-01 13:37:48 +00001228
Daniel Veillard01e87d22002-10-08 16:55:06 +00001229if __name__ == "__main__":
1230 main()