blob: 578f288b37236313f793f69e018954a41d646324 [file] [log] [blame]
Daniel Veillard3371ff82002-10-01 13:37:48 +00001#!/usr/bin/python -u
2#
3# imports the API description and fills up a database with
4# name relevance to modules, functions or web pages
5#
Daniel Veillard2c77cd72002-10-01 13:54:14 +00006# Operation needed:
7# =================
8#
9# install mysqld, the python wrappers for mysql and libxml2, start mysqld
10# Change the root passwd of mysql:
11# mysqladmin -u root password new_password
12# Create the new database xmlsoft
13# mysqladmin -p create xmlsoft
14# Create a database user 'veillard' and give him passord access
15# change veillard and abcde with the right user name and passwd
16# mysql -p
17# password:
18# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
19# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
20#
21# As the user check the access:
22# mysql -p xmlsoft
23# Enter password:
24# Welcome to the MySQL monitor....
25# mysql> use xmlsoft
26# Database changed
27# mysql> quit
28# Bye
29#
30# Then run the script in the doc subdir, it will create the symbols and
31# word tables and populate them with informations extracted from
32# the libxml2-api.xml API description, and make then accessible read-only
33# by nobody@loaclhost the user expected to be Apache's one
34#
35# On the Apache configuration, make sure you have php support enabled
36#
37
Daniel Veillard3371ff82002-10-01 13:37:48 +000038import MySQLdb
39import libxml2
40import sys
41import string
42import os
43
44#
Daniel Veillard141d04b2002-10-06 21:51:18 +000045# We are not interested in parsing errors here
46#
47def callback(ctx, str):
48 return
49libxml2.registerErrorHandler(callback, None)
50
51#
Daniel Veillard3371ff82002-10-01 13:37:48 +000052# The dictionnary of tables required and the SQL command needed
53# to create them
54#
55TABLES={
56 "symbols" : """CREATE TABLE symbols (
Daniel Veillarda6287a42002-10-07 13:17:22 +000057 name varchar(255) BINARY NOT NULL,
58 module varchar(255) BINARY NOT NULL,
Daniel Veillard3371ff82002-10-01 13:37:48 +000059 type varchar(25) NOT NULL,
60 descr varchar(255),
61 UNIQUE KEY name (name),
62 KEY module (module))""",
63 "words" : """CREATE TABLE words (
Daniel Veillarda6287a42002-10-07 13:17:22 +000064 name varchar(50) BINARY NOT NULL,
65 symbol varchar(255) BINARY NOT NULL,
Daniel Veillard3371ff82002-10-01 13:37:48 +000066 relevance int,
67 KEY name (name),
68 KEY symbol (symbol),
69 UNIQUE KEY ID (name, symbol))""",
Daniel Veillard141d04b2002-10-06 21:51:18 +000070 "wordsHTML" : """CREATE TABLE wordsHTML (
Daniel Veillarda6287a42002-10-07 13:17:22 +000071 name varchar(50) BINARY NOT NULL,
72 resource varchar(255) BINARY NOT NULL,
Daniel Veillard141d04b2002-10-06 21:51:18 +000073 section varchar(255),
74 id varchar(50),
75 relevance int,
76 KEY name (name),
77 KEY resource (resource),
78 UNIQUE KEY ref (name, resource))""",
Daniel Veillard01e87d22002-10-08 16:55:06 +000079 "wordsArchive" : """CREATE TABLE wordsArchive (
80 name varchar(50) BINARY NOT NULL,
81 ID int(11) NOT NULL,
82 relevance int,
83 KEY name (name),
84 UNIQUE KEY ref (name, ID))""",
Daniel Veillard141d04b2002-10-06 21:51:18 +000085 "pages" : """CREATE TABLE pages (
Daniel Veillarda6287a42002-10-07 13:17:22 +000086 resource varchar(255) BINARY NOT NULL,
87 title varchar(255) BINARY NOT NULL,
Daniel Veillard141d04b2002-10-06 21:51:18 +000088 UNIQUE KEY name (resource))""",
Daniel Veillard01e87d22002-10-08 16:55:06 +000089 "archives" : """CREATE TABLE archives (
90 ID int(11) NOT NULL auto_increment,
91 resource varchar(255) BINARY NOT NULL,
92 title varchar(255) BINARY NOT NULL,
93 UNIQUE KEY id (ID,resource(255)),
94 INDEX (ID),
95 INDEX (resource))""",
Daniel Veillarddc6d4ab2002-10-04 15:58:34 +000096 "Queries" : """CREATE TABLE Queries (
97 ID int(11) NOT NULL auto_increment,
98 Value varchar(50) NOT NULL,
99 Count int(11) NOT NULL,
100 UNIQUE KEY id (ID,Value(35)),
101 INDEX (ID))""",
Daniel Veillard24f6a072004-04-08 14:39:25 +0000102 "AllQueries" : """CREATE TABLE AllQueries (
103 ID int(11) NOT NULL auto_increment,
104 Value varchar(50) NOT NULL,
105 Count int(11) NOT NULL,
106 UNIQUE KEY id (ID,Value(35)),
107 INDEX (ID))""",
Daniel Veillard3371ff82002-10-01 13:37:48 +0000108}
109
110#
111# The XML API description file to parse
112#
113API="libxml2-api.xml"
114DB=None
115
116#########################################################################
117# #
118# MySQL database interfaces #
119# #
120#########################################################################
121def createTable(db, name):
122 global TABLES
123
124 if db == None:
125 return -1
126 if name == None:
127 return -1
128 c = db.cursor()
129
130 ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
131 if ret == 1:
132 print "Removed table %s" % (name)
133 print "Creating table %s" % (name)
134 try:
135 ret = c.execute(TABLES[name])
136 except:
137 print "Failed to create table %s" % (name)
138 return -1
139 return ret
140
Daniel Veillard24f6a072004-04-08 14:39:25 +0000141def checkTables(db, verbose = 1):
Daniel Veillard3371ff82002-10-01 13:37:48 +0000142 global TABLES
143
144 if db == None:
145 return -1
146 c = db.cursor()
147 nbtables = c.execute("show tables")
Daniel Veillard24f6a072004-04-08 14:39:25 +0000148 if verbose:
149 print "Found %d tables" % (nbtables)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000150 tables = {}
151 i = 0
152 while i < nbtables:
153 l = c.fetchone()
154 name = l[0]
155 tables[name] = {}
156 i = i + 1
157
158 for table in TABLES.keys():
159 if not tables.has_key(table):
160 print "table %s missing" % (table)
161 createTable(db, table)
Daniel Veillard321be0c2002-10-08 21:26:42 +0000162 try:
163 ret = c.execute("SELECT count(*) from %s" % table);
164 row = c.fetchone()
Daniel Veillard24f6a072004-04-08 14:39:25 +0000165 if verbose:
166 print "Table %s contains %d records" % (table, row[0])
Daniel Veillard321be0c2002-10-08 21:26:42 +0000167 except:
168 print "Troubles with table %s : repairing" % (table)
169 ret = c.execute("repair table %s" % table);
170 print "repairing returned %d" % (ret)
171 ret = c.execute("SELECT count(*) from %s" % table);
172 row = c.fetchone()
173 print "Table %s contains %d records" % (table, row[0])
Daniel Veillard24f6a072004-04-08 14:39:25 +0000174 if verbose:
175 print "checkTables finished"
Daniel Veillard3371ff82002-10-01 13:37:48 +0000176
177 # make sure apache can access the tables read-only
178 try:
179 ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
Daniel Veillarddc6d4ab2002-10-04 15:58:34 +0000180 ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
Daniel Veillard3371ff82002-10-01 13:37:48 +0000181 except:
182 pass
183 return 0
184
Daniel Veillard24f6a072004-04-08 14:39:25 +0000185def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
Daniel Veillard3371ff82002-10-01 13:37:48 +0000186 global DB
187
188 if passwd == None:
Daniel Veillard538d3b92002-10-01 14:04:56 +0000189 try:
190 passwd = os.environ["MySQL_PASS"]
191 except:
192 print "No password available, set environment MySQL_PASS"
193 sys.exit(1)
194
Daniel Veillard3371ff82002-10-01 13:37:48 +0000195 DB = MySQLdb.connect(passwd=passwd, db=db)
196 if DB == None:
197 return -1
Daniel Veillard24f6a072004-04-08 14:39:25 +0000198 ret = checkTables(DB, verbose)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000199 return ret
200
201def updateWord(name, symbol, relevance):
202 global DB
203
204 if DB == None:
205 openMySQL()
206 if DB == None:
207 return -1
208 if name == None:
209 return -1
210 if symbol == None:
211 return -1
212
213 c = DB.cursor()
214 try:
215 ret = c.execute(
216"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
217 (name, symbol, relevance))
218 except:
219 try:
220 ret = c.execute(
221 """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
222 (relevance, name, symbol))
223 except:
224 print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
225 print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
226 print sys.exc_type, sys.exc_value
227 return -1
228
229 return ret
230
231def updateSymbol(name, module, type, desc):
232 global DB
233
234 updateWord(name, name, 50)
235 if DB == None:
236 openMySQL()
237 if DB == None:
238 return -1
239 if name == None:
240 return -1
241 if module == None:
242 return -1
243 if type == None:
244 return -1
245
246 try:
247 desc = string.replace(desc, "'", " ")
248 l = string.split(desc, ".")
249 desc = l[0]
250 desc = desc[0:99]
251 except:
252 desc = ""
253
254 c = DB.cursor()
255 try:
256 ret = c.execute(
257"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
258 (name, module, type, desc))
259 except:
260 try:
261 ret = c.execute(
262"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
263 (module, type, desc, name))
264 except:
265 print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
266 print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
267 print sys.exc_type, sys.exc_value
268 return -1
269
270 return ret
271
272def addFunction(name, module, desc = ""):
273 return updateSymbol(name, module, 'function', desc)
274
275def addMacro(name, module, desc = ""):
276 return updateSymbol(name, module, 'macro', desc)
277
278def addEnum(name, module, desc = ""):
279 return updateSymbol(name, module, 'enum', desc)
280
281def addStruct(name, module, desc = ""):
282 return updateSymbol(name, module, 'struct', desc)
283
284def addConst(name, module, desc = ""):
285 return updateSymbol(name, module, 'const', desc)
286
287def addType(name, module, desc = ""):
288 return updateSymbol(name, module, 'type', desc)
289
290def addFunctype(name, module, desc = ""):
291 return updateSymbol(name, module, 'functype', desc)
292
Daniel Veillard141d04b2002-10-06 21:51:18 +0000293def addPage(resource, title):
294 global DB
295
296 if DB == None:
297 openMySQL()
298 if DB == None:
299 return -1
300 if resource == None:
301 return -1
302
303 c = DB.cursor()
304 try:
305 ret = c.execute(
306 """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
307 (resource, title))
308 except:
309 try:
310 ret = c.execute(
311 """UPDATE pages SET title='%s' WHERE resource='%s'""" %
312 (title, resource))
313 except:
314 print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
315 print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
316 print sys.exc_type, sys.exc_value
317 return -1
318
319 return ret
320
321def updateWordHTML(name, resource, desc, id, relevance):
322 global DB
323
324 if DB == None:
325 openMySQL()
326 if DB == None:
327 return -1
328 if name == None:
329 return -1
330 if resource == None:
331 return -1
332 if id == None:
333 id = ""
334 if desc == None:
335 desc = ""
336 else:
337 try:
338 desc = string.replace(desc, "'", " ")
339 desc = desc[0:99]
340 except:
341 desc = ""
342
343 c = DB.cursor()
344 try:
345 ret = c.execute(
346"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
347 (name, resource, desc, id, relevance))
348 except:
349 try:
350 ret = c.execute(
351"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
352 (desc, id, relevance, name, resource))
353 except:
354 print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
355 print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
356 print sys.exc_type, sys.exc_value
357 return -1
358
359 return ret
Daniel Veillard01e87d22002-10-08 16:55:06 +0000360
361def checkXMLMsgArchive(url):
362 global DB
363
364 if DB == None:
365 openMySQL()
366 if DB == None:
367 return -1
368 if url == None:
369 return -1
370
371 c = DB.cursor()
372 try:
373 ret = c.execute(
374 """SELECT ID FROM archives WHERE resource='%s'""" % (url))
375 row = c.fetchone()
376 if row == None:
377 return -1
378 except:
379 return -1
380
381 return row[0]
382
383def addXMLMsgArchive(url, title):
384 global DB
385
386 if DB == None:
387 openMySQL()
388 if DB == None:
389 return -1
390 if url == None:
391 return -1
392 if title == None:
393 title = ""
394 else:
395 title = string.replace(title, "'", " ")
396 title = title[0:99]
397
398 c = DB.cursor()
399 try:
400 cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
401 ret = c.execute(cmd)
402 cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
403 ret = c.execute(cmd)
404 row = c.fetchone()
405 if row == None:
406 print "addXMLMsgArchive failed to get the ID: %s" % (url)
407 return -1
408 except:
409 print "addXMLMsgArchive failed command: %s" % (cmd)
410 return -1
411
412 return((int)(row[0]))
413
414def updateWordArchive(name, id, relevance):
415 global DB
416
417 if DB == None:
418 openMySQL()
419 if DB == None:
420 return -1
421 if name == None:
422 return -1
423 if id == None:
424 return -1
425
426 c = DB.cursor()
427 try:
428 ret = c.execute(
429"""INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
430 (name, id, relevance))
431 except:
432 try:
433 ret = c.execute(
434"""UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
435 (relevance, name, id))
436 except:
437 print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
438 print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
439 print sys.exc_type, sys.exc_value
440 return -1
441
442 return ret
443
Daniel Veillard3371ff82002-10-01 13:37:48 +0000444#########################################################################
445# #
446# Word dictionnary and analysis routines #
447# #
448#########################################################################
449
Daniel Veillard01e87d22002-10-08 16:55:06 +0000450#
451# top 100 english word without the one len < 3 + own set
452#
453dropWords = {
454 'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
455 'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
456 'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
457 'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
458 'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
459 'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
460 'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
461 'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
462 'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
463 'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
464 'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
465 'down':0,
466 'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
467}
468
Daniel Veillard3371ff82002-10-01 13:37:48 +0000469wordsDict = {}
Daniel Veillard141d04b2002-10-06 21:51:18 +0000470wordsDictHTML = {}
Daniel Veillard01e87d22002-10-08 16:55:06 +0000471wordsDictArchive = {}
472
473def cleanupWordsString(str):
474 str = string.replace(str, ".", " ")
475 str = string.replace(str, "!", " ")
476 str = string.replace(str, "?", " ")
477 str = string.replace(str, ",", " ")
478 str = string.replace(str, "'", " ")
479 str = string.replace(str, '"', " ")
480 str = string.replace(str, ";", " ")
Daniel Veillard01e87d22002-10-08 16:55:06 +0000481 str = string.replace(str, "(", " ")
482 str = string.replace(str, ")", " ")
483 str = string.replace(str, "{", " ")
484 str = string.replace(str, "}", " ")
485 str = string.replace(str, "<", " ")
486 str = string.replace(str, ">", " ")
487 str = string.replace(str, "=", " ")
488 str = string.replace(str, "/", " ")
489 str = string.replace(str, "*", " ")
490 str = string.replace(str, ":", " ")
491 str = string.replace(str, "#", " ")
492 str = string.replace(str, "\\", " ")
493 str = string.replace(str, "\n", " ")
494 str = string.replace(str, "\r", " ")
495 str = string.replace(str, "\xc2", " ")
496 str = string.replace(str, "\xa0", " ")
497 return str
498
499def cleanupDescrString(str):
Daniel Veillardf0070122002-10-09 14:24:17 +0000500 str = string.replace(str, "'", " ")
Daniel Veillard01e87d22002-10-08 16:55:06 +0000501 str = string.replace(str, "\n", " ")
502 str = string.replace(str, "\r", " ")
503 str = string.replace(str, "\xc2", " ")
504 str = string.replace(str, "\xa0", " ")
505 l = string.split(str)
506 str = string.join(str)
507 return str
Daniel Veillard3371ff82002-10-01 13:37:48 +0000508
509def splitIdentifier(str):
510 ret = []
511 while str != "":
512 cur = string.lower(str[0])
513 str = str[1:]
514 if ((cur < 'a') or (cur > 'z')):
515 continue
516 while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
517 cur = cur + string.lower(str[0])
518 str = str[1:]
519 while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
520 cur = cur + str[0]
521 str = str[1:]
522 while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
523 str = str[1:]
524 ret.append(cur)
525 return ret
526
527def addWord(word, module, symbol, relevance):
528 global wordsDict
529
530 if word == None or len(word) < 3:
531 return -1
532 if module == None or symbol == None:
533 return -1
Daniel Veillard01e87d22002-10-08 16:55:06 +0000534 if dropWords.has_key(word):
535 return 0
536 if ord(word[0]) > 0x80:
537 return 0
538
Daniel Veillard3371ff82002-10-01 13:37:48 +0000539 if wordsDict.has_key(word):
540 d = wordsDict[word]
541 if d == None:
542 return 0
543 if len(d) > 500:
544 wordsDict[word] = None
545 return 0
546 try:
547 relevance = relevance + d[(module, symbol)]
548 except:
549 pass
550 else:
551 wordsDict[word] = {}
552 wordsDict[word][(module, symbol)] = relevance
553 return relevance
554
555def addString(str, module, symbol, relevance):
556 if str == None or len(str) < 3:
557 return -1
558 ret = 0
Daniel Veillard01e87d22002-10-08 16:55:06 +0000559 str = cleanupWordsString(str)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000560 l = string.split(str)
561 for word in l:
562 if len(word) > 2:
563 ret = ret + addWord(word, module, symbol, 5)
564
565 return ret
566
Daniel Veillard141d04b2002-10-06 21:51:18 +0000567def addWordHTML(word, resource, id, section, relevance):
568 global wordsDictHTML
569
570 if word == None or len(word) < 3:
571 return -1
572 if resource == None or section == None:
573 return -1
Daniel Veillard01e87d22002-10-08 16:55:06 +0000574 if dropWords.has_key(word):
575 return 0
576 if ord(word[0]) > 0x80:
577 return 0
578
579 section = cleanupDescrString(section)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000580
581 if wordsDictHTML.has_key(word):
582 d = wordsDictHTML[word]
583 if d == None:
Daniel Veillarda6287a42002-10-07 13:17:22 +0000584 print "skipped %s" % (word)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000585 return 0
586 try:
587 (r,i,s) = d[resource]
588 if i != None:
589 id = i
590 if s != None:
591 section = s
592 relevance = relevance + r
593 except:
594 pass
595 else:
596 wordsDictHTML[word] = {}
Daniel Veillarda6287a42002-10-07 13:17:22 +0000597 d = wordsDictHTML[word];
598 d[resource] = (relevance, id, section)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000599 return relevance
600
601def addStringHTML(str, resource, id, section, relevance):
602 if str == None or len(str) < 3:
603 return -1
604 ret = 0
Daniel Veillard01e87d22002-10-08 16:55:06 +0000605 str = cleanupWordsString(str)
Daniel Veillard141d04b2002-10-06 21:51:18 +0000606 l = string.split(str)
607 for word in l:
608 if len(word) > 2:
Daniel Veillarda6287a42002-10-07 13:17:22 +0000609 try:
610 r = addWordHTML(word, resource, id, section, relevance)
Daniel Veillard01e87d22002-10-08 16:55:06 +0000611 if r < 0:
Daniel Veillarda6287a42002-10-07 13:17:22 +0000612 print "addWordHTML failed: %s %s" % (word, resource)
613 ret = ret + r
614 except:
615 print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
616 print sys.exc_type, sys.exc_value
Daniel Veillard141d04b2002-10-06 21:51:18 +0000617
618 return ret
619
Daniel Veillard01e87d22002-10-08 16:55:06 +0000620def addWordArchive(word, id, relevance):
621 global wordsDictArchive
622
623 if word == None or len(word) < 3:
624 return -1
625 if id == None or id == -1:
626 return -1
627 if dropWords.has_key(word):
628 return 0
629 if ord(word[0]) > 0x80:
630 return 0
631
632 if wordsDictArchive.has_key(word):
633 d = wordsDictArchive[word]
634 if d == None:
635 print "skipped %s" % (word)
636 return 0
637 try:
638 r = d[id]
639 relevance = relevance + r
640 except:
641 pass
642 else:
643 wordsDictArchive[word] = {}
644 d = wordsDictArchive[word];
645 d[id] = relevance
646 return relevance
647
648def addStringArchive(str, id, relevance):
649 if str == None or len(str) < 3:
650 return -1
651 ret = 0
652 str = cleanupWordsString(str)
653 l = string.split(str)
654 for word in l:
655 i = len(word)
656 if i > 2:
657 try:
658 r = addWordArchive(word, id, relevance)
659 if r < 0:
660 print "addWordArchive failed: %s %s" % (word, id)
661 else:
662 ret = ret + r
663 except:
664 print "addWordArchive failed: %s %s %d" % (word, id, relevance)
665 print sys.exc_type, sys.exc_value
666 return ret
Daniel Veillard3371ff82002-10-01 13:37:48 +0000667
668#########################################################################
669# #
670# XML API description analysis #
671# #
672#########################################################################
673
674def loadAPI(filename):
675 doc = libxml2.parseFile(filename)
676 print "loaded %s" % (filename)
677 return doc
678
679def foundExport(file, symbol):
680 if file == None:
681 return 0
682 if symbol == None:
683 return 0
684 addFunction(symbol, file)
685 l = splitIdentifier(symbol)
686 for word in l:
687 addWord(word, file, symbol, 10)
688 return 1
689
690def analyzeAPIFile(top):
691 count = 0
692 name = top.prop("name")
693 cur = top.children
694 while cur != None:
695 if cur.type == 'text':
696 cur = cur.next
697 continue
698 if cur.name == "exports":
699 count = count + foundExport(name, cur.prop("symbol"))
700 else:
701 print "unexpected element %s in API doc <file name='%s'>" % (name)
702 cur = cur.next
703 return count
704
705def analyzeAPIFiles(top):
706 count = 0
707 cur = top.children
708
709 while cur != None:
710 if cur.type == 'text':
711 cur = cur.next
712 continue
713 if cur.name == "file":
714 count = count + analyzeAPIFile(cur)
715 else:
716 print "unexpected element %s in API doc <files>" % (cur.name)
717 cur = cur.next
718 return count
719
720def analyzeAPIEnum(top):
721 file = top.prop("file")
722 if file == None:
723 return 0
724 symbol = top.prop("name")
725 if symbol == None:
726 return 0
727
728 addEnum(symbol, file)
729 l = splitIdentifier(symbol)
730 for word in l:
731 addWord(word, file, symbol, 10)
732
733 return 1
734
735def analyzeAPIConst(top):
736 file = top.prop("file")
737 if file == None:
738 return 0
739 symbol = top.prop("name")
740 if symbol == None:
741 return 0
742
743 addConst(symbol, file)
744 l = splitIdentifier(symbol)
745 for word in l:
746 addWord(word, file, symbol, 10)
747
748 return 1
749
750def analyzeAPIType(top):
751 file = top.prop("file")
752 if file == None:
753 return 0
754 symbol = top.prop("name")
755 if symbol == None:
756 return 0
757
758 addType(symbol, file)
759 l = splitIdentifier(symbol)
760 for word in l:
761 addWord(word, file, symbol, 10)
762 return 1
763
764def analyzeAPIFunctype(top):
765 file = top.prop("file")
766 if file == None:
767 return 0
768 symbol = top.prop("name")
769 if symbol == None:
770 return 0
771
772 addFunctype(symbol, file)
773 l = splitIdentifier(symbol)
774 for word in l:
775 addWord(word, file, symbol, 10)
776 return 1
777
778def analyzeAPIStruct(top):
779 file = top.prop("file")
780 if file == None:
781 return 0
782 symbol = top.prop("name")
783 if symbol == None:
784 return 0
785
786 addStruct(symbol, file)
787 l = splitIdentifier(symbol)
788 for word in l:
789 addWord(word, file, symbol, 10)
790
791 info = top.prop("info")
792 if info != None:
Daniel Veillardf0070122002-10-09 14:24:17 +0000793 info = string.replace(info, "'", " ")
794 info = string.strip(info)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000795 l = string.split(info)
796 for word in l:
797 if len(word) > 2:
798 addWord(word, file, symbol, 5)
799 return 1
800
801def analyzeAPIMacro(top):
802 file = top.prop("file")
803 if file == None:
804 return 0
805 symbol = top.prop("name")
806 if symbol == None:
807 return 0
Daniel Veillardf0070122002-10-09 14:24:17 +0000808 symbol = string.replace(symbol, "'", " ")
809 symbol = string.strip(symbol)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000810
811 info = None
812 cur = top.children
813 while cur != None:
814 if cur.type == 'text':
815 cur = cur.next
816 continue
817 if cur.name == "info":
818 info = cur.content
819 break
820 cur = cur.next
821
822 l = splitIdentifier(symbol)
823 for word in l:
824 addWord(word, file, symbol, 10)
825
826 if info == None:
827 addMacro(symbol, file)
828 print "Macro %s description has no <info>" % (symbol)
829 return 0
830
Daniel Veillardf0070122002-10-09 14:24:17 +0000831 info = string.replace(info, "'", " ")
832 info = string.strip(info)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000833 addMacro(symbol, file, info)
834 l = string.split(info)
835 for word in l:
836 if len(word) > 2:
837 addWord(word, file, symbol, 5)
838 return 1
839
840def analyzeAPIFunction(top):
841 file = top.prop("file")
842 if file == None:
843 return 0
844 symbol = top.prop("name")
845 if symbol == None:
846 return 0
847
Daniel Veillardf0070122002-10-09 14:24:17 +0000848 symbol = string.replace(symbol, "'", " ")
849 symbol = string.strip(symbol)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000850 info = None
851 cur = top.children
852 while cur != None:
853 if cur.type == 'text':
854 cur = cur.next
855 continue
856 if cur.name == "info":
857 info = cur.content
858 elif cur.name == "return":
859 rinfo = cur.prop("info")
860 if rinfo != None:
Daniel Veillardf0070122002-10-09 14:24:17 +0000861 rinfo = string.replace(rinfo, "'", " ")
862 rinfo = string.strip(rinfo)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000863 addString(rinfo, file, symbol, 7)
864 elif cur.name == "arg":
865 ainfo = cur.prop("info")
Daniel Veillardf0070122002-10-09 14:24:17 +0000866 if ainfo != None:
867 ainfo = string.replace(ainfo, "'", " ")
868 ainfo = string.strip(ainfo)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000869 addString(ainfo, file, symbol, 5)
870 name = cur.prop("name")
871 if name != None:
Daniel Veillardf0070122002-10-09 14:24:17 +0000872 name = string.replace(name, "'", " ")
873 name = string.strip(name)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000874 addWord(name, file, symbol, 7)
875 cur = cur.next
876 if info == None:
877 print "Function %s description has no <info>" % (symbol)
878 addFunction(symbol, file, "")
879 else:
Daniel Veillardf0070122002-10-09 14:24:17 +0000880 info = string.replace(info, "'", " ")
881 info = string.strip(info)
Daniel Veillard3371ff82002-10-01 13:37:48 +0000882 addFunction(symbol, file, info)
883 addString(info, file, symbol, 5)
884
885 l = splitIdentifier(symbol)
886 for word in l:
887 addWord(word, file, symbol, 10)
888
889 return 1
890
891def analyzeAPISymbols(top):
892 count = 0
893 cur = top.children
894
895 while cur != None:
896 if cur.type == 'text':
897 cur = cur.next
898 continue
899 if cur.name == "macro":
900 count = count + analyzeAPIMacro(cur)
901 elif cur.name == "function":
902 count = count + analyzeAPIFunction(cur)
903 elif cur.name == "const":
904 count = count + analyzeAPIConst(cur)
905 elif cur.name == "typedef":
906 count = count + analyzeAPIType(cur)
907 elif cur.name == "struct":
908 count = count + analyzeAPIStruct(cur)
909 elif cur.name == "enum":
910 count = count + analyzeAPIEnum(cur)
911 elif cur.name == "functype":
912 count = count + analyzeAPIFunctype(cur)
913 else:
914 print "unexpected element %s in API doc <files>" % (cur.name)
915 cur = cur.next
916 return count
917
918def analyzeAPI(doc):
919 count = 0
920 if doc == None:
921 return -1
922 root = doc.getRootElement()
923 if root.name != "api":
924 print "Unexpected root name"
925 return -1
926 cur = root.children
927 while cur != None:
928 if cur.type == 'text':
929 cur = cur.next
930 continue
931 if cur.name == "files":
932 pass
933# count = count + analyzeAPIFiles(cur)
934 elif cur.name == "symbols":
935 count = count + analyzeAPISymbols(cur)
936 else:
937 print "unexpected element %s in API doc" % (cur.name)
938 cur = cur.next
939 return count
940
941#########################################################################
942# #
Daniel Veillard141d04b2002-10-06 21:51:18 +0000943# Web pages parsing and analysis #
944# #
945#########################################################################
946
947import glob
948
Daniel Veillard9b006132002-10-07 11:13:27 +0000949def analyzeHTMLText(doc, resource, p, section, id):
950 words = 0
951 try:
952 content = p.content
953 words = words + addStringHTML(content, resource, id, section, 5)
954 except:
955 return -1
956 return words
957
Daniel Veillard141d04b2002-10-06 21:51:18 +0000958def analyzeHTMLPara(doc, resource, p, section, id):
959 words = 0
960 try:
961 content = p.content
962 words = words + addStringHTML(content, resource, id, section, 5)
963 except:
964 return -1
965 return words
966
967def analyzeHTMLPre(doc, resource, p, section, id):
968 words = 0
969 try:
970 content = p.content
971 words = words + addStringHTML(content, resource, id, section, 5)
972 except:
973 return -1
974 return words
975
Daniel Veillard9b006132002-10-07 11:13:27 +0000976def analyzeHTML(doc, resource, p, section, id):
977 words = 0
978 try:
979 content = p.content
980 words = words + addStringHTML(content, resource, id, section, 5)
981 except:
982 return -1
983 return words
984
Daniel Veillard141d04b2002-10-06 21:51:18 +0000985def analyzeHTML(doc, resource):
986 para = 0;
987 ctxt = doc.xpathNewContext()
988 try:
989 res = ctxt.xpathEval("//head/title")
990 title = res[0].content
991 except:
992 title = "Page %s" % (resource)
993 addPage(resource, title)
994 try:
Daniel Veillard9b006132002-10-07 11:13:27 +0000995 items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
Daniel Veillard141d04b2002-10-06 21:51:18 +0000996 section = title
997 id = ""
998 for item in items:
999 if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
1000 section = item.content
1001 if item.prop("id"):
1002 id = item.prop("id")
1003 elif item.prop("name"):
1004 id = item.prop("name")
Daniel Veillard9b006132002-10-07 11:13:27 +00001005 elif item.type == 'text':
1006 analyzeHTMLText(doc, resource, item, section, id)
1007 para = para + 1
Daniel Veillarda6287a42002-10-07 13:17:22 +00001008 elif item.name == 'p':
Daniel Veillard141d04b2002-10-06 21:51:18 +00001009 analyzeHTMLPara(doc, resource, item, section, id)
1010 para = para + 1
1011 elif item.name == 'pre':
1012 analyzeHTMLPre(doc, resource, item, section, id)
1013 para = para + 1
1014 else:
1015 print "Page %s, unexpected %s element" % (resource, item.name)
1016 except:
1017 print "Page %s: problem analyzing" % (resource)
1018 print sys.exc_type, sys.exc_value
1019
1020 return para
1021
1022def analyzeHTMLPages():
1023 ret = 0
1024 HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
1025 for html in HTMLfiles:
1026 if html[0:3] == "API":
1027 continue
1028 if html == "xml.html":
1029 continue
1030 try:
William M. Brack008c06b2003-09-01 22:17:39 +00001031 doc = libxml2.parseFile(html)
1032 except:
Daniel Veillard141d04b2002-10-06 21:51:18 +00001033 doc = libxml2.htmlParseFile(html, None)
William M. Brack008c06b2003-09-01 22:17:39 +00001034 try:
Daniel Veillard141d04b2002-10-06 21:51:18 +00001035 res = analyzeHTML(doc, html)
1036 print "Parsed %s : %d paragraphs" % (html, res)
1037 ret = ret + 1
1038 except:
1039 print "could not parse %s" % (html)
1040 return ret
1041
1042#########################################################################
1043# #
Daniel Veillard01e87d22002-10-08 16:55:06 +00001044# Mail archives parsing and analysis #
1045# #
1046#########################################################################
1047
1048import time
1049
1050def getXMLDateArchive(t = None):
1051 if t == None:
1052 t = time.time()
1053 T = time.gmtime(t)
1054 month = time.strftime("%B", T)
1055 year = T[0]
1056 url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
1057 return url
1058
1059def scanXMLMsgArchive(url, title, force = 0):
1060 if url == None or title == None:
1061 return 0
1062
1063 ID = checkXMLMsgArchive(url)
1064 if force == 0 and ID != -1:
1065 return 0
1066
1067 if ID == -1:
1068 ID = addXMLMsgArchive(url, title)
1069 if ID == -1:
1070 return 0
1071
1072 try:
1073 print "Loading %s" % (url)
1074 doc = libxml2.htmlParseFile(url, None);
1075 except:
1076 doc = None
1077 if doc == None:
1078 print "Failed to parse %s" % (url)
1079 return 0
1080
1081 addStringArchive(title, ID, 20)
1082 ctxt = doc.xpathNewContext()
1083 texts = ctxt.xpathEval("//pre//text()")
1084 for text in texts:
1085 addStringArchive(text.content, ID, 5)
1086
1087 return 1
1088
1089def scanXMLDateArchive(t = None, force = 0):
Daniel Veillardd7960a82002-10-08 19:13:50 +00001090 global wordsDictArchive
1091
1092 wordsDictArchive = {}
1093
Daniel Veillard01e87d22002-10-08 16:55:06 +00001094 url = getXMLDateArchive(t)
1095 print "loading %s" % (url)
1096 try:
1097 doc = libxml2.htmlParseFile(url, None);
1098 except:
1099 doc = None
1100 if doc == None:
1101 print "Failed to parse %s" % (url)
1102 return -1
1103 ctxt = doc.xpathNewContext()
1104 anchors = ctxt.xpathEval("//a[@href]")
1105 links = 0
1106 newmsg = 0
1107 for anchor in anchors:
1108 href = anchor.prop("href")
1109 if href == None or href[0:3] != "msg":
1110 continue
1111 try:
1112 links = links + 1
1113
1114 msg = libxml2.buildURI(href, url)
1115 title = anchor.content
1116 if title != None and title[0:4] == 'Re: ':
1117 title = title[4:]
1118 if title != None and title[0:6] == '[xml] ':
1119 title = title[6:]
1120 newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
1121
1122 except:
1123 pass
1124
1125 return newmsg
1126
1127
1128#########################################################################
1129# #
Daniel Veillard3371ff82002-10-01 13:37:48 +00001130# Main code: open the DB, the API XML and analyze it #
1131# #
1132#########################################################################
Daniel Veillard01e87d22002-10-08 16:55:06 +00001133def analyzeArchives(t = None, force = 0):
1134 global wordsDictArchive
Daniel Veillard141d04b2002-10-06 21:51:18 +00001135
Daniel Veillard01e87d22002-10-08 16:55:06 +00001136 ret = scanXMLDateArchive(t, force)
1137 print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
Daniel Veillard141d04b2002-10-06 21:51:18 +00001138
Daniel Veillard01e87d22002-10-08 16:55:06 +00001139 i = 0
1140 skipped = 0
1141 for word in wordsDictArchive.keys():
1142 refs = wordsDictArchive[word]
1143 if refs == None:
1144 skipped = skipped + 1
1145 continue;
1146 for id in refs.keys():
1147 relevance = refs[id]
1148 updateWordArchive(word, id, relevance)
1149 i = i + 1
Daniel Veillard141d04b2002-10-06 21:51:18 +00001150
Daniel Veillard01e87d22002-10-08 16:55:06 +00001151 print "Found %d associations in HTML pages" % (i)
1152
Daniel Veillard321be0c2002-10-08 21:26:42 +00001153def analyzeHTMLTop():
Daniel Veillard01e87d22002-10-08 16:55:06 +00001154 global wordsDictHTML
1155
1156 ret = analyzeHTMLPages()
1157 print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
1158
1159 i = 0
1160 skipped = 0
1161 for word in wordsDictHTML.keys():
1162 refs = wordsDictHTML[word]
1163 if refs == None:
1164 skipped = skipped + 1
1165 continue;
1166 for resource in refs.keys():
1167 (relevance, id, section) = refs[resource]
1168 updateWordHTML(word, resource, section, id, relevance)
1169 i = i + 1
1170
1171 print "Found %d associations in HTML pages" % (i)
1172
Daniel Veillard321be0c2002-10-08 21:26:42 +00001173def analyzeAPITop():
Daniel Veillard01e87d22002-10-08 16:55:06 +00001174 global wordsDict
Daniel Veillard321be0c2002-10-08 21:26:42 +00001175 global API
Daniel Veillard01e87d22002-10-08 16:55:06 +00001176
1177 try:
1178 doc = loadAPI(API)
1179 ret = analyzeAPI(doc)
1180 print "Analyzed %d blocs" % (ret)
1181 doc.freeDoc()
1182 except:
1183 print "Failed to parse and analyze %s" % (API)
1184 print sys.exc_type, sys.exc_value
1185 sys.exit(1)
1186
1187 print "Indexed %d words" % (len(wordsDict))
1188 i = 0
1189 skipped = 0
1190 for word in wordsDict.keys():
1191 refs = wordsDict[word]
1192 if refs == None:
1193 skipped = skipped + 1
1194 continue;
1195 for (module, symbol) in refs.keys():
1196 updateWord(word, symbol, refs[(module, symbol)])
1197 i = i + 1
1198
1199 print "Found %d associations, skipped %d words" % (i, skipped)
1200
1201def usage():
Daniel Veillardf08d4002002-10-08 17:17:11 +00001202 print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
Daniel Veillard3371ff82002-10-01 13:37:48 +00001203 sys.exit(1)
1204
Daniel Veillard01e87d22002-10-08 16:55:06 +00001205def main():
Daniel Veillard24f6a072004-04-08 14:39:25 +00001206 try:
1207 openMySQL()
1208 except:
1209 print "Failed to open the database"
1210 print sys.exc_type, sys.exc_value
1211 sys.exit(1)
1212
Daniel Veillard01e87d22002-10-08 16:55:06 +00001213 args = sys.argv[1:]
1214 force = 0
1215 if args:
1216 i = 0
1217 while i < len(args):
1218 if args[i] == '--force':
1219 force = 1
1220 elif args[i] == '--archive':
Daniel Veillardf08d4002002-10-08 17:17:11 +00001221 analyzeArchives(None, force)
1222 elif args[i] == '--archive-year':
1223 i = i + 1;
1224 year = args[i]
1225 months = ["January" , "February", "March", "April", "May",
1226 "June", "July", "August", "September", "October",
1227 "November", "December"];
1228 for month in months:
1229 try:
1230 str = "%s-%s" % (year, month)
1231 T = time.strptime(str, "%Y-%B")
1232 t = time.mktime(T) + 3600 * 24 * 10;
1233 analyzeArchives(t, force)
1234 except:
1235 print "Failed to index month archive:"
1236 print sys.exc_type, sys.exc_value
Daniel Veillard01e87d22002-10-08 16:55:06 +00001237 elif args[i] == '--archive-month':
1238 i = i + 1;
1239 month = args[i]
1240 try:
1241 T = time.strptime(month, "%Y-%B")
1242 t = time.mktime(T) + 3600 * 24 * 10;
1243 analyzeArchives(t, force)
1244 except:
1245 print "Failed to index month archive:"
1246 print sys.exc_type, sys.exc_value
1247 elif args[i] == '--API':
Daniel Veillard321be0c2002-10-08 21:26:42 +00001248 analyzeAPITop()
Daniel Veillard01e87d22002-10-08 16:55:06 +00001249 elif args[i] == '--docs':
Daniel Veillard321be0c2002-10-08 21:26:42 +00001250 analyzeHTMLTop()
Daniel Veillard01e87d22002-10-08 16:55:06 +00001251 else:
1252 usage()
1253 i = i + 1
1254 else:
1255 usage()
Daniel Veillard3371ff82002-10-01 13:37:48 +00001256
Daniel Veillard01e87d22002-10-08 16:55:06 +00001257if __name__ == "__main__":
1258 main()