Blame - doc/index.py - fp2-dev/platform/external/libxml2

blob: 600871b39d4714f085f7882c25e3d55cdcd1acf9 [file] [log] [blame]

Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1	#!/usr/bin/python -u
				2	#
				3	# imports the API description and fills up a database with
				4	# name relevance to modules, functions or web pages
				5	#
Daniel Veillard	2c77cd7	2002-10-01 13:54:14 +0000	[diff] [blame]	6	# Operation needed:
				7	# =================
				8	#
				9	# install mysqld, the python wrappers for mysql and libxml2, start mysqld
				10	# Change the root passwd of mysql:
				11	# mysqladmin -u root password new_password
				12	# Create the new database xmlsoft
				13	# mysqladmin -p create xmlsoft
				14	# Create a database user 'veillard' and give him passord access
				15	# change veillard and abcde with the right user name and passwd
				16	# mysql -p
				17	# password:
				18	# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
				19	# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
				20	#
				21	# As the user check the access:
				22	# mysql -p xmlsoft
				23	# Enter password:
				24	# Welcome to the MySQL monitor....
				25	# mysql> use xmlsoft
				26	# Database changed
				27	# mysql> quit
				28	# Bye
				29	#
				30	# Then run the script in the doc subdir, it will create the symbols and
				31	# word tables and populate them with informations extracted from
				32	# the libxml2-api.xml API description, and make then accessible read-only
				33	# by nobody@loaclhost the user expected to be Apache's one
				34	#
				35	# On the Apache configuration, make sure you have php support enabled
				36	#
				37
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	38	import MySQLdb
				39	import libxml2
				40	import sys
				41	import string
				42	import os
				43
				44	#
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	45	# We are not interested in parsing errors here
				46	#
				47	def callback(ctx, str):
				48	return
				49	libxml2.registerErrorHandler(callback, None)
				50
				51	#
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	52	# The dictionnary of tables required and the SQL command needed
				53	# to create them
				54	#
				55	TABLES={
				56	"symbols" : """CREATE TABLE symbols (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	57	name varchar(255) BINARY NOT NULL,
				58	module varchar(255) BINARY NOT NULL,
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	59	type varchar(25) NOT NULL,
				60	descr varchar(255),
				61	UNIQUE KEY name (name),
				62	KEY module (module))""",
				63	"words" : """CREATE TABLE words (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	64	name varchar(50) BINARY NOT NULL,
				65	symbol varchar(255) BINARY NOT NULL,
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	66	relevance int,
				67	KEY name (name),
				68	KEY symbol (symbol),
				69	UNIQUE KEY ID (name, symbol))""",
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	70	"wordsHTML" : """CREATE TABLE wordsHTML (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	71	name varchar(50) BINARY NOT NULL,
				72	resource varchar(255) BINARY NOT NULL,
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	73	section varchar(255),
				74	id varchar(50),
				75	relevance int,
				76	KEY name (name),
				77	KEY resource (resource),
				78	UNIQUE KEY ref (name, resource))""",
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	79	"wordsArchive" : """CREATE TABLE wordsArchive (
				80	name varchar(50) BINARY NOT NULL,
				81	ID int(11) NOT NULL,
				82	relevance int,
				83	KEY name (name),
				84	UNIQUE KEY ref (name, ID))""",
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	85	"pages" : """CREATE TABLE pages (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	86	resource varchar(255) BINARY NOT NULL,
				87	title varchar(255) BINARY NOT NULL,
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	88	UNIQUE KEY name (resource))""",
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	89	"archives" : """CREATE TABLE archives (
				90	ID int(11) NOT NULL auto_increment,
				91	resource varchar(255) BINARY NOT NULL,
				92	title varchar(255) BINARY NOT NULL,
				93	UNIQUE KEY id (ID,resource(255)),
				94	INDEX (ID),
				95	INDEX (resource))""",
Daniel Veillard	dc6d4ab	2002-10-04 15:58:34 +0000	[diff] [blame]	96	"Queries" : """CREATE TABLE Queries (
				97	ID int(11) NOT NULL auto_increment,
				98	Value varchar(50) NOT NULL,
				99	Count int(11) NOT NULL,
				100	UNIQUE KEY id (ID,Value(35)),
				101	INDEX (ID))""",
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	102	}
				103
				104	#
				105	# The XML API description file to parse
				106	#
				107	API="libxml2-api.xml"
				108	DB=None
				109
				110	#########################################################################
				111	# #
				112	# MySQL database interfaces #
				113	# #
				114	#########################################################################
				115	def createTable(db, name):
				116	global TABLES
				117
				118	if db == None:
				119	return -1
				120	if name == None:
				121	return -1
				122	c = db.cursor()
				123
				124	ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
				125	if ret == 1:
				126	print "Removed table %s" % (name)
				127	print "Creating table %s" % (name)
				128	try:
				129	ret = c.execute(TABLES[name])
				130	except:
				131	print "Failed to create table %s" % (name)
				132	return -1
				133	return ret
				134
				135	def checkTables(db):
				136	global TABLES
				137
				138	if db == None:
				139	return -1
				140	c = db.cursor()
				141	nbtables = c.execute("show tables")
				142	print "Found %d tables" % (nbtables)
				143	tables = {}
				144	i = 0
				145	while i < nbtables:
				146	l = c.fetchone()
				147	name = l[0]
				148	tables[name] = {}
				149	i = i + 1
				150
				151	for table in TABLES.keys():
				152	if not tables.has_key(table):
				153	print "table %s missing" % (table)
				154	createTable(db, table)
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame^]	155	try:
				156	ret = c.execute("SELECT count(*) from %s" % table);
				157	row = c.fetchone()
				158	print "Table %s contains %d records" % (table, row[0])
				159	except:
				160	print "Troubles with table %s : repairing" % (table)
				161	ret = c.execute("repair table %s" % table);
				162	print "repairing returned %d" % (ret)
				163	ret = c.execute("SELECT count(*) from %s" % table);
				164	row = c.fetchone()
				165	print "Table %s contains %d records" % (table, row[0])
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	166	print "checkTables finished"
				167
				168	# make sure apache can access the tables read-only
				169	try:
				170	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
Daniel Veillard	dc6d4ab	2002-10-04 15:58:34 +0000	[diff] [blame]	171	ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	172	except:
				173	pass
				174	return 0
				175
				176	def openMySQL(db="xmlsoft", passwd=None):
				177	global DB
				178
				179	if passwd == None:
Daniel Veillard	538d3b9	2002-10-01 14:04:56 +0000	[diff] [blame]	180	try:
				181	passwd = os.environ["MySQL_PASS"]
				182	except:
				183	print "No password available, set environment MySQL_PASS"
				184	sys.exit(1)
				185
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	186	DB = MySQLdb.connect(passwd=passwd, db=db)
				187	if DB == None:
				188	return -1
				189	ret = checkTables(DB)
				190	return ret
				191
				192	def updateWord(name, symbol, relevance):
				193	global DB
				194
				195	if DB == None:
				196	openMySQL()
				197	if DB == None:
				198	return -1
				199	if name == None:
				200	return -1
				201	if symbol == None:
				202	return -1
				203
				204	c = DB.cursor()
				205	try:
				206	ret = c.execute(
				207	"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
				208	(name, symbol, relevance))
				209	except:
				210	try:
				211	ret = c.execute(
				212	"""UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
				213	(relevance, name, symbol))
				214	except:
				215	print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
				216	print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
				217	print sys.exc_type, sys.exc_value
				218	return -1
				219
				220	return ret
				221
				222	def updateSymbol(name, module, type, desc):
				223	global DB
				224
				225	updateWord(name, name, 50)
				226	if DB == None:
				227	openMySQL()
				228	if DB == None:
				229	return -1
				230	if name == None:
				231	return -1
				232	if module == None:
				233	return -1
				234	if type == None:
				235	return -1
				236
				237	try:
				238	desc = string.replace(desc, "'", " ")
				239	l = string.split(desc, ".")
				240	desc = l[0]
				241	desc = desc[0:99]
				242	except:
				243	desc = ""
				244
				245	c = DB.cursor()
				246	try:
				247	ret = c.execute(
				248	"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
				249	(name, module, type, desc))
				250	except:
				251	try:
				252	ret = c.execute(
				253	"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
				254	(module, type, desc, name))
				255	except:
				256	print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
				257	print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
				258	print sys.exc_type, sys.exc_value
				259	return -1
				260
				261	return ret
				262
				263	def addFunction(name, module, desc = ""):
				264	return updateSymbol(name, module, 'function', desc)
				265
				266	def addMacro(name, module, desc = ""):
				267	return updateSymbol(name, module, 'macro', desc)
				268
				269	def addEnum(name, module, desc = ""):
				270	return updateSymbol(name, module, 'enum', desc)
				271
				272	def addStruct(name, module, desc = ""):
				273	return updateSymbol(name, module, 'struct', desc)
				274
				275	def addConst(name, module, desc = ""):
				276	return updateSymbol(name, module, 'const', desc)
				277
				278	def addType(name, module, desc = ""):
				279	return updateSymbol(name, module, 'type', desc)
				280
				281	def addFunctype(name, module, desc = ""):
				282	return updateSymbol(name, module, 'functype', desc)
				283
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	284	def addPage(resource, title):
				285	global DB
				286
				287	if DB == None:
				288	openMySQL()
				289	if DB == None:
				290	return -1
				291	if resource == None:
				292	return -1
				293
				294	c = DB.cursor()
				295	try:
				296	ret = c.execute(
				297	"""INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
				298	(resource, title))
				299	except:
				300	try:
				301	ret = c.execute(
				302	"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
				303	(title, resource))
				304	except:
				305	print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
				306	print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
				307	print sys.exc_type, sys.exc_value
				308	return -1
				309
				310	return ret
				311
				312	def updateWordHTML(name, resource, desc, id, relevance):
				313	global DB
				314
				315	if DB == None:
				316	openMySQL()
				317	if DB == None:
				318	return -1
				319	if name == None:
				320	return -1
				321	if resource == None:
				322	return -1
				323	if id == None:
				324	id = ""
				325	if desc == None:
				326	desc = ""
				327	else:
				328	try:
				329	desc = string.replace(desc, "'", " ")
				330	desc = desc[0:99]
				331	except:
				332	desc = ""
				333
				334	c = DB.cursor()
				335	try:
				336	ret = c.execute(
				337	"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
				338	(name, resource, desc, id, relevance))
				339	except:
				340	try:
				341	ret = c.execute(
				342	"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
				343	(desc, id, relevance, name, resource))
				344	except:
				345	print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
				346	print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
				347	print sys.exc_type, sys.exc_value
				348	return -1
				349
				350	return ret
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	351
				352	def checkXMLMsgArchive(url):
				353	global DB
				354
				355	if DB == None:
				356	openMySQL()
				357	if DB == None:
				358	return -1
				359	if url == None:
				360	return -1
				361
				362	c = DB.cursor()
				363	try:
				364	ret = c.execute(
				365	"""SELECT ID FROM archives WHERE resource='%s'""" % (url))
				366	row = c.fetchone()
				367	if row == None:
				368	return -1
				369	except:
				370	return -1
				371
				372	return row[0]
				373
				374	def addXMLMsgArchive(url, title):
				375	global DB
				376
				377	if DB == None:
				378	openMySQL()
				379	if DB == None:
				380	return -1
				381	if url == None:
				382	return -1
				383	if title == None:
				384	title = ""
				385	else:
				386	title = string.replace(title, "'", " ")
				387	title = title[0:99]
				388
				389	c = DB.cursor()
				390	try:
				391	cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
				392	ret = c.execute(cmd)
				393	cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
				394	ret = c.execute(cmd)
				395	row = c.fetchone()
				396	if row == None:
				397	print "addXMLMsgArchive failed to get the ID: %s" % (url)
				398	return -1
				399	except:
				400	print "addXMLMsgArchive failed command: %s" % (cmd)
				401	return -1
				402
				403	return((int)(row[0]))
				404
				405	def updateWordArchive(name, id, relevance):
				406	global DB
				407
				408	if DB == None:
				409	openMySQL()
				410	if DB == None:
				411	return -1
				412	if name == None:
				413	return -1
				414	if id == None:
				415	return -1
				416
				417	c = DB.cursor()
				418	try:
				419	ret = c.execute(
				420	"""INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
				421	(name, id, relevance))
				422	except:
				423	try:
				424	ret = c.execute(
				425	"""UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
				426	(relevance, name, id))
				427	except:
				428	print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
				429	print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
				430	print sys.exc_type, sys.exc_value
				431	return -1
				432
				433	return ret
				434
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	435	#########################################################################
				436	# #
				437	# Word dictionnary and analysis routines #
				438	# #
				439	#########################################################################
				440
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	441	#
				442	# top 100 english word without the one len < 3 + own set
				443	#
				444	dropWords = {
				445	'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
				446	'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
				447	'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
				448	'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
				449	'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
				450	'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
				451	'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
				452	'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
				453	'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
				454	'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
				455	'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
				456	'down':0,
				457	'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
				458	}
				459
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	460	wordsDict = {}
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	461	wordsDictHTML = {}
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	462	wordsDictArchive = {}
				463
				464	def cleanupWordsString(str):
				465	str = string.replace(str, ".", " ")
				466	str = string.replace(str, "!", " ")
				467	str = string.replace(str, "?", " ")
				468	str = string.replace(str, ",", " ")
				469	str = string.replace(str, "'", " ")
				470	str = string.replace(str, '"', " ")
				471	str = string.replace(str, ";", " ")
				472	str = string.replace(str, "-", " ")
				473	str = string.replace(str, "(", " ")
				474	str = string.replace(str, ")", " ")
				475	str = string.replace(str, "{", " ")
				476	str = string.replace(str, "}", " ")
				477	str = string.replace(str, "<", " ")
				478	str = string.replace(str, ">", " ")
				479	str = string.replace(str, "=", " ")
				480	str = string.replace(str, "/", " ")
				481	str = string.replace(str, "*", " ")
				482	str = string.replace(str, ":", " ")
				483	str = string.replace(str, "#", " ")
				484	str = string.replace(str, "\\", " ")
				485	str = string.replace(str, "\n", " ")
				486	str = string.replace(str, "\r", " ")
				487	str = string.replace(str, "\xc2", " ")
				488	str = string.replace(str, "\xa0", " ")
				489	return str
				490
				491	def cleanupDescrString(str):
				492	str = string.replace(str, "\n", " ")
				493	str = string.replace(str, "\r", " ")
				494	str = string.replace(str, "\xc2", " ")
				495	str = string.replace(str, "\xa0", " ")
				496	l = string.split(str)
				497	str = string.join(str)
				498	return str
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	499
				500	def splitIdentifier(str):
				501	ret = []
				502	while str != "":
				503	cur = string.lower(str[0])
				504	str = str[1:]
				505	if ((cur < 'a') or (cur > 'z')):
				506	continue
				507	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
				508	cur = cur + string.lower(str[0])
				509	str = str[1:]
				510	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
				511	cur = cur + str[0]
				512	str = str[1:]
				513	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
				514	str = str[1:]
				515	ret.append(cur)
				516	return ret
				517
				518	def addWord(word, module, symbol, relevance):
				519	global wordsDict
				520
				521	if word == None or len(word) < 3:
				522	return -1
				523	if module == None or symbol == None:
				524	return -1
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	525	if dropWords.has_key(word):
				526	return 0
				527	if ord(word[0]) > 0x80:
				528	return 0
				529
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	530	if wordsDict.has_key(word):
				531	d = wordsDict[word]
				532	if d == None:
				533	return 0
				534	if len(d) > 500:
				535	wordsDict[word] = None
				536	return 0
				537	try:
				538	relevance = relevance + d[(module, symbol)]
				539	except:
				540	pass
				541	else:
				542	wordsDict[word] = {}
				543	wordsDict[word][(module, symbol)] = relevance
				544	return relevance
				545
				546	def addString(str, module, symbol, relevance):
				547	if str == None or len(str) < 3:
				548	return -1
				549	ret = 0
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	550	str = cleanupWordsString(str)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	551	l = string.split(str)
				552	for word in l:
				553	if len(word) > 2:
				554	ret = ret + addWord(word, module, symbol, 5)
				555
				556	return ret
				557
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	558	def addWordHTML(word, resource, id, section, relevance):
				559	global wordsDictHTML
				560
				561	if word == None or len(word) < 3:
				562	return -1
				563	if resource == None or section == None:
				564	return -1
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	565	if dropWords.has_key(word):
				566	return 0
				567	if ord(word[0]) > 0x80:
				568	return 0
				569
				570	section = cleanupDescrString(section)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	571
				572	if wordsDictHTML.has_key(word):
				573	d = wordsDictHTML[word]
				574	if d == None:
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	575	print "skipped %s" % (word)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	576	return 0
				577	try:
				578	(r,i,s) = d[resource]
				579	if i != None:
				580	id = i
				581	if s != None:
				582	section = s
				583	relevance = relevance + r
				584	except:
				585	pass
				586	else:
				587	wordsDictHTML[word] = {}
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	588	d = wordsDictHTML[word];
				589	d[resource] = (relevance, id, section)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	590	return relevance
				591
				592	def addStringHTML(str, resource, id, section, relevance):
				593	if str == None or len(str) < 3:
				594	return -1
				595	ret = 0
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	596	str = cleanupWordsString(str)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	597	l = string.split(str)
				598	for word in l:
				599	if len(word) > 2:
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	600	try:
				601	r = addWordHTML(word, resource, id, section, relevance)
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	602	if r < 0:
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	603	print "addWordHTML failed: %s %s" % (word, resource)
				604	ret = ret + r
				605	except:
				606	print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
				607	print sys.exc_type, sys.exc_value
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	608
				609	return ret
				610
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	611	def addWordArchive(word, id, relevance):
				612	global wordsDictArchive
				613
				614	if word == None or len(word) < 3:
				615	return -1
				616	if id == None or id == -1:
				617	return -1
				618	if dropWords.has_key(word):
				619	return 0
				620	if ord(word[0]) > 0x80:
				621	return 0
				622
				623	if wordsDictArchive.has_key(word):
				624	d = wordsDictArchive[word]
				625	if d == None:
				626	print "skipped %s" % (word)
				627	return 0
				628	try:
				629	r = d[id]
				630	relevance = relevance + r
				631	except:
				632	pass
				633	else:
				634	wordsDictArchive[word] = {}
				635	d = wordsDictArchive[word];
				636	d[id] = relevance
				637	return relevance
				638
				639	def addStringArchive(str, id, relevance):
				640	if str == None or len(str) < 3:
				641	return -1
				642	ret = 0
				643	str = cleanupWordsString(str)
				644	l = string.split(str)
				645	for word in l:
				646	i = len(word)
				647	if i > 2:
				648	try:
				649	r = addWordArchive(word, id, relevance)
				650	if r < 0:
				651	print "addWordArchive failed: %s %s" % (word, id)
				652	else:
				653	ret = ret + r
				654	except:
				655	print "addWordArchive failed: %s %s %d" % (word, id, relevance)
				656	print sys.exc_type, sys.exc_value
				657	return ret
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	658
				659	#########################################################################
				660	# #
				661	# XML API description analysis #
				662	# #
				663	#########################################################################
				664
				665	def loadAPI(filename):
				666	doc = libxml2.parseFile(filename)
				667	print "loaded %s" % (filename)
				668	return doc
				669
				670	def foundExport(file, symbol):
				671	if file == None:
				672	return 0
				673	if symbol == None:
				674	return 0
				675	addFunction(symbol, file)
				676	l = splitIdentifier(symbol)
				677	for word in l:
				678	addWord(word, file, symbol, 10)
				679	return 1
				680
				681	def analyzeAPIFile(top):
				682	count = 0
				683	name = top.prop("name")
				684	cur = top.children
				685	while cur != None:
				686	if cur.type == 'text':
				687	cur = cur.next
				688	continue
				689	if cur.name == "exports":
				690	count = count + foundExport(name, cur.prop("symbol"))
				691	else:
				692	print "unexpected element %s in API doc <file name='%s'>" % (name)
				693	cur = cur.next
				694	return count
				695
				696	def analyzeAPIFiles(top):
				697	count = 0
				698	cur = top.children
				699
				700	while cur != None:
				701	if cur.type == 'text':
				702	cur = cur.next
				703	continue
				704	if cur.name == "file":
				705	count = count + analyzeAPIFile(cur)
				706	else:
				707	print "unexpected element %s in API doc <files>" % (cur.name)
				708	cur = cur.next
				709	return count
				710
				711	def analyzeAPIEnum(top):
				712	file = top.prop("file")
				713	if file == None:
				714	return 0
				715	symbol = top.prop("name")
				716	if symbol == None:
				717	return 0
				718
				719	addEnum(symbol, file)
				720	l = splitIdentifier(symbol)
				721	for word in l:
				722	addWord(word, file, symbol, 10)
				723
				724	return 1
				725
				726	def analyzeAPIConst(top):
				727	file = top.prop("file")
				728	if file == None:
				729	return 0
				730	symbol = top.prop("name")
				731	if symbol == None:
				732	return 0
				733
				734	addConst(symbol, file)
				735	l = splitIdentifier(symbol)
				736	for word in l:
				737	addWord(word, file, symbol, 10)
				738
				739	return 1
				740
				741	def analyzeAPIType(top):
				742	file = top.prop("file")
				743	if file == None:
				744	return 0
				745	symbol = top.prop("name")
				746	if symbol == None:
				747	return 0
				748
				749	addType(symbol, file)
				750	l = splitIdentifier(symbol)
				751	for word in l:
				752	addWord(word, file, symbol, 10)
				753	return 1
				754
				755	def analyzeAPIFunctype(top):
				756	file = top.prop("file")
				757	if file == None:
				758	return 0
				759	symbol = top.prop("name")
				760	if symbol == None:
				761	return 0
				762
				763	addFunctype(symbol, file)
				764	l = splitIdentifier(symbol)
				765	for word in l:
				766	addWord(word, file, symbol, 10)
				767	return 1
				768
				769	def analyzeAPIStruct(top):
				770	file = top.prop("file")
				771	if file == None:
				772	return 0
				773	symbol = top.prop("name")
				774	if symbol == None:
				775	return 0
				776
				777	addStruct(symbol, file)
				778	l = splitIdentifier(symbol)
				779	for word in l:
				780	addWord(word, file, symbol, 10)
				781
				782	info = top.prop("info")
				783	if info != None:
				784	l = string.split(info)
				785	for word in l:
				786	if len(word) > 2:
				787	addWord(word, file, symbol, 5)
				788	return 1
				789
				790	def analyzeAPIMacro(top):
				791	file = top.prop("file")
				792	if file == None:
				793	return 0
				794	symbol = top.prop("name")
				795	if symbol == None:
				796	return 0
				797
				798	info = None
				799	cur = top.children
				800	while cur != None:
				801	if cur.type == 'text':
				802	cur = cur.next
				803	continue
				804	if cur.name == "info":
				805	info = cur.content
				806	break
				807	cur = cur.next
				808
				809	l = splitIdentifier(symbol)
				810	for word in l:
				811	addWord(word, file, symbol, 10)
				812
				813	if info == None:
				814	addMacro(symbol, file)
				815	print "Macro %s description has no <info>" % (symbol)
				816	return 0
				817
				818	addMacro(symbol, file, info)
				819	l = string.split(info)
				820	for word in l:
				821	if len(word) > 2:
				822	addWord(word, file, symbol, 5)
				823	return 1
				824
				825	def analyzeAPIFunction(top):
				826	file = top.prop("file")
				827	if file == None:
				828	return 0
				829	symbol = top.prop("name")
				830	if symbol == None:
				831	return 0
				832
				833	info = None
				834	cur = top.children
				835	while cur != None:
				836	if cur.type == 'text':
				837	cur = cur.next
				838	continue
				839	if cur.name == "info":
				840	info = cur.content
				841	elif cur.name == "return":
				842	rinfo = cur.prop("info")
				843	if rinfo != None:
				844	addString(rinfo, file, symbol, 7)
				845	elif cur.name == "arg":
				846	ainfo = cur.prop("info")
				847	if rinfo != None:
				848	addString(ainfo, file, symbol, 5)
				849	name = cur.prop("name")
				850	if name != None:
				851	addWord(name, file, symbol, 7)
				852	cur = cur.next
				853	if info == None:
				854	print "Function %s description has no <info>" % (symbol)
				855	addFunction(symbol, file, "")
				856	else:
				857	addFunction(symbol, file, info)
				858	addString(info, file, symbol, 5)
				859
				860	l = splitIdentifier(symbol)
				861	for word in l:
				862	addWord(word, file, symbol, 10)
				863
				864	return 1
				865
				866	def analyzeAPISymbols(top):
				867	count = 0
				868	cur = top.children
				869
				870	while cur != None:
				871	if cur.type == 'text':
				872	cur = cur.next
				873	continue
				874	if cur.name == "macro":
				875	count = count + analyzeAPIMacro(cur)
				876	elif cur.name == "function":
				877	count = count + analyzeAPIFunction(cur)
				878	elif cur.name == "const":
				879	count = count + analyzeAPIConst(cur)
				880	elif cur.name == "typedef":
				881	count = count + analyzeAPIType(cur)
				882	elif cur.name == "struct":
				883	count = count + analyzeAPIStruct(cur)
				884	elif cur.name == "enum":
				885	count = count + analyzeAPIEnum(cur)
				886	elif cur.name == "functype":
				887	count = count + analyzeAPIFunctype(cur)
				888	else:
				889	print "unexpected element %s in API doc <files>" % (cur.name)
				890	cur = cur.next
				891	return count
				892
				893	def analyzeAPI(doc):
				894	count = 0
				895	if doc == None:
				896	return -1
				897	root = doc.getRootElement()
				898	if root.name != "api":
				899	print "Unexpected root name"
				900	return -1
				901	cur = root.children
				902	while cur != None:
				903	if cur.type == 'text':
				904	cur = cur.next
				905	continue
				906	if cur.name == "files":
				907	pass
				908	# count = count + analyzeAPIFiles(cur)
				909	elif cur.name == "symbols":
				910	count = count + analyzeAPISymbols(cur)
				911	else:
				912	print "unexpected element %s in API doc" % (cur.name)
				913	cur = cur.next
				914	return count
				915
				916	#########################################################################
				917	# #
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	918	# Web pages parsing and analysis #
				919	# #
				920	#########################################################################
				921
				922	import glob
				923
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	924	def analyzeHTMLText(doc, resource, p, section, id):
				925	words = 0
				926	try:
				927	content = p.content
				928	words = words + addStringHTML(content, resource, id, section, 5)
				929	except:
				930	return -1
				931	return words
				932
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	933	def analyzeHTMLPara(doc, resource, p, section, id):
				934	words = 0
				935	try:
				936	content = p.content
				937	words = words + addStringHTML(content, resource, id, section, 5)
				938	except:
				939	return -1
				940	return words
				941
				942	def analyzeHTMLPre(doc, resource, p, section, id):
				943	words = 0
				944	try:
				945	content = p.content
				946	words = words + addStringHTML(content, resource, id, section, 5)
				947	except:
				948	return -1
				949	return words
				950
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	951	def analyzeHTML(doc, resource, p, section, id):
				952	words = 0
				953	try:
				954	content = p.content
				955	words = words + addStringHTML(content, resource, id, section, 5)
				956	except:
				957	return -1
				958	return words
				959
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	960	def analyzeHTML(doc, resource):
				961	para = 0;
				962	ctxt = doc.xpathNewContext()
				963	try:
				964	res = ctxt.xpathEval("//head/title")
				965	title = res[0].content
				966	except:
				967	title = "Page %s" % (resource)
				968	addPage(resource, title)
				969	try:
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	970	items = ctxt.xpathEval("//h1 \| //h2 \| //h3 \| //text()")
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	971	section = title
				972	id = ""
				973	for item in items:
				974	if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
				975	section = item.content
				976	if item.prop("id"):
				977	id = item.prop("id")
				978	elif item.prop("name"):
				979	id = item.prop("name")
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	980	elif item.type == 'text':
				981	analyzeHTMLText(doc, resource, item, section, id)
				982	para = para + 1
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	983	elif item.name == 'p':
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	984	analyzeHTMLPara(doc, resource, item, section, id)
				985	para = para + 1
				986	elif item.name == 'pre':
				987	analyzeHTMLPre(doc, resource, item, section, id)
				988	para = para + 1
				989	else:
				990	print "Page %s, unexpected %s element" % (resource, item.name)
				991	except:
				992	print "Page %s: problem analyzing" % (resource)
				993	print sys.exc_type, sys.exc_value
				994
				995	return para
				996
				997	def analyzeHTMLPages():
				998	ret = 0
				999	HTMLfiles = glob.glob(".html") + glob.glob("tutorial/.html")
				1000	for html in HTMLfiles:
				1001	if html[0:3] == "API":
				1002	continue
				1003	if html == "xml.html":
				1004	continue
				1005	try:
				1006	doc = libxml2.htmlParseFile(html, None)
				1007	res = analyzeHTML(doc, html)
				1008	print "Parsed %s : %d paragraphs" % (html, res)
				1009	ret = ret + 1
				1010	except:
				1011	print "could not parse %s" % (html)
				1012	return ret
				1013
				1014	#########################################################################
				1015	# #
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1016	# Mail archives parsing and analysis #
				1017	# #
				1018	#########################################################################
				1019
				1020	import time
				1021
				1022	def getXMLDateArchive(t = None):
				1023	if t == None:
				1024	t = time.time()
				1025	T = time.gmtime(t)
				1026	month = time.strftime("%B", T)
				1027	year = T[0]
				1028	url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
				1029	return url
				1030
				1031	def scanXMLMsgArchive(url, title, force = 0):
				1032	if url == None or title == None:
				1033	return 0
				1034
				1035	ID = checkXMLMsgArchive(url)
				1036	if force == 0 and ID != -1:
				1037	return 0
				1038
				1039	if ID == -1:
				1040	ID = addXMLMsgArchive(url, title)
				1041	if ID == -1:
				1042	return 0
				1043
				1044	try:
				1045	print "Loading %s" % (url)
				1046	doc = libxml2.htmlParseFile(url, None);
				1047	except:
				1048	doc = None
				1049	if doc == None:
				1050	print "Failed to parse %s" % (url)
				1051	return 0
				1052
				1053	addStringArchive(title, ID, 20)
				1054	ctxt = doc.xpathNewContext()
				1055	texts = ctxt.xpathEval("//pre//text()")
				1056	for text in texts:
				1057	addStringArchive(text.content, ID, 5)
				1058
				1059	return 1
				1060
				1061	def scanXMLDateArchive(t = None, force = 0):
Daniel Veillard	d7960a8	2002-10-08 19:13:50 +0000	[diff] [blame]	1062	global wordsDictArchive
				1063
				1064	wordsDictArchive = {}
				1065
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1066	url = getXMLDateArchive(t)
				1067	print "loading %s" % (url)
				1068	try:
				1069	doc = libxml2.htmlParseFile(url, None);
				1070	except:
				1071	doc = None
				1072	if doc == None:
				1073	print "Failed to parse %s" % (url)
				1074	return -1
				1075	ctxt = doc.xpathNewContext()
				1076	anchors = ctxt.xpathEval("//a[@href]")
				1077	links = 0
				1078	newmsg = 0
				1079	for anchor in anchors:
				1080	href = anchor.prop("href")
				1081	if href == None or href[0:3] != "msg":
				1082	continue
				1083	try:
				1084	links = links + 1
				1085
				1086	msg = libxml2.buildURI(href, url)
				1087	title = anchor.content
				1088	if title != None and title[0:4] == 'Re: ':
				1089	title = title[4:]
				1090	if title != None and title[0:6] == '[xml] ':
				1091	title = title[6:]
				1092	newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
				1093
				1094	except:
				1095	pass
				1096
				1097	return newmsg
				1098
				1099
				1100	#########################################################################
				1101	# #
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1102	# Main code: open the DB, the API XML and analyze it #
				1103	# #
				1104	#########################################################################
				1105	try:
				1106	openMySQL()
				1107	except:
				1108	print "Failed to open the database"
				1109	print sys.exc_type, sys.exc_value
				1110	sys.exit(1)
				1111
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1112	def analyzeArchives(t = None, force = 0):
				1113	global wordsDictArchive
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	1114
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1115	ret = scanXMLDateArchive(t, force)
				1116	print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	1117
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1118	i = 0
				1119	skipped = 0
				1120	for word in wordsDictArchive.keys():
				1121	refs = wordsDictArchive[word]
				1122	if refs == None:
				1123	skipped = skipped + 1
				1124	continue;
				1125	for id in refs.keys():
				1126	relevance = refs[id]
				1127	updateWordArchive(word, id, relevance)
				1128	i = i + 1
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	1129
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1130	print "Found %d associations in HTML pages" % (i)
				1131
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame^]	1132	def analyzeHTMLTop():
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1133	global wordsDictHTML
				1134
				1135	ret = analyzeHTMLPages()
				1136	print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
				1137
				1138	i = 0
				1139	skipped = 0
				1140	for word in wordsDictHTML.keys():
				1141	refs = wordsDictHTML[word]
				1142	if refs == None:
				1143	skipped = skipped + 1
				1144	continue;
				1145	for resource in refs.keys():
				1146	(relevance, id, section) = refs[resource]
				1147	updateWordHTML(word, resource, section, id, relevance)
				1148	i = i + 1
				1149
				1150	print "Found %d associations in HTML pages" % (i)
				1151
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame^]	1152	def analyzeAPITop():
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1153	global wordsDict
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame^]	1154	global API
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1155
				1156	try:
				1157	doc = loadAPI(API)
				1158	ret = analyzeAPI(doc)
				1159	print "Analyzed %d blocs" % (ret)
				1160	doc.freeDoc()
				1161	except:
				1162	print "Failed to parse and analyze %s" % (API)
				1163	print sys.exc_type, sys.exc_value
				1164	sys.exit(1)
				1165
				1166	print "Indexed %d words" % (len(wordsDict))
				1167	i = 0
				1168	skipped = 0
				1169	for word in wordsDict.keys():
				1170	refs = wordsDict[word]
				1171	if refs == None:
				1172	skipped = skipped + 1
				1173	continue;
				1174	for (module, symbol) in refs.keys():
				1175	updateWord(word, symbol, refs[(module, symbol)])
				1176	i = i + 1
				1177
				1178	print "Found %d associations, skipped %d words" % (i, skipped)
				1179
				1180	def usage():
Daniel Veillard	f08d400	2002-10-08 17:17:11 +0000	[diff] [blame]	1181	print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1182	sys.exit(1)
				1183
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1184	def main():
				1185	args = sys.argv[1:]
				1186	force = 0
				1187	if args:
				1188	i = 0
				1189	while i < len(args):
				1190	if args[i] == '--force':
				1191	force = 1
				1192	elif args[i] == '--archive':
Daniel Veillard	f08d400	2002-10-08 17:17:11 +0000	[diff] [blame]	1193	analyzeArchives(None, force)
				1194	elif args[i] == '--archive-year':
				1195	i = i + 1;
				1196	year = args[i]
				1197	months = ["January" , "February", "March", "April", "May",
				1198	"June", "July", "August", "September", "October",
				1199	"November", "December"];
				1200	for month in months:
				1201	try:
				1202	str = "%s-%s" % (year, month)
				1203	T = time.strptime(str, "%Y-%B")
				1204	t = time.mktime(T) + 3600 * 24 * 10;
				1205	analyzeArchives(t, force)
				1206	except:
				1207	print "Failed to index month archive:"
				1208	print sys.exc_type, sys.exc_value
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1209	elif args[i] == '--archive-month':
				1210	i = i + 1;
				1211	month = args[i]
				1212	try:
				1213	T = time.strptime(month, "%Y-%B")
				1214	t = time.mktime(T) + 3600 * 24 * 10;
				1215	analyzeArchives(t, force)
				1216	except:
				1217	print "Failed to index month archive:"
				1218	print sys.exc_type, sys.exc_value
				1219	elif args[i] == '--API':
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame^]	1220	analyzeAPITop()
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1221	elif args[i] == '--docs':
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame^]	1222	analyzeHTMLTop()
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1223	else:
				1224	usage()
				1225	i = i + 1
				1226	else:
				1227	usage()
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1228
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1229	if __name__ == "__main__":
				1230	main()