Blame - doc/index.py - fp2-dev/platform/external/libxml2

blob: 128d11a71b7315a0f19a56119b0c87888ba1a083 [file] [log] [blame]

Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1	#!/usr/bin/python -u
				2	#
				3	# imports the API description and fills up a database with
				4	# name relevance to modules, functions or web pages
				5	#
Daniel Veillard	2c77cd7	2002-10-01 13:54:14 +0000	[diff] [blame]	6	# Operation needed:
				7	# =================
				8	#
				9	# install mysqld, the python wrappers for mysql and libxml2, start mysqld
				10	# Change the root passwd of mysql:
				11	# mysqladmin -u root password new_password
				12	# Create the new database xmlsoft
				13	# mysqladmin -p create xmlsoft
				14	# Create a database user 'veillard' and give him passord access
				15	# change veillard and abcde with the right user name and passwd
				16	# mysql -p
				17	# password:
				18	# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
				19	# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
				20	#
				21	# As the user check the access:
				22	# mysql -p xmlsoft
				23	# Enter password:
				24	# Welcome to the MySQL monitor....
				25	# mysql> use xmlsoft
				26	# Database changed
				27	# mysql> quit
				28	# Bye
				29	#
				30	# Then run the script in the doc subdir, it will create the symbols and
				31	# word tables and populate them with informations extracted from
				32	# the libxml2-api.xml API description, and make then accessible read-only
				33	# by nobody@loaclhost the user expected to be Apache's one
				34	#
				35	# On the Apache configuration, make sure you have php support enabled
				36	#
				37
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	38	import MySQLdb
				39	import libxml2
				40	import sys
				41	import string
				42	import os
				43
				44	#
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	45	# We are not interested in parsing errors here
				46	#
				47	def callback(ctx, str):
				48	return
				49	libxml2.registerErrorHandler(callback, None)
				50
				51	#
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	52	# The dictionnary of tables required and the SQL command needed
				53	# to create them
				54	#
				55	TABLES={
				56	"symbols" : """CREATE TABLE symbols (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	57	name varchar(255) BINARY NOT NULL,
				58	module varchar(255) BINARY NOT NULL,
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	59	type varchar(25) NOT NULL,
				60	descr varchar(255),
				61	UNIQUE KEY name (name),
				62	KEY module (module))""",
				63	"words" : """CREATE TABLE words (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	64	name varchar(50) BINARY NOT NULL,
				65	symbol varchar(255) BINARY NOT NULL,
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	66	relevance int,
				67	KEY name (name),
				68	KEY symbol (symbol),
				69	UNIQUE KEY ID (name, symbol))""",
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	70	"wordsHTML" : """CREATE TABLE wordsHTML (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	71	name varchar(50) BINARY NOT NULL,
				72	resource varchar(255) BINARY NOT NULL,
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	73	section varchar(255),
				74	id varchar(50),
				75	relevance int,
				76	KEY name (name),
				77	KEY resource (resource),
				78	UNIQUE KEY ref (name, resource))""",
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	79	"wordsArchive" : """CREATE TABLE wordsArchive (
				80	name varchar(50) BINARY NOT NULL,
				81	ID int(11) NOT NULL,
				82	relevance int,
				83	KEY name (name),
				84	UNIQUE KEY ref (name, ID))""",
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	85	"pages" : """CREATE TABLE pages (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	86	resource varchar(255) BINARY NOT NULL,
				87	title varchar(255) BINARY NOT NULL,
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	88	UNIQUE KEY name (resource))""",
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	89	"archives" : """CREATE TABLE archives (
				90	ID int(11) NOT NULL auto_increment,
				91	resource varchar(255) BINARY NOT NULL,
				92	title varchar(255) BINARY NOT NULL,
				93	UNIQUE KEY id (ID,resource(255)),
				94	INDEX (ID),
				95	INDEX (resource))""",
Daniel Veillard	dc6d4ab	2002-10-04 15:58:34 +0000	[diff] [blame]	96	"Queries" : """CREATE TABLE Queries (
				97	ID int(11) NOT NULL auto_increment,
				98	Value varchar(50) NOT NULL,
				99	Count int(11) NOT NULL,
				100	UNIQUE KEY id (ID,Value(35)),
				101	INDEX (ID))""",
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	102	}
				103
				104	#
				105	# The XML API description file to parse
				106	#
				107	API="libxml2-api.xml"
				108	DB=None
				109
				110	#########################################################################
				111	# #
				112	# MySQL database interfaces #
				113	# #
				114	#########################################################################
				115	def createTable(db, name):
				116	global TABLES
				117
				118	if db == None:
				119	return -1
				120	if name == None:
				121	return -1
				122	c = db.cursor()
				123
				124	ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
				125	if ret == 1:
				126	print "Removed table %s" % (name)
				127	print "Creating table %s" % (name)
				128	try:
				129	ret = c.execute(TABLES[name])
				130	except:
				131	print "Failed to create table %s" % (name)
				132	return -1
				133	return ret
				134
				135	def checkTables(db):
				136	global TABLES
				137
				138	if db == None:
				139	return -1
				140	c = db.cursor()
				141	nbtables = c.execute("show tables")
				142	print "Found %d tables" % (nbtables)
				143	tables = {}
				144	i = 0
				145	while i < nbtables:
				146	l = c.fetchone()
				147	name = l[0]
				148	tables[name] = {}
				149	i = i + 1
				150
				151	for table in TABLES.keys():
				152	if not tables.has_key(table):
				153	print "table %s missing" % (table)
				154	createTable(db, table)
				155	print "checkTables finished"
				156
				157	# make sure apache can access the tables read-only
				158	try:
				159	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
Daniel Veillard	dc6d4ab	2002-10-04 15:58:34 +0000	[diff] [blame]	160	ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	161	except:
				162	pass
				163	return 0
				164
				165	def openMySQL(db="xmlsoft", passwd=None):
				166	global DB
				167
				168	if passwd == None:
Daniel Veillard	538d3b9	2002-10-01 14:04:56 +0000	[diff] [blame]	169	try:
				170	passwd = os.environ["MySQL_PASS"]
				171	except:
				172	print "No password available, set environment MySQL_PASS"
				173	sys.exit(1)
				174
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	175	DB = MySQLdb.connect(passwd=passwd, db=db)
				176	if DB == None:
				177	return -1
				178	ret = checkTables(DB)
				179	return ret
				180
				181	def updateWord(name, symbol, relevance):
				182	global DB
				183
				184	if DB == None:
				185	openMySQL()
				186	if DB == None:
				187	return -1
				188	if name == None:
				189	return -1
				190	if symbol == None:
				191	return -1
				192
				193	c = DB.cursor()
				194	try:
				195	ret = c.execute(
				196	"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
				197	(name, symbol, relevance))
				198	except:
				199	try:
				200	ret = c.execute(
				201	"""UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
				202	(relevance, name, symbol))
				203	except:
				204	print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
				205	print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
				206	print sys.exc_type, sys.exc_value
				207	return -1
				208
				209	return ret
				210
				211	def updateSymbol(name, module, type, desc):
				212	global DB
				213
				214	updateWord(name, name, 50)
				215	if DB == None:
				216	openMySQL()
				217	if DB == None:
				218	return -1
				219	if name == None:
				220	return -1
				221	if module == None:
				222	return -1
				223	if type == None:
				224	return -1
				225
				226	try:
				227	desc = string.replace(desc, "'", " ")
				228	l = string.split(desc, ".")
				229	desc = l[0]
				230	desc = desc[0:99]
				231	except:
				232	desc = ""
				233
				234	c = DB.cursor()
				235	try:
				236	ret = c.execute(
				237	"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
				238	(name, module, type, desc))
				239	except:
				240	try:
				241	ret = c.execute(
				242	"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
				243	(module, type, desc, name))
				244	except:
				245	print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
				246	print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
				247	print sys.exc_type, sys.exc_value
				248	return -1
				249
				250	return ret
				251
				252	def addFunction(name, module, desc = ""):
				253	return updateSymbol(name, module, 'function', desc)
				254
				255	def addMacro(name, module, desc = ""):
				256	return updateSymbol(name, module, 'macro', desc)
				257
				258	def addEnum(name, module, desc = ""):
				259	return updateSymbol(name, module, 'enum', desc)
				260
				261	def addStruct(name, module, desc = ""):
				262	return updateSymbol(name, module, 'struct', desc)
				263
				264	def addConst(name, module, desc = ""):
				265	return updateSymbol(name, module, 'const', desc)
				266
				267	def addType(name, module, desc = ""):
				268	return updateSymbol(name, module, 'type', desc)
				269
				270	def addFunctype(name, module, desc = ""):
				271	return updateSymbol(name, module, 'functype', desc)
				272
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	273	def addPage(resource, title):
				274	global DB
				275
				276	if DB == None:
				277	openMySQL()
				278	if DB == None:
				279	return -1
				280	if resource == None:
				281	return -1
				282
				283	c = DB.cursor()
				284	try:
				285	ret = c.execute(
				286	"""INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
				287	(resource, title))
				288	except:
				289	try:
				290	ret = c.execute(
				291	"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
				292	(title, resource))
				293	except:
				294	print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
				295	print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
				296	print sys.exc_type, sys.exc_value
				297	return -1
				298
				299	return ret
				300
				301	def updateWordHTML(name, resource, desc, id, relevance):
				302	global DB
				303
				304	if DB == None:
				305	openMySQL()
				306	if DB == None:
				307	return -1
				308	if name == None:
				309	return -1
				310	if resource == None:
				311	return -1
				312	if id == None:
				313	id = ""
				314	if desc == None:
				315	desc = ""
				316	else:
				317	try:
				318	desc = string.replace(desc, "'", " ")
				319	desc = desc[0:99]
				320	except:
				321	desc = ""
				322
				323	c = DB.cursor()
				324	try:
				325	ret = c.execute(
				326	"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
				327	(name, resource, desc, id, relevance))
				328	except:
				329	try:
				330	ret = c.execute(
				331	"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
				332	(desc, id, relevance, name, resource))
				333	except:
				334	print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
				335	print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
				336	print sys.exc_type, sys.exc_value
				337	return -1
				338
				339	return ret
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	340
				341	def checkXMLMsgArchive(url):
				342	global DB
				343
				344	if DB == None:
				345	openMySQL()
				346	if DB == None:
				347	return -1
				348	if url == None:
				349	return -1
				350
				351	c = DB.cursor()
				352	try:
				353	ret = c.execute(
				354	"""SELECT ID FROM archives WHERE resource='%s'""" % (url))
				355	row = c.fetchone()
				356	if row == None:
				357	return -1
				358	except:
				359	return -1
				360
				361	return row[0]
				362
				363	def addXMLMsgArchive(url, title):
				364	global DB
				365
				366	if DB == None:
				367	openMySQL()
				368	if DB == None:
				369	return -1
				370	if url == None:
				371	return -1
				372	if title == None:
				373	title = ""
				374	else:
				375	title = string.replace(title, "'", " ")
				376	title = title[0:99]
				377
				378	c = DB.cursor()
				379	try:
				380	cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
				381	ret = c.execute(cmd)
				382	cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
				383	ret = c.execute(cmd)
				384	row = c.fetchone()
				385	if row == None:
				386	print "addXMLMsgArchive failed to get the ID: %s" % (url)
				387	return -1
				388	except:
				389	print "addXMLMsgArchive failed command: %s" % (cmd)
				390	return -1
				391
				392	return((int)(row[0]))
				393
				394	def updateWordArchive(name, id, relevance):
				395	global DB
				396
				397	if DB == None:
				398	openMySQL()
				399	if DB == None:
				400	return -1
				401	if name == None:
				402	return -1
				403	if id == None:
				404	return -1
				405
				406	c = DB.cursor()
				407	try:
				408	ret = c.execute(
				409	"""INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
				410	(name, id, relevance))
				411	except:
				412	try:
				413	ret = c.execute(
				414	"""UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
				415	(relevance, name, id))
				416	except:
				417	print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
				418	print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
				419	print sys.exc_type, sys.exc_value
				420	return -1
				421
				422	return ret
				423
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	424	#########################################################################
				425	# #
				426	# Word dictionnary and analysis routines #
				427	# #
				428	#########################################################################
				429
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	430	#
				431	# top 100 english word without the one len < 3 + own set
				432	#
				433	dropWords = {
				434	'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
				435	'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
				436	'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
				437	'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
				438	'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
				439	'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
				440	'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
				441	'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
				442	'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
				443	'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
				444	'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
				445	'down':0,
				446	'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
				447	}
				448
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	449	wordsDict = {}
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	450	wordsDictHTML = {}
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	451	wordsDictArchive = {}
				452
				453	def cleanupWordsString(str):
				454	str = string.replace(str, ".", " ")
				455	str = string.replace(str, "!", " ")
				456	str = string.replace(str, "?", " ")
				457	str = string.replace(str, ",", " ")
				458	str = string.replace(str, "'", " ")
				459	str = string.replace(str, '"', " ")
				460	str = string.replace(str, ";", " ")
				461	str = string.replace(str, "-", " ")
				462	str = string.replace(str, "(", " ")
				463	str = string.replace(str, ")", " ")
				464	str = string.replace(str, "{", " ")
				465	str = string.replace(str, "}", " ")
				466	str = string.replace(str, "<", " ")
				467	str = string.replace(str, ">", " ")
				468	str = string.replace(str, "=", " ")
				469	str = string.replace(str, "/", " ")
				470	str = string.replace(str, "*", " ")
				471	str = string.replace(str, ":", " ")
				472	str = string.replace(str, "#", " ")
				473	str = string.replace(str, "\\", " ")
				474	str = string.replace(str, "\n", " ")
				475	str = string.replace(str, "\r", " ")
				476	str = string.replace(str, "\xc2", " ")
				477	str = string.replace(str, "\xa0", " ")
				478	return str
				479
				480	def cleanupDescrString(str):
				481	str = string.replace(str, "\n", " ")
				482	str = string.replace(str, "\r", " ")
				483	str = string.replace(str, "\xc2", " ")
				484	str = string.replace(str, "\xa0", " ")
				485	l = string.split(str)
				486	str = string.join(str)
				487	return str
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	488
				489	def splitIdentifier(str):
				490	ret = []
				491	while str != "":
				492	cur = string.lower(str[0])
				493	str = str[1:]
				494	if ((cur < 'a') or (cur > 'z')):
				495	continue
				496	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
				497	cur = cur + string.lower(str[0])
				498	str = str[1:]
				499	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
				500	cur = cur + str[0]
				501	str = str[1:]
				502	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
				503	str = str[1:]
				504	ret.append(cur)
				505	return ret
				506
				507	def addWord(word, module, symbol, relevance):
				508	global wordsDict
				509
				510	if word == None or len(word) < 3:
				511	return -1
				512	if module == None or symbol == None:
				513	return -1
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	514	if dropWords.has_key(word):
				515	return 0
				516	if ord(word[0]) > 0x80:
				517	return 0
				518
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	519	if wordsDict.has_key(word):
				520	d = wordsDict[word]
				521	if d == None:
				522	return 0
				523	if len(d) > 500:
				524	wordsDict[word] = None
				525	return 0
				526	try:
				527	relevance = relevance + d[(module, symbol)]
				528	except:
				529	pass
				530	else:
				531	wordsDict[word] = {}
				532	wordsDict[word][(module, symbol)] = relevance
				533	return relevance
				534
				535	def addString(str, module, symbol, relevance):
				536	if str == None or len(str) < 3:
				537	return -1
				538	ret = 0
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	539	str = cleanupWordsString(str)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	540	l = string.split(str)
				541	for word in l:
				542	if len(word) > 2:
				543	ret = ret + addWord(word, module, symbol, 5)
				544
				545	return ret
				546
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	547	def addWordHTML(word, resource, id, section, relevance):
				548	global wordsDictHTML
				549
				550	if word == None or len(word) < 3:
				551	return -1
				552	if resource == None or section == None:
				553	return -1
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	554	if dropWords.has_key(word):
				555	return 0
				556	if ord(word[0]) > 0x80:
				557	return 0
				558
				559	section = cleanupDescrString(section)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	560
				561	if wordsDictHTML.has_key(word):
				562	d = wordsDictHTML[word]
				563	if d == None:
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	564	print "skipped %s" % (word)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	565	return 0
				566	try:
				567	(r,i,s) = d[resource]
				568	if i != None:
				569	id = i
				570	if s != None:
				571	section = s
				572	relevance = relevance + r
				573	except:
				574	pass
				575	else:
				576	wordsDictHTML[word] = {}
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	577	d = wordsDictHTML[word];
				578	d[resource] = (relevance, id, section)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	579	return relevance
				580
				581	def addStringHTML(str, resource, id, section, relevance):
				582	if str == None or len(str) < 3:
				583	return -1
				584	ret = 0
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	585	str = cleanupWordsString(str)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	586	l = string.split(str)
				587	for word in l:
				588	if len(word) > 2:
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	589	try:
				590	r = addWordHTML(word, resource, id, section, relevance)
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	591	if r < 0:
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	592	print "addWordHTML failed: %s %s" % (word, resource)
				593	ret = ret + r
				594	except:
				595	print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
				596	print sys.exc_type, sys.exc_value
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	597
				598	return ret
				599
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	600	def addWordArchive(word, id, relevance):
				601	global wordsDictArchive
				602
				603	if word == None or len(word) < 3:
				604	return -1
				605	if id == None or id == -1:
				606	return -1
				607	if dropWords.has_key(word):
				608	return 0
				609	if ord(word[0]) > 0x80:
				610	return 0
				611
				612	if wordsDictArchive.has_key(word):
				613	d = wordsDictArchive[word]
				614	if d == None:
				615	print "skipped %s" % (word)
				616	return 0
				617	try:
				618	r = d[id]
				619	relevance = relevance + r
				620	except:
				621	pass
				622	else:
				623	wordsDictArchive[word] = {}
				624	d = wordsDictArchive[word];
				625	d[id] = relevance
				626	return relevance
				627
				628	def addStringArchive(str, id, relevance):
				629	if str == None or len(str) < 3:
				630	return -1
				631	ret = 0
				632	str = cleanupWordsString(str)
				633	l = string.split(str)
				634	for word in l:
				635	i = len(word)
				636	if i > 2:
				637	try:
				638	r = addWordArchive(word, id, relevance)
				639	if r < 0:
				640	print "addWordArchive failed: %s %s" % (word, id)
				641	else:
				642	ret = ret + r
				643	except:
				644	print "addWordArchive failed: %s %s %d" % (word, id, relevance)
				645	print sys.exc_type, sys.exc_value
				646	return ret
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	647
				648	#########################################################################
				649	# #
				650	# XML API description analysis #
				651	# #
				652	#########################################################################
				653
				654	def loadAPI(filename):
				655	doc = libxml2.parseFile(filename)
				656	print "loaded %s" % (filename)
				657	return doc
				658
				659	def foundExport(file, symbol):
				660	if file == None:
				661	return 0
				662	if symbol == None:
				663	return 0
				664	addFunction(symbol, file)
				665	l = splitIdentifier(symbol)
				666	for word in l:
				667	addWord(word, file, symbol, 10)
				668	return 1
				669
				670	def analyzeAPIFile(top):
				671	count = 0
				672	name = top.prop("name")
				673	cur = top.children
				674	while cur != None:
				675	if cur.type == 'text':
				676	cur = cur.next
				677	continue
				678	if cur.name == "exports":
				679	count = count + foundExport(name, cur.prop("symbol"))
				680	else:
				681	print "unexpected element %s in API doc <file name='%s'>" % (name)
				682	cur = cur.next
				683	return count
				684
				685	def analyzeAPIFiles(top):
				686	count = 0
				687	cur = top.children
				688
				689	while cur != None:
				690	if cur.type == 'text':
				691	cur = cur.next
				692	continue
				693	if cur.name == "file":
				694	count = count + analyzeAPIFile(cur)
				695	else:
				696	print "unexpected element %s in API doc <files>" % (cur.name)
				697	cur = cur.next
				698	return count
				699
				700	def analyzeAPIEnum(top):
				701	file = top.prop("file")
				702	if file == None:
				703	return 0
				704	symbol = top.prop("name")
				705	if symbol == None:
				706	return 0
				707
				708	addEnum(symbol, file)
				709	l = splitIdentifier(symbol)
				710	for word in l:
				711	addWord(word, file, symbol, 10)
				712
				713	return 1
				714
				715	def analyzeAPIConst(top):
				716	file = top.prop("file")
				717	if file == None:
				718	return 0
				719	symbol = top.prop("name")
				720	if symbol == None:
				721	return 0
				722
				723	addConst(symbol, file)
				724	l = splitIdentifier(symbol)
				725	for word in l:
				726	addWord(word, file, symbol, 10)
				727
				728	return 1
				729
				730	def analyzeAPIType(top):
				731	file = top.prop("file")
				732	if file == None:
				733	return 0
				734	symbol = top.prop("name")
				735	if symbol == None:
				736	return 0
				737
				738	addType(symbol, file)
				739	l = splitIdentifier(symbol)
				740	for word in l:
				741	addWord(word, file, symbol, 10)
				742	return 1
				743
				744	def analyzeAPIFunctype(top):
				745	file = top.prop("file")
				746	if file == None:
				747	return 0
				748	symbol = top.prop("name")
				749	if symbol == None:
				750	return 0
				751
				752	addFunctype(symbol, file)
				753	l = splitIdentifier(symbol)
				754	for word in l:
				755	addWord(word, file, symbol, 10)
				756	return 1
				757
				758	def analyzeAPIStruct(top):
				759	file = top.prop("file")
				760	if file == None:
				761	return 0
				762	symbol = top.prop("name")
				763	if symbol == None:
				764	return 0
				765
				766	addStruct(symbol, file)
				767	l = splitIdentifier(symbol)
				768	for word in l:
				769	addWord(word, file, symbol, 10)
				770
				771	info = top.prop("info")
				772	if info != None:
				773	l = string.split(info)
				774	for word in l:
				775	if len(word) > 2:
				776	addWord(word, file, symbol, 5)
				777	return 1
				778
				779	def analyzeAPIMacro(top):
				780	file = top.prop("file")
				781	if file == None:
				782	return 0
				783	symbol = top.prop("name")
				784	if symbol == None:
				785	return 0
				786
				787	info = None
				788	cur = top.children
				789	while cur != None:
				790	if cur.type == 'text':
				791	cur = cur.next
				792	continue
				793	if cur.name == "info":
				794	info = cur.content
				795	break
				796	cur = cur.next
				797
				798	l = splitIdentifier(symbol)
				799	for word in l:
				800	addWord(word, file, symbol, 10)
				801
				802	if info == None:
				803	addMacro(symbol, file)
				804	print "Macro %s description has no <info>" % (symbol)
				805	return 0
				806
				807	addMacro(symbol, file, info)
				808	l = string.split(info)
				809	for word in l:
				810	if len(word) > 2:
				811	addWord(word, file, symbol, 5)
				812	return 1
				813
				814	def analyzeAPIFunction(top):
				815	file = top.prop("file")
				816	if file == None:
				817	return 0
				818	symbol = top.prop("name")
				819	if symbol == None:
				820	return 0
				821
				822	info = None
				823	cur = top.children
				824	while cur != None:
				825	if cur.type == 'text':
				826	cur = cur.next
				827	continue
				828	if cur.name == "info":
				829	info = cur.content
				830	elif cur.name == "return":
				831	rinfo = cur.prop("info")
				832	if rinfo != None:
				833	addString(rinfo, file, symbol, 7)
				834	elif cur.name == "arg":
				835	ainfo = cur.prop("info")
				836	if rinfo != None:
				837	addString(ainfo, file, symbol, 5)
				838	name = cur.prop("name")
				839	if name != None:
				840	addWord(name, file, symbol, 7)
				841	cur = cur.next
				842	if info == None:
				843	print "Function %s description has no <info>" % (symbol)
				844	addFunction(symbol, file, "")
				845	else:
				846	addFunction(symbol, file, info)
				847	addString(info, file, symbol, 5)
				848
				849	l = splitIdentifier(symbol)
				850	for word in l:
				851	addWord(word, file, symbol, 10)
				852
				853	return 1
				854
				855	def analyzeAPISymbols(top):
				856	count = 0
				857	cur = top.children
				858
				859	while cur != None:
				860	if cur.type == 'text':
				861	cur = cur.next
				862	continue
				863	if cur.name == "macro":
				864	count = count + analyzeAPIMacro(cur)
				865	elif cur.name == "function":
				866	count = count + analyzeAPIFunction(cur)
				867	elif cur.name == "const":
				868	count = count + analyzeAPIConst(cur)
				869	elif cur.name == "typedef":
				870	count = count + analyzeAPIType(cur)
				871	elif cur.name == "struct":
				872	count = count + analyzeAPIStruct(cur)
				873	elif cur.name == "enum":
				874	count = count + analyzeAPIEnum(cur)
				875	elif cur.name == "functype":
				876	count = count + analyzeAPIFunctype(cur)
				877	else:
				878	print "unexpected element %s in API doc <files>" % (cur.name)
				879	cur = cur.next
				880	return count
				881
				882	def analyzeAPI(doc):
				883	count = 0
				884	if doc == None:
				885	return -1
				886	root = doc.getRootElement()
				887	if root.name != "api":
				888	print "Unexpected root name"
				889	return -1
				890	cur = root.children
				891	while cur != None:
				892	if cur.type == 'text':
				893	cur = cur.next
				894	continue
				895	if cur.name == "files":
				896	pass
				897	# count = count + analyzeAPIFiles(cur)
				898	elif cur.name == "symbols":
				899	count = count + analyzeAPISymbols(cur)
				900	else:
				901	print "unexpected element %s in API doc" % (cur.name)
				902	cur = cur.next
				903	return count
				904
				905	#########################################################################
				906	# #
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	907	# Web pages parsing and analysis #
				908	# #
				909	#########################################################################
				910
				911	import glob
				912
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	913	def analyzeHTMLText(doc, resource, p, section, id):
				914	words = 0
				915	try:
				916	content = p.content
				917	words = words + addStringHTML(content, resource, id, section, 5)
				918	except:
				919	return -1
				920	return words
				921
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	922	def analyzeHTMLPara(doc, resource, p, section, id):
				923	words = 0
				924	try:
				925	content = p.content
				926	words = words + addStringHTML(content, resource, id, section, 5)
				927	except:
				928	return -1
				929	return words
				930
				931	def analyzeHTMLPre(doc, resource, p, section, id):
				932	words = 0
				933	try:
				934	content = p.content
				935	words = words + addStringHTML(content, resource, id, section, 5)
				936	except:
				937	return -1
				938	return words
				939
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	940	def analyzeHTML(doc, resource, p, section, id):
				941	words = 0
				942	try:
				943	content = p.content
				944	words = words + addStringHTML(content, resource, id, section, 5)
				945	except:
				946	return -1
				947	return words
				948
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	949	def analyzeHTML(doc, resource):
				950	para = 0;
				951	ctxt = doc.xpathNewContext()
				952	try:
				953	res = ctxt.xpathEval("//head/title")
				954	title = res[0].content
				955	except:
				956	title = "Page %s" % (resource)
				957	addPage(resource, title)
				958	try:
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	959	items = ctxt.xpathEval("//h1 \| //h2 \| //h3 \| //text()")
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	960	section = title
				961	id = ""
				962	for item in items:
				963	if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
				964	section = item.content
				965	if item.prop("id"):
				966	id = item.prop("id")
				967	elif item.prop("name"):
				968	id = item.prop("name")
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	969	elif item.type == 'text':
				970	analyzeHTMLText(doc, resource, item, section, id)
				971	para = para + 1
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	972	elif item.name == 'p':
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	973	analyzeHTMLPara(doc, resource, item, section, id)
				974	para = para + 1
				975	elif item.name == 'pre':
				976	analyzeHTMLPre(doc, resource, item, section, id)
				977	para = para + 1
				978	else:
				979	print "Page %s, unexpected %s element" % (resource, item.name)
				980	except:
				981	print "Page %s: problem analyzing" % (resource)
				982	print sys.exc_type, sys.exc_value
				983
				984	return para
				985
				986	def analyzeHTMLPages():
				987	ret = 0
				988	HTMLfiles = glob.glob(".html") + glob.glob("tutorial/.html")
				989	for html in HTMLfiles:
				990	if html[0:3] == "API":
				991	continue
				992	if html == "xml.html":
				993	continue
				994	try:
				995	doc = libxml2.htmlParseFile(html, None)
				996	res = analyzeHTML(doc, html)
				997	print "Parsed %s : %d paragraphs" % (html, res)
				998	ret = ret + 1
				999	except:
				1000	print "could not parse %s" % (html)
				1001	return ret
				1002
				1003	#########################################################################
				1004	# #
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1005	# Mail archives parsing and analysis #
				1006	# #
				1007	#########################################################################
				1008
				1009	import time
				1010
				1011	def getXMLDateArchive(t = None):
				1012	if t == None:
				1013	t = time.time()
				1014	T = time.gmtime(t)
				1015	month = time.strftime("%B", T)
				1016	year = T[0]
				1017	url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
				1018	return url
				1019
				1020	def scanXMLMsgArchive(url, title, force = 0):
				1021	if url == None or title == None:
				1022	return 0
				1023
				1024	ID = checkXMLMsgArchive(url)
				1025	if force == 0 and ID != -1:
				1026	return 0
				1027
				1028	if ID == -1:
				1029	ID = addXMLMsgArchive(url, title)
				1030	if ID == -1:
				1031	return 0
				1032
				1033	try:
				1034	print "Loading %s" % (url)
				1035	doc = libxml2.htmlParseFile(url, None);
				1036	except:
				1037	doc = None
				1038	if doc == None:
				1039	print "Failed to parse %s" % (url)
				1040	return 0
				1041
				1042	addStringArchive(title, ID, 20)
				1043	ctxt = doc.xpathNewContext()
				1044	texts = ctxt.xpathEval("//pre//text()")
				1045	for text in texts:
				1046	addStringArchive(text.content, ID, 5)
				1047
				1048	return 1
				1049
				1050	def scanXMLDateArchive(t = None, force = 0):
Daniel Veillard	d7960a8	2002-10-08 19:13:50 +0000	[diff] [blame^]	1051	global wordsDictArchive
				1052
				1053	wordsDictArchive = {}
				1054
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1055	url = getXMLDateArchive(t)
				1056	print "loading %s" % (url)
				1057	try:
				1058	doc = libxml2.htmlParseFile(url, None);
				1059	except:
				1060	doc = None
				1061	if doc == None:
				1062	print "Failed to parse %s" % (url)
				1063	return -1
				1064	ctxt = doc.xpathNewContext()
				1065	anchors = ctxt.xpathEval("//a[@href]")
				1066	links = 0
				1067	newmsg = 0
				1068	for anchor in anchors:
				1069	href = anchor.prop("href")
				1070	if href == None or href[0:3] != "msg":
				1071	continue
				1072	try:
				1073	links = links + 1
				1074
				1075	msg = libxml2.buildURI(href, url)
				1076	title = anchor.content
				1077	if title != None and title[0:4] == 'Re: ':
				1078	title = title[4:]
				1079	if title != None and title[0:6] == '[xml] ':
				1080	title = title[6:]
				1081	newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
				1082
				1083	except:
				1084	pass
				1085
				1086	return newmsg
				1087
				1088
				1089	#########################################################################
				1090	# #
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1091	# Main code: open the DB, the API XML and analyze it #
				1092	# #
				1093	#########################################################################
				1094	try:
				1095	openMySQL()
				1096	except:
				1097	print "Failed to open the database"
				1098	print sys.exc_type, sys.exc_value
				1099	sys.exit(1)
				1100
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1101	def analyzeArchives(t = None, force = 0):
				1102	global wordsDictArchive
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	1103
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1104	ret = scanXMLDateArchive(t, force)
				1105	print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	1106
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1107	i = 0
				1108	skipped = 0
				1109	for word in wordsDictArchive.keys():
				1110	refs = wordsDictArchive[word]
				1111	if refs == None:
				1112	skipped = skipped + 1
				1113	continue;
				1114	for id in refs.keys():
				1115	relevance = refs[id]
				1116	updateWordArchive(word, id, relevance)
				1117	i = i + 1
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	1118
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1119	print "Found %d associations in HTML pages" % (i)
				1120
				1121	def analyzeHTML():
				1122	global wordsDictHTML
				1123
				1124	ret = analyzeHTMLPages()
				1125	print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
				1126
				1127	i = 0
				1128	skipped = 0
				1129	for word in wordsDictHTML.keys():
				1130	refs = wordsDictHTML[word]
				1131	if refs == None:
				1132	skipped = skipped + 1
				1133	continue;
				1134	for resource in refs.keys():
				1135	(relevance, id, section) = refs[resource]
				1136	updateWordHTML(word, resource, section, id, relevance)
				1137	i = i + 1
				1138
				1139	print "Found %d associations in HTML pages" % (i)
				1140
				1141	def analyzeAPI():
				1142	global wordsDict
				1143
				1144	try:
				1145	doc = loadAPI(API)
				1146	ret = analyzeAPI(doc)
				1147	print "Analyzed %d blocs" % (ret)
				1148	doc.freeDoc()
				1149	except:
				1150	print "Failed to parse and analyze %s" % (API)
				1151	print sys.exc_type, sys.exc_value
				1152	sys.exit(1)
				1153
				1154	print "Indexed %d words" % (len(wordsDict))
				1155	i = 0
				1156	skipped = 0
				1157	for word in wordsDict.keys():
				1158	refs = wordsDict[word]
				1159	if refs == None:
				1160	skipped = skipped + 1
				1161	continue;
				1162	for (module, symbol) in refs.keys():
				1163	updateWord(word, symbol, refs[(module, symbol)])
				1164	i = i + 1
				1165
				1166	print "Found %d associations, skipped %d words" % (i, skipped)
				1167
				1168	def usage():
Daniel Veillard	f08d400	2002-10-08 17:17:11 +0000	[diff] [blame]	1169	print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1170	sys.exit(1)
				1171
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1172	def main():
				1173	args = sys.argv[1:]
				1174	force = 0
				1175	if args:
				1176	i = 0
				1177	while i < len(args):
				1178	if args[i] == '--force':
				1179	force = 1
				1180	elif args[i] == '--archive':
Daniel Veillard	f08d400	2002-10-08 17:17:11 +0000	[diff] [blame]	1181	analyzeArchives(None, force)
				1182	elif args[i] == '--archive-year':
				1183	i = i + 1;
				1184	year = args[i]
				1185	months = ["January" , "February", "March", "April", "May",
				1186	"June", "July", "August", "September", "October",
				1187	"November", "December"];
				1188	for month in months:
				1189	try:
				1190	str = "%s-%s" % (year, month)
				1191	T = time.strptime(str, "%Y-%B")
				1192	t = time.mktime(T) + 3600 * 24 * 10;
				1193	analyzeArchives(t, force)
				1194	except:
				1195	print "Failed to index month archive:"
				1196	print sys.exc_type, sys.exc_value
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1197	elif args[i] == '--archive-month':
				1198	i = i + 1;
				1199	month = args[i]
				1200	try:
				1201	T = time.strptime(month, "%Y-%B")
				1202	t = time.mktime(T) + 3600 * 24 * 10;
				1203	analyzeArchives(t, force)
				1204	except:
				1205	print "Failed to index month archive:"
				1206	print sys.exc_type, sys.exc_value
				1207	elif args[i] == '--API':
				1208	analyzeAPI()
				1209	elif args[i] == '--docs':
				1210	analyzeHTML()
				1211	else:
				1212	usage()
				1213	i = i + 1
				1214	else:
				1215	usage()
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1216
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1217	if __name__ == "__main__":
				1218	main()