Blame - doc/index.py - platform/external/libxml2

blob: 1895d84dcbf37e594c22283c38775ea0a7f7b70a [file] [log] [blame]

Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1	#!/usr/bin/python -u
				2	#
				3	# imports the API description and fills up a database with
				4	# name relevance to modules, functions or web pages
				5	#
Daniel Veillard	2c77cd7	2002-10-01 13:54:14 +0000	[diff] [blame]	6	# Operation needed:
				7	# =================
				8	#
				9	# install mysqld, the python wrappers for mysql and libxml2, start mysqld
				10	# Change the root passwd of mysql:
				11	# mysqladmin -u root password new_password
				12	# Create the new database xmlsoft
				13	# mysqladmin -p create xmlsoft
				14	# Create a database user 'veillard' and give him passord access
				15	# change veillard and abcde with the right user name and passwd
				16	# mysql -p
				17	# password:
				18	# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
				19	# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
				20	#
				21	# As the user check the access:
				22	# mysql -p xmlsoft
				23	# Enter password:
				24	# Welcome to the MySQL monitor....
				25	# mysql> use xmlsoft
				26	# Database changed
				27	# mysql> quit
				28	# Bye
				29	#
				30	# Then run the script in the doc subdir, it will create the symbols and
				31	# word tables and populate them with informations extracted from
				32	# the libxml2-api.xml API description, and make then accessible read-only
				33	# by nobody@loaclhost the user expected to be Apache's one
				34	#
				35	# On the Apache configuration, make sure you have php support enabled
				36	#
				37
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	38	import MySQLdb
				39	import libxml2
				40	import sys
				41	import string
				42	import os
				43
				44	#
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	45	# We are not interested in parsing errors here
				46	#
				47	def callback(ctx, str):
				48	return
				49	libxml2.registerErrorHandler(callback, None)
				50
				51	#
Xin Li	a136fc2	2016-07-26 14:22:54 -0700	[diff] [blame]	52	# The dictionary of tables required and the SQL command needed
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	53	# to create them
				54	#
				55	TABLES={
				56	"symbols" : """CREATE TABLE symbols (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	57	name varchar(255) BINARY NOT NULL,
				58	module varchar(255) BINARY NOT NULL,
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	59	type varchar(25) NOT NULL,
				60	descr varchar(255),
				61	UNIQUE KEY name (name),
				62	KEY module (module))""",
				63	"words" : """CREATE TABLE words (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	64	name varchar(50) BINARY NOT NULL,
				65	symbol varchar(255) BINARY NOT NULL,
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	66	relevance int,
				67	KEY name (name),
				68	KEY symbol (symbol),
				69	UNIQUE KEY ID (name, symbol))""",
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	70	"wordsHTML" : """CREATE TABLE wordsHTML (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	71	name varchar(50) BINARY NOT NULL,
				72	resource varchar(255) BINARY NOT NULL,
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	73	section varchar(255),
				74	id varchar(50),
				75	relevance int,
				76	KEY name (name),
				77	KEY resource (resource),
				78	UNIQUE KEY ref (name, resource))""",
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	79	"wordsArchive" : """CREATE TABLE wordsArchive (
				80	name varchar(50) BINARY NOT NULL,
				81	ID int(11) NOT NULL,
				82	relevance int,
				83	KEY name (name),
				84	UNIQUE KEY ref (name, ID))""",
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	85	"pages" : """CREATE TABLE pages (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	86	resource varchar(255) BINARY NOT NULL,
				87	title varchar(255) BINARY NOT NULL,
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	88	UNIQUE KEY name (resource))""",
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	89	"archives" : """CREATE TABLE archives (
				90	ID int(11) NOT NULL auto_increment,
				91	resource varchar(255) BINARY NOT NULL,
				92	title varchar(255) BINARY NOT NULL,
				93	UNIQUE KEY id (ID,resource(255)),
				94	INDEX (ID),
				95	INDEX (resource))""",
Daniel Veillard	dc6d4ab	2002-10-04 15:58:34 +0000	[diff] [blame]	96	"Queries" : """CREATE TABLE Queries (
				97	ID int(11) NOT NULL auto_increment,
				98	Value varchar(50) NOT NULL,
				99	Count int(11) NOT NULL,
				100	UNIQUE KEY id (ID,Value(35)),
				101	INDEX (ID))""",
Daniel Veillard	24f6a07	2004-04-08 14:39:25 +0000	[diff] [blame]	102	"AllQueries" : """CREATE TABLE AllQueries (
				103	ID int(11) NOT NULL auto_increment,
				104	Value varchar(50) NOT NULL,
				105	Count int(11) NOT NULL,
				106	UNIQUE KEY id (ID,Value(35)),
				107	INDEX (ID))""",
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	108	}
				109
				110	#
				111	# The XML API description file to parse
				112	#
				113	API="libxml2-api.xml"
				114	DB=None
				115
				116	#########################################################################
				117	# #
				118	# MySQL database interfaces #
				119	# #
				120	#########################################################################
				121	def createTable(db, name):
				122	global TABLES
				123
				124	if db == None:
				125	return -1
				126	if name == None:
				127	return -1
				128	c = db.cursor()
				129
				130	ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
				131	if ret == 1:
				132	print "Removed table %s" % (name)
				133	print "Creating table %s" % (name)
				134	try:
				135	ret = c.execute(TABLES[name])
				136	except:
				137	print "Failed to create table %s" % (name)
				138	return -1
				139	return ret
				140
Daniel Veillard	24f6a07	2004-04-08 14:39:25 +0000	[diff] [blame]	141	def checkTables(db, verbose = 1):
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	142	global TABLES
				143
				144	if db == None:
				145	return -1
				146	c = db.cursor()
				147	nbtables = c.execute("show tables")
Daniel Veillard	24f6a07	2004-04-08 14:39:25 +0000	[diff] [blame]	148	if verbose:
				149	print "Found %d tables" % (nbtables)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	150	tables = {}
				151	i = 0
				152	while i < nbtables:
				153	l = c.fetchone()
				154	name = l[0]
				155	tables[name] = {}
				156	i = i + 1
				157
				158	for table in TABLES.keys():
				159	if not tables.has_key(table):
				160	print "table %s missing" % (table)
				161	createTable(db, table)
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame]	162	try:
				163	ret = c.execute("SELECT count(*) from %s" % table);
				164	row = c.fetchone()
Daniel Veillard	24f6a07	2004-04-08 14:39:25 +0000	[diff] [blame]	165	if verbose:
				166	print "Table %s contains %d records" % (table, row[0])
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame]	167	except:
				168	print "Troubles with table %s : repairing" % (table)
				169	ret = c.execute("repair table %s" % table);
				170	print "repairing returned %d" % (ret)
				171	ret = c.execute("SELECT count(*) from %s" % table);
				172	row = c.fetchone()
				173	print "Table %s contains %d records" % (table, row[0])
Daniel Veillard	24f6a07	2004-04-08 14:39:25 +0000	[diff] [blame]	174	if verbose:
				175	print "checkTables finished"
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	176
				177	# make sure apache can access the tables read-only
				178	try:
				179	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
Daniel Veillard	dc6d4ab	2002-10-04 15:58:34 +0000	[diff] [blame]	180	ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	181	except:
				182	pass
				183	return 0
				184
Daniel Veillard	24f6a07	2004-04-08 14:39:25 +0000	[diff] [blame]	185	def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	186	global DB
				187
				188	if passwd == None:
Daniel Veillard	538d3b9	2002-10-01 14:04:56 +0000	[diff] [blame]	189	try:
				190	passwd = os.environ["MySQL_PASS"]
				191	except:
				192	print "No password available, set environment MySQL_PASS"
				193	sys.exit(1)
				194
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	195	DB = MySQLdb.connect(passwd=passwd, db=db)
				196	if DB == None:
				197	return -1
Daniel Veillard	24f6a07	2004-04-08 14:39:25 +0000	[diff] [blame]	198	ret = checkTables(DB, verbose)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	199	return ret
				200
				201	def updateWord(name, symbol, relevance):
				202	global DB
				203
				204	if DB == None:
				205	openMySQL()
				206	if DB == None:
				207	return -1
				208	if name == None:
				209	return -1
				210	if symbol == None:
				211	return -1
				212
				213	c = DB.cursor()
				214	try:
				215	ret = c.execute(
				216	"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
				217	(name, symbol, relevance))
				218	except:
				219	try:
				220	ret = c.execute(
				221	"""UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
				222	(relevance, name, symbol))
				223	except:
				224	print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
				225	print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
				226	print sys.exc_type, sys.exc_value
				227	return -1
				228
				229	return ret
				230
				231	def updateSymbol(name, module, type, desc):
				232	global DB
				233
				234	updateWord(name, name, 50)
				235	if DB == None:
				236	openMySQL()
				237	if DB == None:
				238	return -1
				239	if name == None:
				240	return -1
				241	if module == None:
				242	return -1
				243	if type == None:
				244	return -1
				245
				246	try:
				247	desc = string.replace(desc, "'", " ")
				248	l = string.split(desc, ".")
				249	desc = l[0]
				250	desc = desc[0:99]
				251	except:
				252	desc = ""
				253
				254	c = DB.cursor()
				255	try:
				256	ret = c.execute(
				257	"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
				258	(name, module, type, desc))
				259	except:
				260	try:
				261	ret = c.execute(
				262	"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
				263	(module, type, desc, name))
				264	except:
				265	print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
				266	print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
				267	print sys.exc_type, sys.exc_value
				268	return -1
				269
				270	return ret
				271
				272	def addFunction(name, module, desc = ""):
				273	return updateSymbol(name, module, 'function', desc)
				274
				275	def addMacro(name, module, desc = ""):
				276	return updateSymbol(name, module, 'macro', desc)
				277
				278	def addEnum(name, module, desc = ""):
				279	return updateSymbol(name, module, 'enum', desc)
				280
				281	def addStruct(name, module, desc = ""):
				282	return updateSymbol(name, module, 'struct', desc)
				283
				284	def addConst(name, module, desc = ""):
				285	return updateSymbol(name, module, 'const', desc)
				286
				287	def addType(name, module, desc = ""):
				288	return updateSymbol(name, module, 'type', desc)
				289
				290	def addFunctype(name, module, desc = ""):
				291	return updateSymbol(name, module, 'functype', desc)
				292
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	293	def addPage(resource, title):
				294	global DB
				295
				296	if DB == None:
				297	openMySQL()
				298	if DB == None:
				299	return -1
				300	if resource == None:
				301	return -1
				302
				303	c = DB.cursor()
				304	try:
				305	ret = c.execute(
				306	"""INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
				307	(resource, title))
				308	except:
				309	try:
				310	ret = c.execute(
				311	"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
				312	(title, resource))
				313	except:
				314	print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
				315	print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
				316	print sys.exc_type, sys.exc_value
				317	return -1
				318
				319	return ret
				320
				321	def updateWordHTML(name, resource, desc, id, relevance):
				322	global DB
				323
				324	if DB == None:
				325	openMySQL()
				326	if DB == None:
				327	return -1
				328	if name == None:
				329	return -1
				330	if resource == None:
				331	return -1
				332	if id == None:
				333	id = ""
				334	if desc == None:
				335	desc = ""
				336	else:
				337	try:
				338	desc = string.replace(desc, "'", " ")
				339	desc = desc[0:99]
				340	except:
				341	desc = ""
				342
				343	c = DB.cursor()
				344	try:
				345	ret = c.execute(
				346	"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
				347	(name, resource, desc, id, relevance))
				348	except:
				349	try:
				350	ret = c.execute(
				351	"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
				352	(desc, id, relevance, name, resource))
				353	except:
				354	print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
				355	print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
				356	print sys.exc_type, sys.exc_value
				357	return -1
				358
				359	return ret
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	360
				361	def checkXMLMsgArchive(url):
				362	global DB
				363
				364	if DB == None:
				365	openMySQL()
				366	if DB == None:
				367	return -1
				368	if url == None:
				369	return -1
				370
				371	c = DB.cursor()
				372	try:
				373	ret = c.execute(
				374	"""SELECT ID FROM archives WHERE resource='%s'""" % (url))
				375	row = c.fetchone()
				376	if row == None:
				377	return -1
				378	except:
				379	return -1
				380
				381	return row[0]
				382
				383	def addXMLMsgArchive(url, title):
				384	global DB
				385
				386	if DB == None:
				387	openMySQL()
				388	if DB == None:
				389	return -1
				390	if url == None:
				391	return -1
				392	if title == None:
				393	title = ""
				394	else:
				395	title = string.replace(title, "'", " ")
				396	title = title[0:99]
				397
				398	c = DB.cursor()
				399	try:
				400	cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
				401	ret = c.execute(cmd)
				402	cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
				403	ret = c.execute(cmd)
				404	row = c.fetchone()
				405	if row == None:
				406	print "addXMLMsgArchive failed to get the ID: %s" % (url)
				407	return -1
				408	except:
				409	print "addXMLMsgArchive failed command: %s" % (cmd)
				410	return -1
				411
				412	return((int)(row[0]))
				413
				414	def updateWordArchive(name, id, relevance):
				415	global DB
				416
				417	if DB == None:
				418	openMySQL()
				419	if DB == None:
				420	return -1
				421	if name == None:
				422	return -1
				423	if id == None:
				424	return -1
				425
				426	c = DB.cursor()
				427	try:
				428	ret = c.execute(
				429	"""INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
				430	(name, id, relevance))
				431	except:
				432	try:
				433	ret = c.execute(
				434	"""UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
				435	(relevance, name, id))
				436	except:
				437	print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
				438	print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
				439	print sys.exc_type, sys.exc_value
				440	return -1
Xin Li	a136fc2	2016-07-26 14:22:54 -0700	[diff] [blame]	441
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	442	return ret
				443
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	444	#########################################################################
				445	# #
Xin Li	a136fc2	2016-07-26 14:22:54 -0700	[diff] [blame]	446	# Word dictionary and analysis routines #
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	447	# #
				448	#########################################################################
				449
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	450	#
				451	# top 100 english word without the one len < 3 + own set
				452	#
				453	dropWords = {
				454	'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
				455	'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
				456	'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
				457	'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
				458	'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
				459	'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
				460	'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
				461	'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
				462	'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
				463	'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
				464	'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
				465	'down':0,
				466	'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
				467	}
				468
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	469	wordsDict = {}
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	470	wordsDictHTML = {}
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	471	wordsDictArchive = {}
				472
				473	def cleanupWordsString(str):
				474	str = string.replace(str, ".", " ")
				475	str = string.replace(str, "!", " ")
				476	str = string.replace(str, "?", " ")
				477	str = string.replace(str, ",", " ")
				478	str = string.replace(str, "'", " ")
				479	str = string.replace(str, '"', " ")
				480	str = string.replace(str, ";", " ")
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	481	str = string.replace(str, "(", " ")
				482	str = string.replace(str, ")", " ")
				483	str = string.replace(str, "{", " ")
				484	str = string.replace(str, "}", " ")
				485	str = string.replace(str, "<", " ")
				486	str = string.replace(str, ">", " ")
				487	str = string.replace(str, "=", " ")
				488	str = string.replace(str, "/", " ")
				489	str = string.replace(str, "*", " ")
				490	str = string.replace(str, ":", " ")
				491	str = string.replace(str, "#", " ")
				492	str = string.replace(str, "\\", " ")
				493	str = string.replace(str, "\n", " ")
				494	str = string.replace(str, "\r", " ")
				495	str = string.replace(str, "\xc2", " ")
				496	str = string.replace(str, "\xa0", " ")
				497	return str
				498
				499	def cleanupDescrString(str):
Daniel Veillard	f007012	2002-10-09 14:24:17 +0000	[diff] [blame]	500	str = string.replace(str, "'", " ")
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	501	str = string.replace(str, "\n", " ")
				502	str = string.replace(str, "\r", " ")
				503	str = string.replace(str, "\xc2", " ")
				504	str = string.replace(str, "\xa0", " ")
				505	l = string.split(str)
				506	str = string.join(str)
				507	return str
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	508
				509	def splitIdentifier(str):
				510	ret = []
				511	while str != "":
				512	cur = string.lower(str[0])
				513	str = str[1:]
				514	if ((cur < 'a') or (cur > 'z')):
				515	continue
				516	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
				517	cur = cur + string.lower(str[0])
				518	str = str[1:]
				519	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
				520	cur = cur + str[0]
				521	str = str[1:]
				522	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
				523	str = str[1:]
				524	ret.append(cur)
				525	return ret
				526
				527	def addWord(word, module, symbol, relevance):
				528	global wordsDict
				529
				530	if word == None or len(word) < 3:
				531	return -1
				532	if module == None or symbol == None:
				533	return -1
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	534	if dropWords.has_key(word):
				535	return 0
				536	if ord(word[0]) > 0x80:
				537	return 0
				538
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	539	if wordsDict.has_key(word):
				540	d = wordsDict[word]
				541	if d == None:
				542	return 0
				543	if len(d) > 500:
				544	wordsDict[word] = None
				545	return 0
				546	try:
				547	relevance = relevance + d[(module, symbol)]
				548	except:
				549	pass
				550	else:
				551	wordsDict[word] = {}
				552	wordsDict[word][(module, symbol)] = relevance
				553	return relevance
				554
				555	def addString(str, module, symbol, relevance):
				556	if str == None or len(str) < 3:
				557	return -1
				558	ret = 0
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	559	str = cleanupWordsString(str)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	560	l = string.split(str)
				561	for word in l:
				562	if len(word) > 2:
				563	ret = ret + addWord(word, module, symbol, 5)
				564
				565	return ret
				566
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	567	def addWordHTML(word, resource, id, section, relevance):
				568	global wordsDictHTML
				569
				570	if word == None or len(word) < 3:
				571	return -1
				572	if resource == None or section == None:
				573	return -1
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	574	if dropWords.has_key(word):
				575	return 0
				576	if ord(word[0]) > 0x80:
				577	return 0
				578
				579	section = cleanupDescrString(section)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	580
				581	if wordsDictHTML.has_key(word):
				582	d = wordsDictHTML[word]
				583	if d == None:
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	584	print "skipped %s" % (word)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	585	return 0
				586	try:
				587	(r,i,s) = d[resource]
				588	if i != None:
				589	id = i
				590	if s != None:
				591	section = s
				592	relevance = relevance + r
				593	except:
				594	pass
				595	else:
				596	wordsDictHTML[word] = {}
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	597	d = wordsDictHTML[word];
				598	d[resource] = (relevance, id, section)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	599	return relevance
				600
				601	def addStringHTML(str, resource, id, section, relevance):
				602	if str == None or len(str) < 3:
				603	return -1
				604	ret = 0
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	605	str = cleanupWordsString(str)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	606	l = string.split(str)
				607	for word in l:
				608	if len(word) > 2:
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	609	try:
				610	r = addWordHTML(word, resource, id, section, relevance)
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	611	if r < 0:
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	612	print "addWordHTML failed: %s %s" % (word, resource)
				613	ret = ret + r
				614	except:
				615	print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
				616	print sys.exc_type, sys.exc_value
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	617
				618	return ret
				619
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	620	def addWordArchive(word, id, relevance):
				621	global wordsDictArchive
				622
				623	if word == None or len(word) < 3:
				624	return -1
				625	if id == None or id == -1:
				626	return -1
				627	if dropWords.has_key(word):
				628	return 0
				629	if ord(word[0]) > 0x80:
				630	return 0
				631
				632	if wordsDictArchive.has_key(word):
				633	d = wordsDictArchive[word]
				634	if d == None:
				635	print "skipped %s" % (word)
				636	return 0
				637	try:
				638	r = d[id]
				639	relevance = relevance + r
				640	except:
				641	pass
				642	else:
				643	wordsDictArchive[word] = {}
				644	d = wordsDictArchive[word];
				645	d[id] = relevance
				646	return relevance
				647
				648	def addStringArchive(str, id, relevance):
				649	if str == None or len(str) < 3:
				650	return -1
				651	ret = 0
				652	str = cleanupWordsString(str)
				653	l = string.split(str)
				654	for word in l:
				655	i = len(word)
				656	if i > 2:
				657	try:
				658	r = addWordArchive(word, id, relevance)
				659	if r < 0:
				660	print "addWordArchive failed: %s %s" % (word, id)
				661	else:
				662	ret = ret + r
				663	except:
				664	print "addWordArchive failed: %s %s %d" % (word, id, relevance)
				665	print sys.exc_type, sys.exc_value
				666	return ret
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	667
				668	#########################################################################
				669	# #
				670	# XML API description analysis #
				671	# #
				672	#########################################################################
				673
				674	def loadAPI(filename):
				675	doc = libxml2.parseFile(filename)
				676	print "loaded %s" % (filename)
				677	return doc
				678
				679	def foundExport(file, symbol):
				680	if file == None:
				681	return 0
				682	if symbol == None:
				683	return 0
				684	addFunction(symbol, file)
				685	l = splitIdentifier(symbol)
				686	for word in l:
				687	addWord(word, file, symbol, 10)
				688	return 1
				689
				690	def analyzeAPIFile(top):
				691	count = 0
				692	name = top.prop("name")
				693	cur = top.children
				694	while cur != None:
				695	if cur.type == 'text':
				696	cur = cur.next
				697	continue
				698	if cur.name == "exports":
				699	count = count + foundExport(name, cur.prop("symbol"))
				700	else:
				701	print "unexpected element %s in API doc <file name='%s'>" % (name)
				702	cur = cur.next
				703	return count
				704
				705	def analyzeAPIFiles(top):
				706	count = 0
				707	cur = top.children
				708
				709	while cur != None:
				710	if cur.type == 'text':
				711	cur = cur.next
				712	continue
				713	if cur.name == "file":
				714	count = count + analyzeAPIFile(cur)
				715	else:
				716	print "unexpected element %s in API doc <files>" % (cur.name)
				717	cur = cur.next
				718	return count
				719
				720	def analyzeAPIEnum(top):
				721	file = top.prop("file")
				722	if file == None:
				723	return 0
				724	symbol = top.prop("name")
				725	if symbol == None:
				726	return 0
				727
				728	addEnum(symbol, file)
				729	l = splitIdentifier(symbol)
				730	for word in l:
				731	addWord(word, file, symbol, 10)
				732
				733	return 1
				734
				735	def analyzeAPIConst(top):
				736	file = top.prop("file")
				737	if file == None:
				738	return 0
				739	symbol = top.prop("name")
				740	if symbol == None:
				741	return 0
				742
				743	addConst(symbol, file)
				744	l = splitIdentifier(symbol)
				745	for word in l:
				746	addWord(word, file, symbol, 10)
				747
				748	return 1
				749
				750	def analyzeAPIType(top):
				751	file = top.prop("file")
				752	if file == None:
				753	return 0
				754	symbol = top.prop("name")
				755	if symbol == None:
				756	return 0
				757
				758	addType(symbol, file)
				759	l = splitIdentifier(symbol)
				760	for word in l:
				761	addWord(word, file, symbol, 10)
				762	return 1
				763
				764	def analyzeAPIFunctype(top):
				765	file = top.prop("file")
				766	if file == None:
				767	return 0
				768	symbol = top.prop("name")
				769	if symbol == None:
				770	return 0
				771
				772	addFunctype(symbol, file)
				773	l = splitIdentifier(symbol)
				774	for word in l:
				775	addWord(word, file, symbol, 10)
				776	return 1
				777
				778	def analyzeAPIStruct(top):
				779	file = top.prop("file")
				780	if file == None:
				781	return 0
				782	symbol = top.prop("name")
				783	if symbol == None:
				784	return 0
				785
				786	addStruct(symbol, file)
				787	l = splitIdentifier(symbol)
				788	for word in l:
				789	addWord(word, file, symbol, 10)
				790
				791	info = top.prop("info")
				792	if info != None:
Daniel Veillard	f007012	2002-10-09 14:24:17 +0000	[diff] [blame]	793	info = string.replace(info, "'", " ")
				794	info = string.strip(info)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	795	l = string.split(info)
				796	for word in l:
				797	if len(word) > 2:
				798	addWord(word, file, symbol, 5)
				799	return 1
				800
				801	def analyzeAPIMacro(top):
				802	file = top.prop("file")
				803	if file == None:
				804	return 0
				805	symbol = top.prop("name")
				806	if symbol == None:
				807	return 0
Daniel Veillard	f007012	2002-10-09 14:24:17 +0000	[diff] [blame]	808	symbol = string.replace(symbol, "'", " ")
				809	symbol = string.strip(symbol)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	810
				811	info = None
				812	cur = top.children
				813	while cur != None:
				814	if cur.type == 'text':
				815	cur = cur.next
				816	continue
				817	if cur.name == "info":
				818	info = cur.content
				819	break
				820	cur = cur.next
				821
				822	l = splitIdentifier(symbol)
				823	for word in l:
				824	addWord(word, file, symbol, 10)
				825
				826	if info == None:
				827	addMacro(symbol, file)
				828	print "Macro %s description has no <info>" % (symbol)
				829	return 0
				830
Daniel Veillard	f007012	2002-10-09 14:24:17 +0000	[diff] [blame]	831	info = string.replace(info, "'", " ")
				832	info = string.strip(info)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	833	addMacro(symbol, file, info)
				834	l = string.split(info)
				835	for word in l:
				836	if len(word) > 2:
				837	addWord(word, file, symbol, 5)
				838	return 1
				839
				840	def analyzeAPIFunction(top):
				841	file = top.prop("file")
				842	if file == None:
				843	return 0
				844	symbol = top.prop("name")
				845	if symbol == None:
				846	return 0
				847
Daniel Veillard	f007012	2002-10-09 14:24:17 +0000	[diff] [blame]	848	symbol = string.replace(symbol, "'", " ")
				849	symbol = string.strip(symbol)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	850	info = None
				851	cur = top.children
				852	while cur != None:
				853	if cur.type == 'text':
				854	cur = cur.next
				855	continue
				856	if cur.name == "info":
				857	info = cur.content
				858	elif cur.name == "return":
				859	rinfo = cur.prop("info")
				860	if rinfo != None:
Daniel Veillard	f007012	2002-10-09 14:24:17 +0000	[diff] [blame]	861	rinfo = string.replace(rinfo, "'", " ")
				862	rinfo = string.strip(rinfo)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	863	addString(rinfo, file, symbol, 7)
				864	elif cur.name == "arg":
				865	ainfo = cur.prop("info")
Daniel Veillard	f007012	2002-10-09 14:24:17 +0000	[diff] [blame]	866	if ainfo != None:
				867	ainfo = string.replace(ainfo, "'", " ")
				868	ainfo = string.strip(ainfo)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	869	addString(ainfo, file, symbol, 5)
				870	name = cur.prop("name")
				871	if name != None:
Daniel Veillard	f007012	2002-10-09 14:24:17 +0000	[diff] [blame]	872	name = string.replace(name, "'", " ")
				873	name = string.strip(name)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	874	addWord(name, file, symbol, 7)
				875	cur = cur.next
				876	if info == None:
				877	print "Function %s description has no <info>" % (symbol)
				878	addFunction(symbol, file, "")
				879	else:
Daniel Veillard	f007012	2002-10-09 14:24:17 +0000	[diff] [blame]	880	info = string.replace(info, "'", " ")
				881	info = string.strip(info)
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	882	addFunction(symbol, file, info)
				883	addString(info, file, symbol, 5)
				884
				885	l = splitIdentifier(symbol)
				886	for word in l:
				887	addWord(word, file, symbol, 10)
				888
				889	return 1
				890
				891	def analyzeAPISymbols(top):
				892	count = 0
				893	cur = top.children
				894
				895	while cur != None:
				896	if cur.type == 'text':
				897	cur = cur.next
				898	continue
				899	if cur.name == "macro":
				900	count = count + analyzeAPIMacro(cur)
				901	elif cur.name == "function":
				902	count = count + analyzeAPIFunction(cur)
				903	elif cur.name == "const":
				904	count = count + analyzeAPIConst(cur)
				905	elif cur.name == "typedef":
				906	count = count + analyzeAPIType(cur)
				907	elif cur.name == "struct":
				908	count = count + analyzeAPIStruct(cur)
				909	elif cur.name == "enum":
				910	count = count + analyzeAPIEnum(cur)
				911	elif cur.name == "functype":
				912	count = count + analyzeAPIFunctype(cur)
				913	else:
				914	print "unexpected element %s in API doc <files>" % (cur.name)
				915	cur = cur.next
				916	return count
				917
				918	def analyzeAPI(doc):
				919	count = 0
				920	if doc == None:
				921	return -1
				922	root = doc.getRootElement()
				923	if root.name != "api":
				924	print "Unexpected root name"
				925	return -1
				926	cur = root.children
				927	while cur != None:
				928	if cur.type == 'text':
				929	cur = cur.next
				930	continue
				931	if cur.name == "files":
				932	pass
				933	# count = count + analyzeAPIFiles(cur)
				934	elif cur.name == "symbols":
				935	count = count + analyzeAPISymbols(cur)
				936	else:
				937	print "unexpected element %s in API doc" % (cur.name)
				938	cur = cur.next
				939	return count
				940
				941	#########################################################################
				942	# #
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	943	# Web pages parsing and analysis #
				944	# #
				945	#########################################################################
				946
				947	import glob
				948
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	949	def analyzeHTMLText(doc, resource, p, section, id):
				950	words = 0
				951	try:
				952	content = p.content
				953	words = words + addStringHTML(content, resource, id, section, 5)
				954	except:
				955	return -1
				956	return words
				957
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	958	def analyzeHTMLPara(doc, resource, p, section, id):
				959	words = 0
				960	try:
				961	content = p.content
				962	words = words + addStringHTML(content, resource, id, section, 5)
				963	except:
				964	return -1
				965	return words
				966
				967	def analyzeHTMLPre(doc, resource, p, section, id):
				968	words = 0
				969	try:
				970	content = p.content
				971	words = words + addStringHTML(content, resource, id, section, 5)
				972	except:
				973	return -1
				974	return words
				975
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	976	def analyzeHTML(doc, resource, p, section, id):
				977	words = 0
				978	try:
				979	content = p.content
				980	words = words + addStringHTML(content, resource, id, section, 5)
				981	except:
				982	return -1
				983	return words
				984
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	985	def analyzeHTML(doc, resource):
				986	para = 0;
				987	ctxt = doc.xpathNewContext()
				988	try:
				989	res = ctxt.xpathEval("//head/title")
				990	title = res[0].content
				991	except:
				992	title = "Page %s" % (resource)
				993	addPage(resource, title)
				994	try:
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	995	items = ctxt.xpathEval("//h1 \| //h2 \| //h3 \| //text()")
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	996	section = title
				997	id = ""
				998	for item in items:
				999	if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
				1000	section = item.content
				1001	if item.prop("id"):
				1002	id = item.prop("id")
				1003	elif item.prop("name"):
				1004	id = item.prop("name")
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	1005	elif item.type == 'text':
				1006	analyzeHTMLText(doc, resource, item, section, id)
				1007	para = para + 1
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame]	1008	elif item.name == 'p':
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	1009	analyzeHTMLPara(doc, resource, item, section, id)
				1010	para = para + 1
				1011	elif item.name == 'pre':
				1012	analyzeHTMLPre(doc, resource, item, section, id)
				1013	para = para + 1
				1014	else:
				1015	print "Page %s, unexpected %s element" % (resource, item.name)
				1016	except:
				1017	print "Page %s: problem analyzing" % (resource)
				1018	print sys.exc_type, sys.exc_value
				1019
				1020	return para
				1021
				1022	def analyzeHTMLPages():
				1023	ret = 0
				1024	HTMLfiles = glob.glob(".html") + glob.glob("tutorial/.html")
				1025	for html in HTMLfiles:
				1026	if html[0:3] == "API":
				1027	continue
				1028	if html == "xml.html":
				1029	continue
				1030	try:
William M. Brack	008c06b	2003-09-01 22:17:39 +0000	[diff] [blame]	1031	doc = libxml2.parseFile(html)
				1032	except:
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	1033	doc = libxml2.htmlParseFile(html, None)
William M. Brack	008c06b	2003-09-01 22:17:39 +0000	[diff] [blame]	1034	try:
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	1035	res = analyzeHTML(doc, html)
				1036	print "Parsed %s : %d paragraphs" % (html, res)
				1037	ret = ret + 1
				1038	except:
				1039	print "could not parse %s" % (html)
				1040	return ret
				1041
				1042	#########################################################################
				1043	# #
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1044	# Mail archives parsing and analysis #
				1045	# #
				1046	#########################################################################
				1047
				1048	import time
				1049
				1050	def getXMLDateArchive(t = None):
				1051	if t == None:
				1052	t = time.time()
				1053	T = time.gmtime(t)
				1054	month = time.strftime("%B", T)
				1055	year = T[0]
				1056	url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
				1057	return url
				1058
				1059	def scanXMLMsgArchive(url, title, force = 0):
				1060	if url == None or title == None:
				1061	return 0
				1062
				1063	ID = checkXMLMsgArchive(url)
				1064	if force == 0 and ID != -1:
				1065	return 0
				1066
				1067	if ID == -1:
				1068	ID = addXMLMsgArchive(url, title)
				1069	if ID == -1:
				1070	return 0
				1071
				1072	try:
				1073	print "Loading %s" % (url)
				1074	doc = libxml2.htmlParseFile(url, None);
				1075	except:
				1076	doc = None
				1077	if doc == None:
				1078	print "Failed to parse %s" % (url)
				1079	return 0
				1080
				1081	addStringArchive(title, ID, 20)
				1082	ctxt = doc.xpathNewContext()
				1083	texts = ctxt.xpathEval("//pre//text()")
				1084	for text in texts:
				1085	addStringArchive(text.content, ID, 5)
				1086
				1087	return 1
				1088
				1089	def scanXMLDateArchive(t = None, force = 0):
Daniel Veillard	d7960a8	2002-10-08 19:13:50 +0000	[diff] [blame]	1090	global wordsDictArchive
				1091
				1092	wordsDictArchive = {}
				1093
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1094	url = getXMLDateArchive(t)
				1095	print "loading %s" % (url)
				1096	try:
				1097	doc = libxml2.htmlParseFile(url, None);
				1098	except:
				1099	doc = None
				1100	if doc == None:
				1101	print "Failed to parse %s" % (url)
				1102	return -1
				1103	ctxt = doc.xpathNewContext()
				1104	anchors = ctxt.xpathEval("//a[@href]")
				1105	links = 0
				1106	newmsg = 0
				1107	for anchor in anchors:
				1108	href = anchor.prop("href")
				1109	if href == None or href[0:3] != "msg":
				1110	continue
				1111	try:
				1112	links = links + 1
				1113
				1114	msg = libxml2.buildURI(href, url)
				1115	title = anchor.content
				1116	if title != None and title[0:4] == 'Re: ':
				1117	title = title[4:]
				1118	if title != None and title[0:6] == '[xml] ':
				1119	title = title[6:]
				1120	newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
				1121
				1122	except:
				1123	pass
				1124
				1125	return newmsg
				1126
				1127
				1128	#########################################################################
				1129	# #
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1130	# Main code: open the DB, the API XML and analyze it #
				1131	# #
				1132	#########################################################################
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1133	def analyzeArchives(t = None, force = 0):
				1134	global wordsDictArchive
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	1135
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1136	ret = scanXMLDateArchive(t, force)
				1137	print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	1138
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1139	i = 0
				1140	skipped = 0
				1141	for word in wordsDictArchive.keys():
				1142	refs = wordsDictArchive[word]
				1143	if refs == None:
				1144	skipped = skipped + 1
				1145	continue;
				1146	for id in refs.keys():
				1147	relevance = refs[id]
				1148	updateWordArchive(word, id, relevance)
				1149	i = i + 1
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	1150
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1151	print "Found %d associations in HTML pages" % (i)
				1152
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame]	1153	def analyzeHTMLTop():
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1154	global wordsDictHTML
				1155
				1156	ret = analyzeHTMLPages()
				1157	print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
				1158
				1159	i = 0
				1160	skipped = 0
				1161	for word in wordsDictHTML.keys():
				1162	refs = wordsDictHTML[word]
				1163	if refs == None:
				1164	skipped = skipped + 1
				1165	continue;
				1166	for resource in refs.keys():
				1167	(relevance, id, section) = refs[resource]
				1168	updateWordHTML(word, resource, section, id, relevance)
				1169	i = i + 1
				1170
				1171	print "Found %d associations in HTML pages" % (i)
				1172
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame]	1173	def analyzeAPITop():
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1174	global wordsDict
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame]	1175	global API
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1176
				1177	try:
				1178	doc = loadAPI(API)
				1179	ret = analyzeAPI(doc)
				1180	print "Analyzed %d blocs" % (ret)
				1181	doc.freeDoc()
				1182	except:
				1183	print "Failed to parse and analyze %s" % (API)
				1184	print sys.exc_type, sys.exc_value
				1185	sys.exit(1)
				1186
				1187	print "Indexed %d words" % (len(wordsDict))
				1188	i = 0
				1189	skipped = 0
				1190	for word in wordsDict.keys():
				1191	refs = wordsDict[word]
				1192	if refs == None:
				1193	skipped = skipped + 1
				1194	continue;
				1195	for (module, symbol) in refs.keys():
				1196	updateWord(word, symbol, refs[(module, symbol)])
				1197	i = i + 1
				1198
				1199	print "Found %d associations, skipped %d words" % (i, skipped)
				1200
				1201	def usage():
Daniel Veillard	f08d400	2002-10-08 17:17:11 +0000	[diff] [blame]	1202	print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1203	sys.exit(1)
				1204
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1205	def main():
Daniel Veillard	24f6a07	2004-04-08 14:39:25 +0000	[diff] [blame]	1206	try:
				1207	openMySQL()
				1208	except:
				1209	print "Failed to open the database"
				1210	print sys.exc_type, sys.exc_value
				1211	sys.exit(1)
				1212
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1213	args = sys.argv[1:]
				1214	force = 0
				1215	if args:
				1216	i = 0
				1217	while i < len(args):
				1218	if args[i] == '--force':
				1219	force = 1
				1220	elif args[i] == '--archive':
Daniel Veillard	f08d400	2002-10-08 17:17:11 +0000	[diff] [blame]	1221	analyzeArchives(None, force)
				1222	elif args[i] == '--archive-year':
				1223	i = i + 1;
				1224	year = args[i]
				1225	months = ["January" , "February", "March", "April", "May",
				1226	"June", "July", "August", "September", "October",
				1227	"November", "December"];
				1228	for month in months:
				1229	try:
				1230	str = "%s-%s" % (year, month)
				1231	T = time.strptime(str, "%Y-%B")
				1232	t = time.mktime(T) + 3600 * 24 * 10;
				1233	analyzeArchives(t, force)
				1234	except:
				1235	print "Failed to index month archive:"
				1236	print sys.exc_type, sys.exc_value
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1237	elif args[i] == '--archive-month':
				1238	i = i + 1;
				1239	month = args[i]
				1240	try:
				1241	T = time.strptime(month, "%Y-%B")
				1242	t = time.mktime(T) + 3600 * 24 * 10;
				1243	analyzeArchives(t, force)
				1244	except:
				1245	print "Failed to index month archive:"
				1246	print sys.exc_type, sys.exc_value
				1247	elif args[i] == '--API':
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame]	1248	analyzeAPITop()
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1249	elif args[i] == '--docs':
Daniel Veillard	321be0c	2002-10-08 21:26:42 +0000	[diff] [blame]	1250	analyzeHTMLTop()
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1251	else:
				1252	usage()
				1253	i = i + 1
				1254	else:
				1255	usage()
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1256
Daniel Veillard	01e87d2	2002-10-08 16:55:06 +0000	[diff] [blame]	1257	if __name__ == "__main__":
				1258	main()