Blame - doc/index.py - fp2-dev/platform/external/libxml2

blob: 409e90d7e4b38487b07fc584fde281f2e1ba1d3e [file] [log] [blame]

Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1	#!/usr/bin/python -u
				2	#
				3	# imports the API description and fills up a database with
				4	# name relevance to modules, functions or web pages
				5	#
Daniel Veillard	2c77cd7	2002-10-01 13:54:14 +0000	[diff] [blame]	6	# Operation needed:
				7	# =================
				8	#
				9	# install mysqld, the python wrappers for mysql and libxml2, start mysqld
				10	# Change the root passwd of mysql:
				11	# mysqladmin -u root password new_password
				12	# Create the new database xmlsoft
				13	# mysqladmin -p create xmlsoft
				14	# Create a database user 'veillard' and give him passord access
				15	# change veillard and abcde with the right user name and passwd
				16	# mysql -p
				17	# password:
				18	# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
				19	# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
				20	#
				21	# As the user check the access:
				22	# mysql -p xmlsoft
				23	# Enter password:
				24	# Welcome to the MySQL monitor....
				25	# mysql> use xmlsoft
				26	# Database changed
				27	# mysql> quit
				28	# Bye
				29	#
				30	# Then run the script in the doc subdir, it will create the symbols and
				31	# word tables and populate them with informations extracted from
				32	# the libxml2-api.xml API description, and make then accessible read-only
				33	# by nobody@loaclhost the user expected to be Apache's one
				34	#
				35	# On the Apache configuration, make sure you have php support enabled
				36	#
				37
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	38	import MySQLdb
				39	import libxml2
				40	import sys
				41	import string
				42	import os
				43
				44	#
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	45	# We are not interested in parsing errors here
				46	#
				47	def callback(ctx, str):
				48	return
				49	libxml2.registerErrorHandler(callback, None)
				50
				51	#
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	52	# The dictionnary of tables required and the SQL command needed
				53	# to create them
				54	#
				55	TABLES={
				56	"symbols" : """CREATE TABLE symbols (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame^]	57	name varchar(255) BINARY NOT NULL,
				58	module varchar(255) BINARY NOT NULL,
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	59	type varchar(25) NOT NULL,
				60	descr varchar(255),
				61	UNIQUE KEY name (name),
				62	KEY module (module))""",
				63	"words" : """CREATE TABLE words (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame^]	64	name varchar(50) BINARY NOT NULL,
				65	symbol varchar(255) BINARY NOT NULL,
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	66	relevance int,
				67	KEY name (name),
				68	KEY symbol (symbol),
				69	UNIQUE KEY ID (name, symbol))""",
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	70	"wordsHTML" : """CREATE TABLE wordsHTML (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame^]	71	name varchar(50) BINARY NOT NULL,
				72	resource varchar(255) BINARY NOT NULL,
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	73	section varchar(255),
				74	id varchar(50),
				75	relevance int,
				76	KEY name (name),
				77	KEY resource (resource),
				78	UNIQUE KEY ref (name, resource))""",
				79	"pages" : """CREATE TABLE pages (
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame^]	80	resource varchar(255) BINARY NOT NULL,
				81	title varchar(255) BINARY NOT NULL,
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	82	UNIQUE KEY name (resource))""",
Daniel Veillard	dc6d4ab	2002-10-04 15:58:34 +0000	[diff] [blame]	83	"Queries" : """CREATE TABLE Queries (
				84	ID int(11) NOT NULL auto_increment,
				85	Value varchar(50) NOT NULL,
				86	Count int(11) NOT NULL,
				87	UNIQUE KEY id (ID,Value(35)),
				88	INDEX (ID))""",
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	89	}
				90
				91	#
				92	# The XML API description file to parse
				93	#
				94	API="libxml2-api.xml"
				95	DB=None
				96
				97	#########################################################################
				98	# #
				99	# MySQL database interfaces #
				100	# #
				101	#########################################################################
				102	def createTable(db, name):
				103	global TABLES
				104
				105	if db == None:
				106	return -1
				107	if name == None:
				108	return -1
				109	c = db.cursor()
				110
				111	ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
				112	if ret == 1:
				113	print "Removed table %s" % (name)
				114	print "Creating table %s" % (name)
				115	try:
				116	ret = c.execute(TABLES[name])
				117	except:
				118	print "Failed to create table %s" % (name)
				119	return -1
				120	return ret
				121
				122	def checkTables(db):
				123	global TABLES
				124
				125	if db == None:
				126	return -1
				127	c = db.cursor()
				128	nbtables = c.execute("show tables")
				129	print "Found %d tables" % (nbtables)
				130	tables = {}
				131	i = 0
				132	while i < nbtables:
				133	l = c.fetchone()
				134	name = l[0]
				135	tables[name] = {}
				136	i = i + 1
				137
				138	for table in TABLES.keys():
				139	if not tables.has_key(table):
				140	print "table %s missing" % (table)
				141	createTable(db, table)
				142	print "checkTables finished"
				143
				144	# make sure apache can access the tables read-only
				145	try:
				146	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
Daniel Veillard	dc6d4ab	2002-10-04 15:58:34 +0000	[diff] [blame]	147	ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	148	except:
				149	pass
				150	return 0
				151
				152	def openMySQL(db="xmlsoft", passwd=None):
				153	global DB
				154
				155	if passwd == None:
Daniel Veillard	538d3b9	2002-10-01 14:04:56 +0000	[diff] [blame]	156	try:
				157	passwd = os.environ["MySQL_PASS"]
				158	except:
				159	print "No password available, set environment MySQL_PASS"
				160	sys.exit(1)
				161
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	162	DB = MySQLdb.connect(passwd=passwd, db=db)
				163	if DB == None:
				164	return -1
				165	ret = checkTables(DB)
				166	return ret
				167
				168	def updateWord(name, symbol, relevance):
				169	global DB
				170
				171	if DB == None:
				172	openMySQL()
				173	if DB == None:
				174	return -1
				175	if name == None:
				176	return -1
				177	if symbol == None:
				178	return -1
				179
				180	c = DB.cursor()
				181	try:
				182	ret = c.execute(
				183	"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
				184	(name, symbol, relevance))
				185	except:
				186	try:
				187	ret = c.execute(
				188	"""UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
				189	(relevance, name, symbol))
				190	except:
				191	print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
				192	print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
				193	print sys.exc_type, sys.exc_value
				194	return -1
				195
				196	return ret
				197
				198	def updateSymbol(name, module, type, desc):
				199	global DB
				200
				201	updateWord(name, name, 50)
				202	if DB == None:
				203	openMySQL()
				204	if DB == None:
				205	return -1
				206	if name == None:
				207	return -1
				208	if module == None:
				209	return -1
				210	if type == None:
				211	return -1
				212
				213	try:
				214	desc = string.replace(desc, "'", " ")
				215	l = string.split(desc, ".")
				216	desc = l[0]
				217	desc = desc[0:99]
				218	except:
				219	desc = ""
				220
				221	c = DB.cursor()
				222	try:
				223	ret = c.execute(
				224	"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
				225	(name, module, type, desc))
				226	except:
				227	try:
				228	ret = c.execute(
				229	"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
				230	(module, type, desc, name))
				231	except:
				232	print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
				233	print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
				234	print sys.exc_type, sys.exc_value
				235	return -1
				236
				237	return ret
				238
				239	def addFunction(name, module, desc = ""):
				240	return updateSymbol(name, module, 'function', desc)
				241
				242	def addMacro(name, module, desc = ""):
				243	return updateSymbol(name, module, 'macro', desc)
				244
				245	def addEnum(name, module, desc = ""):
				246	return updateSymbol(name, module, 'enum', desc)
				247
				248	def addStruct(name, module, desc = ""):
				249	return updateSymbol(name, module, 'struct', desc)
				250
				251	def addConst(name, module, desc = ""):
				252	return updateSymbol(name, module, 'const', desc)
				253
				254	def addType(name, module, desc = ""):
				255	return updateSymbol(name, module, 'type', desc)
				256
				257	def addFunctype(name, module, desc = ""):
				258	return updateSymbol(name, module, 'functype', desc)
				259
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	260	def addPage(resource, title):
				261	global DB
				262
				263	if DB == None:
				264	openMySQL()
				265	if DB == None:
				266	return -1
				267	if resource == None:
				268	return -1
				269
				270	c = DB.cursor()
				271	try:
				272	ret = c.execute(
				273	"""INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
				274	(resource, title))
				275	except:
				276	try:
				277	ret = c.execute(
				278	"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
				279	(title, resource))
				280	except:
				281	print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
				282	print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
				283	print sys.exc_type, sys.exc_value
				284	return -1
				285
				286	return ret
				287
				288	def updateWordHTML(name, resource, desc, id, relevance):
				289	global DB
				290
				291	if DB == None:
				292	openMySQL()
				293	if DB == None:
				294	return -1
				295	if name == None:
				296	return -1
				297	if resource == None:
				298	return -1
				299	if id == None:
				300	id = ""
				301	if desc == None:
				302	desc = ""
				303	else:
				304	try:
				305	desc = string.replace(desc, "'", " ")
				306	desc = desc[0:99]
				307	except:
				308	desc = ""
				309
				310	c = DB.cursor()
				311	try:
				312	ret = c.execute(
				313	"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
				314	(name, resource, desc, id, relevance))
				315	except:
				316	try:
				317	ret = c.execute(
				318	"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
				319	(desc, id, relevance, name, resource))
				320	except:
				321	print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
				322	print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
				323	print sys.exc_type, sys.exc_value
				324	return -1
				325
				326	return ret
				327
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	328	#########################################################################
				329	# #
				330	# Word dictionnary and analysis routines #
				331	# #
				332	#########################################################################
				333
				334	wordsDict = {}
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	335	wordsDictHTML = {}
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	336
				337	def splitIdentifier(str):
				338	ret = []
				339	while str != "":
				340	cur = string.lower(str[0])
				341	str = str[1:]
				342	if ((cur < 'a') or (cur > 'z')):
				343	continue
				344	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
				345	cur = cur + string.lower(str[0])
				346	str = str[1:]
				347	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
				348	cur = cur + str[0]
				349	str = str[1:]
				350	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
				351	str = str[1:]
				352	ret.append(cur)
				353	return ret
				354
				355	def addWord(word, module, symbol, relevance):
				356	global wordsDict
				357
				358	if word == None or len(word) < 3:
				359	return -1
				360	if module == None or symbol == None:
				361	return -1
				362	if wordsDict.has_key(word):
				363	d = wordsDict[word]
				364	if d == None:
				365	return 0
				366	if len(d) > 500:
				367	wordsDict[word] = None
				368	return 0
				369	try:
				370	relevance = relevance + d[(module, symbol)]
				371	except:
				372	pass
				373	else:
				374	wordsDict[word] = {}
				375	wordsDict[word][(module, symbol)] = relevance
				376	return relevance
				377
				378	def addString(str, module, symbol, relevance):
				379	if str == None or len(str) < 3:
				380	return -1
				381	ret = 0
				382	str = string.replace(str, ".", " ")
				383	str = string.replace(str, ",", " ")
				384	str = string.replace(str, "'", " ")
				385	str = string.replace(str, '"', " ")
				386	str = string.replace(str, ";", " ")
				387	str = string.replace(str, "-", " ")
				388	l = string.split(str)
				389	for word in l:
				390	if len(word) > 2:
				391	ret = ret + addWord(word, module, symbol, 5)
				392
				393	return ret
				394
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	395	def addWordHTML(word, resource, id, section, relevance):
				396	global wordsDictHTML
				397
				398	if word == None or len(word) < 3:
				399	return -1
				400	if resource == None or section == None:
				401	return -1
				402
				403	if wordsDictHTML.has_key(word):
				404	d = wordsDictHTML[word]
				405	if d == None:
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame^]	406	print "skipped %s" % (word)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	407	return 0
				408	try:
				409	(r,i,s) = d[resource]
				410	if i != None:
				411	id = i
				412	if s != None:
				413	section = s
				414	relevance = relevance + r
				415	except:
				416	pass
				417	else:
				418	wordsDictHTML[word] = {}
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame^]	419	d = wordsDictHTML[word];
				420	d[resource] = (relevance, id, section)
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	421	return relevance
				422
				423	def addStringHTML(str, resource, id, section, relevance):
				424	if str == None or len(str) < 3:
				425	return -1
				426	ret = 0
				427	str = string.replace(str, ".", " ")
				428	str = string.replace(str, ",", " ")
				429	str = string.replace(str, "'", " ")
				430	str = string.replace(str, '"', " ")
				431	str = string.replace(str, ";", " ")
				432	str = string.replace(str, "-", " ")
				433	str = string.replace(str, "(", " ")
				434	str = string.replace(str, ")", " ")
				435	str = string.replace(str, "{", " ")
				436	str = string.replace(str, "}", " ")
				437	str = string.replace(str, "<", " ")
				438	str = string.replace(str, ">", " ")
				439	str = string.replace(str, "/", " ")
				440	str = string.replace(str, "*", " ")
				441	str = string.replace(str, ":", " ")
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame^]	442	str = string.replace(str, "#", " ")
				443	str = string.replace(str, "!", " ")
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	444	str = string.replace(str, "\n", " ")
				445	str = string.replace(str, "\r", " ")
				446	str = string.replace(str, "\xc2", " ")
				447	str = string.replace(str, "\xa0", " ")
				448	l = string.split(str)
				449	for word in l:
				450	if len(word) > 2:
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame^]	451	try:
				452	r = addWordHTML(word, resource, id, section, relevance)
				453	if r <= 0:
				454	print "addWordHTML failed: %s %s" % (word, resource)
				455	ret = ret + r
				456	except:
				457	print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
				458	print sys.exc_type, sys.exc_value
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	459
				460	return ret
				461
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	462
				463	#########################################################################
				464	# #
				465	# XML API description analysis #
				466	# #
				467	#########################################################################
				468
				469	def loadAPI(filename):
				470	doc = libxml2.parseFile(filename)
				471	print "loaded %s" % (filename)
				472	return doc
				473
				474	def foundExport(file, symbol):
				475	if file == None:
				476	return 0
				477	if symbol == None:
				478	return 0
				479	addFunction(symbol, file)
				480	l = splitIdentifier(symbol)
				481	for word in l:
				482	addWord(word, file, symbol, 10)
				483	return 1
				484
				485	def analyzeAPIFile(top):
				486	count = 0
				487	name = top.prop("name")
				488	cur = top.children
				489	while cur != None:
				490	if cur.type == 'text':
				491	cur = cur.next
				492	continue
				493	if cur.name == "exports":
				494	count = count + foundExport(name, cur.prop("symbol"))
				495	else:
				496	print "unexpected element %s in API doc <file name='%s'>" % (name)
				497	cur = cur.next
				498	return count
				499
				500	def analyzeAPIFiles(top):
				501	count = 0
				502	cur = top.children
				503
				504	while cur != None:
				505	if cur.type == 'text':
				506	cur = cur.next
				507	continue
				508	if cur.name == "file":
				509	count = count + analyzeAPIFile(cur)
				510	else:
				511	print "unexpected element %s in API doc <files>" % (cur.name)
				512	cur = cur.next
				513	return count
				514
				515	def analyzeAPIEnum(top):
				516	file = top.prop("file")
				517	if file == None:
				518	return 0
				519	symbol = top.prop("name")
				520	if symbol == None:
				521	return 0
				522
				523	addEnum(symbol, file)
				524	l = splitIdentifier(symbol)
				525	for word in l:
				526	addWord(word, file, symbol, 10)
				527
				528	return 1
				529
				530	def analyzeAPIConst(top):
				531	file = top.prop("file")
				532	if file == None:
				533	return 0
				534	symbol = top.prop("name")
				535	if symbol == None:
				536	return 0
				537
				538	addConst(symbol, file)
				539	l = splitIdentifier(symbol)
				540	for word in l:
				541	addWord(word, file, symbol, 10)
				542
				543	return 1
				544
				545	def analyzeAPIType(top):
				546	file = top.prop("file")
				547	if file == None:
				548	return 0
				549	symbol = top.prop("name")
				550	if symbol == None:
				551	return 0
				552
				553	addType(symbol, file)
				554	l = splitIdentifier(symbol)
				555	for word in l:
				556	addWord(word, file, symbol, 10)
				557	return 1
				558
				559	def analyzeAPIFunctype(top):
				560	file = top.prop("file")
				561	if file == None:
				562	return 0
				563	symbol = top.prop("name")
				564	if symbol == None:
				565	return 0
				566
				567	addFunctype(symbol, file)
				568	l = splitIdentifier(symbol)
				569	for word in l:
				570	addWord(word, file, symbol, 10)
				571	return 1
				572
				573	def analyzeAPIStruct(top):
				574	file = top.prop("file")
				575	if file == None:
				576	return 0
				577	symbol = top.prop("name")
				578	if symbol == None:
				579	return 0
				580
				581	addStruct(symbol, file)
				582	l = splitIdentifier(symbol)
				583	for word in l:
				584	addWord(word, file, symbol, 10)
				585
				586	info = top.prop("info")
				587	if info != None:
				588	l = string.split(info)
				589	for word in l:
				590	if len(word) > 2:
				591	addWord(word, file, symbol, 5)
				592	return 1
				593
				594	def analyzeAPIMacro(top):
				595	file = top.prop("file")
				596	if file == None:
				597	return 0
				598	symbol = top.prop("name")
				599	if symbol == None:
				600	return 0
				601
				602	info = None
				603	cur = top.children
				604	while cur != None:
				605	if cur.type == 'text':
				606	cur = cur.next
				607	continue
				608	if cur.name == "info":
				609	info = cur.content
				610	break
				611	cur = cur.next
				612
				613	l = splitIdentifier(symbol)
				614	for word in l:
				615	addWord(word, file, symbol, 10)
				616
				617	if info == None:
				618	addMacro(symbol, file)
				619	print "Macro %s description has no <info>" % (symbol)
				620	return 0
				621
				622	addMacro(symbol, file, info)
				623	l = string.split(info)
				624	for word in l:
				625	if len(word) > 2:
				626	addWord(word, file, symbol, 5)
				627	return 1
				628
				629	def analyzeAPIFunction(top):
				630	file = top.prop("file")
				631	if file == None:
				632	return 0
				633	symbol = top.prop("name")
				634	if symbol == None:
				635	return 0
				636
				637	info = None
				638	cur = top.children
				639	while cur != None:
				640	if cur.type == 'text':
				641	cur = cur.next
				642	continue
				643	if cur.name == "info":
				644	info = cur.content
				645	elif cur.name == "return":
				646	rinfo = cur.prop("info")
				647	if rinfo != None:
				648	addString(rinfo, file, symbol, 7)
				649	elif cur.name == "arg":
				650	ainfo = cur.prop("info")
				651	if rinfo != None:
				652	addString(ainfo, file, symbol, 5)
				653	name = cur.prop("name")
				654	if name != None:
				655	addWord(name, file, symbol, 7)
				656	cur = cur.next
				657	if info == None:
				658	print "Function %s description has no <info>" % (symbol)
				659	addFunction(symbol, file, "")
				660	else:
				661	addFunction(symbol, file, info)
				662	addString(info, file, symbol, 5)
				663
				664	l = splitIdentifier(symbol)
				665	for word in l:
				666	addWord(word, file, symbol, 10)
				667
				668	return 1
				669
				670	def analyzeAPISymbols(top):
				671	count = 0
				672	cur = top.children
				673
				674	while cur != None:
				675	if cur.type == 'text':
				676	cur = cur.next
				677	continue
				678	if cur.name == "macro":
				679	count = count + analyzeAPIMacro(cur)
				680	elif cur.name == "function":
				681	count = count + analyzeAPIFunction(cur)
				682	elif cur.name == "const":
				683	count = count + analyzeAPIConst(cur)
				684	elif cur.name == "typedef":
				685	count = count + analyzeAPIType(cur)
				686	elif cur.name == "struct":
				687	count = count + analyzeAPIStruct(cur)
				688	elif cur.name == "enum":
				689	count = count + analyzeAPIEnum(cur)
				690	elif cur.name == "functype":
				691	count = count + analyzeAPIFunctype(cur)
				692	else:
				693	print "unexpected element %s in API doc <files>" % (cur.name)
				694	cur = cur.next
				695	return count
				696
				697	def analyzeAPI(doc):
				698	count = 0
				699	if doc == None:
				700	return -1
				701	root = doc.getRootElement()
				702	if root.name != "api":
				703	print "Unexpected root name"
				704	return -1
				705	cur = root.children
				706	while cur != None:
				707	if cur.type == 'text':
				708	cur = cur.next
				709	continue
				710	if cur.name == "files":
				711	pass
				712	# count = count + analyzeAPIFiles(cur)
				713	elif cur.name == "symbols":
				714	count = count + analyzeAPISymbols(cur)
				715	else:
				716	print "unexpected element %s in API doc" % (cur.name)
				717	cur = cur.next
				718	return count
				719
				720	#########################################################################
				721	# #
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	722	# Web pages parsing and analysis #
				723	# #
				724	#########################################################################
				725
				726	import glob
				727
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	728	def analyzeHTMLText(doc, resource, p, section, id):
				729	words = 0
				730	try:
				731	content = p.content
				732	words = words + addStringHTML(content, resource, id, section, 5)
				733	except:
				734	return -1
				735	return words
				736
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	737	def analyzeHTMLPara(doc, resource, p, section, id):
				738	words = 0
				739	try:
				740	content = p.content
				741	words = words + addStringHTML(content, resource, id, section, 5)
				742	except:
				743	return -1
				744	return words
				745
				746	def analyzeHTMLPre(doc, resource, p, section, id):
				747	words = 0
				748	try:
				749	content = p.content
				750	words = words + addStringHTML(content, resource, id, section, 5)
				751	except:
				752	return -1
				753	return words
				754
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	755	def analyzeHTML(doc, resource, p, section, id):
				756	words = 0
				757	try:
				758	content = p.content
				759	words = words + addStringHTML(content, resource, id, section, 5)
				760	except:
				761	return -1
				762	return words
				763
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	764	def analyzeHTML(doc, resource):
				765	para = 0;
				766	ctxt = doc.xpathNewContext()
				767	try:
				768	res = ctxt.xpathEval("//head/title")
				769	title = res[0].content
				770	except:
				771	title = "Page %s" % (resource)
				772	addPage(resource, title)
				773	try:
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	774	items = ctxt.xpathEval("//h1 \| //h2 \| //h3 \| //text()")
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	775	section = title
				776	id = ""
				777	for item in items:
				778	if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
				779	section = item.content
				780	if item.prop("id"):
				781	id = item.prop("id")
				782	elif item.prop("name"):
				783	id = item.prop("name")
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame]	784	elif item.type == 'text':
				785	analyzeHTMLText(doc, resource, item, section, id)
				786	para = para + 1
Daniel Veillard	a6287a4	2002-10-07 13:17:22 +0000	[diff] [blame^]	787	elif item.name == 'p':
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	788	analyzeHTMLPara(doc, resource, item, section, id)
				789	para = para + 1
				790	elif item.name == 'pre':
				791	analyzeHTMLPre(doc, resource, item, section, id)
				792	para = para + 1
				793	else:
				794	print "Page %s, unexpected %s element" % (resource, item.name)
				795	except:
				796	print "Page %s: problem analyzing" % (resource)
				797	print sys.exc_type, sys.exc_value
				798
				799	return para
				800
				801	def analyzeHTMLPages():
				802	ret = 0
				803	HTMLfiles = glob.glob(".html") + glob.glob("tutorial/.html")
				804	for html in HTMLfiles:
				805	if html[0:3] == "API":
				806	continue
				807	if html == "xml.html":
				808	continue
				809	try:
				810	doc = libxml2.htmlParseFile(html, None)
				811	res = analyzeHTML(doc, html)
				812	print "Parsed %s : %d paragraphs" % (html, res)
				813	ret = ret + 1
				814	except:
				815	print "could not parse %s" % (html)
				816	return ret
				817
				818	#########################################################################
				819	# #
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	820	# Main code: open the DB, the API XML and analyze it #
				821	# #
				822	#########################################################################
				823	try:
				824	openMySQL()
				825	except:
				826	print "Failed to open the database"
				827	print sys.exc_type, sys.exc_value
				828	sys.exit(1)
				829
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	830	ret = analyzeHTMLPages()
				831	print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
				832
				833	i = 0
				834	skipped = 0
				835	for word in wordsDictHTML.keys():
				836	refs = wordsDictHTML[word]
				837	if refs == None:
				838	skipped = skipped + 1
				839	continue;
				840	for resource in refs.keys():
				841	(relevance, id, section) = refs[resource]
				842	updateWordHTML(word, resource, section, id, relevance)
				843	i = i + 1
				844
				845	print "Found %d associations in HTML pages" % (i)
				846
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	847	try:
				848	doc = loadAPI(API)
				849	ret = analyzeAPI(doc)
				850	print "Analyzed %d blocs" % (ret)
				851	doc.freeDoc()
				852	except:
				853	print "Failed to parse and analyze %s" % (API)
				854	print sys.exc_type, sys.exc_value
				855	sys.exit(1)
				856
				857	print "Indexed %d words" % (len(wordsDict))
				858	i = 0
				859	skipped = 0
				860	for word in wordsDict.keys():
				861	refs = wordsDict[word]
				862	if refs == None:
				863	skipped = skipped + 1
				864	continue;
				865	for (module, symbol) in refs.keys():
				866	updateWord(word, symbol, refs[(module, symbol)])
				867	i = i + 1
				868
				869	print "Found %d associations, skipped %d words" % (i, skipped)