Blame - doc/index.py - fp2-dev/platform/external/libxml2

blob: e3b8588abe0c18da919181de61393dd7efdfa723 [file] [log] [blame]

Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	1	#!/usr/bin/python -u
				2	#
				3	# imports the API description and fills up a database with
				4	# name relevance to modules, functions or web pages
				5	#
Daniel Veillard	2c77cd7	2002-10-01 13:54:14 +0000	[diff] [blame]	6	# Operation needed:
				7	# =================
				8	#
				9	# install mysqld, the python wrappers for mysql and libxml2, start mysqld
				10	# Change the root passwd of mysql:
				11	# mysqladmin -u root password new_password
				12	# Create the new database xmlsoft
				13	# mysqladmin -p create xmlsoft
				14	# Create a database user 'veillard' and give him passord access
				15	# change veillard and abcde with the right user name and passwd
				16	# mysql -p
				17	# password:
				18	# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
				19	# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
				20	#
				21	# As the user check the access:
				22	# mysql -p xmlsoft
				23	# Enter password:
				24	# Welcome to the MySQL monitor....
				25	# mysql> use xmlsoft
				26	# Database changed
				27	# mysql> quit
				28	# Bye
				29	#
				30	# Then run the script in the doc subdir, it will create the symbols and
				31	# word tables and populate them with informations extracted from
				32	# the libxml2-api.xml API description, and make then accessible read-only
				33	# by nobody@loaclhost the user expected to be Apache's one
				34	#
				35	# On the Apache configuration, make sure you have php support enabled
				36	#
				37
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	38	import MySQLdb
				39	import libxml2
				40	import sys
				41	import string
				42	import os
				43
				44	#
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	45	# We are not interested in parsing errors here
				46	#
				47	def callback(ctx, str):
				48	return
				49	libxml2.registerErrorHandler(callback, None)
				50
				51	#
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	52	# The dictionnary of tables required and the SQL command needed
				53	# to create them
				54	#
				55	TABLES={
				56	"symbols" : """CREATE TABLE symbols (
				57	name varchar(255) NOT NULL,
				58	module varchar(255) NOT NULL,
				59	type varchar(25) NOT NULL,
				60	descr varchar(255),
				61	UNIQUE KEY name (name),
				62	KEY module (module))""",
				63	"words" : """CREATE TABLE words (
				64	name varchar(50) NOT NULL,
				65	symbol varchar(255) NOT NULL,
				66	relevance int,
				67	KEY name (name),
				68	KEY symbol (symbol),
				69	UNIQUE KEY ID (name, symbol))""",
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	70	"wordsHTML" : """CREATE TABLE wordsHTML (
				71	name varchar(50) NOT NULL,
				72	resource varchar(255) NOT NULL,
				73	section varchar(255),
				74	id varchar(50),
				75	relevance int,
				76	KEY name (name),
				77	KEY resource (resource),
				78	UNIQUE KEY ref (name, resource))""",
				79	"pages" : """CREATE TABLE pages (
				80	resource varchar(255) NOT NULL,
				81	title varchar(255) NOT NULL,
				82	UNIQUE KEY name (resource))""",
Daniel Veillard	dc6d4ab	2002-10-04 15:58:34 +0000	[diff] [blame]	83	"Queries" : """CREATE TABLE Queries (
				84	ID int(11) NOT NULL auto_increment,
				85	Value varchar(50) NOT NULL,
				86	Count int(11) NOT NULL,
				87	UNIQUE KEY id (ID,Value(35)),
				88	INDEX (ID))""",
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	89	}
				90
				91	#
				92	# The XML API description file to parse
				93	#
				94	API="libxml2-api.xml"
				95	DB=None
				96
				97	#########################################################################
				98	# #
				99	# MySQL database interfaces #
				100	# #
				101	#########################################################################
				102	def createTable(db, name):
				103	global TABLES
				104
				105	if db == None:
				106	return -1
				107	if name == None:
				108	return -1
				109	c = db.cursor()
				110
				111	ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
				112	if ret == 1:
				113	print "Removed table %s" % (name)
				114	print "Creating table %s" % (name)
				115	try:
				116	ret = c.execute(TABLES[name])
				117	except:
				118	print "Failed to create table %s" % (name)
				119	return -1
				120	return ret
				121
				122	def checkTables(db):
				123	global TABLES
				124
				125	if db == None:
				126	return -1
				127	c = db.cursor()
				128	nbtables = c.execute("show tables")
				129	print "Found %d tables" % (nbtables)
				130	tables = {}
				131	i = 0
				132	while i < nbtables:
				133	l = c.fetchone()
				134	name = l[0]
				135	tables[name] = {}
				136	i = i + 1
				137
				138	for table in TABLES.keys():
				139	if not tables.has_key(table):
				140	print "table %s missing" % (table)
				141	createTable(db, table)
				142	print "checkTables finished"
				143
				144	# make sure apache can access the tables read-only
				145	try:
				146	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
Daniel Veillard	dc6d4ab	2002-10-04 15:58:34 +0000	[diff] [blame]	147	ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	148	except:
				149	pass
				150	return 0
				151
				152	def openMySQL(db="xmlsoft", passwd=None):
				153	global DB
				154
				155	if passwd == None:
Daniel Veillard	538d3b9	2002-10-01 14:04:56 +0000	[diff] [blame]	156	try:
				157	passwd = os.environ["MySQL_PASS"]
				158	except:
				159	print "No password available, set environment MySQL_PASS"
				160	sys.exit(1)
				161
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	162	DB = MySQLdb.connect(passwd=passwd, db=db)
				163	if DB == None:
				164	return -1
				165	ret = checkTables(DB)
				166	return ret
				167
				168	def updateWord(name, symbol, relevance):
				169	global DB
				170
				171	if DB == None:
				172	openMySQL()
				173	if DB == None:
				174	return -1
				175	if name == None:
				176	return -1
				177	if symbol == None:
				178	return -1
				179
				180	c = DB.cursor()
				181	try:
				182	ret = c.execute(
				183	"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
				184	(name, symbol, relevance))
				185	except:
				186	try:
				187	ret = c.execute(
				188	"""UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
				189	(relevance, name, symbol))
				190	except:
				191	print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
				192	print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
				193	print sys.exc_type, sys.exc_value
				194	return -1
				195
				196	return ret
				197
				198	def updateSymbol(name, module, type, desc):
				199	global DB
				200
				201	updateWord(name, name, 50)
				202	if DB == None:
				203	openMySQL()
				204	if DB == None:
				205	return -1
				206	if name == None:
				207	return -1
				208	if module == None:
				209	return -1
				210	if type == None:
				211	return -1
				212
				213	try:
				214	desc = string.replace(desc, "'", " ")
				215	l = string.split(desc, ".")
				216	desc = l[0]
				217	desc = desc[0:99]
				218	except:
				219	desc = ""
				220
				221	c = DB.cursor()
				222	try:
				223	ret = c.execute(
				224	"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
				225	(name, module, type, desc))
				226	except:
				227	try:
				228	ret = c.execute(
				229	"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
				230	(module, type, desc, name))
				231	except:
				232	print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
				233	print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
				234	print sys.exc_type, sys.exc_value
				235	return -1
				236
				237	return ret
				238
				239	def addFunction(name, module, desc = ""):
				240	return updateSymbol(name, module, 'function', desc)
				241
				242	def addMacro(name, module, desc = ""):
				243	return updateSymbol(name, module, 'macro', desc)
				244
				245	def addEnum(name, module, desc = ""):
				246	return updateSymbol(name, module, 'enum', desc)
				247
				248	def addStruct(name, module, desc = ""):
				249	return updateSymbol(name, module, 'struct', desc)
				250
				251	def addConst(name, module, desc = ""):
				252	return updateSymbol(name, module, 'const', desc)
				253
				254	def addType(name, module, desc = ""):
				255	return updateSymbol(name, module, 'type', desc)
				256
				257	def addFunctype(name, module, desc = ""):
				258	return updateSymbol(name, module, 'functype', desc)
				259
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	260	def addPage(resource, title):
				261	global DB
				262
				263	if DB == None:
				264	openMySQL()
				265	if DB == None:
				266	return -1
				267	if resource == None:
				268	return -1
				269
				270	c = DB.cursor()
				271	try:
				272	ret = c.execute(
				273	"""INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
				274	(resource, title))
				275	except:
				276	try:
				277	ret = c.execute(
				278	"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
				279	(title, resource))
				280	except:
				281	print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
				282	print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
				283	print sys.exc_type, sys.exc_value
				284	return -1
				285
				286	return ret
				287
				288	def updateWordHTML(name, resource, desc, id, relevance):
				289	global DB
				290
				291	if DB == None:
				292	openMySQL()
				293	if DB == None:
				294	return -1
				295	if name == None:
				296	return -1
				297	if resource == None:
				298	return -1
				299	if id == None:
				300	id = ""
				301	if desc == None:
				302	desc = ""
				303	else:
				304	try:
				305	desc = string.replace(desc, "'", " ")
				306	desc = desc[0:99]
				307	except:
				308	desc = ""
				309
				310	c = DB.cursor()
				311	try:
				312	ret = c.execute(
				313	"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
				314	(name, resource, desc, id, relevance))
				315	except:
				316	try:
				317	ret = c.execute(
				318	"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
				319	(desc, id, relevance, name, resource))
				320	except:
				321	print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
				322	print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
				323	print sys.exc_type, sys.exc_value
				324	return -1
				325
				326	return ret
				327
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	328	#########################################################################
				329	# #
				330	# Word dictionnary and analysis routines #
				331	# #
				332	#########################################################################
				333
				334	wordsDict = {}
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	335	wordsDictHTML = {}
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	336
				337	def splitIdentifier(str):
				338	ret = []
				339	while str != "":
				340	cur = string.lower(str[0])
				341	str = str[1:]
				342	if ((cur < 'a') or (cur > 'z')):
				343	continue
				344	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
				345	cur = cur + string.lower(str[0])
				346	str = str[1:]
				347	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
				348	cur = cur + str[0]
				349	str = str[1:]
				350	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
				351	str = str[1:]
				352	ret.append(cur)
				353	return ret
				354
				355	def addWord(word, module, symbol, relevance):
				356	global wordsDict
				357
				358	if word == None or len(word) < 3:
				359	return -1
				360	if module == None or symbol == None:
				361	return -1
				362	if wordsDict.has_key(word):
				363	d = wordsDict[word]
				364	if d == None:
				365	return 0
				366	if len(d) > 500:
				367	wordsDict[word] = None
				368	return 0
				369	try:
				370	relevance = relevance + d[(module, symbol)]
				371	except:
				372	pass
				373	else:
				374	wordsDict[word] = {}
				375	wordsDict[word][(module, symbol)] = relevance
				376	return relevance
				377
				378	def addString(str, module, symbol, relevance):
				379	if str == None or len(str) < 3:
				380	return -1
				381	ret = 0
				382	str = string.replace(str, ".", " ")
				383	str = string.replace(str, ",", " ")
				384	str = string.replace(str, "'", " ")
				385	str = string.replace(str, '"', " ")
				386	str = string.replace(str, ";", " ")
				387	str = string.replace(str, "-", " ")
				388	l = string.split(str)
				389	for word in l:
				390	if len(word) > 2:
				391	ret = ret + addWord(word, module, symbol, 5)
				392
				393	return ret
				394
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	395	def addWordHTML(word, resource, id, section, relevance):
				396	global wordsDictHTML
				397
				398	if word == None or len(word) < 3:
				399	return -1
				400	if resource == None or section == None:
				401	return -1
				402
				403	if wordsDictHTML.has_key(word):
				404	d = wordsDictHTML[word]
				405	if d == None:
				406	return 0
				407	if len(d) > 15:
				408	wordsDictHTML[word] = None
				409	return 0
				410	try:
				411	(r,i,s) = d[resource]
				412	if i != None:
				413	id = i
				414	if s != None:
				415	section = s
				416	relevance = relevance + r
				417	except:
				418	pass
				419	else:
				420	wordsDictHTML[word] = {}
				421	wordsDictHTML[word][resource] = (relevance, id, section)
				422	return relevance
				423
				424	def addStringHTML(str, resource, id, section, relevance):
				425	if str == None or len(str) < 3:
				426	return -1
				427	ret = 0
				428	str = string.replace(str, ".", " ")
				429	str = string.replace(str, ",", " ")
				430	str = string.replace(str, "'", " ")
				431	str = string.replace(str, '"', " ")
				432	str = string.replace(str, ";", " ")
				433	str = string.replace(str, "-", " ")
				434	str = string.replace(str, "(", " ")
				435	str = string.replace(str, ")", " ")
				436	str = string.replace(str, "{", " ")
				437	str = string.replace(str, "}", " ")
				438	str = string.replace(str, "<", " ")
				439	str = string.replace(str, ">", " ")
				440	str = string.replace(str, "/", " ")
				441	str = string.replace(str, "*", " ")
				442	str = string.replace(str, ":", " ")
				443	str = string.replace(str, "\n", " ")
				444	str = string.replace(str, "\r", " ")
				445	str = string.replace(str, "\xc2", " ")
				446	str = string.replace(str, "\xa0", " ")
				447	l = string.split(str)
				448	for word in l:
				449	if len(word) > 2:
				450	ret = ret + addWordHTML(word, resource, id, section, relevance)
				451
				452	return ret
				453
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	454
				455	#########################################################################
				456	# #
				457	# XML API description analysis #
				458	# #
				459	#########################################################################
				460
				461	def loadAPI(filename):
				462	doc = libxml2.parseFile(filename)
				463	print "loaded %s" % (filename)
				464	return doc
				465
				466	def foundExport(file, symbol):
				467	if file == None:
				468	return 0
				469	if symbol == None:
				470	return 0
				471	addFunction(symbol, file)
				472	l = splitIdentifier(symbol)
				473	for word in l:
				474	addWord(word, file, symbol, 10)
				475	return 1
				476
				477	def analyzeAPIFile(top):
				478	count = 0
				479	name = top.prop("name")
				480	cur = top.children
				481	while cur != None:
				482	if cur.type == 'text':
				483	cur = cur.next
				484	continue
				485	if cur.name == "exports":
				486	count = count + foundExport(name, cur.prop("symbol"))
				487	else:
				488	print "unexpected element %s in API doc <file name='%s'>" % (name)
				489	cur = cur.next
				490	return count
				491
				492	def analyzeAPIFiles(top):
				493	count = 0
				494	cur = top.children
				495
				496	while cur != None:
				497	if cur.type == 'text':
				498	cur = cur.next
				499	continue
				500	if cur.name == "file":
				501	count = count + analyzeAPIFile(cur)
				502	else:
				503	print "unexpected element %s in API doc <files>" % (cur.name)
				504	cur = cur.next
				505	return count
				506
				507	def analyzeAPIEnum(top):
				508	file = top.prop("file")
				509	if file == None:
				510	return 0
				511	symbol = top.prop("name")
				512	if symbol == None:
				513	return 0
				514
				515	addEnum(symbol, file)
				516	l = splitIdentifier(symbol)
				517	for word in l:
				518	addWord(word, file, symbol, 10)
				519
				520	return 1
				521
				522	def analyzeAPIConst(top):
				523	file = top.prop("file")
				524	if file == None:
				525	return 0
				526	symbol = top.prop("name")
				527	if symbol == None:
				528	return 0
				529
				530	addConst(symbol, file)
				531	l = splitIdentifier(symbol)
				532	for word in l:
				533	addWord(word, file, symbol, 10)
				534
				535	return 1
				536
				537	def analyzeAPIType(top):
				538	file = top.prop("file")
				539	if file == None:
				540	return 0
				541	symbol = top.prop("name")
				542	if symbol == None:
				543	return 0
				544
				545	addType(symbol, file)
				546	l = splitIdentifier(symbol)
				547	for word in l:
				548	addWord(word, file, symbol, 10)
				549	return 1
				550
				551	def analyzeAPIFunctype(top):
				552	file = top.prop("file")
				553	if file == None:
				554	return 0
				555	symbol = top.prop("name")
				556	if symbol == None:
				557	return 0
				558
				559	addFunctype(symbol, file)
				560	l = splitIdentifier(symbol)
				561	for word in l:
				562	addWord(word, file, symbol, 10)
				563	return 1
				564
				565	def analyzeAPIStruct(top):
				566	file = top.prop("file")
				567	if file == None:
				568	return 0
				569	symbol = top.prop("name")
				570	if symbol == None:
				571	return 0
				572
				573	addStruct(symbol, file)
				574	l = splitIdentifier(symbol)
				575	for word in l:
				576	addWord(word, file, symbol, 10)
				577
				578	info = top.prop("info")
				579	if info != None:
				580	l = string.split(info)
				581	for word in l:
				582	if len(word) > 2:
				583	addWord(word, file, symbol, 5)
				584	return 1
				585
				586	def analyzeAPIMacro(top):
				587	file = top.prop("file")
				588	if file == None:
				589	return 0
				590	symbol = top.prop("name")
				591	if symbol == None:
				592	return 0
				593
				594	info = None
				595	cur = top.children
				596	while cur != None:
				597	if cur.type == 'text':
				598	cur = cur.next
				599	continue
				600	if cur.name == "info":
				601	info = cur.content
				602	break
				603	cur = cur.next
				604
				605	l = splitIdentifier(symbol)
				606	for word in l:
				607	addWord(word, file, symbol, 10)
				608
				609	if info == None:
				610	addMacro(symbol, file)
				611	print "Macro %s description has no <info>" % (symbol)
				612	return 0
				613
				614	addMacro(symbol, file, info)
				615	l = string.split(info)
				616	for word in l:
				617	if len(word) > 2:
				618	addWord(word, file, symbol, 5)
				619	return 1
				620
				621	def analyzeAPIFunction(top):
				622	file = top.prop("file")
				623	if file == None:
				624	return 0
				625	symbol = top.prop("name")
				626	if symbol == None:
				627	return 0
				628
				629	info = None
				630	cur = top.children
				631	while cur != None:
				632	if cur.type == 'text':
				633	cur = cur.next
				634	continue
				635	if cur.name == "info":
				636	info = cur.content
				637	elif cur.name == "return":
				638	rinfo = cur.prop("info")
				639	if rinfo != None:
				640	addString(rinfo, file, symbol, 7)
				641	elif cur.name == "arg":
				642	ainfo = cur.prop("info")
				643	if rinfo != None:
				644	addString(ainfo, file, symbol, 5)
				645	name = cur.prop("name")
				646	if name != None:
				647	addWord(name, file, symbol, 7)
				648	cur = cur.next
				649	if info == None:
				650	print "Function %s description has no <info>" % (symbol)
				651	addFunction(symbol, file, "")
				652	else:
				653	addFunction(symbol, file, info)
				654	addString(info, file, symbol, 5)
				655
				656	l = splitIdentifier(symbol)
				657	for word in l:
				658	addWord(word, file, symbol, 10)
				659
				660	return 1
				661
				662	def analyzeAPISymbols(top):
				663	count = 0
				664	cur = top.children
				665
				666	while cur != None:
				667	if cur.type == 'text':
				668	cur = cur.next
				669	continue
				670	if cur.name == "macro":
				671	count = count + analyzeAPIMacro(cur)
				672	elif cur.name == "function":
				673	count = count + analyzeAPIFunction(cur)
				674	elif cur.name == "const":
				675	count = count + analyzeAPIConst(cur)
				676	elif cur.name == "typedef":
				677	count = count + analyzeAPIType(cur)
				678	elif cur.name == "struct":
				679	count = count + analyzeAPIStruct(cur)
				680	elif cur.name == "enum":
				681	count = count + analyzeAPIEnum(cur)
				682	elif cur.name == "functype":
				683	count = count + analyzeAPIFunctype(cur)
				684	else:
				685	print "unexpected element %s in API doc <files>" % (cur.name)
				686	cur = cur.next
				687	return count
				688
				689	def analyzeAPI(doc):
				690	count = 0
				691	if doc == None:
				692	return -1
				693	root = doc.getRootElement()
				694	if root.name != "api":
				695	print "Unexpected root name"
				696	return -1
				697	cur = root.children
				698	while cur != None:
				699	if cur.type == 'text':
				700	cur = cur.next
				701	continue
				702	if cur.name == "files":
				703	pass
				704	# count = count + analyzeAPIFiles(cur)
				705	elif cur.name == "symbols":
				706	count = count + analyzeAPISymbols(cur)
				707	else:
				708	print "unexpected element %s in API doc" % (cur.name)
				709	cur = cur.next
				710	return count
				711
				712	#########################################################################
				713	# #
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	714	# Web pages parsing and analysis #
				715	# #
				716	#########################################################################
				717
				718	import glob
				719
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame^]	720	def analyzeHTMLText(doc, resource, p, section, id):
				721	words = 0
				722	try:
				723	content = p.content
				724	words = words + addStringHTML(content, resource, id, section, 5)
				725	except:
				726	return -1
				727	return words
				728
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	729	def analyzeHTMLPara(doc, resource, p, section, id):
				730	words = 0
				731	try:
				732	content = p.content
				733	words = words + addStringHTML(content, resource, id, section, 5)
				734	except:
				735	return -1
				736	return words
				737
				738	def analyzeHTMLPre(doc, resource, p, section, id):
				739	words = 0
				740	try:
				741	content = p.content
				742	words = words + addStringHTML(content, resource, id, section, 5)
				743	except:
				744	return -1
				745	return words
				746
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame^]	747	def analyzeHTML(doc, resource, p, section, id):
				748	words = 0
				749	try:
				750	content = p.content
				751	words = words + addStringHTML(content, resource, id, section, 5)
				752	except:
				753	return -1
				754	return words
				755
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	756	def analyzeHTML(doc, resource):
				757	para = 0;
				758	ctxt = doc.xpathNewContext()
				759	try:
				760	res = ctxt.xpathEval("//head/title")
				761	title = res[0].content
				762	except:
				763	title = "Page %s" % (resource)
				764	addPage(resource, title)
				765	try:
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame^]	766	items = ctxt.xpathEval("//h1 \| //h2 \| //h3 \| //text()")
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	767	section = title
				768	id = ""
				769	for item in items:
				770	if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
				771	section = item.content
				772	if item.prop("id"):
				773	id = item.prop("id")
				774	elif item.prop("name"):
				775	id = item.prop("name")
Daniel Veillard	9b00613	2002-10-07 11:13:27 +0000	[diff] [blame^]	776	elif item.type == 'text':
				777	analyzeHTMLText(doc, resource, item, section, id)
				778	para = para + 1
				779	elif item.name == 'text':
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	780	analyzeHTMLPara(doc, resource, item, section, id)
				781	para = para + 1
				782	elif item.name == 'pre':
				783	analyzeHTMLPre(doc, resource, item, section, id)
				784	para = para + 1
				785	else:
				786	print "Page %s, unexpected %s element" % (resource, item.name)
				787	except:
				788	print "Page %s: problem analyzing" % (resource)
				789	print sys.exc_type, sys.exc_value
				790
				791	return para
				792
				793	def analyzeHTMLPages():
				794	ret = 0
				795	HTMLfiles = glob.glob(".html") + glob.glob("tutorial/.html")
				796	for html in HTMLfiles:
				797	if html[0:3] == "API":
				798	continue
				799	if html == "xml.html":
				800	continue
				801	try:
				802	doc = libxml2.htmlParseFile(html, None)
				803	res = analyzeHTML(doc, html)
				804	print "Parsed %s : %d paragraphs" % (html, res)
				805	ret = ret + 1
				806	except:
				807	print "could not parse %s" % (html)
				808	return ret
				809
				810	#########################################################################
				811	# #
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	812	# Main code: open the DB, the API XML and analyze it #
				813	# #
				814	#########################################################################
				815	try:
				816	openMySQL()
				817	except:
				818	print "Failed to open the database"
				819	print sys.exc_type, sys.exc_value
				820	sys.exit(1)
				821
Daniel Veillard	141d04b	2002-10-06 21:51:18 +0000	[diff] [blame]	822	ret = analyzeHTMLPages()
				823	print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
				824
				825	i = 0
				826	skipped = 0
				827	for word in wordsDictHTML.keys():
				828	refs = wordsDictHTML[word]
				829	if refs == None:
				830	skipped = skipped + 1
				831	continue;
				832	for resource in refs.keys():
				833	(relevance, id, section) = refs[resource]
				834	updateWordHTML(word, resource, section, id, relevance)
				835	i = i + 1
				836
				837	print "Found %d associations in HTML pages" % (i)
				838
Daniel Veillard	3371ff8	2002-10-01 13:37:48 +0000	[diff] [blame]	839	try:
				840	doc = loadAPI(API)
				841	ret = analyzeAPI(doc)
				842	print "Analyzed %d blocs" % (ret)
				843	doc.freeDoc()
				844	except:
				845	print "Failed to parse and analyze %s" % (API)
				846	print sys.exc_type, sys.exc_value
				847	sys.exit(1)
				848
				849	print "Indexed %d words" % (len(wordsDict))
				850	i = 0
				851	skipped = 0
				852	for word in wordsDict.keys():
				853	refs = wordsDict[word]
				854	if refs == None:
				855	skipped = skipped + 1
				856	continue;
				857	for (module, symbol) in refs.keys():
				858	updateWord(word, symbol, refs[(module, symbol)])
				859	i = i + 1
				860
				861	print "Found %d associations, skipped %d words" % (i, skipped)