Blame - chrome/common/extensions/docs/server2/link_error_detector.py - fp2-dev/platform/external/chromium_org

blob: 528c8107e6a6497330454a234c3953ac66a847ad [file] [log] [blame]

Ben Murdoch	ca12bfa	2013-07-23 11:17:05 +0100	[diff] [blame]	1	# Copyright 2013 The Chromium Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
				4
				5	from collections import defaultdict, deque, namedtuple
				6	from HTMLParser import HTMLParser, HTMLParseError
Ben Murdoch	bb1529c	2013-08-08 10:24:53 +0100	[diff] [blame^]	7	from itertools import groupby
				8	from operator import itemgetter
Ben Murdoch	ca12bfa	2013-07-23 11:17:05 +0100	[diff] [blame]	9	import posixpath
				10	from urlparse import urlsplit
				11
				12	from file_system_util import CreateURLsFromPaths
				13	import svn_constants
				14
				15	Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
				16
				17	def _SplitAnchor(url):
				18	components = urlsplit(url)
				19	return components.path, components.fragment
				20
				21	def _Process(path, renderer):
				22	'''Render the page at \|path\| using a \|renderer\| and process the contents of
				23	that page. Returns a \|Page\| namedtuple with fields for the http status code
				24	of the page render, the href of all the links that occurred on the page, all
				25	of the anchors on the page (ids and names), and all links that contain an
				26	anchor component.
				27
				28	If a non-html page is properly rendered, a \|Page\| with status code 200 and
				29	all other fields empty is returned.
				30	'''
				31	parser = _ContentParser()
				32	response = renderer(path)
				33
				34	if response.status != 200:
				35	return Page(response.status, (), (), ())
				36	if not path.endswith('.html'):
				37	return Page(200, (), (), ())
				38
				39	try:
				40	parser.feed(str(response.content))
				41	except HTMLParseError:
				42	return Page(200, (), (), ())
				43
				44	links, anchors = parser.links, parser.anchors
				45	base, _ = path.rsplit('/', 1)
				46	edges = []
				47	anchor_refs = []
				48
				49	# Convert relative links to absolute links and categorize links as edges
				50	# or anchor_refs.
				51	for link in links:
				52	# Files like experimental_history.html are refered to with the URL
				53	# experimental.history.html.
				54	head, last = link.rsplit('/', 1) if '/' in link else ('', link)
				55	last, anchor = _SplitAnchor(last)
				56
				57	if last.endswith('.html') and last.count('.') > 1:
				58	last = last.replace('.', '_', last.count('.') - 1)
				59	link = posixpath.join(head, last)
				60	if anchor:
				61	link = '%s#%s' % (link, anchor)
				62
				63	if link.startswith('#'):
				64	anchor_refs.append(link)
				65	else:
				66	if link.startswith('/'):
				67	link = link[1:]
				68	else:
				69	link = posixpath.normpath('%s/%s' % (base, link))
				70
				71	if '#' in link:
				72	anchor_refs.append(link)
				73	else:
				74	edges.append(link)
				75
				76	return Page(200, edges, anchors, anchor_refs)
				77
Ben Murdoch	ca12bfa	2013-07-23 11:17:05 +0100	[diff] [blame]	78	class _ContentParser(HTMLParser):
				79	'''Parse an html file pulling out all links and anchor_refs, where an
				80	anchor_ref is a link that contains an anchor.
				81	'''
				82
				83	def __init__(self):
				84	HTMLParser.__init__(self)
				85	self.links = []
				86	self.anchors = set()
				87
				88	def handle_starttag(self, tag, raw_attrs):
				89	attrs = dict(raw_attrs)
				90
				91	if tag == 'a':
				92	# Handle special cases for href's that: start with a space, contain
				93	# just a '.' (period), contain python templating code, are an absolute
				94	# url, are a zip file, or execute javascript on the page.
				95	href = attrs.get('href', '').strip()
				96	if href and not href == '.' and not '{{' in href:
				97	if not urlsplit(href).scheme in ('http', 'https'):
				98	if not href.endswith('.zip') and not 'javascript:' in href:
				99	self.links.append(href)
				100
				101	if attrs.get('id'):
				102	self.anchors.add(attrs['id'])
				103	if attrs.get('name'):
				104	self.anchors.add(attrs['name'])
				105
				106	class LinkErrorDetector(object):
				107	'''Finds link errors on the doc server. This includes broken links, those with
				108	a target page that 404s or contain an anchor that doesn't exist, or pages that
				109	have no links to them.
				110	'''
				111
				112	def __init__(self, file_system, renderer, public_path, root_pages):
				113	'''Creates a new broken link detector. \|renderer\| is a callable that takes
				114	a path and returns a full html page. \|public_path\| is the path to public
				115	template files. All URLs in \|root_pages\| are used as the starting nodes for
				116	the orphaned page search.
				117	'''
				118	self._file_system = file_system
				119	self._renderer = renderer
				120	self._public_path = public_path
				121	self._pages = defaultdict(lambda: Page(404, (), (), ()))
				122	self._root_pages = frozenset(root_pages)
Torne (Richard Coles)	a36e592	2013-08-05 13:57:33 +0100	[diff] [blame]	123	self._always_detached = frozenset((
				124	'apps/404.html',
				125	'extensions/404.html',
				126	'apps/private_apis.html',
				127	'extensions/private_apis.html'))
Ben Murdoch	558790d	2013-07-30 15:19:42 +0100	[diff] [blame]	128	self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
Ben Murdoch	ca12bfa	2013-07-23 11:17:05 +0100	[diff] [blame]	129
				130	self._RenderAllPages()
				131
				132	def _RenderAllPages(self):
				133	'''Traverses the public templates directory rendering each URL and
				134	processing the resultant html to pull out all links and anchors.
				135	'''
				136	top_level_directories = (
				137	(svn_constants.PUBLIC_TEMPLATE_PATH, ''),
				138	(svn_constants.STATIC_PATH, 'static/'),
				139	(svn_constants.EXAMPLES_PATH, 'extensions/examples/'),
				140	)
				141
				142	for dirpath, urlprefix in top_level_directories:
				143	files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
				144	for url, path in files:
				145	self._pages[url] = _Process(url, self._renderer)
				146
				147	if self._pages[url].status != 200:
				148	print(url, ', a url derived from the path', dirpath +
				149	', resulted in a', self._pages[url].status)
				150
Ben Murdoch	558790d	2013-07-30 15:19:42 +0100	[diff] [blame]	151	def _FollowRedirections(self, starting_url, limit=4):
				152	'''Follow redirection until a non-redirectable page is reached. Start at
				153	\|starting_url\| which must return a 301 or 302 status code.
Ben Murdoch	ca12bfa	2013-07-23 11:17:05 +0100	[diff] [blame]	154
Ben Murdoch	558790d	2013-07-30 15:19:42 +0100	[diff] [blame]	155	Return a tuple of: the status of rendering \|staring_url\|, the final url,
				156	and a list of the pages reached including \|starting_url\|. If no redirection
				157	occurred, returns (None, None, None).
				158	'''
				159	pages_reached = [starting_url]
				160	redirect_link = None
				161	target_page = self._renderer(starting_url)
				162	original_status = status = target_page.status
				163	count = 0
				164
				165	while status in (301, 302):
				166	if count > limit:
				167	return None, None, None
				168	redirect_link = target_page.headers.get('Location')
				169	target_page = self._renderer(redirect_link)
				170	status = target_page.status
				171	pages_reached.append(redirect_link)
				172	count += 1
				173
				174	if redirect_link is None:
				175	return None, None, None
				176
				177	return original_status, redirect_link, pages_reached
				178
				179	def _CategorizeBrokenLinks(self, url, page, pages):
				180	'''Find all broken links on a page and create appropriate notes describing
				181	why tehy are broken (broken anchor, target redirects, etc). \|page\| is the
				182	current page being checked and is the result of rendering \|url\|. \|pages\|
				183	is a callable that takes a path and returns a Page.
Ben Murdoch	ca12bfa	2013-07-23 11:17:05 +0100	[diff] [blame]	184	'''
				185	broken_links = []
Ben Murdoch	558790d	2013-07-30 15:19:42 +0100	[diff] [blame]	186
				187	for link in page.links + page.anchor_refs:
				188	components = urlsplit(link)
				189	fragment = components.fragment
				190
				191	if components.path == '':
Torne (Richard Coles)	a36e592	2013-08-05 13:57:33 +0100	[diff] [blame]	192	if fragment == 'top' or fragment == '':
Ben Murdoch	558790d	2013-07-30 15:19:42 +0100	[diff] [blame]	193	continue
				194	if not fragment in page.anchors:
				195	broken_links.append((200, url, link, 'target anchor not found'))
				196	else:
				197	# Render the target page
				198	target_page = pages(components.path)
				199
				200	if target_page.status != 200:
				201	if components.path in self._redirection_whitelist:
				202	continue
				203
				204	status, relink, _ = self._FollowRedirections(components.path)
				205	if relink:
				206	broken_links.append((
				207	status,
				208	url,
				209	link,
				210	'redirects to %s' % relink))
Torne (Richard Coles)	a36e592	2013-08-05 13:57:33 +0100	[diff] [blame]	211	else:
Ben Murdoch	558790d	2013-07-30 15:19:42 +0100	[diff] [blame]	212	broken_links.append((
				213	target_page.status, url, link, 'target page not found'))
Ben Murdoch	558790d	2013-07-30 15:19:42 +0100	[diff] [blame]	214
				215	elif fragment:
				216	if not fragment in target_page.anchors:
				217	broken_links.append((
				218	target_page.status, url, link, 'target anchor not found'))
				219
				220	return broken_links
				221
				222	def GetBrokenLinks(self):
				223	'''Find all broken links. A broken link is a link that leads to a page
				224	that does not exist (404s), redirects to another page (301 or 302), or
				225	has an anchor whose target does not exist.
				226
				227	Returns a list of tuples of four elements: status, url, target_page,
				228	notes.
				229	'''
				230	broken_links = []
Ben Murdoch	ca12bfa	2013-07-23 11:17:05 +0100	[diff] [blame]	231
				232	for url in self._pages.keys():
				233	page = self._pages[url]
				234	if page.status != 200:
				235	continue
Ben Murdoch	558790d	2013-07-30 15:19:42 +0100	[diff] [blame]	236	broken_links.extend(self._CategorizeBrokenLinks(
				237	url, page, lambda x: self._pages[x]))
Ben Murdoch	ca12bfa	2013-07-23 11:17:05 +0100	[diff] [blame]	238
Ben Murdoch	558790d	2013-07-30 15:19:42 +0100	[diff] [blame]	239	return broken_links
Ben Murdoch	ca12bfa	2013-07-23 11:17:05 +0100	[diff] [blame]	240
				241	def GetOrphanedPages(self):
				242	'''Crawls the server find all pages that are connected to the pages at
				243	\|seed_url\|s. Return the links that are valid on the server but are not in
				244	part of the connected component containing the \|root_pages\|. These pages
				245	are orphans and cannot be reached simply by clicking through the server.
				246	'''
Torne (Richard Coles)	a36e592	2013-08-05 13:57:33 +0100	[diff] [blame]	247	pages_to_check = deque(self._root_pages.union(self._always_detached))
Ben Murdoch	ca12bfa	2013-07-23 11:17:05 +0100	[diff] [blame]	248	found = set(self._root_pages) \| self._always_detached
				249
				250	while pages_to_check:
				251	item = pages_to_check.popleft()
Ben Murdoch	558790d	2013-07-30 15:19:42 +0100	[diff] [blame]	252	target_page = self._pages[item]
				253
				254	if target_page.status != 200:
				255	redirected_page = self._FollowRedirections(item)[1]
				256	if not redirected_page is None:
				257	target_page = self._pages[redirected_page]
				258
				259	for link in target_page.links:
Ben Murdoch	ca12bfa	2013-07-23 11:17:05 +0100	[diff] [blame]	260	if link not in found:
				261	found.add(link)
				262	pages_to_check.append(link)
				263
				264	all_urls = set(
				265	[url for url, page in self._pages.iteritems() if page.status == 200])
				266
				267	return [url for url in all_urls - found if url.endswith('.html')]
Ben Murdoch	bb1529c	2013-08-08 10:24:53 +0100	[diff] [blame^]	268
				269	def StringifyBrokenLinks(broken_links):
				270	'''Prints out broken links in a more readable format.
				271	'''
				272	def fixed_width(string, width):
				273	return "%s%s" % (string, (width - len(string)) * ' ')
				274
				275	first_col_width = max(len(link[1]) for link in broken_links)
				276	second_col_width = max(len(link[2]) for link in broken_links)
				277	target = itemgetter(2)
				278	output = []
				279
				280	def pretty_print(link, col_offset=0):
				281	return "%s -> %s %s" % (
				282	fixed_width(link[1], first_col_width - col_offset),
				283	fixed_width(link[2], second_col_width),
				284	link[3])
				285
				286	for target, links in groupby(sorted(broken_links, key=target), target):
				287	links = list(links)
				288	# Compress messages
				289	if len(links) > 50 and not links[0][2].startswith('#'):
				290	message = "Found %d broken links (" % len(links)
				291	output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
				292	else:
				293	for link in links:
				294	output.append(pretty_print(link))
				295
				296	return '\n'.join(output)