blob: 528c8107e6a6497330454a234c3953ac66a847ad [file] [log] [blame]
Ben Murdochca12bfa2013-07-23 11:17:05 +01001# Copyright 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from collections import defaultdict, deque, namedtuple
6from HTMLParser import HTMLParser, HTMLParseError
Ben Murdochbb1529c2013-08-08 10:24:53 +01007from itertools import groupby
8from operator import itemgetter
Ben Murdochca12bfa2013-07-23 11:17:05 +01009import posixpath
10from urlparse import urlsplit
11
12from file_system_util import CreateURLsFromPaths
13import svn_constants
14
15Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
16
17def _SplitAnchor(url):
18 components = urlsplit(url)
19 return components.path, components.fragment
20
21def _Process(path, renderer):
22 '''Render the page at |path| using a |renderer| and process the contents of
23 that page. Returns a |Page| namedtuple with fields for the http status code
24 of the page render, the href of all the links that occurred on the page, all
25 of the anchors on the page (ids and names), and all links that contain an
26 anchor component.
27
28 If a non-html page is properly rendered, a |Page| with status code 200 and
29 all other fields empty is returned.
30 '''
31 parser = _ContentParser()
32 response = renderer(path)
33
34 if response.status != 200:
35 return Page(response.status, (), (), ())
36 if not path.endswith('.html'):
37 return Page(200, (), (), ())
38
39 try:
40 parser.feed(str(response.content))
41 except HTMLParseError:
42 return Page(200, (), (), ())
43
44 links, anchors = parser.links, parser.anchors
45 base, _ = path.rsplit('/', 1)
46 edges = []
47 anchor_refs = []
48
49 # Convert relative links to absolute links and categorize links as edges
50 # or anchor_refs.
51 for link in links:
52 # Files like experimental_history.html are refered to with the URL
53 # experimental.history.html.
54 head, last = link.rsplit('/', 1) if '/' in link else ('', link)
55 last, anchor = _SplitAnchor(last)
56
57 if last.endswith('.html') and last.count('.') > 1:
58 last = last.replace('.', '_', last.count('.') - 1)
59 link = posixpath.join(head, last)
60 if anchor:
61 link = '%s#%s' % (link, anchor)
62
63 if link.startswith('#'):
64 anchor_refs.append(link)
65 else:
66 if link.startswith('/'):
67 link = link[1:]
68 else:
69 link = posixpath.normpath('%s/%s' % (base, link))
70
71 if '#' in link:
72 anchor_refs.append(link)
73 else:
74 edges.append(link)
75
76 return Page(200, edges, anchors, anchor_refs)
77
Ben Murdochca12bfa2013-07-23 11:17:05 +010078class _ContentParser(HTMLParser):
79 '''Parse an html file pulling out all links and anchor_refs, where an
80 anchor_ref is a link that contains an anchor.
81 '''
82
83 def __init__(self):
84 HTMLParser.__init__(self)
85 self.links = []
86 self.anchors = set()
87
88 def handle_starttag(self, tag, raw_attrs):
89 attrs = dict(raw_attrs)
90
91 if tag == 'a':
92 # Handle special cases for href's that: start with a space, contain
93 # just a '.' (period), contain python templating code, are an absolute
94 # url, are a zip file, or execute javascript on the page.
95 href = attrs.get('href', '').strip()
96 if href and not href == '.' and not '{{' in href:
97 if not urlsplit(href).scheme in ('http', 'https'):
98 if not href.endswith('.zip') and not 'javascript:' in href:
99 self.links.append(href)
100
101 if attrs.get('id'):
102 self.anchors.add(attrs['id'])
103 if attrs.get('name'):
104 self.anchors.add(attrs['name'])
105
106class LinkErrorDetector(object):
107 '''Finds link errors on the doc server. This includes broken links, those with
108 a target page that 404s or contain an anchor that doesn't exist, or pages that
109 have no links to them.
110 '''
111
112 def __init__(self, file_system, renderer, public_path, root_pages):
113 '''Creates a new broken link detector. |renderer| is a callable that takes
114 a path and returns a full html page. |public_path| is the path to public
115 template files. All URLs in |root_pages| are used as the starting nodes for
116 the orphaned page search.
117 '''
118 self._file_system = file_system
119 self._renderer = renderer
120 self._public_path = public_path
121 self._pages = defaultdict(lambda: Page(404, (), (), ()))
122 self._root_pages = frozenset(root_pages)
Torne (Richard Coles)a36e5922013-08-05 13:57:33 +0100123 self._always_detached = frozenset((
124 'apps/404.html',
125 'extensions/404.html',
126 'apps/private_apis.html',
127 'extensions/private_apis.html'))
Ben Murdoch558790d2013-07-30 15:19:42 +0100128 self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
Ben Murdochca12bfa2013-07-23 11:17:05 +0100129
130 self._RenderAllPages()
131
132 def _RenderAllPages(self):
133 '''Traverses the public templates directory rendering each URL and
134 processing the resultant html to pull out all links and anchors.
135 '''
136 top_level_directories = (
137 (svn_constants.PUBLIC_TEMPLATE_PATH, ''),
138 (svn_constants.STATIC_PATH, 'static/'),
139 (svn_constants.EXAMPLES_PATH, 'extensions/examples/'),
140 )
141
142 for dirpath, urlprefix in top_level_directories:
143 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
144 for url, path in files:
145 self._pages[url] = _Process(url, self._renderer)
146
147 if self._pages[url].status != 200:
148 print(url, ', a url derived from the path', dirpath +
149 ', resulted in a', self._pages[url].status)
150
Ben Murdoch558790d2013-07-30 15:19:42 +0100151 def _FollowRedirections(self, starting_url, limit=4):
152 '''Follow redirection until a non-redirectable page is reached. Start at
153 |starting_url| which must return a 301 or 302 status code.
Ben Murdochca12bfa2013-07-23 11:17:05 +0100154
Ben Murdoch558790d2013-07-30 15:19:42 +0100155 Return a tuple of: the status of rendering |staring_url|, the final url,
156 and a list of the pages reached including |starting_url|. If no redirection
157 occurred, returns (None, None, None).
158 '''
159 pages_reached = [starting_url]
160 redirect_link = None
161 target_page = self._renderer(starting_url)
162 original_status = status = target_page.status
163 count = 0
164
165 while status in (301, 302):
166 if count > limit:
167 return None, None, None
168 redirect_link = target_page.headers.get('Location')
169 target_page = self._renderer(redirect_link)
170 status = target_page.status
171 pages_reached.append(redirect_link)
172 count += 1
173
174 if redirect_link is None:
175 return None, None, None
176
177 return original_status, redirect_link, pages_reached
178
179 def _CategorizeBrokenLinks(self, url, page, pages):
180 '''Find all broken links on a page and create appropriate notes describing
181 why tehy are broken (broken anchor, target redirects, etc). |page| is the
182 current page being checked and is the result of rendering |url|. |pages|
183 is a callable that takes a path and returns a Page.
Ben Murdochca12bfa2013-07-23 11:17:05 +0100184 '''
185 broken_links = []
Ben Murdoch558790d2013-07-30 15:19:42 +0100186
187 for link in page.links + page.anchor_refs:
188 components = urlsplit(link)
189 fragment = components.fragment
190
191 if components.path == '':
Torne (Richard Coles)a36e5922013-08-05 13:57:33 +0100192 if fragment == 'top' or fragment == '':
Ben Murdoch558790d2013-07-30 15:19:42 +0100193 continue
194 if not fragment in page.anchors:
195 broken_links.append((200, url, link, 'target anchor not found'))
196 else:
197 # Render the target page
198 target_page = pages(components.path)
199
200 if target_page.status != 200:
201 if components.path in self._redirection_whitelist:
202 continue
203
204 status, relink, _ = self._FollowRedirections(components.path)
205 if relink:
206 broken_links.append((
207 status,
208 url,
209 link,
210 'redirects to %s' % relink))
Torne (Richard Coles)a36e5922013-08-05 13:57:33 +0100211 else:
Ben Murdoch558790d2013-07-30 15:19:42 +0100212 broken_links.append((
213 target_page.status, url, link, 'target page not found'))
Ben Murdoch558790d2013-07-30 15:19:42 +0100214
215 elif fragment:
216 if not fragment in target_page.anchors:
217 broken_links.append((
218 target_page.status, url, link, 'target anchor not found'))
219
220 return broken_links
221
222 def GetBrokenLinks(self):
223 '''Find all broken links. A broken link is a link that leads to a page
224 that does not exist (404s), redirects to another page (301 or 302), or
225 has an anchor whose target does not exist.
226
227 Returns a list of tuples of four elements: status, url, target_page,
228 notes.
229 '''
230 broken_links = []
Ben Murdochca12bfa2013-07-23 11:17:05 +0100231
232 for url in self._pages.keys():
233 page = self._pages[url]
234 if page.status != 200:
235 continue
Ben Murdoch558790d2013-07-30 15:19:42 +0100236 broken_links.extend(self._CategorizeBrokenLinks(
237 url, page, lambda x: self._pages[x]))
Ben Murdochca12bfa2013-07-23 11:17:05 +0100238
Ben Murdoch558790d2013-07-30 15:19:42 +0100239 return broken_links
Ben Murdochca12bfa2013-07-23 11:17:05 +0100240
241 def GetOrphanedPages(self):
242 '''Crawls the server find all pages that are connected to the pages at
243 |seed_url|s. Return the links that are valid on the server but are not in
244 part of the connected component containing the |root_pages|. These pages
245 are orphans and cannot be reached simply by clicking through the server.
246 '''
Torne (Richard Coles)a36e5922013-08-05 13:57:33 +0100247 pages_to_check = deque(self._root_pages.union(self._always_detached))
Ben Murdochca12bfa2013-07-23 11:17:05 +0100248 found = set(self._root_pages) | self._always_detached
249
250 while pages_to_check:
251 item = pages_to_check.popleft()
Ben Murdoch558790d2013-07-30 15:19:42 +0100252 target_page = self._pages[item]
253
254 if target_page.status != 200:
255 redirected_page = self._FollowRedirections(item)[1]
256 if not redirected_page is None:
257 target_page = self._pages[redirected_page]
258
259 for link in target_page.links:
Ben Murdochca12bfa2013-07-23 11:17:05 +0100260 if link not in found:
261 found.add(link)
262 pages_to_check.append(link)
263
264 all_urls = set(
265 [url for url, page in self._pages.iteritems() if page.status == 200])
266
267 return [url for url in all_urls - found if url.endswith('.html')]
Ben Murdochbb1529c2013-08-08 10:24:53 +0100268
269def StringifyBrokenLinks(broken_links):
270 '''Prints out broken links in a more readable format.
271 '''
272 def fixed_width(string, width):
273 return "%s%s" % (string, (width - len(string)) * ' ')
274
275 first_col_width = max(len(link[1]) for link in broken_links)
276 second_col_width = max(len(link[2]) for link in broken_links)
277 target = itemgetter(2)
278 output = []
279
280 def pretty_print(link, col_offset=0):
281 return "%s -> %s %s" % (
282 fixed_width(link[1], first_col_width - col_offset),
283 fixed_width(link[2], second_col_width),
284 link[3])
285
286 for target, links in groupby(sorted(broken_links, key=target), target):
287 links = list(links)
288 # Compress messages
289 if len(links) > 50 and not links[0][2].startswith('#'):
290 message = "Found %d broken links (" % len(links)
291 output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
292 else:
293 for link in links:
294 output.append(pretty_print(link))
295
296 return '\n'.join(output)