Ben Murdoch | ca12bfa | 2013-07-23 11:17:05 +0100 | [diff] [blame] | 1 | # Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
| 4 | |
| 5 | from collections import defaultdict, deque, namedtuple |
| 6 | from HTMLParser import HTMLParser, HTMLParseError |
Ben Murdoch | bb1529c | 2013-08-08 10:24:53 +0100 | [diff] [blame^] | 7 | from itertools import groupby |
| 8 | from operator import itemgetter |
Ben Murdoch | ca12bfa | 2013-07-23 11:17:05 +0100 | [diff] [blame] | 9 | import posixpath |
| 10 | from urlparse import urlsplit |
| 11 | |
| 12 | from file_system_util import CreateURLsFromPaths |
| 13 | import svn_constants |
| 14 | |
| 15 | Page = namedtuple('Page', 'status, links, anchors, anchor_refs') |
| 16 | |
| 17 | def _SplitAnchor(url): |
| 18 | components = urlsplit(url) |
| 19 | return components.path, components.fragment |
| 20 | |
| 21 | def _Process(path, renderer): |
| 22 | '''Render the page at |path| using a |renderer| and process the contents of |
| 23 | that page. Returns a |Page| namedtuple with fields for the http status code |
| 24 | of the page render, the href of all the links that occurred on the page, all |
| 25 | of the anchors on the page (ids and names), and all links that contain an |
| 26 | anchor component. |
| 27 | |
| 28 | If a non-html page is properly rendered, a |Page| with status code 200 and |
| 29 | all other fields empty is returned. |
| 30 | ''' |
| 31 | parser = _ContentParser() |
| 32 | response = renderer(path) |
| 33 | |
| 34 | if response.status != 200: |
| 35 | return Page(response.status, (), (), ()) |
| 36 | if not path.endswith('.html'): |
| 37 | return Page(200, (), (), ()) |
| 38 | |
| 39 | try: |
| 40 | parser.feed(str(response.content)) |
| 41 | except HTMLParseError: |
| 42 | return Page(200, (), (), ()) |
| 43 | |
| 44 | links, anchors = parser.links, parser.anchors |
| 45 | base, _ = path.rsplit('/', 1) |
| 46 | edges = [] |
| 47 | anchor_refs = [] |
| 48 | |
| 49 | # Convert relative links to absolute links and categorize links as edges |
| 50 | # or anchor_refs. |
| 51 | for link in links: |
| 52 | # Files like experimental_history.html are refered to with the URL |
| 53 | # experimental.history.html. |
| 54 | head, last = link.rsplit('/', 1) if '/' in link else ('', link) |
| 55 | last, anchor = _SplitAnchor(last) |
| 56 | |
| 57 | if last.endswith('.html') and last.count('.') > 1: |
| 58 | last = last.replace('.', '_', last.count('.') - 1) |
| 59 | link = posixpath.join(head, last) |
| 60 | if anchor: |
| 61 | link = '%s#%s' % (link, anchor) |
| 62 | |
| 63 | if link.startswith('#'): |
| 64 | anchor_refs.append(link) |
| 65 | else: |
| 66 | if link.startswith('/'): |
| 67 | link = link[1:] |
| 68 | else: |
| 69 | link = posixpath.normpath('%s/%s' % (base, link)) |
| 70 | |
| 71 | if '#' in link: |
| 72 | anchor_refs.append(link) |
| 73 | else: |
| 74 | edges.append(link) |
| 75 | |
| 76 | return Page(200, edges, anchors, anchor_refs) |
| 77 | |
Ben Murdoch | ca12bfa | 2013-07-23 11:17:05 +0100 | [diff] [blame] | 78 | class _ContentParser(HTMLParser): |
| 79 | '''Parse an html file pulling out all links and anchor_refs, where an |
| 80 | anchor_ref is a link that contains an anchor. |
| 81 | ''' |
| 82 | |
| 83 | def __init__(self): |
| 84 | HTMLParser.__init__(self) |
| 85 | self.links = [] |
| 86 | self.anchors = set() |
| 87 | |
| 88 | def handle_starttag(self, tag, raw_attrs): |
| 89 | attrs = dict(raw_attrs) |
| 90 | |
| 91 | if tag == 'a': |
| 92 | # Handle special cases for href's that: start with a space, contain |
| 93 | # just a '.' (period), contain python templating code, are an absolute |
| 94 | # url, are a zip file, or execute javascript on the page. |
| 95 | href = attrs.get('href', '').strip() |
| 96 | if href and not href == '.' and not '{{' in href: |
| 97 | if not urlsplit(href).scheme in ('http', 'https'): |
| 98 | if not href.endswith('.zip') and not 'javascript:' in href: |
| 99 | self.links.append(href) |
| 100 | |
| 101 | if attrs.get('id'): |
| 102 | self.anchors.add(attrs['id']) |
| 103 | if attrs.get('name'): |
| 104 | self.anchors.add(attrs['name']) |
| 105 | |
| 106 | class LinkErrorDetector(object): |
| 107 | '''Finds link errors on the doc server. This includes broken links, those with |
| 108 | a target page that 404s or contain an anchor that doesn't exist, or pages that |
| 109 | have no links to them. |
| 110 | ''' |
| 111 | |
| 112 | def __init__(self, file_system, renderer, public_path, root_pages): |
| 113 | '''Creates a new broken link detector. |renderer| is a callable that takes |
| 114 | a path and returns a full html page. |public_path| is the path to public |
| 115 | template files. All URLs in |root_pages| are used as the starting nodes for |
| 116 | the orphaned page search. |
| 117 | ''' |
| 118 | self._file_system = file_system |
| 119 | self._renderer = renderer |
| 120 | self._public_path = public_path |
| 121 | self._pages = defaultdict(lambda: Page(404, (), (), ())) |
| 122 | self._root_pages = frozenset(root_pages) |
Torne (Richard Coles) | a36e592 | 2013-08-05 13:57:33 +0100 | [diff] [blame] | 123 | self._always_detached = frozenset(( |
| 124 | 'apps/404.html', |
| 125 | 'extensions/404.html', |
| 126 | 'apps/private_apis.html', |
| 127 | 'extensions/private_apis.html')) |
Ben Murdoch | 558790d | 2013-07-30 15:19:42 +0100 | [diff] [blame] | 128 | self._redirection_whitelist = frozenset(('extensions/', 'apps/')) |
Ben Murdoch | ca12bfa | 2013-07-23 11:17:05 +0100 | [diff] [blame] | 129 | |
| 130 | self._RenderAllPages() |
| 131 | |
| 132 | def _RenderAllPages(self): |
| 133 | '''Traverses the public templates directory rendering each URL and |
| 134 | processing the resultant html to pull out all links and anchors. |
| 135 | ''' |
| 136 | top_level_directories = ( |
| 137 | (svn_constants.PUBLIC_TEMPLATE_PATH, ''), |
| 138 | (svn_constants.STATIC_PATH, 'static/'), |
| 139 | (svn_constants.EXAMPLES_PATH, 'extensions/examples/'), |
| 140 | ) |
| 141 | |
| 142 | for dirpath, urlprefix in top_level_directories: |
| 143 | files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix) |
| 144 | for url, path in files: |
| 145 | self._pages[url] = _Process(url, self._renderer) |
| 146 | |
| 147 | if self._pages[url].status != 200: |
| 148 | print(url, ', a url derived from the path', dirpath + |
| 149 | ', resulted in a', self._pages[url].status) |
| 150 | |
Ben Murdoch | 558790d | 2013-07-30 15:19:42 +0100 | [diff] [blame] | 151 | def _FollowRedirections(self, starting_url, limit=4): |
| 152 | '''Follow redirection until a non-redirectable page is reached. Start at |
| 153 | |starting_url| which must return a 301 or 302 status code. |
Ben Murdoch | ca12bfa | 2013-07-23 11:17:05 +0100 | [diff] [blame] | 154 | |
Ben Murdoch | 558790d | 2013-07-30 15:19:42 +0100 | [diff] [blame] | 155 | Return a tuple of: the status of rendering |staring_url|, the final url, |
| 156 | and a list of the pages reached including |starting_url|. If no redirection |
| 157 | occurred, returns (None, None, None). |
| 158 | ''' |
| 159 | pages_reached = [starting_url] |
| 160 | redirect_link = None |
| 161 | target_page = self._renderer(starting_url) |
| 162 | original_status = status = target_page.status |
| 163 | count = 0 |
| 164 | |
| 165 | while status in (301, 302): |
| 166 | if count > limit: |
| 167 | return None, None, None |
| 168 | redirect_link = target_page.headers.get('Location') |
| 169 | target_page = self._renderer(redirect_link) |
| 170 | status = target_page.status |
| 171 | pages_reached.append(redirect_link) |
| 172 | count += 1 |
| 173 | |
| 174 | if redirect_link is None: |
| 175 | return None, None, None |
| 176 | |
| 177 | return original_status, redirect_link, pages_reached |
| 178 | |
| 179 | def _CategorizeBrokenLinks(self, url, page, pages): |
| 180 | '''Find all broken links on a page and create appropriate notes describing |
| 181 | why tehy are broken (broken anchor, target redirects, etc). |page| is the |
| 182 | current page being checked and is the result of rendering |url|. |pages| |
| 183 | is a callable that takes a path and returns a Page. |
Ben Murdoch | ca12bfa | 2013-07-23 11:17:05 +0100 | [diff] [blame] | 184 | ''' |
| 185 | broken_links = [] |
Ben Murdoch | 558790d | 2013-07-30 15:19:42 +0100 | [diff] [blame] | 186 | |
| 187 | for link in page.links + page.anchor_refs: |
| 188 | components = urlsplit(link) |
| 189 | fragment = components.fragment |
| 190 | |
| 191 | if components.path == '': |
Torne (Richard Coles) | a36e592 | 2013-08-05 13:57:33 +0100 | [diff] [blame] | 192 | if fragment == 'top' or fragment == '': |
Ben Murdoch | 558790d | 2013-07-30 15:19:42 +0100 | [diff] [blame] | 193 | continue |
| 194 | if not fragment in page.anchors: |
| 195 | broken_links.append((200, url, link, 'target anchor not found')) |
| 196 | else: |
| 197 | # Render the target page |
| 198 | target_page = pages(components.path) |
| 199 | |
| 200 | if target_page.status != 200: |
| 201 | if components.path in self._redirection_whitelist: |
| 202 | continue |
| 203 | |
| 204 | status, relink, _ = self._FollowRedirections(components.path) |
| 205 | if relink: |
| 206 | broken_links.append(( |
| 207 | status, |
| 208 | url, |
| 209 | link, |
| 210 | 'redirects to %s' % relink)) |
Torne (Richard Coles) | a36e592 | 2013-08-05 13:57:33 +0100 | [diff] [blame] | 211 | else: |
Ben Murdoch | 558790d | 2013-07-30 15:19:42 +0100 | [diff] [blame] | 212 | broken_links.append(( |
| 213 | target_page.status, url, link, 'target page not found')) |
Ben Murdoch | 558790d | 2013-07-30 15:19:42 +0100 | [diff] [blame] | 214 | |
| 215 | elif fragment: |
| 216 | if not fragment in target_page.anchors: |
| 217 | broken_links.append(( |
| 218 | target_page.status, url, link, 'target anchor not found')) |
| 219 | |
| 220 | return broken_links |
| 221 | |
| 222 | def GetBrokenLinks(self): |
| 223 | '''Find all broken links. A broken link is a link that leads to a page |
| 224 | that does not exist (404s), redirects to another page (301 or 302), or |
| 225 | has an anchor whose target does not exist. |
| 226 | |
| 227 | Returns a list of tuples of four elements: status, url, target_page, |
| 228 | notes. |
| 229 | ''' |
| 230 | broken_links = [] |
Ben Murdoch | ca12bfa | 2013-07-23 11:17:05 +0100 | [diff] [blame] | 231 | |
| 232 | for url in self._pages.keys(): |
| 233 | page = self._pages[url] |
| 234 | if page.status != 200: |
| 235 | continue |
Ben Murdoch | 558790d | 2013-07-30 15:19:42 +0100 | [diff] [blame] | 236 | broken_links.extend(self._CategorizeBrokenLinks( |
| 237 | url, page, lambda x: self._pages[x])) |
Ben Murdoch | ca12bfa | 2013-07-23 11:17:05 +0100 | [diff] [blame] | 238 | |
Ben Murdoch | 558790d | 2013-07-30 15:19:42 +0100 | [diff] [blame] | 239 | return broken_links |
Ben Murdoch | ca12bfa | 2013-07-23 11:17:05 +0100 | [diff] [blame] | 240 | |
| 241 | def GetOrphanedPages(self): |
| 242 | '''Crawls the server find all pages that are connected to the pages at |
| 243 | |seed_url|s. Return the links that are valid on the server but are not in |
| 244 | part of the connected component containing the |root_pages|. These pages |
| 245 | are orphans and cannot be reached simply by clicking through the server. |
| 246 | ''' |
Torne (Richard Coles) | a36e592 | 2013-08-05 13:57:33 +0100 | [diff] [blame] | 247 | pages_to_check = deque(self._root_pages.union(self._always_detached)) |
Ben Murdoch | ca12bfa | 2013-07-23 11:17:05 +0100 | [diff] [blame] | 248 | found = set(self._root_pages) | self._always_detached |
| 249 | |
| 250 | while pages_to_check: |
| 251 | item = pages_to_check.popleft() |
Ben Murdoch | 558790d | 2013-07-30 15:19:42 +0100 | [diff] [blame] | 252 | target_page = self._pages[item] |
| 253 | |
| 254 | if target_page.status != 200: |
| 255 | redirected_page = self._FollowRedirections(item)[1] |
| 256 | if not redirected_page is None: |
| 257 | target_page = self._pages[redirected_page] |
| 258 | |
| 259 | for link in target_page.links: |
Ben Murdoch | ca12bfa | 2013-07-23 11:17:05 +0100 | [diff] [blame] | 260 | if link not in found: |
| 261 | found.add(link) |
| 262 | pages_to_check.append(link) |
| 263 | |
| 264 | all_urls = set( |
| 265 | [url for url, page in self._pages.iteritems() if page.status == 200]) |
| 266 | |
| 267 | return [url for url in all_urls - found if url.endswith('.html')] |
Ben Murdoch | bb1529c | 2013-08-08 10:24:53 +0100 | [diff] [blame^] | 268 | |
| 269 | def StringifyBrokenLinks(broken_links): |
| 270 | '''Prints out broken links in a more readable format. |
| 271 | ''' |
| 272 | def fixed_width(string, width): |
| 273 | return "%s%s" % (string, (width - len(string)) * ' ') |
| 274 | |
| 275 | first_col_width = max(len(link[1]) for link in broken_links) |
| 276 | second_col_width = max(len(link[2]) for link in broken_links) |
| 277 | target = itemgetter(2) |
| 278 | output = [] |
| 279 | |
| 280 | def pretty_print(link, col_offset=0): |
| 281 | return "%s -> %s %s" % ( |
| 282 | fixed_width(link[1], first_col_width - col_offset), |
| 283 | fixed_width(link[2], second_col_width), |
| 284 | link[3]) |
| 285 | |
| 286 | for target, links in groupby(sorted(broken_links, key=target), target): |
| 287 | links = list(links) |
| 288 | # Compress messages |
| 289 | if len(links) > 50 and not links[0][2].startswith('#'): |
| 290 | message = "Found %d broken links (" % len(links) |
| 291 | output.append("%s%s)" % (message, pretty_print(links[0], len(message)))) |
| 292 | else: |
| 293 | for link in links: |
| 294 | output.append(pretty_print(link)) |
| 295 | |
| 296 | return '\n'.join(output) |