blob: 85851938833a1468a8a69dd3b8ba95eb37005f9d [file] [log] [blame]
Tarek Ziade1231a4e2011-05-19 13:07:25 +02001"""Spider using the screen-scraping "simple" PyPI API.
2
3This module contains the class SimpleIndexCrawler, a simple spider that
4can be used to find and retrieve distributions from a project index
5(like the Python Package Index), using its so-called simple API (see
6reference implementation available at http://pypi.python.org/simple/).
7"""
8
9import http.client
10import re
11import socket
12import sys
13import urllib.request
14import urllib.parse
15import urllib.error
16import os
17
18
19from fnmatch import translate
20from packaging import logger
21from packaging.metadata import Metadata
22from packaging.version import get_version_predicate
23from packaging import __version__ as packaging_version
24from packaging.pypi.base import BaseClient
25from packaging.pypi.dist import (ReleasesList, EXTENSIONS,
26 get_infos_from_url, MD5_HASH)
27from packaging.pypi.errors import (PackagingPyPIError, DownloadError,
28 UnableToDownload, CantParseArchiveName,
29 ReleaseNotFound, ProjectNotFound)
30from packaging.pypi.mirrors import get_mirrors
31from packaging.metadata import Metadata
32
33__all__ = ['Crawler', 'DEFAULT_SIMPLE_INDEX_URL']
34
35# -- Constants -----------------------------------------------
36DEFAULT_SIMPLE_INDEX_URL = "http://a.pypi.python.org/simple/"
37DEFAULT_HOSTS = ("*",)
38SOCKET_TIMEOUT = 15
39USER_AGENT = "Python-urllib/%s packaging/%s" % (
40 sys.version[:3], packaging_version)
41
42# -- Regexps -------------------------------------------------
43EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
44HREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I)
45URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
46
47# This pattern matches a character entity reference (a decimal numeric
48# references, a hexadecimal numeric reference, or a named reference).
49ENTITY_SUB = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
50REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
51
52
53def socket_timeout(timeout=SOCKET_TIMEOUT):
54 """Decorator to add a socket timeout when requesting pages on PyPI.
55 """
56 def _socket_timeout(func):
57 def _socket_timeout(self, *args, **kwargs):
58 old_timeout = socket.getdefaulttimeout()
59 if hasattr(self, "_timeout"):
60 timeout = self._timeout
61 socket.setdefaulttimeout(timeout)
62 try:
63 return func(self, *args, **kwargs)
64 finally:
65 socket.setdefaulttimeout(old_timeout)
66 return _socket_timeout
67 return _socket_timeout
68
69
70def with_mirror_support():
71 """Decorator that makes the mirroring support easier"""
72 def wrapper(func):
73 def wrapped(self, *args, **kwargs):
74 try:
75 return func(self, *args, **kwargs)
76 except DownloadError:
77 # if an error occurs, try with the next index_url
78 if self._mirrors_tries >= self._mirrors_max_tries:
79 try:
80 self._switch_to_next_mirror()
81 except KeyError:
82 raise UnableToDownload("Tried all mirrors")
83 else:
84 self._mirrors_tries += 1
85 self._projects.clear()
86 return wrapped(self, *args, **kwargs)
87 return wrapped
88 return wrapper
89
90
91class Crawler(BaseClient):
92 """Provides useful tools to request the Python Package Index simple API.
93
94 You can specify both mirrors and mirrors_url, but mirrors_url will only be
95 used if mirrors is set to None.
96
97 :param index_url: the url of the simple index to search on.
98 :param prefer_final: if the version is not mentioned, and the last
99 version is not a "final" one (alpha, beta, etc.),
100 pick up the last final version.
101 :param prefer_source: if the distribution type is not mentioned, pick up
102 the source one if available.
103 :param follow_externals: tell if following external links is needed or
104 not. Default is False.
105 :param hosts: a list of hosts allowed to be processed while using
106 follow_externals=True. Default behavior is to follow all
107 hosts.
108 :param follow_externals: tell if following external links is needed or
109 not. Default is False.
110 :param mirrors_url: the url to look on for DNS records giving mirror
111 adresses.
112 :param mirrors: a list of mirrors (see PEP 381).
113 :param timeout: time in seconds to consider a url has timeouted.
114 :param mirrors_max_tries": number of times to try requesting informations
115 on mirrors before switching.
116 """
117
118 def __init__(self, index_url=DEFAULT_SIMPLE_INDEX_URL, prefer_final=False,
119 prefer_source=True, hosts=DEFAULT_HOSTS,
120 follow_externals=False, mirrors_url=None, mirrors=None,
121 timeout=SOCKET_TIMEOUT, mirrors_max_tries=0):
122 super(Crawler, self).__init__(prefer_final, prefer_source)
123 self.follow_externals = follow_externals
124
125 # mirroring attributes.
126 if not index_url.endswith("/"):
127 index_url += "/"
128 # if no mirrors are defined, use the method described in PEP 381.
129 if mirrors is None:
130 mirrors = get_mirrors(mirrors_url)
131 self._mirrors = set(mirrors)
132 self._mirrors_used = set()
133 self.index_url = index_url
134 self._mirrors_max_tries = mirrors_max_tries
135 self._mirrors_tries = 0
136 self._timeout = timeout
137
138 # create a regexp to match all given hosts
139 self._allowed_hosts = re.compile('|'.join(map(translate, hosts))).match
140
141 # we keep an index of pages we have processed, in order to avoid
142 # scanning them multple time (eg. if there is multiple pages pointing
143 # on one)
144 self._processed_urls = []
145 self._projects = {}
146
147 @with_mirror_support()
148 def search_projects(self, name=None, **kwargs):
149 """Search the index for projects containing the given name.
150
151 Return a list of names.
152 """
153 with self._open_url(self.index_url) as index:
154 if '*' in name:
155 name.replace('*', '.*')
156 else:
157 name = "%s%s%s" % ('*.?', name, '*.?')
158 name = name.replace('*', '[^<]*') # avoid matching end tag
159 projectname = re.compile('<a[^>]*>(%s)</a>' % name, re.I)
160 matching_projects = []
161
162 index_content = index.read()
163
164 # FIXME should use bytes I/O and regexes instead of decoding
165 index_content = index_content.decode()
166
167 for match in projectname.finditer(index_content):
168 project_name = match.group(1)
169 matching_projects.append(self._get_project(project_name))
170 return matching_projects
171
172 def get_releases(self, requirements, prefer_final=None,
173 force_update=False):
174 """Search for releases and return a ReleaseList object containing
175 the results.
176 """
177 predicate = get_version_predicate(requirements)
178 if predicate.name.lower() in self._projects and not force_update:
179 return self._projects.get(predicate.name.lower())
180 prefer_final = self._get_prefer_final(prefer_final)
181 logger.info('reading info on PyPI about %s', predicate.name)
182 self._process_index_page(predicate.name)
183
184 if predicate.name.lower() not in self._projects:
185 raise ProjectNotFound()
186
187 releases = self._projects.get(predicate.name.lower())
188 releases.sort_releases(prefer_final=prefer_final)
189 return releases
190
191 def get_release(self, requirements, prefer_final=None):
192 """Return only one release that fulfill the given requirements"""
193 predicate = get_version_predicate(requirements)
194 release = self.get_releases(predicate, prefer_final)\
195 .get_last(predicate)
196 if not release:
197 raise ReleaseNotFound("No release matches the given criterias")
198 return release
199
200 def get_distributions(self, project_name, version):
201 """Return the distributions found on the index for the specific given
202 release"""
203 # as the default behavior of get_release is to return a release
204 # containing the distributions, just alias it.
205 return self.get_release("%s (%s)" % (project_name, version))
206
207 def get_metadata(self, project_name, version):
208 """Return the metadatas from the simple index.
209
210 Currently, download one archive, extract it and use the PKG-INFO file.
211 """
212 release = self.get_distributions(project_name, version)
213 if not release.metadata:
214 location = release.get_distribution().unpack()
215 pkg_info = os.path.join(location, 'PKG-INFO')
216 release.metadata = Metadata(pkg_info)
217 return release
218
219 def _switch_to_next_mirror(self):
220 """Switch to the next mirror (eg. point self.index_url to the next
221 mirror url.
222
223 Raise a KeyError if all mirrors have been tried.
224 """
225 self._mirrors_used.add(self.index_url)
226 index_url = self._mirrors.pop()
227 if not ("http://" or "https://" or "file://") in index_url:
228 index_url = "http://%s" % index_url
229
230 if not index_url.endswith("/simple"):
231 index_url = "%s/simple/" % index_url
232
233 self.index_url = index_url
234
235 def _is_browsable(self, url):
236 """Tell if the given URL can be browsed or not.
237
238 It uses the follow_externals and the hosts list to tell if the given
239 url is browsable or not.
240 """
241 # if _index_url is contained in the given URL, we are browsing the
242 # index, and it's always "browsable".
243 # local files are always considered browable resources
244 if self.index_url in url or urllib.parse.urlparse(url)[0] == "file":
245 return True
246 elif self.follow_externals:
247 if self._allowed_hosts(urllib.parse.urlparse(url)[1]): # 1 is netloc
248 return True
249 else:
250 return False
251 return False
252
253 def _is_distribution(self, link):
254 """Tell if the given URL matches to a distribution name or not.
255 """
256 #XXX find a better way to check that links are distributions
257 # Using a regexp ?
258 for ext in EXTENSIONS:
259 if ext in link:
260 return True
261 return False
262
263 def _register_release(self, release=None, release_info={}):
264 """Register a new release.
265
266 Both a release or a dict of release_info can be provided, the prefered
267 way (eg. the quicker) is the dict one.
268
269 Return the list of existing releases for the given project.
270 """
271 # Check if the project already has a list of releases (refering to
272 # the project name). If not, create a new release list.
273 # Then, add the release to the list.
274 if release:
275 name = release.name
276 else:
277 name = release_info['name']
278 if not name.lower() in self._projects:
279 self._projects[name.lower()] = ReleasesList(name, index=self._index)
280
281 if release:
282 self._projects[name.lower()].add_release(release=release)
283 else:
284 name = release_info.pop('name')
285 version = release_info.pop('version')
286 dist_type = release_info.pop('dist_type')
287 self._projects[name.lower()].add_release(version, dist_type,
288 **release_info)
289 return self._projects[name.lower()]
290
291 def _process_url(self, url, project_name=None, follow_links=True):
292 """Process an url and search for distributions packages.
293
294 For each URL found, if it's a download, creates a PyPIdistribution
295 object. If it's a homepage and we can follow links, process it too.
296
297 :param url: the url to process
298 :param project_name: the project name we are searching for.
299 :param follow_links: Do not want to follow links more than from one
300 level. This parameter tells if we want to follow
301 the links we find (eg. run recursively this
302 method on it)
303 """
304 with self._open_url(url) as f:
305 base_url = f.url
306 if url not in self._processed_urls:
307 self._processed_urls.append(url)
308 link_matcher = self._get_link_matcher(url)
309 for link, is_download in link_matcher(f.read().decode(), base_url):
310 if link not in self._processed_urls:
311 if self._is_distribution(link) or is_download:
312 self._processed_urls.append(link)
313 # it's a distribution, so create a dist object
314 try:
315 infos = get_infos_from_url(link, project_name,
316 is_external=not self.index_url in url)
317 except CantParseArchiveName as e:
318 logger.warning(
319 "version has not been parsed: %s", e)
320 else:
321 self._register_release(release_info=infos)
322 else:
323 if self._is_browsable(link) and follow_links:
324 self._process_url(link, project_name,
325 follow_links=False)
326
327 def _get_link_matcher(self, url):
328 """Returns the right link matcher function of the given url
329 """
330 if self.index_url in url:
331 return self._simple_link_matcher
332 else:
333 return self._default_link_matcher
334
335 def _get_full_url(self, url, base_url):
336 return urllib.parse.urljoin(base_url, self._htmldecode(url))
337
338 def _simple_link_matcher(self, content, base_url):
339 """Yield all links with a rel="download" or rel="homepage".
340
341 This matches the simple index requirements for matching links.
342 If follow_externals is set to False, dont yeld the external
343 urls.
344
345 :param content: the content of the page we want to parse
346 :param base_url: the url of this page.
347 """
348 for match in HREF.finditer(content):
349 url = self._get_full_url(match.group(1), base_url)
350 if MD5_HASH.match(url):
351 yield (url, True)
352
353 for match in REL.finditer(content):
354 # search for rel links.
355 tag, rel = match.groups()
356 rels = [s.strip() for s in rel.lower().split(',')]
357 if 'homepage' in rels or 'download' in rels:
358 for match in HREF.finditer(tag):
359 url = self._get_full_url(match.group(1), base_url)
360 if 'download' in rels or self._is_browsable(url):
361 # yield a list of (url, is_download)
362 yield (url, 'download' in rels)
363
364 def _default_link_matcher(self, content, base_url):
365 """Yield all links found on the page.
366 """
367 for match in HREF.finditer(content):
368 url = self._get_full_url(match.group(1), base_url)
369 if self._is_browsable(url):
370 yield (url, False)
371
372 @with_mirror_support()
373 def _process_index_page(self, name):
374 """Find and process a PyPI page for the given project name.
375
376 :param name: the name of the project to find the page
377 """
378 # Browse and index the content of the given PyPI page.
379 url = self.index_url + name + "/"
380 self._process_url(url, name)
381
382 @socket_timeout()
383 def _open_url(self, url):
384 """Open a urllib2 request, handling HTTP authentication, and local
385 files support.
386
387 """
388 scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
389
390 # authentication stuff
391 if scheme in ('http', 'https'):
392 auth, host = urllib.parse.splituser(netloc)
393 else:
394 auth = None
395
396 # add index.html automatically for filesystem paths
397 if scheme == 'file':
398 if url.endswith('/'):
399 url += "index.html"
400
401 # add authorization headers if auth is provided
402 if auth:
403 auth = "Basic " + \
404 urllib.parse.unquote(auth).encode('base64').strip()
405 new_url = urllib.parse.urlunparse((
406 scheme, host, path, params, query, frag))
407 request = urllib.request.Request(new_url)
408 request.add_header("Authorization", auth)
409 else:
410 request = urllib.request.Request(url)
411 request.add_header('User-Agent', USER_AGENT)
412 try:
413 fp = urllib.request.urlopen(request)
414 except (ValueError, http.client.InvalidURL) as v:
415 msg = ' '.join([str(arg) for arg in v.args])
416 raise PackagingPyPIError('%s %s' % (url, msg))
417 except urllib.error.HTTPError as v:
418 return v
419 except urllib.error.URLError as v:
420 raise DownloadError("Download error for %s: %s" % (url, v.reason))
421 except http.client.BadStatusLine as v:
422 raise DownloadError('%s returned a bad status line. '
423 'The server might be down, %s' % (url, v.line))
424 except http.client.HTTPException as v:
425 raise DownloadError("Download error for %s: %s" % (url, v))
426 except socket.timeout:
427 raise DownloadError("The server timeouted")
428
429 if auth:
430 # Put authentication info back into request URL if same host,
431 # so that links found on the page will work
432 s2, h2, path2, param2, query2, frag2 = \
433 urllib.parse.urlparse(fp.url)
434 if s2 == scheme and h2 == host:
435 fp.url = urllib.parse.urlunparse(
436 (s2, netloc, path2, param2, query2, frag2))
437 return fp
438
439 def _decode_entity(self, match):
440 what = match.group(1)
441 if what.startswith('#x'):
442 what = int(what[2:], 16)
443 elif what.startswith('#'):
444 what = int(what[1:])
445 else:
446 from html.entities import name2codepoint
447 what = name2codepoint.get(what, match.group(0))
448 return chr(what)
449
450 def _htmldecode(self, text):
451 """Decode HTML entities in the given text."""
452 return ENTITY_SUB(self._decode_entity, text)