blob: e01e033c4687b798fe414d2fa869d7d2f864239a [file] [log] [blame]
Tarek Ziade1231a4e2011-05-19 13:07:25 +02001"""Spider using the screen-scraping "simple" PyPI API.
2
Éric Araujo25d57372011-06-01 14:41:11 +02003This module contains the class Crawler, a simple spider that
Tarek Ziade1231a4e2011-05-19 13:07:25 +02004can be used to find and retrieve distributions from a project index
5(like the Python Package Index), using its so-called simple API (see
6reference implementation available at http://pypi.python.org/simple/).
7"""
8
9import http.client
10import re
11import socket
12import sys
13import urllib.request
14import urllib.parse
15import urllib.error
16import os
17
Tarek Ziade1231a4e2011-05-19 13:07:25 +020018from fnmatch import translate
Éric Araujo3c8ca082011-06-17 21:10:21 +020019from functools import wraps
Tarek Ziade1231a4e2011-05-19 13:07:25 +020020from packaging import logger
21from packaging.metadata import Metadata
22from packaging.version import get_version_predicate
23from packaging import __version__ as packaging_version
24from packaging.pypi.base import BaseClient
25from packaging.pypi.dist import (ReleasesList, EXTENSIONS,
Éric Araujo505f0eb2011-09-19 15:12:23 +020026 get_infos_from_url, MD5_HASH)
Tarek Ziade1231a4e2011-05-19 13:07:25 +020027from packaging.pypi.errors import (PackagingPyPIError, DownloadError,
Éric Araujo505f0eb2011-09-19 15:12:23 +020028 UnableToDownload, CantParseArchiveName,
29 ReleaseNotFound, ProjectNotFound)
Tarek Ziade1231a4e2011-05-19 13:07:25 +020030from packaging.pypi.mirrors import get_mirrors
Tarek Ziade1231a4e2011-05-19 13:07:25 +020031
32__all__ = ['Crawler', 'DEFAULT_SIMPLE_INDEX_URL']
33
34# -- Constants -----------------------------------------------
35DEFAULT_SIMPLE_INDEX_URL = "http://a.pypi.python.org/simple/"
36DEFAULT_HOSTS = ("*",)
37SOCKET_TIMEOUT = 15
38USER_AGENT = "Python-urllib/%s packaging/%s" % (
39 sys.version[:3], packaging_version)
40
41# -- Regexps -------------------------------------------------
42EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
43HREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I)
44URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
45
46# This pattern matches a character entity reference (a decimal numeric
47# references, a hexadecimal numeric reference, or a named reference).
48ENTITY_SUB = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
49REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
50
51
52def socket_timeout(timeout=SOCKET_TIMEOUT):
53 """Decorator to add a socket timeout when requesting pages on PyPI.
54 """
Éric Araujo3c8ca082011-06-17 21:10:21 +020055 def wrapper(func):
56 @wraps(func)
57 def wrapped(self, *args, **kwargs):
Tarek Ziade1231a4e2011-05-19 13:07:25 +020058 old_timeout = socket.getdefaulttimeout()
59 if hasattr(self, "_timeout"):
60 timeout = self._timeout
61 socket.setdefaulttimeout(timeout)
62 try:
63 return func(self, *args, **kwargs)
64 finally:
65 socket.setdefaulttimeout(old_timeout)
Éric Araujo3c8ca082011-06-17 21:10:21 +020066 return wrapped
67 return wrapper
Tarek Ziade1231a4e2011-05-19 13:07:25 +020068
69
70def with_mirror_support():
71 """Decorator that makes the mirroring support easier"""
72 def wrapper(func):
Éric Araujo3c8ca082011-06-17 21:10:21 +020073 @wraps(func)
Tarek Ziade1231a4e2011-05-19 13:07:25 +020074 def wrapped(self, *args, **kwargs):
75 try:
76 return func(self, *args, **kwargs)
77 except DownloadError:
78 # if an error occurs, try with the next index_url
79 if self._mirrors_tries >= self._mirrors_max_tries:
80 try:
81 self._switch_to_next_mirror()
82 except KeyError:
83 raise UnableToDownload("Tried all mirrors")
84 else:
85 self._mirrors_tries += 1
86 self._projects.clear()
87 return wrapped(self, *args, **kwargs)
88 return wrapped
89 return wrapper
90
91
92class Crawler(BaseClient):
93 """Provides useful tools to request the Python Package Index simple API.
94
95 You can specify both mirrors and mirrors_url, but mirrors_url will only be
96 used if mirrors is set to None.
97
98 :param index_url: the url of the simple index to search on.
99 :param prefer_final: if the version is not mentioned, and the last
100 version is not a "final" one (alpha, beta, etc.),
101 pick up the last final version.
102 :param prefer_source: if the distribution type is not mentioned, pick up
103 the source one if available.
104 :param follow_externals: tell if following external links is needed or
105 not. Default is False.
106 :param hosts: a list of hosts allowed to be processed while using
107 follow_externals=True. Default behavior is to follow all
108 hosts.
109 :param follow_externals: tell if following external links is needed or
110 not. Default is False.
111 :param mirrors_url: the url to look on for DNS records giving mirror
Éric Araujo348c5722011-06-19 18:53:31 +0200112 addresses.
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200113 :param mirrors: a list of mirrors (see PEP 381).
114 :param timeout: time in seconds to consider a url has timeouted.
115 :param mirrors_max_tries": number of times to try requesting informations
116 on mirrors before switching.
117 """
118
119 def __init__(self, index_url=DEFAULT_SIMPLE_INDEX_URL, prefer_final=False,
120 prefer_source=True, hosts=DEFAULT_HOSTS,
121 follow_externals=False, mirrors_url=None, mirrors=None,
Éric Araujodd2d55c2011-09-21 16:28:03 +0200122 timeout=SOCKET_TIMEOUT, mirrors_max_tries=0):
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200123 super(Crawler, self).__init__(prefer_final, prefer_source)
124 self.follow_externals = follow_externals
125
126 # mirroring attributes.
Tarek Ziadecc243cc2011-05-21 22:47:40 +0200127 parsed = urllib.parse.urlparse(index_url)
128 self.scheme = parsed[0]
129 if self.scheme == 'file':
130 ender = os.path.sep
131 else:
132 ender = '/'
133 if not index_url.endswith(ender):
134 index_url += ender
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200135 # if no mirrors are defined, use the method described in PEP 381.
136 if mirrors is None:
137 mirrors = get_mirrors(mirrors_url)
138 self._mirrors = set(mirrors)
139 self._mirrors_used = set()
140 self.index_url = index_url
141 self._mirrors_max_tries = mirrors_max_tries
142 self._mirrors_tries = 0
143 self._timeout = timeout
144
145 # create a regexp to match all given hosts
146 self._allowed_hosts = re.compile('|'.join(map(translate, hosts))).match
147
148 # we keep an index of pages we have processed, in order to avoid
149 # scanning them multple time (eg. if there is multiple pages pointing
150 # on one)
151 self._processed_urls = []
152 self._projects = {}
153
154 @with_mirror_support()
155 def search_projects(self, name=None, **kwargs):
156 """Search the index for projects containing the given name.
157
158 Return a list of names.
159 """
Éric Araujo030cfe22011-09-10 18:10:58 +0200160 if '*' in name:
161 name.replace('*', '.*')
162 else:
163 name = "%s%s%s" % ('*.?', name, '*.?')
164 name = name.replace('*', '[^<]*') # avoid matching end tag
165 pattern = ('<a[^>]*>(%s)</a>' % name).encode('utf-8')
166 projectname = re.compile(pattern, re.I)
167 matching_projects = []
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200168
Éric Araujo030cfe22011-09-10 18:10:58 +0200169 with self._open_url(self.index_url) as index:
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200170 index_content = index.read()
171
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200172 for match in projectname.finditer(index_content):
Éric Araujo030cfe22011-09-10 18:10:58 +0200173 project_name = match.group(1).decode('utf-8')
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200174 matching_projects.append(self._get_project(project_name))
175 return matching_projects
176
177 def get_releases(self, requirements, prefer_final=None,
178 force_update=False):
Éric Araujo25d57372011-06-01 14:41:11 +0200179 """Search for releases and return a ReleasesList object containing
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200180 the results.
181 """
182 predicate = get_version_predicate(requirements)
183 if predicate.name.lower() in self._projects and not force_update:
184 return self._projects.get(predicate.name.lower())
185 prefer_final = self._get_prefer_final(prefer_final)
Tarek Ziadeb1b6e132011-05-30 12:07:49 +0200186 logger.debug('Reading info on PyPI about %s', predicate.name)
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200187 self._process_index_page(predicate.name)
188
189 if predicate.name.lower() not in self._projects:
Éric Araujo2ef747c2011-06-04 22:33:16 +0200190 raise ProjectNotFound
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200191
192 releases = self._projects.get(predicate.name.lower())
193 releases.sort_releases(prefer_final=prefer_final)
194 return releases
195
196 def get_release(self, requirements, prefer_final=None):
197 """Return only one release that fulfill the given requirements"""
198 predicate = get_version_predicate(requirements)
199 release = self.get_releases(predicate, prefer_final)\
200 .get_last(predicate)
201 if not release:
202 raise ReleaseNotFound("No release matches the given criterias")
203 return release
204
205 def get_distributions(self, project_name, version):
206 """Return the distributions found on the index for the specific given
207 release"""
208 # as the default behavior of get_release is to return a release
209 # containing the distributions, just alias it.
210 return self.get_release("%s (%s)" % (project_name, version))
211
212 def get_metadata(self, project_name, version):
213 """Return the metadatas from the simple index.
214
215 Currently, download one archive, extract it and use the PKG-INFO file.
216 """
217 release = self.get_distributions(project_name, version)
218 if not release.metadata:
219 location = release.get_distribution().unpack()
220 pkg_info = os.path.join(location, 'PKG-INFO')
221 release.metadata = Metadata(pkg_info)
222 return release
223
224 def _switch_to_next_mirror(self):
225 """Switch to the next mirror (eg. point self.index_url to the next
226 mirror url.
227
228 Raise a KeyError if all mirrors have been tried.
229 """
230 self._mirrors_used.add(self.index_url)
231 index_url = self._mirrors.pop()
Éric Araujoea888e02011-06-08 04:31:18 +0200232 # XXX use urllib.parse for a real check of missing scheme part
233 if not index_url.startswith(("http://", "https://", "file://")):
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200234 index_url = "http://%s" % index_url
235
236 if not index_url.endswith("/simple"):
237 index_url = "%s/simple/" % index_url
238
239 self.index_url = index_url
240
241 def _is_browsable(self, url):
242 """Tell if the given URL can be browsed or not.
243
244 It uses the follow_externals and the hosts list to tell if the given
245 url is browsable or not.
246 """
247 # if _index_url is contained in the given URL, we are browsing the
248 # index, and it's always "browsable".
249 # local files are always considered browable resources
250 if self.index_url in url or urllib.parse.urlparse(url)[0] == "file":
251 return True
252 elif self.follow_externals:
253 if self._allowed_hosts(urllib.parse.urlparse(url)[1]): # 1 is netloc
254 return True
255 else:
256 return False
257 return False
258
259 def _is_distribution(self, link):
260 """Tell if the given URL matches to a distribution name or not.
261 """
262 #XXX find a better way to check that links are distributions
263 # Using a regexp ?
264 for ext in EXTENSIONS:
265 if ext in link:
266 return True
267 return False
268
269 def _register_release(self, release=None, release_info={}):
270 """Register a new release.
271
272 Both a release or a dict of release_info can be provided, the prefered
273 way (eg. the quicker) is the dict one.
274
275 Return the list of existing releases for the given project.
276 """
277 # Check if the project already has a list of releases (refering to
278 # the project name). If not, create a new release list.
279 # Then, add the release to the list.
280 if release:
281 name = release.name
282 else:
283 name = release_info['name']
Éric Araujodf8ef022011-06-08 04:47:13 +0200284 if name.lower() not in self._projects:
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200285 self._projects[name.lower()] = ReleasesList(name, index=self._index)
286
287 if release:
288 self._projects[name.lower()].add_release(release=release)
289 else:
290 name = release_info.pop('name')
291 version = release_info.pop('version')
292 dist_type = release_info.pop('dist_type')
293 self._projects[name.lower()].add_release(version, dist_type,
294 **release_info)
295 return self._projects[name.lower()]
296
297 def _process_url(self, url, project_name=None, follow_links=True):
298 """Process an url and search for distributions packages.
299
300 For each URL found, if it's a download, creates a PyPIdistribution
301 object. If it's a homepage and we can follow links, process it too.
302
303 :param url: the url to process
304 :param project_name: the project name we are searching for.
305 :param follow_links: Do not want to follow links more than from one
306 level. This parameter tells if we want to follow
307 the links we find (eg. run recursively this
308 method on it)
309 """
310 with self._open_url(url) as f:
311 base_url = f.url
312 if url not in self._processed_urls:
313 self._processed_urls.append(url)
314 link_matcher = self._get_link_matcher(url)
315 for link, is_download in link_matcher(f.read().decode(), base_url):
316 if link not in self._processed_urls:
317 if self._is_distribution(link) or is_download:
318 self._processed_urls.append(link)
319 # it's a distribution, so create a dist object
320 try:
321 infos = get_infos_from_url(link, project_name,
Éric Araujodf8ef022011-06-08 04:47:13 +0200322 is_external=self.index_url not in url)
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200323 except CantParseArchiveName as e:
Éric Araujodd2d55c2011-09-21 16:28:03 +0200324 logger.warning(
325 "version has not been parsed: %s", e)
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200326 else:
327 self._register_release(release_info=infos)
328 else:
329 if self._is_browsable(link) and follow_links:
330 self._process_url(link, project_name,
331 follow_links=False)
332
333 def _get_link_matcher(self, url):
334 """Returns the right link matcher function of the given url
335 """
336 if self.index_url in url:
337 return self._simple_link_matcher
338 else:
339 return self._default_link_matcher
340
341 def _get_full_url(self, url, base_url):
342 return urllib.parse.urljoin(base_url, self._htmldecode(url))
343
344 def _simple_link_matcher(self, content, base_url):
345 """Yield all links with a rel="download" or rel="homepage".
346
347 This matches the simple index requirements for matching links.
348 If follow_externals is set to False, dont yeld the external
349 urls.
350
351 :param content: the content of the page we want to parse
352 :param base_url: the url of this page.
353 """
354 for match in HREF.finditer(content):
355 url = self._get_full_url(match.group(1), base_url)
356 if MD5_HASH.match(url):
357 yield (url, True)
358
359 for match in REL.finditer(content):
360 # search for rel links.
361 tag, rel = match.groups()
362 rels = [s.strip() for s in rel.lower().split(',')]
363 if 'homepage' in rels or 'download' in rels:
364 for match in HREF.finditer(tag):
365 url = self._get_full_url(match.group(1), base_url)
366 if 'download' in rels or self._is_browsable(url):
367 # yield a list of (url, is_download)
368 yield (url, 'download' in rels)
369
370 def _default_link_matcher(self, content, base_url):
371 """Yield all links found on the page.
372 """
373 for match in HREF.finditer(content):
374 url = self._get_full_url(match.group(1), base_url)
375 if self._is_browsable(url):
376 yield (url, False)
377
378 @with_mirror_support()
379 def _process_index_page(self, name):
380 """Find and process a PyPI page for the given project name.
381
382 :param name: the name of the project to find the page
383 """
384 # Browse and index the content of the given PyPI page.
Tarek Ziadecc243cc2011-05-21 22:47:40 +0200385 if self.scheme == 'file':
386 ender = os.path.sep
387 else:
388 ender = '/'
389 url = self.index_url + name + ender
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200390 self._process_url(url, name)
391
392 @socket_timeout()
393 def _open_url(self, url):
394 """Open a urllib2 request, handling HTTP authentication, and local
395 files support.
396
397 """
398 scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
399
400 # authentication stuff
401 if scheme in ('http', 'https'):
402 auth, host = urllib.parse.splituser(netloc)
403 else:
404 auth = None
405
406 # add index.html automatically for filesystem paths
407 if scheme == 'file':
Tarek Ziadecc243cc2011-05-21 22:47:40 +0200408 if url.endswith(os.path.sep):
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200409 url += "index.html"
410
411 # add authorization headers if auth is provided
412 if auth:
413 auth = "Basic " + \
414 urllib.parse.unquote(auth).encode('base64').strip()
415 new_url = urllib.parse.urlunparse((
416 scheme, host, path, params, query, frag))
417 request = urllib.request.Request(new_url)
418 request.add_header("Authorization", auth)
419 else:
420 request = urllib.request.Request(url)
421 request.add_header('User-Agent', USER_AGENT)
422 try:
423 fp = urllib.request.urlopen(request)
424 except (ValueError, http.client.InvalidURL) as v:
425 msg = ' '.join([str(arg) for arg in v.args])
426 raise PackagingPyPIError('%s %s' % (url, msg))
427 except urllib.error.HTTPError as v:
428 return v
429 except urllib.error.URLError as v:
430 raise DownloadError("Download error for %s: %s" % (url, v.reason))
431 except http.client.BadStatusLine as v:
432 raise DownloadError('%s returned a bad status line. '
433 'The server might be down, %s' % (url, v.line))
434 except http.client.HTTPException as v:
435 raise DownloadError("Download error for %s: %s" % (url, v))
436 except socket.timeout:
437 raise DownloadError("The server timeouted")
438
439 if auth:
440 # Put authentication info back into request URL if same host,
441 # so that links found on the page will work
442 s2, h2, path2, param2, query2, frag2 = \
443 urllib.parse.urlparse(fp.url)
444 if s2 == scheme and h2 == host:
445 fp.url = urllib.parse.urlunparse(
446 (s2, netloc, path2, param2, query2, frag2))
447 return fp
448
449 def _decode_entity(self, match):
450 what = match.group(1)
451 if what.startswith('#x'):
452 what = int(what[2:], 16)
453 elif what.startswith('#'):
454 what = int(what[1:])
455 else:
456 from html.entities import name2codepoint
457 what = name2codepoint.get(what, match.group(0))
458 return chr(what)
459
460 def _htmldecode(self, text):
461 """Decode HTML entities in the given text."""
462 return ENTITY_SUB(self._decode_entity, text)