blob: 710355d6480e8306af273978e1ee103dc46abb1c [file] [log] [blame]
Tarek Ziade1231a4e2011-05-19 13:07:25 +02001"""Spider using the screen-scraping "simple" PyPI API.
2
Éric Araujo25d57372011-06-01 14:41:11 +02003This module contains the class Crawler, a simple spider that
Tarek Ziade1231a4e2011-05-19 13:07:25 +02004can be used to find and retrieve distributions from a project index
5(like the Python Package Index), using its so-called simple API (see
6reference implementation available at http://pypi.python.org/simple/).
7"""
8
9import http.client
10import re
11import socket
12import sys
13import urllib.request
14import urllib.parse
15import urllib.error
16import os
17
Tarek Ziade1231a4e2011-05-19 13:07:25 +020018from fnmatch import translate
Éric Araujo3c8ca082011-06-17 21:10:21 +020019from functools import wraps
Tarek Ziade1231a4e2011-05-19 13:07:25 +020020from packaging import logger
21from packaging.metadata import Metadata
22from packaging.version import get_version_predicate
23from packaging import __version__ as packaging_version
24from packaging.pypi.base import BaseClient
25from packaging.pypi.dist import (ReleasesList, EXTENSIONS,
26 get_infos_from_url, MD5_HASH)
27from packaging.pypi.errors import (PackagingPyPIError, DownloadError,
28 UnableToDownload, CantParseArchiveName,
29 ReleaseNotFound, ProjectNotFound)
30from packaging.pypi.mirrors import get_mirrors
31from packaging.metadata import Metadata
32
33__all__ = ['Crawler', 'DEFAULT_SIMPLE_INDEX_URL']
34
35# -- Constants -----------------------------------------------
36DEFAULT_SIMPLE_INDEX_URL = "http://a.pypi.python.org/simple/"
37DEFAULT_HOSTS = ("*",)
38SOCKET_TIMEOUT = 15
39USER_AGENT = "Python-urllib/%s packaging/%s" % (
40 sys.version[:3], packaging_version)
41
42# -- Regexps -------------------------------------------------
43EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
44HREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I)
45URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
46
47# This pattern matches a character entity reference (a decimal numeric
48# references, a hexadecimal numeric reference, or a named reference).
49ENTITY_SUB = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
50REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
51
52
53def socket_timeout(timeout=SOCKET_TIMEOUT):
54 """Decorator to add a socket timeout when requesting pages on PyPI.
55 """
Éric Araujo3c8ca082011-06-17 21:10:21 +020056 def wrapper(func):
57 @wraps(func)
58 def wrapped(self, *args, **kwargs):
Tarek Ziade1231a4e2011-05-19 13:07:25 +020059 old_timeout = socket.getdefaulttimeout()
60 if hasattr(self, "_timeout"):
61 timeout = self._timeout
62 socket.setdefaulttimeout(timeout)
63 try:
64 return func(self, *args, **kwargs)
65 finally:
66 socket.setdefaulttimeout(old_timeout)
Éric Araujo3c8ca082011-06-17 21:10:21 +020067 return wrapped
68 return wrapper
Tarek Ziade1231a4e2011-05-19 13:07:25 +020069
70
71def with_mirror_support():
72 """Decorator that makes the mirroring support easier"""
73 def wrapper(func):
Éric Araujo3c8ca082011-06-17 21:10:21 +020074 @wraps(func)
Tarek Ziade1231a4e2011-05-19 13:07:25 +020075 def wrapped(self, *args, **kwargs):
76 try:
77 return func(self, *args, **kwargs)
78 except DownloadError:
79 # if an error occurs, try with the next index_url
80 if self._mirrors_tries >= self._mirrors_max_tries:
81 try:
82 self._switch_to_next_mirror()
83 except KeyError:
84 raise UnableToDownload("Tried all mirrors")
85 else:
86 self._mirrors_tries += 1
87 self._projects.clear()
88 return wrapped(self, *args, **kwargs)
89 return wrapped
90 return wrapper
91
92
93class Crawler(BaseClient):
94 """Provides useful tools to request the Python Package Index simple API.
95
96 You can specify both mirrors and mirrors_url, but mirrors_url will only be
97 used if mirrors is set to None.
98
99 :param index_url: the url of the simple index to search on.
100 :param prefer_final: if the version is not mentioned, and the last
101 version is not a "final" one (alpha, beta, etc.),
102 pick up the last final version.
103 :param prefer_source: if the distribution type is not mentioned, pick up
104 the source one if available.
105 :param follow_externals: tell if following external links is needed or
106 not. Default is False.
107 :param hosts: a list of hosts allowed to be processed while using
108 follow_externals=True. Default behavior is to follow all
109 hosts.
110 :param follow_externals: tell if following external links is needed or
111 not. Default is False.
112 :param mirrors_url: the url to look on for DNS records giving mirror
Éric Araujo348c5722011-06-19 18:53:31 +0200113 addresses.
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200114 :param mirrors: a list of mirrors (see PEP 381).
115 :param timeout: time in seconds to consider a url has timeouted.
116 :param mirrors_max_tries": number of times to try requesting informations
117 on mirrors before switching.
118 """
119
120 def __init__(self, index_url=DEFAULT_SIMPLE_INDEX_URL, prefer_final=False,
121 prefer_source=True, hosts=DEFAULT_HOSTS,
122 follow_externals=False, mirrors_url=None, mirrors=None,
Tarek Ziadeb1b6e132011-05-30 12:07:49 +0200123 timeout=SOCKET_TIMEOUT, mirrors_max_tries=0, verbose=False):
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200124 super(Crawler, self).__init__(prefer_final, prefer_source)
125 self.follow_externals = follow_externals
Tarek Ziadeb1b6e132011-05-30 12:07:49 +0200126 self.verbose = verbose
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200127
128 # mirroring attributes.
Tarek Ziadecc243cc2011-05-21 22:47:40 +0200129 parsed = urllib.parse.urlparse(index_url)
130 self.scheme = parsed[0]
131 if self.scheme == 'file':
132 ender = os.path.sep
133 else:
134 ender = '/'
135 if not index_url.endswith(ender):
136 index_url += ender
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200137 # if no mirrors are defined, use the method described in PEP 381.
138 if mirrors is None:
139 mirrors = get_mirrors(mirrors_url)
140 self._mirrors = set(mirrors)
141 self._mirrors_used = set()
142 self.index_url = index_url
143 self._mirrors_max_tries = mirrors_max_tries
144 self._mirrors_tries = 0
145 self._timeout = timeout
146
147 # create a regexp to match all given hosts
148 self._allowed_hosts = re.compile('|'.join(map(translate, hosts))).match
149
150 # we keep an index of pages we have processed, in order to avoid
151 # scanning them multple time (eg. if there is multiple pages pointing
152 # on one)
153 self._processed_urls = []
154 self._projects = {}
155
156 @with_mirror_support()
157 def search_projects(self, name=None, **kwargs):
158 """Search the index for projects containing the given name.
159
160 Return a list of names.
161 """
162 with self._open_url(self.index_url) as index:
163 if '*' in name:
164 name.replace('*', '.*')
165 else:
166 name = "%s%s%s" % ('*.?', name, '*.?')
167 name = name.replace('*', '[^<]*') # avoid matching end tag
168 projectname = re.compile('<a[^>]*>(%s)</a>' % name, re.I)
169 matching_projects = []
170
171 index_content = index.read()
172
173 # FIXME should use bytes I/O and regexes instead of decoding
174 index_content = index_content.decode()
175
176 for match in projectname.finditer(index_content):
177 project_name = match.group(1)
178 matching_projects.append(self._get_project(project_name))
179 return matching_projects
180
181 def get_releases(self, requirements, prefer_final=None,
182 force_update=False):
Éric Araujo25d57372011-06-01 14:41:11 +0200183 """Search for releases and return a ReleasesList object containing
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200184 the results.
185 """
186 predicate = get_version_predicate(requirements)
187 if predicate.name.lower() in self._projects and not force_update:
188 return self._projects.get(predicate.name.lower())
189 prefer_final = self._get_prefer_final(prefer_final)
Tarek Ziadeb1b6e132011-05-30 12:07:49 +0200190 logger.debug('Reading info on PyPI about %s', predicate.name)
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200191 self._process_index_page(predicate.name)
192
193 if predicate.name.lower() not in self._projects:
Éric Araujo2ef747c2011-06-04 22:33:16 +0200194 raise ProjectNotFound
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200195
196 releases = self._projects.get(predicate.name.lower())
197 releases.sort_releases(prefer_final=prefer_final)
198 return releases
199
200 def get_release(self, requirements, prefer_final=None):
201 """Return only one release that fulfill the given requirements"""
202 predicate = get_version_predicate(requirements)
203 release = self.get_releases(predicate, prefer_final)\
204 .get_last(predicate)
205 if not release:
206 raise ReleaseNotFound("No release matches the given criterias")
207 return release
208
209 def get_distributions(self, project_name, version):
210 """Return the distributions found on the index for the specific given
211 release"""
212 # as the default behavior of get_release is to return a release
213 # containing the distributions, just alias it.
214 return self.get_release("%s (%s)" % (project_name, version))
215
216 def get_metadata(self, project_name, version):
217 """Return the metadatas from the simple index.
218
219 Currently, download one archive, extract it and use the PKG-INFO file.
220 """
221 release = self.get_distributions(project_name, version)
222 if not release.metadata:
223 location = release.get_distribution().unpack()
224 pkg_info = os.path.join(location, 'PKG-INFO')
225 release.metadata = Metadata(pkg_info)
226 return release
227
228 def _switch_to_next_mirror(self):
229 """Switch to the next mirror (eg. point self.index_url to the next
230 mirror url.
231
232 Raise a KeyError if all mirrors have been tried.
233 """
234 self._mirrors_used.add(self.index_url)
235 index_url = self._mirrors.pop()
Éric Araujoea888e02011-06-08 04:31:18 +0200236 # XXX use urllib.parse for a real check of missing scheme part
237 if not index_url.startswith(("http://", "https://", "file://")):
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200238 index_url = "http://%s" % index_url
239
240 if not index_url.endswith("/simple"):
241 index_url = "%s/simple/" % index_url
242
243 self.index_url = index_url
244
245 def _is_browsable(self, url):
246 """Tell if the given URL can be browsed or not.
247
248 It uses the follow_externals and the hosts list to tell if the given
249 url is browsable or not.
250 """
251 # if _index_url is contained in the given URL, we are browsing the
252 # index, and it's always "browsable".
253 # local files are always considered browable resources
254 if self.index_url in url or urllib.parse.urlparse(url)[0] == "file":
255 return True
256 elif self.follow_externals:
257 if self._allowed_hosts(urllib.parse.urlparse(url)[1]): # 1 is netloc
258 return True
259 else:
260 return False
261 return False
262
263 def _is_distribution(self, link):
264 """Tell if the given URL matches to a distribution name or not.
265 """
266 #XXX find a better way to check that links are distributions
267 # Using a regexp ?
268 for ext in EXTENSIONS:
269 if ext in link:
270 return True
271 return False
272
273 def _register_release(self, release=None, release_info={}):
274 """Register a new release.
275
276 Both a release or a dict of release_info can be provided, the prefered
277 way (eg. the quicker) is the dict one.
278
279 Return the list of existing releases for the given project.
280 """
281 # Check if the project already has a list of releases (refering to
282 # the project name). If not, create a new release list.
283 # Then, add the release to the list.
284 if release:
285 name = release.name
286 else:
287 name = release_info['name']
Éric Araujodf8ef022011-06-08 04:47:13 +0200288 if name.lower() not in self._projects:
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200289 self._projects[name.lower()] = ReleasesList(name, index=self._index)
290
291 if release:
292 self._projects[name.lower()].add_release(release=release)
293 else:
294 name = release_info.pop('name')
295 version = release_info.pop('version')
296 dist_type = release_info.pop('dist_type')
297 self._projects[name.lower()].add_release(version, dist_type,
298 **release_info)
299 return self._projects[name.lower()]
300
301 def _process_url(self, url, project_name=None, follow_links=True):
302 """Process an url and search for distributions packages.
303
304 For each URL found, if it's a download, creates a PyPIdistribution
305 object. If it's a homepage and we can follow links, process it too.
306
307 :param url: the url to process
308 :param project_name: the project name we are searching for.
309 :param follow_links: Do not want to follow links more than from one
310 level. This parameter tells if we want to follow
311 the links we find (eg. run recursively this
312 method on it)
313 """
314 with self._open_url(url) as f:
315 base_url = f.url
316 if url not in self._processed_urls:
317 self._processed_urls.append(url)
318 link_matcher = self._get_link_matcher(url)
319 for link, is_download in link_matcher(f.read().decode(), base_url):
320 if link not in self._processed_urls:
321 if self._is_distribution(link) or is_download:
322 self._processed_urls.append(link)
323 # it's a distribution, so create a dist object
324 try:
325 infos = get_infos_from_url(link, project_name,
Éric Araujodf8ef022011-06-08 04:47:13 +0200326 is_external=self.index_url not in url)
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200327 except CantParseArchiveName as e:
Tarek Ziadeb1b6e132011-05-30 12:07:49 +0200328 if self.verbose:
329 logger.warning(
330 "version has not been parsed: %s", e)
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200331 else:
332 self._register_release(release_info=infos)
333 else:
334 if self._is_browsable(link) and follow_links:
335 self._process_url(link, project_name,
336 follow_links=False)
337
338 def _get_link_matcher(self, url):
339 """Returns the right link matcher function of the given url
340 """
341 if self.index_url in url:
342 return self._simple_link_matcher
343 else:
344 return self._default_link_matcher
345
346 def _get_full_url(self, url, base_url):
347 return urllib.parse.urljoin(base_url, self._htmldecode(url))
348
349 def _simple_link_matcher(self, content, base_url):
350 """Yield all links with a rel="download" or rel="homepage".
351
352 This matches the simple index requirements for matching links.
353 If follow_externals is set to False, dont yeld the external
354 urls.
355
356 :param content: the content of the page we want to parse
357 :param base_url: the url of this page.
358 """
359 for match in HREF.finditer(content):
360 url = self._get_full_url(match.group(1), base_url)
361 if MD5_HASH.match(url):
362 yield (url, True)
363
364 for match in REL.finditer(content):
365 # search for rel links.
366 tag, rel = match.groups()
367 rels = [s.strip() for s in rel.lower().split(',')]
368 if 'homepage' in rels or 'download' in rels:
369 for match in HREF.finditer(tag):
370 url = self._get_full_url(match.group(1), base_url)
371 if 'download' in rels or self._is_browsable(url):
372 # yield a list of (url, is_download)
373 yield (url, 'download' in rels)
374
375 def _default_link_matcher(self, content, base_url):
376 """Yield all links found on the page.
377 """
378 for match in HREF.finditer(content):
379 url = self._get_full_url(match.group(1), base_url)
380 if self._is_browsable(url):
381 yield (url, False)
382
383 @with_mirror_support()
384 def _process_index_page(self, name):
385 """Find and process a PyPI page for the given project name.
386
387 :param name: the name of the project to find the page
388 """
389 # Browse and index the content of the given PyPI page.
Tarek Ziadecc243cc2011-05-21 22:47:40 +0200390 if self.scheme == 'file':
391 ender = os.path.sep
392 else:
393 ender = '/'
394 url = self.index_url + name + ender
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200395 self._process_url(url, name)
396
397 @socket_timeout()
398 def _open_url(self, url):
399 """Open a urllib2 request, handling HTTP authentication, and local
400 files support.
401
402 """
403 scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
404
405 # authentication stuff
406 if scheme in ('http', 'https'):
407 auth, host = urllib.parse.splituser(netloc)
408 else:
409 auth = None
410
411 # add index.html automatically for filesystem paths
412 if scheme == 'file':
Tarek Ziadecc243cc2011-05-21 22:47:40 +0200413 if url.endswith(os.path.sep):
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200414 url += "index.html"
415
416 # add authorization headers if auth is provided
417 if auth:
418 auth = "Basic " + \
419 urllib.parse.unquote(auth).encode('base64').strip()
420 new_url = urllib.parse.urlunparse((
421 scheme, host, path, params, query, frag))
422 request = urllib.request.Request(new_url)
423 request.add_header("Authorization", auth)
424 else:
425 request = urllib.request.Request(url)
426 request.add_header('User-Agent', USER_AGENT)
427 try:
428 fp = urllib.request.urlopen(request)
429 except (ValueError, http.client.InvalidURL) as v:
430 msg = ' '.join([str(arg) for arg in v.args])
431 raise PackagingPyPIError('%s %s' % (url, msg))
432 except urllib.error.HTTPError as v:
433 return v
434 except urllib.error.URLError as v:
435 raise DownloadError("Download error for %s: %s" % (url, v.reason))
436 except http.client.BadStatusLine as v:
437 raise DownloadError('%s returned a bad status line. '
438 'The server might be down, %s' % (url, v.line))
439 except http.client.HTTPException as v:
440 raise DownloadError("Download error for %s: %s" % (url, v))
441 except socket.timeout:
442 raise DownloadError("The server timeouted")
443
444 if auth:
445 # Put authentication info back into request URL if same host,
446 # so that links found on the page will work
447 s2, h2, path2, param2, query2, frag2 = \
448 urllib.parse.urlparse(fp.url)
449 if s2 == scheme and h2 == host:
450 fp.url = urllib.parse.urlunparse(
451 (s2, netloc, path2, param2, query2, frag2))
452 return fp
453
454 def _decode_entity(self, match):
455 what = match.group(1)
456 if what.startswith('#x'):
457 what = int(what[2:], 16)
458 elif what.startswith('#'):
459 what = int(what[1:])
460 else:
461 from html.entities import name2codepoint
462 what = name2codepoint.get(what, match.group(0))
463 return chr(what)
464
465 def _htmldecode(self, text):
466 """Decode HTML entities in the given text."""
467 return ENTITY_SUB(self._decode_entity, text)