blob: c492179874464396d0f89be0f8487a281c5610b3 [file] [log] [blame]
Tarek Ziade1231a4e2011-05-19 13:07:25 +02001"""Spider using the screen-scraping "simple" PyPI API.
2
Éric Araujo25d57372011-06-01 14:41:11 +02003This module contains the class Crawler, a simple spider that
Tarek Ziade1231a4e2011-05-19 13:07:25 +02004can be used to find and retrieve distributions from a project index
5(like the Python Package Index), using its so-called simple API (see
6reference implementation available at http://pypi.python.org/simple/).
7"""
8
9import http.client
10import re
11import socket
12import sys
13import urllib.request
14import urllib.parse
15import urllib.error
16import os
17
18
19from fnmatch import translate
20from packaging import logger
21from packaging.metadata import Metadata
22from packaging.version import get_version_predicate
23from packaging import __version__ as packaging_version
24from packaging.pypi.base import BaseClient
25from packaging.pypi.dist import (ReleasesList, EXTENSIONS,
26 get_infos_from_url, MD5_HASH)
27from packaging.pypi.errors import (PackagingPyPIError, DownloadError,
28 UnableToDownload, CantParseArchiveName,
29 ReleaseNotFound, ProjectNotFound)
30from packaging.pypi.mirrors import get_mirrors
31from packaging.metadata import Metadata
32
33__all__ = ['Crawler', 'DEFAULT_SIMPLE_INDEX_URL']
34
35# -- Constants -----------------------------------------------
36DEFAULT_SIMPLE_INDEX_URL = "http://a.pypi.python.org/simple/"
37DEFAULT_HOSTS = ("*",)
38SOCKET_TIMEOUT = 15
39USER_AGENT = "Python-urllib/%s packaging/%s" % (
40 sys.version[:3], packaging_version)
41
42# -- Regexps -------------------------------------------------
43EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
44HREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I)
45URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
46
47# This pattern matches a character entity reference (a decimal numeric
48# references, a hexadecimal numeric reference, or a named reference).
49ENTITY_SUB = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
50REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
51
52
53def socket_timeout(timeout=SOCKET_TIMEOUT):
54 """Decorator to add a socket timeout when requesting pages on PyPI.
55 """
56 def _socket_timeout(func):
57 def _socket_timeout(self, *args, **kwargs):
58 old_timeout = socket.getdefaulttimeout()
59 if hasattr(self, "_timeout"):
60 timeout = self._timeout
61 socket.setdefaulttimeout(timeout)
62 try:
63 return func(self, *args, **kwargs)
64 finally:
65 socket.setdefaulttimeout(old_timeout)
66 return _socket_timeout
67 return _socket_timeout
68
69
70def with_mirror_support():
71 """Decorator that makes the mirroring support easier"""
72 def wrapper(func):
73 def wrapped(self, *args, **kwargs):
74 try:
75 return func(self, *args, **kwargs)
76 except DownloadError:
77 # if an error occurs, try with the next index_url
78 if self._mirrors_tries >= self._mirrors_max_tries:
79 try:
80 self._switch_to_next_mirror()
81 except KeyError:
82 raise UnableToDownload("Tried all mirrors")
83 else:
84 self._mirrors_tries += 1
85 self._projects.clear()
86 return wrapped(self, *args, **kwargs)
87 return wrapped
88 return wrapper
89
90
91class Crawler(BaseClient):
92 """Provides useful tools to request the Python Package Index simple API.
93
94 You can specify both mirrors and mirrors_url, but mirrors_url will only be
95 used if mirrors is set to None.
96
97 :param index_url: the url of the simple index to search on.
98 :param prefer_final: if the version is not mentioned, and the last
99 version is not a "final" one (alpha, beta, etc.),
100 pick up the last final version.
101 :param prefer_source: if the distribution type is not mentioned, pick up
102 the source one if available.
103 :param follow_externals: tell if following external links is needed or
104 not. Default is False.
105 :param hosts: a list of hosts allowed to be processed while using
106 follow_externals=True. Default behavior is to follow all
107 hosts.
108 :param follow_externals: tell if following external links is needed or
109 not. Default is False.
110 :param mirrors_url: the url to look on for DNS records giving mirror
111 adresses.
112 :param mirrors: a list of mirrors (see PEP 381).
113 :param timeout: time in seconds to consider a url has timeouted.
114 :param mirrors_max_tries": number of times to try requesting informations
115 on mirrors before switching.
116 """
117
118 def __init__(self, index_url=DEFAULT_SIMPLE_INDEX_URL, prefer_final=False,
119 prefer_source=True, hosts=DEFAULT_HOSTS,
120 follow_externals=False, mirrors_url=None, mirrors=None,
Tarek Ziadeb1b6e132011-05-30 12:07:49 +0200121 timeout=SOCKET_TIMEOUT, mirrors_max_tries=0, verbose=False):
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200122 super(Crawler, self).__init__(prefer_final, prefer_source)
123 self.follow_externals = follow_externals
Tarek Ziadeb1b6e132011-05-30 12:07:49 +0200124 self.verbose = verbose
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200125
126 # mirroring attributes.
Tarek Ziadecc243cc2011-05-21 22:47:40 +0200127 parsed = urllib.parse.urlparse(index_url)
128 self.scheme = parsed[0]
129 if self.scheme == 'file':
130 ender = os.path.sep
131 else:
132 ender = '/'
133 if not index_url.endswith(ender):
134 index_url += ender
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200135 # if no mirrors are defined, use the method described in PEP 381.
136 if mirrors is None:
137 mirrors = get_mirrors(mirrors_url)
138 self._mirrors = set(mirrors)
139 self._mirrors_used = set()
140 self.index_url = index_url
141 self._mirrors_max_tries = mirrors_max_tries
142 self._mirrors_tries = 0
143 self._timeout = timeout
144
145 # create a regexp to match all given hosts
146 self._allowed_hosts = re.compile('|'.join(map(translate, hosts))).match
147
148 # we keep an index of pages we have processed, in order to avoid
149 # scanning them multple time (eg. if there is multiple pages pointing
150 # on one)
151 self._processed_urls = []
152 self._projects = {}
153
154 @with_mirror_support()
155 def search_projects(self, name=None, **kwargs):
156 """Search the index for projects containing the given name.
157
158 Return a list of names.
159 """
160 with self._open_url(self.index_url) as index:
161 if '*' in name:
162 name.replace('*', '.*')
163 else:
164 name = "%s%s%s" % ('*.?', name, '*.?')
165 name = name.replace('*', '[^<]*') # avoid matching end tag
166 projectname = re.compile('<a[^>]*>(%s)</a>' % name, re.I)
167 matching_projects = []
168
169 index_content = index.read()
170
171 # FIXME should use bytes I/O and regexes instead of decoding
172 index_content = index_content.decode()
173
174 for match in projectname.finditer(index_content):
175 project_name = match.group(1)
176 matching_projects.append(self._get_project(project_name))
177 return matching_projects
178
179 def get_releases(self, requirements, prefer_final=None,
180 force_update=False):
Éric Araujo25d57372011-06-01 14:41:11 +0200181 """Search for releases and return a ReleasesList object containing
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200182 the results.
183 """
184 predicate = get_version_predicate(requirements)
185 if predicate.name.lower() in self._projects and not force_update:
186 return self._projects.get(predicate.name.lower())
187 prefer_final = self._get_prefer_final(prefer_final)
Tarek Ziadeb1b6e132011-05-30 12:07:49 +0200188 logger.debug('Reading info on PyPI about %s', predicate.name)
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200189 self._process_index_page(predicate.name)
190
191 if predicate.name.lower() not in self._projects:
192 raise ProjectNotFound()
193
194 releases = self._projects.get(predicate.name.lower())
195 releases.sort_releases(prefer_final=prefer_final)
196 return releases
197
198 def get_release(self, requirements, prefer_final=None):
199 """Return only one release that fulfill the given requirements"""
200 predicate = get_version_predicate(requirements)
201 release = self.get_releases(predicate, prefer_final)\
202 .get_last(predicate)
203 if not release:
204 raise ReleaseNotFound("No release matches the given criterias")
205 return release
206
207 def get_distributions(self, project_name, version):
208 """Return the distributions found on the index for the specific given
209 release"""
210 # as the default behavior of get_release is to return a release
211 # containing the distributions, just alias it.
212 return self.get_release("%s (%s)" % (project_name, version))
213
214 def get_metadata(self, project_name, version):
215 """Return the metadatas from the simple index.
216
217 Currently, download one archive, extract it and use the PKG-INFO file.
218 """
219 release = self.get_distributions(project_name, version)
220 if not release.metadata:
221 location = release.get_distribution().unpack()
222 pkg_info = os.path.join(location, 'PKG-INFO')
223 release.metadata = Metadata(pkg_info)
224 return release
225
226 def _switch_to_next_mirror(self):
227 """Switch to the next mirror (eg. point self.index_url to the next
228 mirror url.
229
230 Raise a KeyError if all mirrors have been tried.
231 """
232 self._mirrors_used.add(self.index_url)
233 index_url = self._mirrors.pop()
234 if not ("http://" or "https://" or "file://") in index_url:
235 index_url = "http://%s" % index_url
236
237 if not index_url.endswith("/simple"):
238 index_url = "%s/simple/" % index_url
239
240 self.index_url = index_url
241
242 def _is_browsable(self, url):
243 """Tell if the given URL can be browsed or not.
244
245 It uses the follow_externals and the hosts list to tell if the given
246 url is browsable or not.
247 """
248 # if _index_url is contained in the given URL, we are browsing the
249 # index, and it's always "browsable".
250 # local files are always considered browable resources
251 if self.index_url in url or urllib.parse.urlparse(url)[0] == "file":
252 return True
253 elif self.follow_externals:
254 if self._allowed_hosts(urllib.parse.urlparse(url)[1]): # 1 is netloc
255 return True
256 else:
257 return False
258 return False
259
260 def _is_distribution(self, link):
261 """Tell if the given URL matches to a distribution name or not.
262 """
263 #XXX find a better way to check that links are distributions
264 # Using a regexp ?
265 for ext in EXTENSIONS:
266 if ext in link:
267 return True
268 return False
269
270 def _register_release(self, release=None, release_info={}):
271 """Register a new release.
272
273 Both a release or a dict of release_info can be provided, the prefered
274 way (eg. the quicker) is the dict one.
275
276 Return the list of existing releases for the given project.
277 """
278 # Check if the project already has a list of releases (refering to
279 # the project name). If not, create a new release list.
280 # Then, add the release to the list.
281 if release:
282 name = release.name
283 else:
284 name = release_info['name']
285 if not name.lower() in self._projects:
286 self._projects[name.lower()] = ReleasesList(name, index=self._index)
287
288 if release:
289 self._projects[name.lower()].add_release(release=release)
290 else:
291 name = release_info.pop('name')
292 version = release_info.pop('version')
293 dist_type = release_info.pop('dist_type')
294 self._projects[name.lower()].add_release(version, dist_type,
295 **release_info)
296 return self._projects[name.lower()]
297
298 def _process_url(self, url, project_name=None, follow_links=True):
299 """Process an url and search for distributions packages.
300
301 For each URL found, if it's a download, creates a PyPIdistribution
302 object. If it's a homepage and we can follow links, process it too.
303
304 :param url: the url to process
305 :param project_name: the project name we are searching for.
306 :param follow_links: Do not want to follow links more than from one
307 level. This parameter tells if we want to follow
308 the links we find (eg. run recursively this
309 method on it)
310 """
311 with self._open_url(url) as f:
312 base_url = f.url
313 if url not in self._processed_urls:
314 self._processed_urls.append(url)
315 link_matcher = self._get_link_matcher(url)
316 for link, is_download in link_matcher(f.read().decode(), base_url):
317 if link not in self._processed_urls:
318 if self._is_distribution(link) or is_download:
319 self._processed_urls.append(link)
320 # it's a distribution, so create a dist object
321 try:
322 infos = get_infos_from_url(link, project_name,
323 is_external=not self.index_url in url)
324 except CantParseArchiveName as e:
Tarek Ziadeb1b6e132011-05-30 12:07:49 +0200325 if self.verbose:
326 logger.warning(
327 "version has not been parsed: %s", e)
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200328 else:
329 self._register_release(release_info=infos)
330 else:
331 if self._is_browsable(link) and follow_links:
332 self._process_url(link, project_name,
333 follow_links=False)
334
335 def _get_link_matcher(self, url):
336 """Returns the right link matcher function of the given url
337 """
338 if self.index_url in url:
339 return self._simple_link_matcher
340 else:
341 return self._default_link_matcher
342
343 def _get_full_url(self, url, base_url):
344 return urllib.parse.urljoin(base_url, self._htmldecode(url))
345
346 def _simple_link_matcher(self, content, base_url):
347 """Yield all links with a rel="download" or rel="homepage".
348
349 This matches the simple index requirements for matching links.
350 If follow_externals is set to False, dont yeld the external
351 urls.
352
353 :param content: the content of the page we want to parse
354 :param base_url: the url of this page.
355 """
356 for match in HREF.finditer(content):
357 url = self._get_full_url(match.group(1), base_url)
358 if MD5_HASH.match(url):
359 yield (url, True)
360
361 for match in REL.finditer(content):
362 # search for rel links.
363 tag, rel = match.groups()
364 rels = [s.strip() for s in rel.lower().split(',')]
365 if 'homepage' in rels or 'download' in rels:
366 for match in HREF.finditer(tag):
367 url = self._get_full_url(match.group(1), base_url)
368 if 'download' in rels or self._is_browsable(url):
369 # yield a list of (url, is_download)
370 yield (url, 'download' in rels)
371
372 def _default_link_matcher(self, content, base_url):
373 """Yield all links found on the page.
374 """
375 for match in HREF.finditer(content):
376 url = self._get_full_url(match.group(1), base_url)
377 if self._is_browsable(url):
378 yield (url, False)
379
380 @with_mirror_support()
381 def _process_index_page(self, name):
382 """Find and process a PyPI page for the given project name.
383
384 :param name: the name of the project to find the page
385 """
386 # Browse and index the content of the given PyPI page.
Tarek Ziadecc243cc2011-05-21 22:47:40 +0200387 if self.scheme == 'file':
388 ender = os.path.sep
389 else:
390 ender = '/'
391 url = self.index_url + name + ender
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200392 self._process_url(url, name)
393
394 @socket_timeout()
395 def _open_url(self, url):
396 """Open a urllib2 request, handling HTTP authentication, and local
397 files support.
398
399 """
400 scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
401
402 # authentication stuff
403 if scheme in ('http', 'https'):
404 auth, host = urllib.parse.splituser(netloc)
405 else:
406 auth = None
407
408 # add index.html automatically for filesystem paths
409 if scheme == 'file':
Tarek Ziadecc243cc2011-05-21 22:47:40 +0200410 if url.endswith(os.path.sep):
Tarek Ziade1231a4e2011-05-19 13:07:25 +0200411 url += "index.html"
412
413 # add authorization headers if auth is provided
414 if auth:
415 auth = "Basic " + \
416 urllib.parse.unquote(auth).encode('base64').strip()
417 new_url = urllib.parse.urlunparse((
418 scheme, host, path, params, query, frag))
419 request = urllib.request.Request(new_url)
420 request.add_header("Authorization", auth)
421 else:
422 request = urllib.request.Request(url)
423 request.add_header('User-Agent', USER_AGENT)
424 try:
425 fp = urllib.request.urlopen(request)
426 except (ValueError, http.client.InvalidURL) as v:
427 msg = ' '.join([str(arg) for arg in v.args])
428 raise PackagingPyPIError('%s %s' % (url, msg))
429 except urllib.error.HTTPError as v:
430 return v
431 except urllib.error.URLError as v:
432 raise DownloadError("Download error for %s: %s" % (url, v.reason))
433 except http.client.BadStatusLine as v:
434 raise DownloadError('%s returned a bad status line. '
435 'The server might be down, %s' % (url, v.line))
436 except http.client.HTTPException as v:
437 raise DownloadError("Download error for %s: %s" % (url, v))
438 except socket.timeout:
439 raise DownloadError("The server timeouted")
440
441 if auth:
442 # Put authentication info back into request URL if same host,
443 # so that links found on the page will work
444 s2, h2, path2, param2, query2, frag2 = \
445 urllib.parse.urlparse(fp.url)
446 if s2 == scheme and h2 == host:
447 fp.url = urllib.parse.urlunparse(
448 (s2, netloc, path2, param2, query2, frag2))
449 return fp
450
451 def _decode_entity(self, match):
452 what = match.group(1)
453 if what.startswith('#x'):
454 what = int(what[2:], 16)
455 elif what.startswith('#'):
456 what = int(what[1:])
457 else:
458 from html.entities import name2codepoint
459 what = name2codepoint.get(what, match.group(0))
460 return chr(what)
461
462 def _htmldecode(self, text):
463 """Decode HTML entities in the given text."""
464 return ENTITY_SUB(self._decode_entity, text)