Blame - Lib/packaging/pypi/simple.py - platform/external/python/cpython3

blob: 777fa93b57dbe36fea5bf66c98b7d8c95a7aecb7 [file] [log] [blame]

Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	1	"""Spider using the screen-scraping "simple" PyPI API.
				2
Éric Araujo	25d5737	2011-06-01 14:41:11 +0200	[diff] [blame]	3	This module contains the class Crawler, a simple spider that
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	4	can be used to find and retrieve distributions from a project index
				5	(like the Python Package Index), using its so-called simple API (see
				6	reference implementation available at http://pypi.python.org/simple/).
				7	"""
				8
				9	import http.client
				10	import re
				11	import socket
				12	import sys
				13	import urllib.request
				14	import urllib.parse
				15	import urllib.error
				16	import os
				17
				18
				19	from fnmatch import translate
				20	from packaging import logger
				21	from packaging.metadata import Metadata
				22	from packaging.version import get_version_predicate
				23	from packaging import __version__ as packaging_version
				24	from packaging.pypi.base import BaseClient
				25	from packaging.pypi.dist import (ReleasesList, EXTENSIONS,
				26	get_infos_from_url, MD5_HASH)
				27	from packaging.pypi.errors import (PackagingPyPIError, DownloadError,
				28	UnableToDownload, CantParseArchiveName,
				29	ReleaseNotFound, ProjectNotFound)
				30	from packaging.pypi.mirrors import get_mirrors
				31	from packaging.metadata import Metadata
				32
				33	__all__ = ['Crawler', 'DEFAULT_SIMPLE_INDEX_URL']
				34
				35	# -- Constants -----------------------------------------------
				36	DEFAULT_SIMPLE_INDEX_URL = "http://a.pypi.python.org/simple/"
				37	DEFAULT_HOSTS = ("*",)
				38	SOCKET_TIMEOUT = 15
				39	USER_AGENT = "Python-urllib/%s packaging/%s" % (
				40	sys.version[:3], packaging_version)
				41
				42	# -- Regexps -------------------------------------------------
				43	EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
				44	HREF = re.compile("""href\\s=\\s['"]?([^'"> ]+)""", re.I)
				45	URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
				46
				47	# This pattern matches a character entity reference (a decimal numeric
				48	# references, a hexadecimal numeric reference, or a named reference).
				49	ENTITY_SUB = re.compile(r'&(#(\d+\|x[\da-fA-F]+)\|[\w.:-]+);?').sub
				50	REL = re.compile("""<([^>]\srel\s=\s['"]?([^'">]+)[^>])>""", re.I)
				51
				52
				53	def socket_timeout(timeout=SOCKET_TIMEOUT):
				54	"""Decorator to add a socket timeout when requesting pages on PyPI.
				55	"""
				56	def _socket_timeout(func):
				57	def _socket_timeout(self, args, *kwargs):
				58	old_timeout = socket.getdefaulttimeout()
				59	if hasattr(self, "_timeout"):
				60	timeout = self._timeout
				61	socket.setdefaulttimeout(timeout)
				62	try:
				63	return func(self, args, *kwargs)
				64	finally:
				65	socket.setdefaulttimeout(old_timeout)
				66	return _socket_timeout
				67	return _socket_timeout
				68
				69
				70	def with_mirror_support():
				71	"""Decorator that makes the mirroring support easier"""
				72	def wrapper(func):
				73	def wrapped(self, args, *kwargs):
				74	try:
				75	return func(self, args, *kwargs)
				76	except DownloadError:
				77	# if an error occurs, try with the next index_url
				78	if self._mirrors_tries >= self._mirrors_max_tries:
				79	try:
				80	self._switch_to_next_mirror()
				81	except KeyError:
				82	raise UnableToDownload("Tried all mirrors")
				83	else:
				84	self._mirrors_tries += 1
				85	self._projects.clear()
				86	return wrapped(self, args, *kwargs)
				87	return wrapped
				88	return wrapper
				89
				90
				91	class Crawler(BaseClient):
				92	"""Provides useful tools to request the Python Package Index simple API.
				93
				94	You can specify both mirrors and mirrors_url, but mirrors_url will only be
				95	used if mirrors is set to None.
				96
				97	:param index_url: the url of the simple index to search on.
				98	:param prefer_final: if the version is not mentioned, and the last
				99	version is not a "final" one (alpha, beta, etc.),
				100	pick up the last final version.
				101	:param prefer_source: if the distribution type is not mentioned, pick up
				102	the source one if available.
				103	:param follow_externals: tell if following external links is needed or
				104	not. Default is False.
				105	:param hosts: a list of hosts allowed to be processed while using
				106	follow_externals=True. Default behavior is to follow all
				107	hosts.
				108	:param follow_externals: tell if following external links is needed or
				109	not. Default is False.
				110	:param mirrors_url: the url to look on for DNS records giving mirror
				111	adresses.
				112	:param mirrors: a list of mirrors (see PEP 381).
				113	:param timeout: time in seconds to consider a url has timeouted.
				114	:param mirrors_max_tries": number of times to try requesting informations
				115	on mirrors before switching.
				116	"""
				117
				118	def __init__(self, index_url=DEFAULT_SIMPLE_INDEX_URL, prefer_final=False,
				119	prefer_source=True, hosts=DEFAULT_HOSTS,
				120	follow_externals=False, mirrors_url=None, mirrors=None,
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	121	timeout=SOCKET_TIMEOUT, mirrors_max_tries=0, verbose=False):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	122	super(Crawler, self).__init__(prefer_final, prefer_source)
				123	self.follow_externals = follow_externals
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	124	self.verbose = verbose
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	125
				126	# mirroring attributes.
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	127	parsed = urllib.parse.urlparse(index_url)
				128	self.scheme = parsed[0]
				129	if self.scheme == 'file':
				130	ender = os.path.sep
				131	else:
				132	ender = '/'
				133	if not index_url.endswith(ender):
				134	index_url += ender
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	135	# if no mirrors are defined, use the method described in PEP 381.
				136	if mirrors is None:
				137	mirrors = get_mirrors(mirrors_url)
				138	self._mirrors = set(mirrors)
				139	self._mirrors_used = set()
				140	self.index_url = index_url
				141	self._mirrors_max_tries = mirrors_max_tries
				142	self._mirrors_tries = 0
				143	self._timeout = timeout
				144
				145	# create a regexp to match all given hosts
				146	self._allowed_hosts = re.compile('\|'.join(map(translate, hosts))).match
				147
				148	# we keep an index of pages we have processed, in order to avoid
				149	# scanning them multple time (eg. if there is multiple pages pointing
				150	# on one)
				151	self._processed_urls = []
				152	self._projects = {}
				153
				154	@with_mirror_support()
				155	def search_projects(self, name=None, **kwargs):
				156	"""Search the index for projects containing the given name.
				157
				158	Return a list of names.
				159	"""
				160	with self._open_url(self.index_url) as index:
				161	if '*' in name:
				162	name.replace('', '.')
				163	else:
				164	name = "%s%s%s" % ('.?', name, '.?')
				165	name = name.replace('', '[^<]') # avoid matching end tag
				166	projectname = re.compile('<a[^>]*>(%s)</a>' % name, re.I)
				167	matching_projects = []
				168
				169	index_content = index.read()
				170
				171	# FIXME should use bytes I/O and regexes instead of decoding
				172	index_content = index_content.decode()
				173
				174	for match in projectname.finditer(index_content):
				175	project_name = match.group(1)
				176	matching_projects.append(self._get_project(project_name))
				177	return matching_projects
				178
				179	def get_releases(self, requirements, prefer_final=None,
				180	force_update=False):
Éric Araujo	25d5737	2011-06-01 14:41:11 +0200	[diff] [blame]	181	"""Search for releases and return a ReleasesList object containing
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	182	the results.
				183	"""
				184	predicate = get_version_predicate(requirements)
				185	if predicate.name.lower() in self._projects and not force_update:
				186	return self._projects.get(predicate.name.lower())
				187	prefer_final = self._get_prefer_final(prefer_final)
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	188	logger.debug('Reading info on PyPI about %s', predicate.name)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	189	self._process_index_page(predicate.name)
				190
				191	if predicate.name.lower() not in self._projects:
Éric Araujo	2ef747c	2011-06-04 22:33:16 +0200	[diff] [blame]	192	raise ProjectNotFound
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	193
				194	releases = self._projects.get(predicate.name.lower())
				195	releases.sort_releases(prefer_final=prefer_final)
				196	return releases
				197
				198	def get_release(self, requirements, prefer_final=None):
				199	"""Return only one release that fulfill the given requirements"""
				200	predicate = get_version_predicate(requirements)
				201	release = self.get_releases(predicate, prefer_final)\
				202	.get_last(predicate)
				203	if not release:
				204	raise ReleaseNotFound("No release matches the given criterias")
				205	return release
				206
				207	def get_distributions(self, project_name, version):
				208	"""Return the distributions found on the index for the specific given
				209	release"""
				210	# as the default behavior of get_release is to return a release
				211	# containing the distributions, just alias it.
				212	return self.get_release("%s (%s)" % (project_name, version))
				213
				214	def get_metadata(self, project_name, version):
				215	"""Return the metadatas from the simple index.
				216
				217	Currently, download one archive, extract it and use the PKG-INFO file.
				218	"""
				219	release = self.get_distributions(project_name, version)
				220	if not release.metadata:
				221	location = release.get_distribution().unpack()
				222	pkg_info = os.path.join(location, 'PKG-INFO')
				223	release.metadata = Metadata(pkg_info)
				224	return release
				225
				226	def _switch_to_next_mirror(self):
				227	"""Switch to the next mirror (eg. point self.index_url to the next
				228	mirror url.
				229
				230	Raise a KeyError if all mirrors have been tried.
				231	"""
				232	self._mirrors_used.add(self.index_url)
				233	index_url = self._mirrors.pop()
Éric Araujo	ea888e0	2011-06-08 04:31:18 +0200	[diff] [blame^]	234	# XXX use urllib.parse for a real check of missing scheme part
				235	if not index_url.startswith(("http://", "https://", "file://")):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	236	index_url = "http://%s" % index_url
				237
				238	if not index_url.endswith("/simple"):
				239	index_url = "%s/simple/" % index_url
				240
				241	self.index_url = index_url
				242
				243	def _is_browsable(self, url):
				244	"""Tell if the given URL can be browsed or not.
				245
				246	It uses the follow_externals and the hosts list to tell if the given
				247	url is browsable or not.
				248	"""
				249	# if _index_url is contained in the given URL, we are browsing the
				250	# index, and it's always "browsable".
				251	# local files are always considered browable resources
				252	if self.index_url in url or urllib.parse.urlparse(url)[0] == "file":
				253	return True
				254	elif self.follow_externals:
				255	if self._allowed_hosts(urllib.parse.urlparse(url)[1]): # 1 is netloc
				256	return True
				257	else:
				258	return False
				259	return False
				260
				261	def _is_distribution(self, link):
				262	"""Tell if the given URL matches to a distribution name or not.
				263	"""
				264	#XXX find a better way to check that links are distributions
				265	# Using a regexp ?
				266	for ext in EXTENSIONS:
				267	if ext in link:
				268	return True
				269	return False
				270
				271	def _register_release(self, release=None, release_info={}):
				272	"""Register a new release.
				273
				274	Both a release or a dict of release_info can be provided, the prefered
				275	way (eg. the quicker) is the dict one.
				276
				277	Return the list of existing releases for the given project.
				278	"""
				279	# Check if the project already has a list of releases (refering to
				280	# the project name). If not, create a new release list.
				281	# Then, add the release to the list.
				282	if release:
				283	name = release.name
				284	else:
				285	name = release_info['name']
				286	if not name.lower() in self._projects:
				287	self._projects[name.lower()] = ReleasesList(name, index=self._index)
				288
				289	if release:
				290	self._projects[name.lower()].add_release(release=release)
				291	else:
				292	name = release_info.pop('name')
				293	version = release_info.pop('version')
				294	dist_type = release_info.pop('dist_type')
				295	self._projects[name.lower()].add_release(version, dist_type,
				296	**release_info)
				297	return self._projects[name.lower()]
				298
				299	def _process_url(self, url, project_name=None, follow_links=True):
				300	"""Process an url and search for distributions packages.
				301
				302	For each URL found, if it's a download, creates a PyPIdistribution
				303	object. If it's a homepage and we can follow links, process it too.
				304
				305	:param url: the url to process
				306	:param project_name: the project name we are searching for.
				307	:param follow_links: Do not want to follow links more than from one
				308	level. This parameter tells if we want to follow
				309	the links we find (eg. run recursively this
				310	method on it)
				311	"""
				312	with self._open_url(url) as f:
				313	base_url = f.url
				314	if url not in self._processed_urls:
				315	self._processed_urls.append(url)
				316	link_matcher = self._get_link_matcher(url)
				317	for link, is_download in link_matcher(f.read().decode(), base_url):
				318	if link not in self._processed_urls:
				319	if self._is_distribution(link) or is_download:
				320	self._processed_urls.append(link)
				321	# it's a distribution, so create a dist object
				322	try:
				323	infos = get_infos_from_url(link, project_name,
				324	is_external=not self.index_url in url)
				325	except CantParseArchiveName as e:
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	326	if self.verbose:
				327	logger.warning(
				328	"version has not been parsed: %s", e)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	329	else:
				330	self._register_release(release_info=infos)
				331	else:
				332	if self._is_browsable(link) and follow_links:
				333	self._process_url(link, project_name,
				334	follow_links=False)
				335
				336	def _get_link_matcher(self, url):
				337	"""Returns the right link matcher function of the given url
				338	"""
				339	if self.index_url in url:
				340	return self._simple_link_matcher
				341	else:
				342	return self._default_link_matcher
				343
				344	def _get_full_url(self, url, base_url):
				345	return urllib.parse.urljoin(base_url, self._htmldecode(url))
				346
				347	def _simple_link_matcher(self, content, base_url):
				348	"""Yield all links with a rel="download" or rel="homepage".
				349
				350	This matches the simple index requirements for matching links.
				351	If follow_externals is set to False, dont yeld the external
				352	urls.
				353
				354	:param content: the content of the page we want to parse
				355	:param base_url: the url of this page.
				356	"""
				357	for match in HREF.finditer(content):
				358	url = self._get_full_url(match.group(1), base_url)
				359	if MD5_HASH.match(url):
				360	yield (url, True)
				361
				362	for match in REL.finditer(content):
				363	# search for rel links.
				364	tag, rel = match.groups()
				365	rels = [s.strip() for s in rel.lower().split(',')]
				366	if 'homepage' in rels or 'download' in rels:
				367	for match in HREF.finditer(tag):
				368	url = self._get_full_url(match.group(1), base_url)
				369	if 'download' in rels or self._is_browsable(url):
				370	# yield a list of (url, is_download)
				371	yield (url, 'download' in rels)
				372
				373	def _default_link_matcher(self, content, base_url):
				374	"""Yield all links found on the page.
				375	"""
				376	for match in HREF.finditer(content):
				377	url = self._get_full_url(match.group(1), base_url)
				378	if self._is_browsable(url):
				379	yield (url, False)
				380
				381	@with_mirror_support()
				382	def _process_index_page(self, name):
				383	"""Find and process a PyPI page for the given project name.
				384
				385	:param name: the name of the project to find the page
				386	"""
				387	# Browse and index the content of the given PyPI page.
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	388	if self.scheme == 'file':
				389	ender = os.path.sep
				390	else:
				391	ender = '/'
				392	url = self.index_url + name + ender
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	393	self._process_url(url, name)
				394
				395	@socket_timeout()
				396	def _open_url(self, url):
				397	"""Open a urllib2 request, handling HTTP authentication, and local
				398	files support.
				399
				400	"""
				401	scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
				402
				403	# authentication stuff
				404	if scheme in ('http', 'https'):
				405	auth, host = urllib.parse.splituser(netloc)
				406	else:
				407	auth = None
				408
				409	# add index.html automatically for filesystem paths
				410	if scheme == 'file':
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	411	if url.endswith(os.path.sep):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	412	url += "index.html"
				413
				414	# add authorization headers if auth is provided
				415	if auth:
				416	auth = "Basic " + \
				417	urllib.parse.unquote(auth).encode('base64').strip()
				418	new_url = urllib.parse.urlunparse((
				419	scheme, host, path, params, query, frag))
				420	request = urllib.request.Request(new_url)
				421	request.add_header("Authorization", auth)
				422	else:
				423	request = urllib.request.Request(url)
				424	request.add_header('User-Agent', USER_AGENT)
				425	try:
				426	fp = urllib.request.urlopen(request)
				427	except (ValueError, http.client.InvalidURL) as v:
				428	msg = ' '.join([str(arg) for arg in v.args])
				429	raise PackagingPyPIError('%s %s' % (url, msg))
				430	except urllib.error.HTTPError as v:
				431	return v
				432	except urllib.error.URLError as v:
				433	raise DownloadError("Download error for %s: %s" % (url, v.reason))
				434	except http.client.BadStatusLine as v:
				435	raise DownloadError('%s returned a bad status line. '
				436	'The server might be down, %s' % (url, v.line))
				437	except http.client.HTTPException as v:
				438	raise DownloadError("Download error for %s: %s" % (url, v))
				439	except socket.timeout:
				440	raise DownloadError("The server timeouted")
				441
				442	if auth:
				443	# Put authentication info back into request URL if same host,
				444	# so that links found on the page will work
				445	s2, h2, path2, param2, query2, frag2 = \
				446	urllib.parse.urlparse(fp.url)
				447	if s2 == scheme and h2 == host:
				448	fp.url = urllib.parse.urlunparse(
				449	(s2, netloc, path2, param2, query2, frag2))
				450	return fp
				451
				452	def _decode_entity(self, match):
				453	what = match.group(1)
				454	if what.startswith('#x'):
				455	what = int(what[2:], 16)
				456	elif what.startswith('#'):
				457	what = int(what[1:])
				458	else:
				459	from html.entities import name2codepoint
				460	what = name2codepoint.get(what, match.group(0))
				461	return chr(what)
				462
				463	def _htmldecode(self, text):
				464	"""Decode HTML entities in the given text."""
				465	return ENTITY_SUB(self._decode_entity, text)