Blame - Lib/packaging/pypi/simple.py - platform/external/python/cpython3

blob: e01e033c4687b798fe414d2fa869d7d2f864239a [file] [log] [blame]

Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	1	"""Spider using the screen-scraping "simple" PyPI API.
				2
Éric Araujo	25d5737	2011-06-01 14:41:11 +0200	[diff] [blame]	3	This module contains the class Crawler, a simple spider that
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	4	can be used to find and retrieve distributions from a project index
				5	(like the Python Package Index), using its so-called simple API (see
				6	reference implementation available at http://pypi.python.org/simple/).
				7	"""
				8
				9	import http.client
				10	import re
				11	import socket
				12	import sys
				13	import urllib.request
				14	import urllib.parse
				15	import urllib.error
				16	import os
				17
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	18	from fnmatch import translate
Éric Araujo	3c8ca08	2011-06-17 21:10:21 +0200	[diff] [blame]	19	from functools import wraps
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	20	from packaging import logger
				21	from packaging.metadata import Metadata
				22	from packaging.version import get_version_predicate
				23	from packaging import __version__ as packaging_version
				24	from packaging.pypi.base import BaseClient
				25	from packaging.pypi.dist import (ReleasesList, EXTENSIONS,
Éric Araujo	505f0eb	2011-09-19 15:12:23 +0200	[diff] [blame]	26	get_infos_from_url, MD5_HASH)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	27	from packaging.pypi.errors import (PackagingPyPIError, DownloadError,
Éric Araujo	505f0eb	2011-09-19 15:12:23 +0200	[diff] [blame]	28	UnableToDownload, CantParseArchiveName,
				29	ReleaseNotFound, ProjectNotFound)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	30	from packaging.pypi.mirrors import get_mirrors
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	31
				32	__all__ = ['Crawler', 'DEFAULT_SIMPLE_INDEX_URL']
				33
				34	# -- Constants -----------------------------------------------
				35	DEFAULT_SIMPLE_INDEX_URL = "http://a.pypi.python.org/simple/"
				36	DEFAULT_HOSTS = ("*",)
				37	SOCKET_TIMEOUT = 15
				38	USER_AGENT = "Python-urllib/%s packaging/%s" % (
				39	sys.version[:3], packaging_version)
				40
				41	# -- Regexps -------------------------------------------------
				42	EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
				43	HREF = re.compile("""href\\s=\\s['"]?([^'"> ]+)""", re.I)
				44	URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
				45
				46	# This pattern matches a character entity reference (a decimal numeric
				47	# references, a hexadecimal numeric reference, or a named reference).
				48	ENTITY_SUB = re.compile(r'&(#(\d+\|x[\da-fA-F]+)\|[\w.:-]+);?').sub
				49	REL = re.compile("""<([^>]\srel\s=\s['"]?([^'">]+)[^>])>""", re.I)
				50
				51
				52	def socket_timeout(timeout=SOCKET_TIMEOUT):
				53	"""Decorator to add a socket timeout when requesting pages on PyPI.
				54	"""
Éric Araujo	3c8ca08	2011-06-17 21:10:21 +0200	[diff] [blame]	55	def wrapper(func):
				56	@wraps(func)
				57	def wrapped(self, args, *kwargs):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	58	old_timeout = socket.getdefaulttimeout()
				59	if hasattr(self, "_timeout"):
				60	timeout = self._timeout
				61	socket.setdefaulttimeout(timeout)
				62	try:
				63	return func(self, args, *kwargs)
				64	finally:
				65	socket.setdefaulttimeout(old_timeout)
Éric Araujo	3c8ca08	2011-06-17 21:10:21 +0200	[diff] [blame]	66	return wrapped
				67	return wrapper
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	68
				69
				70	def with_mirror_support():
				71	"""Decorator that makes the mirroring support easier"""
				72	def wrapper(func):
Éric Araujo	3c8ca08	2011-06-17 21:10:21 +0200	[diff] [blame]	73	@wraps(func)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	74	def wrapped(self, args, *kwargs):
				75	try:
				76	return func(self, args, *kwargs)
				77	except DownloadError:
				78	# if an error occurs, try with the next index_url
				79	if self._mirrors_tries >= self._mirrors_max_tries:
				80	try:
				81	self._switch_to_next_mirror()
				82	except KeyError:
				83	raise UnableToDownload("Tried all mirrors")
				84	else:
				85	self._mirrors_tries += 1
				86	self._projects.clear()
				87	return wrapped(self, args, *kwargs)
				88	return wrapped
				89	return wrapper
				90
				91
				92	class Crawler(BaseClient):
				93	"""Provides useful tools to request the Python Package Index simple API.
				94
				95	You can specify both mirrors and mirrors_url, but mirrors_url will only be
				96	used if mirrors is set to None.
				97
				98	:param index_url: the url of the simple index to search on.
				99	:param prefer_final: if the version is not mentioned, and the last
				100	version is not a "final" one (alpha, beta, etc.),
				101	pick up the last final version.
				102	:param prefer_source: if the distribution type is not mentioned, pick up
				103	the source one if available.
				104	:param follow_externals: tell if following external links is needed or
				105	not. Default is False.
				106	:param hosts: a list of hosts allowed to be processed while using
				107	follow_externals=True. Default behavior is to follow all
				108	hosts.
				109	:param follow_externals: tell if following external links is needed or
				110	not. Default is False.
				111	:param mirrors_url: the url to look on for DNS records giving mirror
Éric Araujo	348c572	2011-06-19 18:53:31 +0200	[diff] [blame]	112	addresses.
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	113	:param mirrors: a list of mirrors (see PEP 381).
				114	:param timeout: time in seconds to consider a url has timeouted.
				115	:param mirrors_max_tries": number of times to try requesting informations
				116	on mirrors before switching.
				117	"""
				118
				119	def __init__(self, index_url=DEFAULT_SIMPLE_INDEX_URL, prefer_final=False,
				120	prefer_source=True, hosts=DEFAULT_HOSTS,
				121	follow_externals=False, mirrors_url=None, mirrors=None,
Éric Araujo	dd2d55c	2011-09-21 16:28:03 +0200	[diff] [blame]	122	timeout=SOCKET_TIMEOUT, mirrors_max_tries=0):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	123	super(Crawler, self).__init__(prefer_final, prefer_source)
				124	self.follow_externals = follow_externals
				125
				126	# mirroring attributes.
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	127	parsed = urllib.parse.urlparse(index_url)
				128	self.scheme = parsed[0]
				129	if self.scheme == 'file':
				130	ender = os.path.sep
				131	else:
				132	ender = '/'
				133	if not index_url.endswith(ender):
				134	index_url += ender
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	135	# if no mirrors are defined, use the method described in PEP 381.
				136	if mirrors is None:
				137	mirrors = get_mirrors(mirrors_url)
				138	self._mirrors = set(mirrors)
				139	self._mirrors_used = set()
				140	self.index_url = index_url
				141	self._mirrors_max_tries = mirrors_max_tries
				142	self._mirrors_tries = 0
				143	self._timeout = timeout
				144
				145	# create a regexp to match all given hosts
				146	self._allowed_hosts = re.compile('\|'.join(map(translate, hosts))).match
				147
				148	# we keep an index of pages we have processed, in order to avoid
				149	# scanning them multple time (eg. if there is multiple pages pointing
				150	# on one)
				151	self._processed_urls = []
				152	self._projects = {}
				153
				154	@with_mirror_support()
				155	def search_projects(self, name=None, **kwargs):
				156	"""Search the index for projects containing the given name.
				157
				158	Return a list of names.
				159	"""
Éric Araujo	030cfe2	2011-09-10 18:10:58 +0200	[diff] [blame]	160	if '*' in name:
				161	name.replace('', '.')
				162	else:
				163	name = "%s%s%s" % ('.?', name, '.?')
				164	name = name.replace('', '[^<]') # avoid matching end tag
				165	pattern = ('<a[^>]*>(%s)</a>' % name).encode('utf-8')
				166	projectname = re.compile(pattern, re.I)
				167	matching_projects = []
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	168
Éric Araujo	030cfe2	2011-09-10 18:10:58 +0200	[diff] [blame]	169	with self._open_url(self.index_url) as index:
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	170	index_content = index.read()
				171
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	172	for match in projectname.finditer(index_content):
Éric Araujo	030cfe2	2011-09-10 18:10:58 +0200	[diff] [blame]	173	project_name = match.group(1).decode('utf-8')
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	174	matching_projects.append(self._get_project(project_name))
				175	return matching_projects
				176
				177	def get_releases(self, requirements, prefer_final=None,
				178	force_update=False):
Éric Araujo	25d5737	2011-06-01 14:41:11 +0200	[diff] [blame]	179	"""Search for releases and return a ReleasesList object containing
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	180	the results.
				181	"""
				182	predicate = get_version_predicate(requirements)
				183	if predicate.name.lower() in self._projects and not force_update:
				184	return self._projects.get(predicate.name.lower())
				185	prefer_final = self._get_prefer_final(prefer_final)
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	186	logger.debug('Reading info on PyPI about %s', predicate.name)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	187	self._process_index_page(predicate.name)
				188
				189	if predicate.name.lower() not in self._projects:
Éric Araujo	2ef747c	2011-06-04 22:33:16 +0200	[diff] [blame]	190	raise ProjectNotFound
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	191
				192	releases = self._projects.get(predicate.name.lower())
				193	releases.sort_releases(prefer_final=prefer_final)
				194	return releases
				195
				196	def get_release(self, requirements, prefer_final=None):
				197	"""Return only one release that fulfill the given requirements"""
				198	predicate = get_version_predicate(requirements)
				199	release = self.get_releases(predicate, prefer_final)\
				200	.get_last(predicate)
				201	if not release:
				202	raise ReleaseNotFound("No release matches the given criterias")
				203	return release
				204
				205	def get_distributions(self, project_name, version):
				206	"""Return the distributions found on the index for the specific given
				207	release"""
				208	# as the default behavior of get_release is to return a release
				209	# containing the distributions, just alias it.
				210	return self.get_release("%s (%s)" % (project_name, version))
				211
				212	def get_metadata(self, project_name, version):
				213	"""Return the metadatas from the simple index.
				214
				215	Currently, download one archive, extract it and use the PKG-INFO file.
				216	"""
				217	release = self.get_distributions(project_name, version)
				218	if not release.metadata:
				219	location = release.get_distribution().unpack()
				220	pkg_info = os.path.join(location, 'PKG-INFO')
				221	release.metadata = Metadata(pkg_info)
				222	return release
				223
				224	def _switch_to_next_mirror(self):
				225	"""Switch to the next mirror (eg. point self.index_url to the next
				226	mirror url.
				227
				228	Raise a KeyError if all mirrors have been tried.
				229	"""
				230	self._mirrors_used.add(self.index_url)
				231	index_url = self._mirrors.pop()
Éric Araujo	ea888e0	2011-06-08 04:31:18 +0200	[diff] [blame]	232	# XXX use urllib.parse for a real check of missing scheme part
				233	if not index_url.startswith(("http://", "https://", "file://")):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	234	index_url = "http://%s" % index_url
				235
				236	if not index_url.endswith("/simple"):
				237	index_url = "%s/simple/" % index_url
				238
				239	self.index_url = index_url
				240
				241	def _is_browsable(self, url):
				242	"""Tell if the given URL can be browsed or not.
				243
				244	It uses the follow_externals and the hosts list to tell if the given
				245	url is browsable or not.
				246	"""
				247	# if _index_url is contained in the given URL, we are browsing the
				248	# index, and it's always "browsable".
				249	# local files are always considered browable resources
				250	if self.index_url in url or urllib.parse.urlparse(url)[0] == "file":
				251	return True
				252	elif self.follow_externals:
				253	if self._allowed_hosts(urllib.parse.urlparse(url)[1]): # 1 is netloc
				254	return True
				255	else:
				256	return False
				257	return False
				258
				259	def _is_distribution(self, link):
				260	"""Tell if the given URL matches to a distribution name or not.
				261	"""
				262	#XXX find a better way to check that links are distributions
				263	# Using a regexp ?
				264	for ext in EXTENSIONS:
				265	if ext in link:
				266	return True
				267	return False
				268
				269	def _register_release(self, release=None, release_info={}):
				270	"""Register a new release.
				271
				272	Both a release or a dict of release_info can be provided, the prefered
				273	way (eg. the quicker) is the dict one.
				274
				275	Return the list of existing releases for the given project.
				276	"""
				277	# Check if the project already has a list of releases (refering to
				278	# the project name). If not, create a new release list.
				279	# Then, add the release to the list.
				280	if release:
				281	name = release.name
				282	else:
				283	name = release_info['name']
Éric Araujo	df8ef02	2011-06-08 04:47:13 +0200	[diff] [blame]	284	if name.lower() not in self._projects:
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	285	self._projects[name.lower()] = ReleasesList(name, index=self._index)
				286
				287	if release:
				288	self._projects[name.lower()].add_release(release=release)
				289	else:
				290	name = release_info.pop('name')
				291	version = release_info.pop('version')
				292	dist_type = release_info.pop('dist_type')
				293	self._projects[name.lower()].add_release(version, dist_type,
				294	**release_info)
				295	return self._projects[name.lower()]
				296
				297	def _process_url(self, url, project_name=None, follow_links=True):
				298	"""Process an url and search for distributions packages.
				299
				300	For each URL found, if it's a download, creates a PyPIdistribution
				301	object. If it's a homepage and we can follow links, process it too.
				302
				303	:param url: the url to process
				304	:param project_name: the project name we are searching for.
				305	:param follow_links: Do not want to follow links more than from one
				306	level. This parameter tells if we want to follow
				307	the links we find (eg. run recursively this
				308	method on it)
				309	"""
				310	with self._open_url(url) as f:
				311	base_url = f.url
				312	if url not in self._processed_urls:
				313	self._processed_urls.append(url)
				314	link_matcher = self._get_link_matcher(url)
				315	for link, is_download in link_matcher(f.read().decode(), base_url):
				316	if link not in self._processed_urls:
				317	if self._is_distribution(link) or is_download:
				318	self._processed_urls.append(link)
				319	# it's a distribution, so create a dist object
				320	try:
				321	infos = get_infos_from_url(link, project_name,
Éric Araujo	df8ef02	2011-06-08 04:47:13 +0200	[diff] [blame]	322	is_external=self.index_url not in url)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	323	except CantParseArchiveName as e:
Éric Araujo	dd2d55c	2011-09-21 16:28:03 +0200	[diff] [blame]	324	logger.warning(
				325	"version has not been parsed: %s", e)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	326	else:
				327	self._register_release(release_info=infos)
				328	else:
				329	if self._is_browsable(link) and follow_links:
				330	self._process_url(link, project_name,
				331	follow_links=False)
				332
				333	def _get_link_matcher(self, url):
				334	"""Returns the right link matcher function of the given url
				335	"""
				336	if self.index_url in url:
				337	return self._simple_link_matcher
				338	else:
				339	return self._default_link_matcher
				340
				341	def _get_full_url(self, url, base_url):
				342	return urllib.parse.urljoin(base_url, self._htmldecode(url))
				343
				344	def _simple_link_matcher(self, content, base_url):
				345	"""Yield all links with a rel="download" or rel="homepage".
				346
				347	This matches the simple index requirements for matching links.
				348	If follow_externals is set to False, dont yeld the external
				349	urls.
				350
				351	:param content: the content of the page we want to parse
				352	:param base_url: the url of this page.
				353	"""
				354	for match in HREF.finditer(content):
				355	url = self._get_full_url(match.group(1), base_url)
				356	if MD5_HASH.match(url):
				357	yield (url, True)
				358
				359	for match in REL.finditer(content):
				360	# search for rel links.
				361	tag, rel = match.groups()
				362	rels = [s.strip() for s in rel.lower().split(',')]
				363	if 'homepage' in rels or 'download' in rels:
				364	for match in HREF.finditer(tag):
				365	url = self._get_full_url(match.group(1), base_url)
				366	if 'download' in rels or self._is_browsable(url):
				367	# yield a list of (url, is_download)
				368	yield (url, 'download' in rels)
				369
				370	def _default_link_matcher(self, content, base_url):
				371	"""Yield all links found on the page.
				372	"""
				373	for match in HREF.finditer(content):
				374	url = self._get_full_url(match.group(1), base_url)
				375	if self._is_browsable(url):
				376	yield (url, False)
				377
				378	@with_mirror_support()
				379	def _process_index_page(self, name):
				380	"""Find and process a PyPI page for the given project name.
				381
				382	:param name: the name of the project to find the page
				383	"""
				384	# Browse and index the content of the given PyPI page.
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	385	if self.scheme == 'file':
				386	ender = os.path.sep
				387	else:
				388	ender = '/'
				389	url = self.index_url + name + ender
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	390	self._process_url(url, name)
				391
				392	@socket_timeout()
				393	def _open_url(self, url):
				394	"""Open a urllib2 request, handling HTTP authentication, and local
				395	files support.
				396
				397	"""
				398	scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
				399
				400	# authentication stuff
				401	if scheme in ('http', 'https'):
				402	auth, host = urllib.parse.splituser(netloc)
				403	else:
				404	auth = None
				405
				406	# add index.html automatically for filesystem paths
				407	if scheme == 'file':
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	408	if url.endswith(os.path.sep):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	409	url += "index.html"
				410
				411	# add authorization headers if auth is provided
				412	if auth:
				413	auth = "Basic " + \
				414	urllib.parse.unquote(auth).encode('base64').strip()
				415	new_url = urllib.parse.urlunparse((
				416	scheme, host, path, params, query, frag))
				417	request = urllib.request.Request(new_url)
				418	request.add_header("Authorization", auth)
				419	else:
				420	request = urllib.request.Request(url)
				421	request.add_header('User-Agent', USER_AGENT)
				422	try:
				423	fp = urllib.request.urlopen(request)
				424	except (ValueError, http.client.InvalidURL) as v:
				425	msg = ' '.join([str(arg) for arg in v.args])
				426	raise PackagingPyPIError('%s %s' % (url, msg))
				427	except urllib.error.HTTPError as v:
				428	return v
				429	except urllib.error.URLError as v:
				430	raise DownloadError("Download error for %s: %s" % (url, v.reason))
				431	except http.client.BadStatusLine as v:
				432	raise DownloadError('%s returned a bad status line. '
				433	'The server might be down, %s' % (url, v.line))
				434	except http.client.HTTPException as v:
				435	raise DownloadError("Download error for %s: %s" % (url, v))
				436	except socket.timeout:
				437	raise DownloadError("The server timeouted")
				438
				439	if auth:
				440	# Put authentication info back into request URL if same host,
				441	# so that links found on the page will work
				442	s2, h2, path2, param2, query2, frag2 = \
				443	urllib.parse.urlparse(fp.url)
				444	if s2 == scheme and h2 == host:
				445	fp.url = urllib.parse.urlunparse(
				446	(s2, netloc, path2, param2, query2, frag2))
				447	return fp
				448
				449	def _decode_entity(self, match):
				450	what = match.group(1)
				451	if what.startswith('#x'):
				452	what = int(what[2:], 16)
				453	elif what.startswith('#'):
				454	what = int(what[1:])
				455	else:
				456	from html.entities import name2codepoint
				457	what = name2codepoint.get(what, match.group(0))
				458	return chr(what)
				459
				460	def _htmldecode(self, text):
				461	"""Decode HTML entities in the given text."""
				462	return ENTITY_SUB(self._decode_entity, text)