Blame - Lib/packaging/pypi/simple.py - platform/external/python/cpython3

blob: c492179874464396d0f89be0f8487a281c5610b3 [file] [log] [blame]

Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	1	"""Spider using the screen-scraping "simple" PyPI API.
				2
Éric Araujo	25d5737	2011-06-01 14:41:11 +0200	[diff] [blame^]	3	This module contains the class Crawler, a simple spider that
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	4	can be used to find and retrieve distributions from a project index
				5	(like the Python Package Index), using its so-called simple API (see
				6	reference implementation available at http://pypi.python.org/simple/).
				7	"""
				8
				9	import http.client
				10	import re
				11	import socket
				12	import sys
				13	import urllib.request
				14	import urllib.parse
				15	import urllib.error
				16	import os
				17
				18
				19	from fnmatch import translate
				20	from packaging import logger
				21	from packaging.metadata import Metadata
				22	from packaging.version import get_version_predicate
				23	from packaging import __version__ as packaging_version
				24	from packaging.pypi.base import BaseClient
				25	from packaging.pypi.dist import (ReleasesList, EXTENSIONS,
				26	get_infos_from_url, MD5_HASH)
				27	from packaging.pypi.errors import (PackagingPyPIError, DownloadError,
				28	UnableToDownload, CantParseArchiveName,
				29	ReleaseNotFound, ProjectNotFound)
				30	from packaging.pypi.mirrors import get_mirrors
				31	from packaging.metadata import Metadata
				32
				33	__all__ = ['Crawler', 'DEFAULT_SIMPLE_INDEX_URL']
				34
				35	# -- Constants -----------------------------------------------
				36	DEFAULT_SIMPLE_INDEX_URL = "http://a.pypi.python.org/simple/"
				37	DEFAULT_HOSTS = ("*",)
				38	SOCKET_TIMEOUT = 15
				39	USER_AGENT = "Python-urllib/%s packaging/%s" % (
				40	sys.version[:3], packaging_version)
				41
				42	# -- Regexps -------------------------------------------------
				43	EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
				44	HREF = re.compile("""href\\s=\\s['"]?([^'"> ]+)""", re.I)
				45	URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
				46
				47	# This pattern matches a character entity reference (a decimal numeric
				48	# references, a hexadecimal numeric reference, or a named reference).
				49	ENTITY_SUB = re.compile(r'&(#(\d+\|x[\da-fA-F]+)\|[\w.:-]+);?').sub
				50	REL = re.compile("""<([^>]\srel\s=\s['"]?([^'">]+)[^>])>""", re.I)
				51
				52
				53	def socket_timeout(timeout=SOCKET_TIMEOUT):
				54	"""Decorator to add a socket timeout when requesting pages on PyPI.
				55	"""
				56	def _socket_timeout(func):
				57	def _socket_timeout(self, args, *kwargs):
				58	old_timeout = socket.getdefaulttimeout()
				59	if hasattr(self, "_timeout"):
				60	timeout = self._timeout
				61	socket.setdefaulttimeout(timeout)
				62	try:
				63	return func(self, args, *kwargs)
				64	finally:
				65	socket.setdefaulttimeout(old_timeout)
				66	return _socket_timeout
				67	return _socket_timeout
				68
				69
				70	def with_mirror_support():
				71	"""Decorator that makes the mirroring support easier"""
				72	def wrapper(func):
				73	def wrapped(self, args, *kwargs):
				74	try:
				75	return func(self, args, *kwargs)
				76	except DownloadError:
				77	# if an error occurs, try with the next index_url
				78	if self._mirrors_tries >= self._mirrors_max_tries:
				79	try:
				80	self._switch_to_next_mirror()
				81	except KeyError:
				82	raise UnableToDownload("Tried all mirrors")
				83	else:
				84	self._mirrors_tries += 1
				85	self._projects.clear()
				86	return wrapped(self, args, *kwargs)
				87	return wrapped
				88	return wrapper
				89
				90
				91	class Crawler(BaseClient):
				92	"""Provides useful tools to request the Python Package Index simple API.
				93
				94	You can specify both mirrors and mirrors_url, but mirrors_url will only be
				95	used if mirrors is set to None.
				96
				97	:param index_url: the url of the simple index to search on.
				98	:param prefer_final: if the version is not mentioned, and the last
				99	version is not a "final" one (alpha, beta, etc.),
				100	pick up the last final version.
				101	:param prefer_source: if the distribution type is not mentioned, pick up
				102	the source one if available.
				103	:param follow_externals: tell if following external links is needed or
				104	not. Default is False.
				105	:param hosts: a list of hosts allowed to be processed while using
				106	follow_externals=True. Default behavior is to follow all
				107	hosts.
				108	:param follow_externals: tell if following external links is needed or
				109	not. Default is False.
				110	:param mirrors_url: the url to look on for DNS records giving mirror
				111	adresses.
				112	:param mirrors: a list of mirrors (see PEP 381).
				113	:param timeout: time in seconds to consider a url has timeouted.
				114	:param mirrors_max_tries": number of times to try requesting informations
				115	on mirrors before switching.
				116	"""
				117
				118	def __init__(self, index_url=DEFAULT_SIMPLE_INDEX_URL, prefer_final=False,
				119	prefer_source=True, hosts=DEFAULT_HOSTS,
				120	follow_externals=False, mirrors_url=None, mirrors=None,
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	121	timeout=SOCKET_TIMEOUT, mirrors_max_tries=0, verbose=False):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	122	super(Crawler, self).__init__(prefer_final, prefer_source)
				123	self.follow_externals = follow_externals
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	124	self.verbose = verbose
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	125
				126	# mirroring attributes.
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	127	parsed = urllib.parse.urlparse(index_url)
				128	self.scheme = parsed[0]
				129	if self.scheme == 'file':
				130	ender = os.path.sep
				131	else:
				132	ender = '/'
				133	if not index_url.endswith(ender):
				134	index_url += ender
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	135	# if no mirrors are defined, use the method described in PEP 381.
				136	if mirrors is None:
				137	mirrors = get_mirrors(mirrors_url)
				138	self._mirrors = set(mirrors)
				139	self._mirrors_used = set()
				140	self.index_url = index_url
				141	self._mirrors_max_tries = mirrors_max_tries
				142	self._mirrors_tries = 0
				143	self._timeout = timeout
				144
				145	# create a regexp to match all given hosts
				146	self._allowed_hosts = re.compile('\|'.join(map(translate, hosts))).match
				147
				148	# we keep an index of pages we have processed, in order to avoid
				149	# scanning them multple time (eg. if there is multiple pages pointing
				150	# on one)
				151	self._processed_urls = []
				152	self._projects = {}
				153
				154	@with_mirror_support()
				155	def search_projects(self, name=None, **kwargs):
				156	"""Search the index for projects containing the given name.
				157
				158	Return a list of names.
				159	"""
				160	with self._open_url(self.index_url) as index:
				161	if '*' in name:
				162	name.replace('', '.')
				163	else:
				164	name = "%s%s%s" % ('.?', name, '.?')
				165	name = name.replace('', '[^<]') # avoid matching end tag
				166	projectname = re.compile('<a[^>]*>(%s)</a>' % name, re.I)
				167	matching_projects = []
				168
				169	index_content = index.read()
				170
				171	# FIXME should use bytes I/O and regexes instead of decoding
				172	index_content = index_content.decode()
				173
				174	for match in projectname.finditer(index_content):
				175	project_name = match.group(1)
				176	matching_projects.append(self._get_project(project_name))
				177	return matching_projects
				178
				179	def get_releases(self, requirements, prefer_final=None,
				180	force_update=False):
Éric Araujo	25d5737	2011-06-01 14:41:11 +0200	[diff] [blame^]	181	"""Search for releases and return a ReleasesList object containing
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	182	the results.
				183	"""
				184	predicate = get_version_predicate(requirements)
				185	if predicate.name.lower() in self._projects and not force_update:
				186	return self._projects.get(predicate.name.lower())
				187	prefer_final = self._get_prefer_final(prefer_final)
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	188	logger.debug('Reading info on PyPI about %s', predicate.name)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	189	self._process_index_page(predicate.name)
				190
				191	if predicate.name.lower() not in self._projects:
				192	raise ProjectNotFound()
				193
				194	releases = self._projects.get(predicate.name.lower())
				195	releases.sort_releases(prefer_final=prefer_final)
				196	return releases
				197
				198	def get_release(self, requirements, prefer_final=None):
				199	"""Return only one release that fulfill the given requirements"""
				200	predicate = get_version_predicate(requirements)
				201	release = self.get_releases(predicate, prefer_final)\
				202	.get_last(predicate)
				203	if not release:
				204	raise ReleaseNotFound("No release matches the given criterias")
				205	return release
				206
				207	def get_distributions(self, project_name, version):
				208	"""Return the distributions found on the index for the specific given
				209	release"""
				210	# as the default behavior of get_release is to return a release
				211	# containing the distributions, just alias it.
				212	return self.get_release("%s (%s)" % (project_name, version))
				213
				214	def get_metadata(self, project_name, version):
				215	"""Return the metadatas from the simple index.
				216
				217	Currently, download one archive, extract it and use the PKG-INFO file.
				218	"""
				219	release = self.get_distributions(project_name, version)
				220	if not release.metadata:
				221	location = release.get_distribution().unpack()
				222	pkg_info = os.path.join(location, 'PKG-INFO')
				223	release.metadata = Metadata(pkg_info)
				224	return release
				225
				226	def _switch_to_next_mirror(self):
				227	"""Switch to the next mirror (eg. point self.index_url to the next
				228	mirror url.
				229
				230	Raise a KeyError if all mirrors have been tried.
				231	"""
				232	self._mirrors_used.add(self.index_url)
				233	index_url = self._mirrors.pop()
				234	if not ("http://" or "https://" or "file://") in index_url:
				235	index_url = "http://%s" % index_url
				236
				237	if not index_url.endswith("/simple"):
				238	index_url = "%s/simple/" % index_url
				239
				240	self.index_url = index_url
				241
				242	def _is_browsable(self, url):
				243	"""Tell if the given URL can be browsed or not.
				244
				245	It uses the follow_externals and the hosts list to tell if the given
				246	url is browsable or not.
				247	"""
				248	# if _index_url is contained in the given URL, we are browsing the
				249	# index, and it's always "browsable".
				250	# local files are always considered browable resources
				251	if self.index_url in url or urllib.parse.urlparse(url)[0] == "file":
				252	return True
				253	elif self.follow_externals:
				254	if self._allowed_hosts(urllib.parse.urlparse(url)[1]): # 1 is netloc
				255	return True
				256	else:
				257	return False
				258	return False
				259
				260	def _is_distribution(self, link):
				261	"""Tell if the given URL matches to a distribution name or not.
				262	"""
				263	#XXX find a better way to check that links are distributions
				264	# Using a regexp ?
				265	for ext in EXTENSIONS:
				266	if ext in link:
				267	return True
				268	return False
				269
				270	def _register_release(self, release=None, release_info={}):
				271	"""Register a new release.
				272
				273	Both a release or a dict of release_info can be provided, the prefered
				274	way (eg. the quicker) is the dict one.
				275
				276	Return the list of existing releases for the given project.
				277	"""
				278	# Check if the project already has a list of releases (refering to
				279	# the project name). If not, create a new release list.
				280	# Then, add the release to the list.
				281	if release:
				282	name = release.name
				283	else:
				284	name = release_info['name']
				285	if not name.lower() in self._projects:
				286	self._projects[name.lower()] = ReleasesList(name, index=self._index)
				287
				288	if release:
				289	self._projects[name.lower()].add_release(release=release)
				290	else:
				291	name = release_info.pop('name')
				292	version = release_info.pop('version')
				293	dist_type = release_info.pop('dist_type')
				294	self._projects[name.lower()].add_release(version, dist_type,
				295	**release_info)
				296	return self._projects[name.lower()]
				297
				298	def _process_url(self, url, project_name=None, follow_links=True):
				299	"""Process an url and search for distributions packages.
				300
				301	For each URL found, if it's a download, creates a PyPIdistribution
				302	object. If it's a homepage and we can follow links, process it too.
				303
				304	:param url: the url to process
				305	:param project_name: the project name we are searching for.
				306	:param follow_links: Do not want to follow links more than from one
				307	level. This parameter tells if we want to follow
				308	the links we find (eg. run recursively this
				309	method on it)
				310	"""
				311	with self._open_url(url) as f:
				312	base_url = f.url
				313	if url not in self._processed_urls:
				314	self._processed_urls.append(url)
				315	link_matcher = self._get_link_matcher(url)
				316	for link, is_download in link_matcher(f.read().decode(), base_url):
				317	if link not in self._processed_urls:
				318	if self._is_distribution(link) or is_download:
				319	self._processed_urls.append(link)
				320	# it's a distribution, so create a dist object
				321	try:
				322	infos = get_infos_from_url(link, project_name,
				323	is_external=not self.index_url in url)
				324	except CantParseArchiveName as e:
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	325	if self.verbose:
				326	logger.warning(
				327	"version has not been parsed: %s", e)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	328	else:
				329	self._register_release(release_info=infos)
				330	else:
				331	if self._is_browsable(link) and follow_links:
				332	self._process_url(link, project_name,
				333	follow_links=False)
				334
				335	def _get_link_matcher(self, url):
				336	"""Returns the right link matcher function of the given url
				337	"""
				338	if self.index_url in url:
				339	return self._simple_link_matcher
				340	else:
				341	return self._default_link_matcher
				342
				343	def _get_full_url(self, url, base_url):
				344	return urllib.parse.urljoin(base_url, self._htmldecode(url))
				345
				346	def _simple_link_matcher(self, content, base_url):
				347	"""Yield all links with a rel="download" or rel="homepage".
				348
				349	This matches the simple index requirements for matching links.
				350	If follow_externals is set to False, dont yeld the external
				351	urls.
				352
				353	:param content: the content of the page we want to parse
				354	:param base_url: the url of this page.
				355	"""
				356	for match in HREF.finditer(content):
				357	url = self._get_full_url(match.group(1), base_url)
				358	if MD5_HASH.match(url):
				359	yield (url, True)
				360
				361	for match in REL.finditer(content):
				362	# search for rel links.
				363	tag, rel = match.groups()
				364	rels = [s.strip() for s in rel.lower().split(',')]
				365	if 'homepage' in rels or 'download' in rels:
				366	for match in HREF.finditer(tag):
				367	url = self._get_full_url(match.group(1), base_url)
				368	if 'download' in rels or self._is_browsable(url):
				369	# yield a list of (url, is_download)
				370	yield (url, 'download' in rels)
				371
				372	def _default_link_matcher(self, content, base_url):
				373	"""Yield all links found on the page.
				374	"""
				375	for match in HREF.finditer(content):
				376	url = self._get_full_url(match.group(1), base_url)
				377	if self._is_browsable(url):
				378	yield (url, False)
				379
				380	@with_mirror_support()
				381	def _process_index_page(self, name):
				382	"""Find and process a PyPI page for the given project name.
				383
				384	:param name: the name of the project to find the page
				385	"""
				386	# Browse and index the content of the given PyPI page.
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	387	if self.scheme == 'file':
				388	ender = os.path.sep
				389	else:
				390	ender = '/'
				391	url = self.index_url + name + ender
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	392	self._process_url(url, name)
				393
				394	@socket_timeout()
				395	def _open_url(self, url):
				396	"""Open a urllib2 request, handling HTTP authentication, and local
				397	files support.
				398
				399	"""
				400	scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
				401
				402	# authentication stuff
				403	if scheme in ('http', 'https'):
				404	auth, host = urllib.parse.splituser(netloc)
				405	else:
				406	auth = None
				407
				408	# add index.html automatically for filesystem paths
				409	if scheme == 'file':
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	410	if url.endswith(os.path.sep):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	411	url += "index.html"
				412
				413	# add authorization headers if auth is provided
				414	if auth:
				415	auth = "Basic " + \
				416	urllib.parse.unquote(auth).encode('base64').strip()
				417	new_url = urllib.parse.urlunparse((
				418	scheme, host, path, params, query, frag))
				419	request = urllib.request.Request(new_url)
				420	request.add_header("Authorization", auth)
				421	else:
				422	request = urllib.request.Request(url)
				423	request.add_header('User-Agent', USER_AGENT)
				424	try:
				425	fp = urllib.request.urlopen(request)
				426	except (ValueError, http.client.InvalidURL) as v:
				427	msg = ' '.join([str(arg) for arg in v.args])
				428	raise PackagingPyPIError('%s %s' % (url, msg))
				429	except urllib.error.HTTPError as v:
				430	return v
				431	except urllib.error.URLError as v:
				432	raise DownloadError("Download error for %s: %s" % (url, v.reason))
				433	except http.client.BadStatusLine as v:
				434	raise DownloadError('%s returned a bad status line. '
				435	'The server might be down, %s' % (url, v.line))
				436	except http.client.HTTPException as v:
				437	raise DownloadError("Download error for %s: %s" % (url, v))
				438	except socket.timeout:
				439	raise DownloadError("The server timeouted")
				440
				441	if auth:
				442	# Put authentication info back into request URL if same host,
				443	# so that links found on the page will work
				444	s2, h2, path2, param2, query2, frag2 = \
				445	urllib.parse.urlparse(fp.url)
				446	if s2 == scheme and h2 == host:
				447	fp.url = urllib.parse.urlunparse(
				448	(s2, netloc, path2, param2, query2, frag2))
				449	return fp
				450
				451	def _decode_entity(self, match):
				452	what = match.group(1)
				453	if what.startswith('#x'):
				454	what = int(what[2:], 16)
				455	elif what.startswith('#'):
				456	what = int(what[1:])
				457	else:
				458	from html.entities import name2codepoint
				459	what = name2codepoint.get(what, match.group(0))
				460	return chr(what)
				461
				462	def _htmldecode(self, text):
				463	"""Decode HTML entities in the given text."""
				464	return ENTITY_SUB(self._decode_entity, text)