Blame - Lib/packaging/pypi/simple.py - platform/external/python/cpython3

blob: ee7a1137547e50d46c42b43456a47e1993cffaeb [file] [log] [blame]

Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	1	"""Spider using the screen-scraping "simple" PyPI API.
				2
				3	This module contains the class SimpleIndexCrawler, a simple spider that
				4	can be used to find and retrieve distributions from a project index
				5	(like the Python Package Index), using its so-called simple API (see
				6	reference implementation available at http://pypi.python.org/simple/).
				7	"""
				8
				9	import http.client
				10	import re
				11	import socket
				12	import sys
				13	import urllib.request
				14	import urllib.parse
				15	import urllib.error
				16	import os
				17
				18
				19	from fnmatch import translate
				20	from packaging import logger
				21	from packaging.metadata import Metadata
				22	from packaging.version import get_version_predicate
				23	from packaging import __version__ as packaging_version
				24	from packaging.pypi.base import BaseClient
				25	from packaging.pypi.dist import (ReleasesList, EXTENSIONS,
				26	get_infos_from_url, MD5_HASH)
				27	from packaging.pypi.errors import (PackagingPyPIError, DownloadError,
				28	UnableToDownload, CantParseArchiveName,
				29	ReleaseNotFound, ProjectNotFound)
				30	from packaging.pypi.mirrors import get_mirrors
				31	from packaging.metadata import Metadata
				32
				33	__all__ = ['Crawler', 'DEFAULT_SIMPLE_INDEX_URL']
				34
				35	# -- Constants -----------------------------------------------
				36	DEFAULT_SIMPLE_INDEX_URL = "http://a.pypi.python.org/simple/"
				37	DEFAULT_HOSTS = ("*",)
				38	SOCKET_TIMEOUT = 15
				39	USER_AGENT = "Python-urllib/%s packaging/%s" % (
				40	sys.version[:3], packaging_version)
				41
				42	# -- Regexps -------------------------------------------------
				43	EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
				44	HREF = re.compile("""href\\s=\\s['"]?([^'"> ]+)""", re.I)
				45	URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
				46
				47	# This pattern matches a character entity reference (a decimal numeric
				48	# references, a hexadecimal numeric reference, or a named reference).
				49	ENTITY_SUB = re.compile(r'&(#(\d+\|x[\da-fA-F]+)\|[\w.:-]+);?').sub
				50	REL = re.compile("""<([^>]\srel\s=\s['"]?([^'">]+)[^>])>""", re.I)
				51
				52
				53	def socket_timeout(timeout=SOCKET_TIMEOUT):
				54	"""Decorator to add a socket timeout when requesting pages on PyPI.
				55	"""
				56	def _socket_timeout(func):
				57	def _socket_timeout(self, args, *kwargs):
				58	old_timeout = socket.getdefaulttimeout()
				59	if hasattr(self, "_timeout"):
				60	timeout = self._timeout
				61	socket.setdefaulttimeout(timeout)
				62	try:
				63	return func(self, args, *kwargs)
				64	finally:
				65	socket.setdefaulttimeout(old_timeout)
				66	return _socket_timeout
				67	return _socket_timeout
				68
				69
				70	def with_mirror_support():
				71	"""Decorator that makes the mirroring support easier"""
				72	def wrapper(func):
				73	def wrapped(self, args, *kwargs):
				74	try:
				75	return func(self, args, *kwargs)
				76	except DownloadError:
				77	# if an error occurs, try with the next index_url
				78	if self._mirrors_tries >= self._mirrors_max_tries:
				79	try:
				80	self._switch_to_next_mirror()
				81	except KeyError:
				82	raise UnableToDownload("Tried all mirrors")
				83	else:
				84	self._mirrors_tries += 1
				85	self._projects.clear()
				86	return wrapped(self, args, *kwargs)
				87	return wrapped
				88	return wrapper
				89
				90
				91	class Crawler(BaseClient):
				92	"""Provides useful tools to request the Python Package Index simple API.
				93
				94	You can specify both mirrors and mirrors_url, but mirrors_url will only be
				95	used if mirrors is set to None.
				96
				97	:param index_url: the url of the simple index to search on.
				98	:param prefer_final: if the version is not mentioned, and the last
				99	version is not a "final" one (alpha, beta, etc.),
				100	pick up the last final version.
				101	:param prefer_source: if the distribution type is not mentioned, pick up
				102	the source one if available.
				103	:param follow_externals: tell if following external links is needed or
				104	not. Default is False.
				105	:param hosts: a list of hosts allowed to be processed while using
				106	follow_externals=True. Default behavior is to follow all
				107	hosts.
				108	:param follow_externals: tell if following external links is needed or
				109	not. Default is False.
				110	:param mirrors_url: the url to look on for DNS records giving mirror
				111	adresses.
				112	:param mirrors: a list of mirrors (see PEP 381).
				113	:param timeout: time in seconds to consider a url has timeouted.
				114	:param mirrors_max_tries": number of times to try requesting informations
				115	on mirrors before switching.
				116	"""
				117
				118	def __init__(self, index_url=DEFAULT_SIMPLE_INDEX_URL, prefer_final=False,
				119	prefer_source=True, hosts=DEFAULT_HOSTS,
				120	follow_externals=False, mirrors_url=None, mirrors=None,
				121	timeout=SOCKET_TIMEOUT, mirrors_max_tries=0):
				122	super(Crawler, self).__init__(prefer_final, prefer_source)
				123	self.follow_externals = follow_externals
				124
				125	# mirroring attributes.
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	126	parsed = urllib.parse.urlparse(index_url)
				127	self.scheme = parsed[0]
				128	if self.scheme == 'file':
				129	ender = os.path.sep
				130	else:
				131	ender = '/'
				132	if not index_url.endswith(ender):
				133	index_url += ender
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	134	# if no mirrors are defined, use the method described in PEP 381.
				135	if mirrors is None:
				136	mirrors = get_mirrors(mirrors_url)
				137	self._mirrors = set(mirrors)
				138	self._mirrors_used = set()
				139	self.index_url = index_url
				140	self._mirrors_max_tries = mirrors_max_tries
				141	self._mirrors_tries = 0
				142	self._timeout = timeout
				143
				144	# create a regexp to match all given hosts
				145	self._allowed_hosts = re.compile('\|'.join(map(translate, hosts))).match
				146
				147	# we keep an index of pages we have processed, in order to avoid
				148	# scanning them multple time (eg. if there is multiple pages pointing
				149	# on one)
				150	self._processed_urls = []
				151	self._projects = {}
				152
				153	@with_mirror_support()
				154	def search_projects(self, name=None, **kwargs):
				155	"""Search the index for projects containing the given name.
				156
				157	Return a list of names.
				158	"""
				159	with self._open_url(self.index_url) as index:
				160	if '*' in name:
				161	name.replace('', '.')
				162	else:
				163	name = "%s%s%s" % ('.?', name, '.?')
				164	name = name.replace('', '[^<]') # avoid matching end tag
				165	projectname = re.compile('<a[^>]*>(%s)</a>' % name, re.I)
				166	matching_projects = []
				167
				168	index_content = index.read()
				169
				170	# FIXME should use bytes I/O and regexes instead of decoding
				171	index_content = index_content.decode()
				172
				173	for match in projectname.finditer(index_content):
				174	project_name = match.group(1)
				175	matching_projects.append(self._get_project(project_name))
				176	return matching_projects
				177
				178	def get_releases(self, requirements, prefer_final=None,
				179	force_update=False):
				180	"""Search for releases and return a ReleaseList object containing
				181	the results.
				182	"""
				183	predicate = get_version_predicate(requirements)
				184	if predicate.name.lower() in self._projects and not force_update:
				185	return self._projects.get(predicate.name.lower())
				186	prefer_final = self._get_prefer_final(prefer_final)
				187	logger.info('reading info on PyPI about %s', predicate.name)
				188	self._process_index_page(predicate.name)
				189
				190	if predicate.name.lower() not in self._projects:
				191	raise ProjectNotFound()
				192
				193	releases = self._projects.get(predicate.name.lower())
				194	releases.sort_releases(prefer_final=prefer_final)
				195	return releases
				196
				197	def get_release(self, requirements, prefer_final=None):
				198	"""Return only one release that fulfill the given requirements"""
				199	predicate = get_version_predicate(requirements)
				200	release = self.get_releases(predicate, prefer_final)\
				201	.get_last(predicate)
				202	if not release:
				203	raise ReleaseNotFound("No release matches the given criterias")
				204	return release
				205
				206	def get_distributions(self, project_name, version):
				207	"""Return the distributions found on the index for the specific given
				208	release"""
				209	# as the default behavior of get_release is to return a release
				210	# containing the distributions, just alias it.
				211	return self.get_release("%s (%s)" % (project_name, version))
				212
				213	def get_metadata(self, project_name, version):
				214	"""Return the metadatas from the simple index.
				215
				216	Currently, download one archive, extract it and use the PKG-INFO file.
				217	"""
				218	release = self.get_distributions(project_name, version)
				219	if not release.metadata:
				220	location = release.get_distribution().unpack()
				221	pkg_info = os.path.join(location, 'PKG-INFO')
				222	release.metadata = Metadata(pkg_info)
				223	return release
				224
				225	def _switch_to_next_mirror(self):
				226	"""Switch to the next mirror (eg. point self.index_url to the next
				227	mirror url.
				228
				229	Raise a KeyError if all mirrors have been tried.
				230	"""
				231	self._mirrors_used.add(self.index_url)
				232	index_url = self._mirrors.pop()
				233	if not ("http://" or "https://" or "file://") in index_url:
				234	index_url = "http://%s" % index_url
				235
				236	if not index_url.endswith("/simple"):
				237	index_url = "%s/simple/" % index_url
				238
				239	self.index_url = index_url
				240
				241	def _is_browsable(self, url):
				242	"""Tell if the given URL can be browsed or not.
				243
				244	It uses the follow_externals and the hosts list to tell if the given
				245	url is browsable or not.
				246	"""
				247	# if _index_url is contained in the given URL, we are browsing the
				248	# index, and it's always "browsable".
				249	# local files are always considered browable resources
				250	if self.index_url in url or urllib.parse.urlparse(url)[0] == "file":
				251	return True
				252	elif self.follow_externals:
				253	if self._allowed_hosts(urllib.parse.urlparse(url)[1]): # 1 is netloc
				254	return True
				255	else:
				256	return False
				257	return False
				258
				259	def _is_distribution(self, link):
				260	"""Tell if the given URL matches to a distribution name or not.
				261	"""
				262	#XXX find a better way to check that links are distributions
				263	# Using a regexp ?
				264	for ext in EXTENSIONS:
				265	if ext in link:
				266	return True
				267	return False
				268
				269	def _register_release(self, release=None, release_info={}):
				270	"""Register a new release.
				271
				272	Both a release or a dict of release_info can be provided, the prefered
				273	way (eg. the quicker) is the dict one.
				274
				275	Return the list of existing releases for the given project.
				276	"""
				277	# Check if the project already has a list of releases (refering to
				278	# the project name). If not, create a new release list.
				279	# Then, add the release to the list.
				280	if release:
				281	name = release.name
				282	else:
				283	name = release_info['name']
				284	if not name.lower() in self._projects:
				285	self._projects[name.lower()] = ReleasesList(name, index=self._index)
				286
				287	if release:
				288	self._projects[name.lower()].add_release(release=release)
				289	else:
				290	name = release_info.pop('name')
				291	version = release_info.pop('version')
				292	dist_type = release_info.pop('dist_type')
				293	self._projects[name.lower()].add_release(version, dist_type,
				294	**release_info)
				295	return self._projects[name.lower()]
				296
				297	def _process_url(self, url, project_name=None, follow_links=True):
				298	"""Process an url and search for distributions packages.
				299
				300	For each URL found, if it's a download, creates a PyPIdistribution
				301	object. If it's a homepage and we can follow links, process it too.
				302
				303	:param url: the url to process
				304	:param project_name: the project name we are searching for.
				305	:param follow_links: Do not want to follow links more than from one
				306	level. This parameter tells if we want to follow
				307	the links we find (eg. run recursively this
				308	method on it)
				309	"""
				310	with self._open_url(url) as f:
				311	base_url = f.url
				312	if url not in self._processed_urls:
				313	self._processed_urls.append(url)
				314	link_matcher = self._get_link_matcher(url)
				315	for link, is_download in link_matcher(f.read().decode(), base_url):
				316	if link not in self._processed_urls:
				317	if self._is_distribution(link) or is_download:
				318	self._processed_urls.append(link)
				319	# it's a distribution, so create a dist object
				320	try:
				321	infos = get_infos_from_url(link, project_name,
				322	is_external=not self.index_url in url)
				323	except CantParseArchiveName as e:
				324	logger.warning(
				325	"version has not been parsed: %s", e)
				326	else:
				327	self._register_release(release_info=infos)
				328	else:
				329	if self._is_browsable(link) and follow_links:
				330	self._process_url(link, project_name,
				331	follow_links=False)
				332
				333	def _get_link_matcher(self, url):
				334	"""Returns the right link matcher function of the given url
				335	"""
				336	if self.index_url in url:
				337	return self._simple_link_matcher
				338	else:
				339	return self._default_link_matcher
				340
				341	def _get_full_url(self, url, base_url):
				342	return urllib.parse.urljoin(base_url, self._htmldecode(url))
				343
				344	def _simple_link_matcher(self, content, base_url):
				345	"""Yield all links with a rel="download" or rel="homepage".
				346
				347	This matches the simple index requirements for matching links.
				348	If follow_externals is set to False, dont yeld the external
				349	urls.
				350
				351	:param content: the content of the page we want to parse
				352	:param base_url: the url of this page.
				353	"""
				354	for match in HREF.finditer(content):
				355	url = self._get_full_url(match.group(1), base_url)
				356	if MD5_HASH.match(url):
				357	yield (url, True)
				358
				359	for match in REL.finditer(content):
				360	# search for rel links.
				361	tag, rel = match.groups()
				362	rels = [s.strip() for s in rel.lower().split(',')]
				363	if 'homepage' in rels or 'download' in rels:
				364	for match in HREF.finditer(tag):
				365	url = self._get_full_url(match.group(1), base_url)
				366	if 'download' in rels or self._is_browsable(url):
				367	# yield a list of (url, is_download)
				368	yield (url, 'download' in rels)
				369
				370	def _default_link_matcher(self, content, base_url):
				371	"""Yield all links found on the page.
				372	"""
				373	for match in HREF.finditer(content):
				374	url = self._get_full_url(match.group(1), base_url)
				375	if self._is_browsable(url):
				376	yield (url, False)
				377
				378	@with_mirror_support()
				379	def _process_index_page(self, name):
				380	"""Find and process a PyPI page for the given project name.
				381
				382	:param name: the name of the project to find the page
				383	"""
				384	# Browse and index the content of the given PyPI page.
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	385	if self.scheme == 'file':
				386	ender = os.path.sep
				387	else:
				388	ender = '/'
				389	url = self.index_url + name + ender
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	390	self._process_url(url, name)
				391
				392	@socket_timeout()
				393	def _open_url(self, url):
				394	"""Open a urllib2 request, handling HTTP authentication, and local
				395	files support.
				396
				397	"""
				398	scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
				399
				400	# authentication stuff
				401	if scheme in ('http', 'https'):
				402	auth, host = urllib.parse.splituser(netloc)
				403	else:
				404	auth = None
				405
				406	# add index.html automatically for filesystem paths
				407	if scheme == 'file':
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	408	if url.endswith(os.path.sep):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	409	url += "index.html"
				410
				411	# add authorization headers if auth is provided
				412	if auth:
				413	auth = "Basic " + \
				414	urllib.parse.unquote(auth).encode('base64').strip()
				415	new_url = urllib.parse.urlunparse((
				416	scheme, host, path, params, query, frag))
				417	request = urllib.request.Request(new_url)
				418	request.add_header("Authorization", auth)
				419	else:
				420	request = urllib.request.Request(url)
				421	request.add_header('User-Agent', USER_AGENT)
				422	try:
				423	fp = urllib.request.urlopen(request)
				424	except (ValueError, http.client.InvalidURL) as v:
				425	msg = ' '.join([str(arg) for arg in v.args])
				426	raise PackagingPyPIError('%s %s' % (url, msg))
				427	except urllib.error.HTTPError as v:
				428	return v
				429	except urllib.error.URLError as v:
				430	raise DownloadError("Download error for %s: %s" % (url, v.reason))
				431	except http.client.BadStatusLine as v:
				432	raise DownloadError('%s returned a bad status line. '
				433	'The server might be down, %s' % (url, v.line))
				434	except http.client.HTTPException as v:
				435	raise DownloadError("Download error for %s: %s" % (url, v))
				436	except socket.timeout:
				437	raise DownloadError("The server timeouted")
				438
				439	if auth:
				440	# Put authentication info back into request URL if same host,
				441	# so that links found on the page will work
				442	s2, h2, path2, param2, query2, frag2 = \
				443	urllib.parse.urlparse(fp.url)
				444	if s2 == scheme and h2 == host:
				445	fp.url = urllib.parse.urlunparse(
				446	(s2, netloc, path2, param2, query2, frag2))
				447	return fp
				448
				449	def _decode_entity(self, match):
				450	what = match.group(1)
				451	if what.startswith('#x'):
				452	what = int(what[2:], 16)
				453	elif what.startswith('#'):
				454	what = int(what[1:])
				455	else:
				456	from html.entities import name2codepoint
				457	what = name2codepoint.get(what, match.group(0))
				458	return chr(what)
				459
				460	def _htmldecode(self, text):
				461	"""Decode HTML entities in the given text."""
				462	return ENTITY_SUB(self._decode_entity, text)