Blame - Lib/packaging/pypi/simple.py - platform/external/python/cpython3

blob: 710355d6480e8306af273978e1ee103dc46abb1c [file] [log] [blame]

Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	1	"""Spider using the screen-scraping "simple" PyPI API.
				2
Éric Araujo	25d5737	2011-06-01 14:41:11 +0200	[diff] [blame]	3	This module contains the class Crawler, a simple spider that
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	4	can be used to find and retrieve distributions from a project index
				5	(like the Python Package Index), using its so-called simple API (see
				6	reference implementation available at http://pypi.python.org/simple/).
				7	"""
				8
				9	import http.client
				10	import re
				11	import socket
				12	import sys
				13	import urllib.request
				14	import urllib.parse
				15	import urllib.error
				16	import os
				17
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	18	from fnmatch import translate
Éric Araujo	3c8ca08	2011-06-17 21:10:21 +0200	[diff] [blame]	19	from functools import wraps
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	20	from packaging import logger
				21	from packaging.metadata import Metadata
				22	from packaging.version import get_version_predicate
				23	from packaging import __version__ as packaging_version
				24	from packaging.pypi.base import BaseClient
				25	from packaging.pypi.dist import (ReleasesList, EXTENSIONS,
				26	get_infos_from_url, MD5_HASH)
				27	from packaging.pypi.errors import (PackagingPyPIError, DownloadError,
				28	UnableToDownload, CantParseArchiveName,
				29	ReleaseNotFound, ProjectNotFound)
				30	from packaging.pypi.mirrors import get_mirrors
				31	from packaging.metadata import Metadata
				32
				33	__all__ = ['Crawler', 'DEFAULT_SIMPLE_INDEX_URL']
				34
				35	# -- Constants -----------------------------------------------
				36	DEFAULT_SIMPLE_INDEX_URL = "http://a.pypi.python.org/simple/"
				37	DEFAULT_HOSTS = ("*",)
				38	SOCKET_TIMEOUT = 15
				39	USER_AGENT = "Python-urllib/%s packaging/%s" % (
				40	sys.version[:3], packaging_version)
				41
				42	# -- Regexps -------------------------------------------------
				43	EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
				44	HREF = re.compile("""href\\s=\\s['"]?([^'"> ]+)""", re.I)
				45	URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
				46
				47	# This pattern matches a character entity reference (a decimal numeric
				48	# references, a hexadecimal numeric reference, or a named reference).
				49	ENTITY_SUB = re.compile(r'&(#(\d+\|x[\da-fA-F]+)\|[\w.:-]+);?').sub
				50	REL = re.compile("""<([^>]\srel\s=\s['"]?([^'">]+)[^>])>""", re.I)
				51
				52
				53	def socket_timeout(timeout=SOCKET_TIMEOUT):
				54	"""Decorator to add a socket timeout when requesting pages on PyPI.
				55	"""
Éric Araujo	3c8ca08	2011-06-17 21:10:21 +0200	[diff] [blame]	56	def wrapper(func):
				57	@wraps(func)
				58	def wrapped(self, args, *kwargs):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	59	old_timeout = socket.getdefaulttimeout()
				60	if hasattr(self, "_timeout"):
				61	timeout = self._timeout
				62	socket.setdefaulttimeout(timeout)
				63	try:
				64	return func(self, args, *kwargs)
				65	finally:
				66	socket.setdefaulttimeout(old_timeout)
Éric Araujo	3c8ca08	2011-06-17 21:10:21 +0200	[diff] [blame]	67	return wrapped
				68	return wrapper
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	69
				70
				71	def with_mirror_support():
				72	"""Decorator that makes the mirroring support easier"""
				73	def wrapper(func):
Éric Araujo	3c8ca08	2011-06-17 21:10:21 +0200	[diff] [blame]	74	@wraps(func)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	75	def wrapped(self, args, *kwargs):
				76	try:
				77	return func(self, args, *kwargs)
				78	except DownloadError:
				79	# if an error occurs, try with the next index_url
				80	if self._mirrors_tries >= self._mirrors_max_tries:
				81	try:
				82	self._switch_to_next_mirror()
				83	except KeyError:
				84	raise UnableToDownload("Tried all mirrors")
				85	else:
				86	self._mirrors_tries += 1
				87	self._projects.clear()
				88	return wrapped(self, args, *kwargs)
				89	return wrapped
				90	return wrapper
				91
				92
				93	class Crawler(BaseClient):
				94	"""Provides useful tools to request the Python Package Index simple API.
				95
				96	You can specify both mirrors and mirrors_url, but mirrors_url will only be
				97	used if mirrors is set to None.
				98
				99	:param index_url: the url of the simple index to search on.
				100	:param prefer_final: if the version is not mentioned, and the last
				101	version is not a "final" one (alpha, beta, etc.),
				102	pick up the last final version.
				103	:param prefer_source: if the distribution type is not mentioned, pick up
				104	the source one if available.
				105	:param follow_externals: tell if following external links is needed or
				106	not. Default is False.
				107	:param hosts: a list of hosts allowed to be processed while using
				108	follow_externals=True. Default behavior is to follow all
				109	hosts.
				110	:param follow_externals: tell if following external links is needed or
				111	not. Default is False.
				112	:param mirrors_url: the url to look on for DNS records giving mirror
Éric Araujo	348c572	2011-06-19 18:53:31 +0200	[diff] [blame^]	113	addresses.
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	114	:param mirrors: a list of mirrors (see PEP 381).
				115	:param timeout: time in seconds to consider a url has timeouted.
				116	:param mirrors_max_tries": number of times to try requesting informations
				117	on mirrors before switching.
				118	"""
				119
				120	def __init__(self, index_url=DEFAULT_SIMPLE_INDEX_URL, prefer_final=False,
				121	prefer_source=True, hosts=DEFAULT_HOSTS,
				122	follow_externals=False, mirrors_url=None, mirrors=None,
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	123	timeout=SOCKET_TIMEOUT, mirrors_max_tries=0, verbose=False):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	124	super(Crawler, self).__init__(prefer_final, prefer_source)
				125	self.follow_externals = follow_externals
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	126	self.verbose = verbose
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	127
				128	# mirroring attributes.
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	129	parsed = urllib.parse.urlparse(index_url)
				130	self.scheme = parsed[0]
				131	if self.scheme == 'file':
				132	ender = os.path.sep
				133	else:
				134	ender = '/'
				135	if not index_url.endswith(ender):
				136	index_url += ender
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	137	# if no mirrors are defined, use the method described in PEP 381.
				138	if mirrors is None:
				139	mirrors = get_mirrors(mirrors_url)
				140	self._mirrors = set(mirrors)
				141	self._mirrors_used = set()
				142	self.index_url = index_url
				143	self._mirrors_max_tries = mirrors_max_tries
				144	self._mirrors_tries = 0
				145	self._timeout = timeout
				146
				147	# create a regexp to match all given hosts
				148	self._allowed_hosts = re.compile('\|'.join(map(translate, hosts))).match
				149
				150	# we keep an index of pages we have processed, in order to avoid
				151	# scanning them multple time (eg. if there is multiple pages pointing
				152	# on one)
				153	self._processed_urls = []
				154	self._projects = {}
				155
				156	@with_mirror_support()
				157	def search_projects(self, name=None, **kwargs):
				158	"""Search the index for projects containing the given name.
				159
				160	Return a list of names.
				161	"""
				162	with self._open_url(self.index_url) as index:
				163	if '*' in name:
				164	name.replace('', '.')
				165	else:
				166	name = "%s%s%s" % ('.?', name, '.?')
				167	name = name.replace('', '[^<]') # avoid matching end tag
				168	projectname = re.compile('<a[^>]*>(%s)</a>' % name, re.I)
				169	matching_projects = []
				170
				171	index_content = index.read()
				172
				173	# FIXME should use bytes I/O and regexes instead of decoding
				174	index_content = index_content.decode()
				175
				176	for match in projectname.finditer(index_content):
				177	project_name = match.group(1)
				178	matching_projects.append(self._get_project(project_name))
				179	return matching_projects
				180
				181	def get_releases(self, requirements, prefer_final=None,
				182	force_update=False):
Éric Araujo	25d5737	2011-06-01 14:41:11 +0200	[diff] [blame]	183	"""Search for releases and return a ReleasesList object containing
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	184	the results.
				185	"""
				186	predicate = get_version_predicate(requirements)
				187	if predicate.name.lower() in self._projects and not force_update:
				188	return self._projects.get(predicate.name.lower())
				189	prefer_final = self._get_prefer_final(prefer_final)
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	190	logger.debug('Reading info on PyPI about %s', predicate.name)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	191	self._process_index_page(predicate.name)
				192
				193	if predicate.name.lower() not in self._projects:
Éric Araujo	2ef747c	2011-06-04 22:33:16 +0200	[diff] [blame]	194	raise ProjectNotFound
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	195
				196	releases = self._projects.get(predicate.name.lower())
				197	releases.sort_releases(prefer_final=prefer_final)
				198	return releases
				199
				200	def get_release(self, requirements, prefer_final=None):
				201	"""Return only one release that fulfill the given requirements"""
				202	predicate = get_version_predicate(requirements)
				203	release = self.get_releases(predicate, prefer_final)\
				204	.get_last(predicate)
				205	if not release:
				206	raise ReleaseNotFound("No release matches the given criterias")
				207	return release
				208
				209	def get_distributions(self, project_name, version):
				210	"""Return the distributions found on the index for the specific given
				211	release"""
				212	# as the default behavior of get_release is to return a release
				213	# containing the distributions, just alias it.
				214	return self.get_release("%s (%s)" % (project_name, version))
				215
				216	def get_metadata(self, project_name, version):
				217	"""Return the metadatas from the simple index.
				218
				219	Currently, download one archive, extract it and use the PKG-INFO file.
				220	"""
				221	release = self.get_distributions(project_name, version)
				222	if not release.metadata:
				223	location = release.get_distribution().unpack()
				224	pkg_info = os.path.join(location, 'PKG-INFO')
				225	release.metadata = Metadata(pkg_info)
				226	return release
				227
				228	def _switch_to_next_mirror(self):
				229	"""Switch to the next mirror (eg. point self.index_url to the next
				230	mirror url.
				231
				232	Raise a KeyError if all mirrors have been tried.
				233	"""
				234	self._mirrors_used.add(self.index_url)
				235	index_url = self._mirrors.pop()
Éric Araujo	ea888e0	2011-06-08 04:31:18 +0200	[diff] [blame]	236	# XXX use urllib.parse for a real check of missing scheme part
				237	if not index_url.startswith(("http://", "https://", "file://")):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	238	index_url = "http://%s" % index_url
				239
				240	if not index_url.endswith("/simple"):
				241	index_url = "%s/simple/" % index_url
				242
				243	self.index_url = index_url
				244
				245	def _is_browsable(self, url):
				246	"""Tell if the given URL can be browsed or not.
				247
				248	It uses the follow_externals and the hosts list to tell if the given
				249	url is browsable or not.
				250	"""
				251	# if _index_url is contained in the given URL, we are browsing the
				252	# index, and it's always "browsable".
				253	# local files are always considered browable resources
				254	if self.index_url in url or urllib.parse.urlparse(url)[0] == "file":
				255	return True
				256	elif self.follow_externals:
				257	if self._allowed_hosts(urllib.parse.urlparse(url)[1]): # 1 is netloc
				258	return True
				259	else:
				260	return False
				261	return False
				262
				263	def _is_distribution(self, link):
				264	"""Tell if the given URL matches to a distribution name or not.
				265	"""
				266	#XXX find a better way to check that links are distributions
				267	# Using a regexp ?
				268	for ext in EXTENSIONS:
				269	if ext in link:
				270	return True
				271	return False
				272
				273	def _register_release(self, release=None, release_info={}):
				274	"""Register a new release.
				275
				276	Both a release or a dict of release_info can be provided, the prefered
				277	way (eg. the quicker) is the dict one.
				278
				279	Return the list of existing releases for the given project.
				280	"""
				281	# Check if the project already has a list of releases (refering to
				282	# the project name). If not, create a new release list.
				283	# Then, add the release to the list.
				284	if release:
				285	name = release.name
				286	else:
				287	name = release_info['name']
Éric Araujo	df8ef02	2011-06-08 04:47:13 +0200	[diff] [blame]	288	if name.lower() not in self._projects:
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	289	self._projects[name.lower()] = ReleasesList(name, index=self._index)
				290
				291	if release:
				292	self._projects[name.lower()].add_release(release=release)
				293	else:
				294	name = release_info.pop('name')
				295	version = release_info.pop('version')
				296	dist_type = release_info.pop('dist_type')
				297	self._projects[name.lower()].add_release(version, dist_type,
				298	**release_info)
				299	return self._projects[name.lower()]
				300
				301	def _process_url(self, url, project_name=None, follow_links=True):
				302	"""Process an url and search for distributions packages.
				303
				304	For each URL found, if it's a download, creates a PyPIdistribution
				305	object. If it's a homepage and we can follow links, process it too.
				306
				307	:param url: the url to process
				308	:param project_name: the project name we are searching for.
				309	:param follow_links: Do not want to follow links more than from one
				310	level. This parameter tells if we want to follow
				311	the links we find (eg. run recursively this
				312	method on it)
				313	"""
				314	with self._open_url(url) as f:
				315	base_url = f.url
				316	if url not in self._processed_urls:
				317	self._processed_urls.append(url)
				318	link_matcher = self._get_link_matcher(url)
				319	for link, is_download in link_matcher(f.read().decode(), base_url):
				320	if link not in self._processed_urls:
				321	if self._is_distribution(link) or is_download:
				322	self._processed_urls.append(link)
				323	# it's a distribution, so create a dist object
				324	try:
				325	infos = get_infos_from_url(link, project_name,
Éric Araujo	df8ef02	2011-06-08 04:47:13 +0200	[diff] [blame]	326	is_external=self.index_url not in url)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	327	except CantParseArchiveName as e:
Tarek Ziade	b1b6e13	2011-05-30 12:07:49 +0200	[diff] [blame]	328	if self.verbose:
				329	logger.warning(
				330	"version has not been parsed: %s", e)
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	331	else:
				332	self._register_release(release_info=infos)
				333	else:
				334	if self._is_browsable(link) and follow_links:
				335	self._process_url(link, project_name,
				336	follow_links=False)
				337
				338	def _get_link_matcher(self, url):
				339	"""Returns the right link matcher function of the given url
				340	"""
				341	if self.index_url in url:
				342	return self._simple_link_matcher
				343	else:
				344	return self._default_link_matcher
				345
				346	def _get_full_url(self, url, base_url):
				347	return urllib.parse.urljoin(base_url, self._htmldecode(url))
				348
				349	def _simple_link_matcher(self, content, base_url):
				350	"""Yield all links with a rel="download" or rel="homepage".
				351
				352	This matches the simple index requirements for matching links.
				353	If follow_externals is set to False, dont yeld the external
				354	urls.
				355
				356	:param content: the content of the page we want to parse
				357	:param base_url: the url of this page.
				358	"""
				359	for match in HREF.finditer(content):
				360	url = self._get_full_url(match.group(1), base_url)
				361	if MD5_HASH.match(url):
				362	yield (url, True)
				363
				364	for match in REL.finditer(content):
				365	# search for rel links.
				366	tag, rel = match.groups()
				367	rels = [s.strip() for s in rel.lower().split(',')]
				368	if 'homepage' in rels or 'download' in rels:
				369	for match in HREF.finditer(tag):
				370	url = self._get_full_url(match.group(1), base_url)
				371	if 'download' in rels or self._is_browsable(url):
				372	# yield a list of (url, is_download)
				373	yield (url, 'download' in rels)
				374
				375	def _default_link_matcher(self, content, base_url):
				376	"""Yield all links found on the page.
				377	"""
				378	for match in HREF.finditer(content):
				379	url = self._get_full_url(match.group(1), base_url)
				380	if self._is_browsable(url):
				381	yield (url, False)
				382
				383	@with_mirror_support()
				384	def _process_index_page(self, name):
				385	"""Find and process a PyPI page for the given project name.
				386
				387	:param name: the name of the project to find the page
				388	"""
				389	# Browse and index the content of the given PyPI page.
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	390	if self.scheme == 'file':
				391	ender = os.path.sep
				392	else:
				393	ender = '/'
				394	url = self.index_url + name + ender
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	395	self._process_url(url, name)
				396
				397	@socket_timeout()
				398	def _open_url(self, url):
				399	"""Open a urllib2 request, handling HTTP authentication, and local
				400	files support.
				401
				402	"""
				403	scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
				404
				405	# authentication stuff
				406	if scheme in ('http', 'https'):
				407	auth, host = urllib.parse.splituser(netloc)
				408	else:
				409	auth = None
				410
				411	# add index.html automatically for filesystem paths
				412	if scheme == 'file':
Tarek Ziade	cc243cc	2011-05-21 22:47:40 +0200	[diff] [blame]	413	if url.endswith(os.path.sep):
Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame]	414	url += "index.html"
				415
				416	# add authorization headers if auth is provided
				417	if auth:
				418	auth = "Basic " + \
				419	urllib.parse.unquote(auth).encode('base64').strip()
				420	new_url = urllib.parse.urlunparse((
				421	scheme, host, path, params, query, frag))
				422	request = urllib.request.Request(new_url)
				423	request.add_header("Authorization", auth)
				424	else:
				425	request = urllib.request.Request(url)
				426	request.add_header('User-Agent', USER_AGENT)
				427	try:
				428	fp = urllib.request.urlopen(request)
				429	except (ValueError, http.client.InvalidURL) as v:
				430	msg = ' '.join([str(arg) for arg in v.args])
				431	raise PackagingPyPIError('%s %s' % (url, msg))
				432	except urllib.error.HTTPError as v:
				433	return v
				434	except urllib.error.URLError as v:
				435	raise DownloadError("Download error for %s: %s" % (url, v.reason))
				436	except http.client.BadStatusLine as v:
				437	raise DownloadError('%s returned a bad status line. '
				438	'The server might be down, %s' % (url, v.line))
				439	except http.client.HTTPException as v:
				440	raise DownloadError("Download error for %s: %s" % (url, v))
				441	except socket.timeout:
				442	raise DownloadError("The server timeouted")
				443
				444	if auth:
				445	# Put authentication info back into request URL if same host,
				446	# so that links found on the page will work
				447	s2, h2, path2, param2, query2, frag2 = \
				448	urllib.parse.urlparse(fp.url)
				449	if s2 == scheme and h2 == host:
				450	fp.url = urllib.parse.urlunparse(
				451	(s2, netloc, path2, param2, query2, frag2))
				452	return fp
				453
				454	def _decode_entity(self, match):
				455	what = match.group(1)
				456	if what.startswith('#x'):
				457	what = int(what[2:], 16)
				458	elif what.startswith('#'):
				459	what = int(what[1:])
				460	else:
				461	from html.entities import name2codepoint
				462	what = name2codepoint.get(what, match.group(0))
				463	return chr(what)
				464
				465	def _htmldecode(self, text):
				466	"""Decode HTML entities in the given text."""
				467	return ENTITY_SUB(self._decode_entity, text)