Blame - Lib/packaging/pypi/simple.py - platform/external/python/cpython2

blob: 85851938833a1468a8a69dd3b8ba95eb37005f9d [file] [log] [blame]

Tarek Ziade	1231a4e	2011-05-19 13:07:25 +0200	[diff] [blame^]	1	"""Spider using the screen-scraping "simple" PyPI API.
				2
				3	This module contains the class SimpleIndexCrawler, a simple spider that
				4	can be used to find and retrieve distributions from a project index
				5	(like the Python Package Index), using its so-called simple API (see
				6	reference implementation available at http://pypi.python.org/simple/).
				7	"""
				8
				9	import http.client
				10	import re
				11	import socket
				12	import sys
				13	import urllib.request
				14	import urllib.parse
				15	import urllib.error
				16	import os
				17
				18
				19	from fnmatch import translate
				20	from packaging import logger
				21	from packaging.metadata import Metadata
				22	from packaging.version import get_version_predicate
				23	from packaging import __version__ as packaging_version
				24	from packaging.pypi.base import BaseClient
				25	from packaging.pypi.dist import (ReleasesList, EXTENSIONS,
				26	get_infos_from_url, MD5_HASH)
				27	from packaging.pypi.errors import (PackagingPyPIError, DownloadError,
				28	UnableToDownload, CantParseArchiveName,
				29	ReleaseNotFound, ProjectNotFound)
				30	from packaging.pypi.mirrors import get_mirrors
				31	from packaging.metadata import Metadata
				32
				33	__all__ = ['Crawler', 'DEFAULT_SIMPLE_INDEX_URL']
				34
				35	# -- Constants -----------------------------------------------
				36	DEFAULT_SIMPLE_INDEX_URL = "http://a.pypi.python.org/simple/"
				37	DEFAULT_HOSTS = ("*",)
				38	SOCKET_TIMEOUT = 15
				39	USER_AGENT = "Python-urllib/%s packaging/%s" % (
				40	sys.version[:3], packaging_version)
				41
				42	# -- Regexps -------------------------------------------------
				43	EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
				44	HREF = re.compile("""href\\s=\\s['"]?([^'"> ]+)""", re.I)
				45	URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
				46
				47	# This pattern matches a character entity reference (a decimal numeric
				48	# references, a hexadecimal numeric reference, or a named reference).
				49	ENTITY_SUB = re.compile(r'&(#(\d+\|x[\da-fA-F]+)\|[\w.:-]+);?').sub
				50	REL = re.compile("""<([^>]\srel\s=\s['"]?([^'">]+)[^>])>""", re.I)
				51
				52
				53	def socket_timeout(timeout=SOCKET_TIMEOUT):
				54	"""Decorator to add a socket timeout when requesting pages on PyPI.
				55	"""
				56	def _socket_timeout(func):
				57	def _socket_timeout(self, args, *kwargs):
				58	old_timeout = socket.getdefaulttimeout()
				59	if hasattr(self, "_timeout"):
				60	timeout = self._timeout
				61	socket.setdefaulttimeout(timeout)
				62	try:
				63	return func(self, args, *kwargs)
				64	finally:
				65	socket.setdefaulttimeout(old_timeout)
				66	return _socket_timeout
				67	return _socket_timeout
				68
				69
				70	def with_mirror_support():
				71	"""Decorator that makes the mirroring support easier"""
				72	def wrapper(func):
				73	def wrapped(self, args, *kwargs):
				74	try:
				75	return func(self, args, *kwargs)
				76	except DownloadError:
				77	# if an error occurs, try with the next index_url
				78	if self._mirrors_tries >= self._mirrors_max_tries:
				79	try:
				80	self._switch_to_next_mirror()
				81	except KeyError:
				82	raise UnableToDownload("Tried all mirrors")
				83	else:
				84	self._mirrors_tries += 1
				85	self._projects.clear()
				86	return wrapped(self, args, *kwargs)
				87	return wrapped
				88	return wrapper
				89
				90
				91	class Crawler(BaseClient):
				92	"""Provides useful tools to request the Python Package Index simple API.
				93
				94	You can specify both mirrors and mirrors_url, but mirrors_url will only be
				95	used if mirrors is set to None.
				96
				97	:param index_url: the url of the simple index to search on.
				98	:param prefer_final: if the version is not mentioned, and the last
				99	version is not a "final" one (alpha, beta, etc.),
				100	pick up the last final version.
				101	:param prefer_source: if the distribution type is not mentioned, pick up
				102	the source one if available.
				103	:param follow_externals: tell if following external links is needed or
				104	not. Default is False.
				105	:param hosts: a list of hosts allowed to be processed while using
				106	follow_externals=True. Default behavior is to follow all
				107	hosts.
				108	:param follow_externals: tell if following external links is needed or
				109	not. Default is False.
				110	:param mirrors_url: the url to look on for DNS records giving mirror
				111	adresses.
				112	:param mirrors: a list of mirrors (see PEP 381).
				113	:param timeout: time in seconds to consider a url has timeouted.
				114	:param mirrors_max_tries": number of times to try requesting informations
				115	on mirrors before switching.
				116	"""
				117
				118	def __init__(self, index_url=DEFAULT_SIMPLE_INDEX_URL, prefer_final=False,
				119	prefer_source=True, hosts=DEFAULT_HOSTS,
				120	follow_externals=False, mirrors_url=None, mirrors=None,
				121	timeout=SOCKET_TIMEOUT, mirrors_max_tries=0):
				122	super(Crawler, self).__init__(prefer_final, prefer_source)
				123	self.follow_externals = follow_externals
				124
				125	# mirroring attributes.
				126	if not index_url.endswith("/"):
				127	index_url += "/"
				128	# if no mirrors are defined, use the method described in PEP 381.
				129	if mirrors is None:
				130	mirrors = get_mirrors(mirrors_url)
				131	self._mirrors = set(mirrors)
				132	self._mirrors_used = set()
				133	self.index_url = index_url
				134	self._mirrors_max_tries = mirrors_max_tries
				135	self._mirrors_tries = 0
				136	self._timeout = timeout
				137
				138	# create a regexp to match all given hosts
				139	self._allowed_hosts = re.compile('\|'.join(map(translate, hosts))).match
				140
				141	# we keep an index of pages we have processed, in order to avoid
				142	# scanning them multple time (eg. if there is multiple pages pointing
				143	# on one)
				144	self._processed_urls = []
				145	self._projects = {}
				146
				147	@with_mirror_support()
				148	def search_projects(self, name=None, **kwargs):
				149	"""Search the index for projects containing the given name.
				150
				151	Return a list of names.
				152	"""
				153	with self._open_url(self.index_url) as index:
				154	if '*' in name:
				155	name.replace('', '.')
				156	else:
				157	name = "%s%s%s" % ('.?', name, '.?')
				158	name = name.replace('', '[^<]') # avoid matching end tag
				159	projectname = re.compile('<a[^>]*>(%s)</a>' % name, re.I)
				160	matching_projects = []
				161
				162	index_content = index.read()
				163
				164	# FIXME should use bytes I/O and regexes instead of decoding
				165	index_content = index_content.decode()
				166
				167	for match in projectname.finditer(index_content):
				168	project_name = match.group(1)
				169	matching_projects.append(self._get_project(project_name))
				170	return matching_projects
				171
				172	def get_releases(self, requirements, prefer_final=None,
				173	force_update=False):
				174	"""Search for releases and return a ReleaseList object containing
				175	the results.
				176	"""
				177	predicate = get_version_predicate(requirements)
				178	if predicate.name.lower() in self._projects and not force_update:
				179	return self._projects.get(predicate.name.lower())
				180	prefer_final = self._get_prefer_final(prefer_final)
				181	logger.info('reading info on PyPI about %s', predicate.name)
				182	self._process_index_page(predicate.name)
				183
				184	if predicate.name.lower() not in self._projects:
				185	raise ProjectNotFound()
				186
				187	releases = self._projects.get(predicate.name.lower())
				188	releases.sort_releases(prefer_final=prefer_final)
				189	return releases
				190
				191	def get_release(self, requirements, prefer_final=None):
				192	"""Return only one release that fulfill the given requirements"""
				193	predicate = get_version_predicate(requirements)
				194	release = self.get_releases(predicate, prefer_final)\
				195	.get_last(predicate)
				196	if not release:
				197	raise ReleaseNotFound("No release matches the given criterias")
				198	return release
				199
				200	def get_distributions(self, project_name, version):
				201	"""Return the distributions found on the index for the specific given
				202	release"""
				203	# as the default behavior of get_release is to return a release
				204	# containing the distributions, just alias it.
				205	return self.get_release("%s (%s)" % (project_name, version))
				206
				207	def get_metadata(self, project_name, version):
				208	"""Return the metadatas from the simple index.
				209
				210	Currently, download one archive, extract it and use the PKG-INFO file.
				211	"""
				212	release = self.get_distributions(project_name, version)
				213	if not release.metadata:
				214	location = release.get_distribution().unpack()
				215	pkg_info = os.path.join(location, 'PKG-INFO')
				216	release.metadata = Metadata(pkg_info)
				217	return release
				218
				219	def _switch_to_next_mirror(self):
				220	"""Switch to the next mirror (eg. point self.index_url to the next
				221	mirror url.
				222
				223	Raise a KeyError if all mirrors have been tried.
				224	"""
				225	self._mirrors_used.add(self.index_url)
				226	index_url = self._mirrors.pop()
				227	if not ("http://" or "https://" or "file://") in index_url:
				228	index_url = "http://%s" % index_url
				229
				230	if not index_url.endswith("/simple"):
				231	index_url = "%s/simple/" % index_url
				232
				233	self.index_url = index_url
				234
				235	def _is_browsable(self, url):
				236	"""Tell if the given URL can be browsed or not.
				237
				238	It uses the follow_externals and the hosts list to tell if the given
				239	url is browsable or not.
				240	"""
				241	# if _index_url is contained in the given URL, we are browsing the
				242	# index, and it's always "browsable".
				243	# local files are always considered browable resources
				244	if self.index_url in url or urllib.parse.urlparse(url)[0] == "file":
				245	return True
				246	elif self.follow_externals:
				247	if self._allowed_hosts(urllib.parse.urlparse(url)[1]): # 1 is netloc
				248	return True
				249	else:
				250	return False
				251	return False
				252
				253	def _is_distribution(self, link):
				254	"""Tell if the given URL matches to a distribution name or not.
				255	"""
				256	#XXX find a better way to check that links are distributions
				257	# Using a regexp ?
				258	for ext in EXTENSIONS:
				259	if ext in link:
				260	return True
				261	return False
				262
				263	def _register_release(self, release=None, release_info={}):
				264	"""Register a new release.
				265
				266	Both a release or a dict of release_info can be provided, the prefered
				267	way (eg. the quicker) is the dict one.
				268
				269	Return the list of existing releases for the given project.
				270	"""
				271	# Check if the project already has a list of releases (refering to
				272	# the project name). If not, create a new release list.
				273	# Then, add the release to the list.
				274	if release:
				275	name = release.name
				276	else:
				277	name = release_info['name']
				278	if not name.lower() in self._projects:
				279	self._projects[name.lower()] = ReleasesList(name, index=self._index)
				280
				281	if release:
				282	self._projects[name.lower()].add_release(release=release)
				283	else:
				284	name = release_info.pop('name')
				285	version = release_info.pop('version')
				286	dist_type = release_info.pop('dist_type')
				287	self._projects[name.lower()].add_release(version, dist_type,
				288	**release_info)
				289	return self._projects[name.lower()]
				290
				291	def _process_url(self, url, project_name=None, follow_links=True):
				292	"""Process an url and search for distributions packages.
				293
				294	For each URL found, if it's a download, creates a PyPIdistribution
				295	object. If it's a homepage and we can follow links, process it too.
				296
				297	:param url: the url to process
				298	:param project_name: the project name we are searching for.
				299	:param follow_links: Do not want to follow links more than from one
				300	level. This parameter tells if we want to follow
				301	the links we find (eg. run recursively this
				302	method on it)
				303	"""
				304	with self._open_url(url) as f:
				305	base_url = f.url
				306	if url not in self._processed_urls:
				307	self._processed_urls.append(url)
				308	link_matcher = self._get_link_matcher(url)
				309	for link, is_download in link_matcher(f.read().decode(), base_url):
				310	if link not in self._processed_urls:
				311	if self._is_distribution(link) or is_download:
				312	self._processed_urls.append(link)
				313	# it's a distribution, so create a dist object
				314	try:
				315	infos = get_infos_from_url(link, project_name,
				316	is_external=not self.index_url in url)
				317	except CantParseArchiveName as e:
				318	logger.warning(
				319	"version has not been parsed: %s", e)
				320	else:
				321	self._register_release(release_info=infos)
				322	else:
				323	if self._is_browsable(link) and follow_links:
				324	self._process_url(link, project_name,
				325	follow_links=False)
				326
				327	def _get_link_matcher(self, url):
				328	"""Returns the right link matcher function of the given url
				329	"""
				330	if self.index_url in url:
				331	return self._simple_link_matcher
				332	else:
				333	return self._default_link_matcher
				334
				335	def _get_full_url(self, url, base_url):
				336	return urllib.parse.urljoin(base_url, self._htmldecode(url))
				337
				338	def _simple_link_matcher(self, content, base_url):
				339	"""Yield all links with a rel="download" or rel="homepage".
				340
				341	This matches the simple index requirements for matching links.
				342	If follow_externals is set to False, dont yeld the external
				343	urls.
				344
				345	:param content: the content of the page we want to parse
				346	:param base_url: the url of this page.
				347	"""
				348	for match in HREF.finditer(content):
				349	url = self._get_full_url(match.group(1), base_url)
				350	if MD5_HASH.match(url):
				351	yield (url, True)
				352
				353	for match in REL.finditer(content):
				354	# search for rel links.
				355	tag, rel = match.groups()
				356	rels = [s.strip() for s in rel.lower().split(',')]
				357	if 'homepage' in rels or 'download' in rels:
				358	for match in HREF.finditer(tag):
				359	url = self._get_full_url(match.group(1), base_url)
				360	if 'download' in rels or self._is_browsable(url):
				361	# yield a list of (url, is_download)
				362	yield (url, 'download' in rels)
				363
				364	def _default_link_matcher(self, content, base_url):
				365	"""Yield all links found on the page.
				366	"""
				367	for match in HREF.finditer(content):
				368	url = self._get_full_url(match.group(1), base_url)
				369	if self._is_browsable(url):
				370	yield (url, False)
				371
				372	@with_mirror_support()
				373	def _process_index_page(self, name):
				374	"""Find and process a PyPI page for the given project name.
				375
				376	:param name: the name of the project to find the page
				377	"""
				378	# Browse and index the content of the given PyPI page.
				379	url = self.index_url + name + "/"
				380	self._process_url(url, name)
				381
				382	@socket_timeout()
				383	def _open_url(self, url):
				384	"""Open a urllib2 request, handling HTTP authentication, and local
				385	files support.
				386
				387	"""
				388	scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
				389
				390	# authentication stuff
				391	if scheme in ('http', 'https'):
				392	auth, host = urllib.parse.splituser(netloc)
				393	else:
				394	auth = None
				395
				396	# add index.html automatically for filesystem paths
				397	if scheme == 'file':
				398	if url.endswith('/'):
				399	url += "index.html"
				400
				401	# add authorization headers if auth is provided
				402	if auth:
				403	auth = "Basic " + \
				404	urllib.parse.unquote(auth).encode('base64').strip()
				405	new_url = urllib.parse.urlunparse((
				406	scheme, host, path, params, query, frag))
				407	request = urllib.request.Request(new_url)
				408	request.add_header("Authorization", auth)
				409	else:
				410	request = urllib.request.Request(url)
				411	request.add_header('User-Agent', USER_AGENT)
				412	try:
				413	fp = urllib.request.urlopen(request)
				414	except (ValueError, http.client.InvalidURL) as v:
				415	msg = ' '.join([str(arg) for arg in v.args])
				416	raise PackagingPyPIError('%s %s' % (url, msg))
				417	except urllib.error.HTTPError as v:
				418	return v
				419	except urllib.error.URLError as v:
				420	raise DownloadError("Download error for %s: %s" % (url, v.reason))
				421	except http.client.BadStatusLine as v:
				422	raise DownloadError('%s returned a bad status line. '
				423	'The server might be down, %s' % (url, v.line))
				424	except http.client.HTTPException as v:
				425	raise DownloadError("Download error for %s: %s" % (url, v))
				426	except socket.timeout:
				427	raise DownloadError("The server timeouted")
				428
				429	if auth:
				430	# Put authentication info back into request URL if same host,
				431	# so that links found on the page will work
				432	s2, h2, path2, param2, query2, frag2 = \
				433	urllib.parse.urlparse(fp.url)
				434	if s2 == scheme and h2 == host:
				435	fp.url = urllib.parse.urlunparse(
				436	(s2, netloc, path2, param2, query2, frag2))
				437	return fp
				438
				439	def _decode_entity(self, match):
				440	what = match.group(1)
				441	if what.startswith('#x'):
				442	what = int(what[2:], 16)
				443	elif what.startswith('#'):
				444	what = int(what[1:])
				445	else:
				446	from html.entities import name2codepoint
				447	what = name2codepoint.get(what, match.group(0))
				448	return chr(what)
				449
				450	def _htmldecode(self, text):
				451	"""Decode HTML entities in the given text."""
				452	return ENTITY_SUB(self._decode_entity, text)