Blame - Lib/urllib/request.py - platform/external/python/cpython3

blob: bb67267639bf14059cde2dbcbc7945f5ab67cc8a [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""An extensible library for opening URLs using a variety of protocols
				2
				3	The simplest way to use this module is to call the urlopen function,
				4	which accepts a string containing a URL or a Request object (described
				5	below). It opens the URL and returns the results as file-like
				6	object; the returned object has some extra methods described below.
				7
				8	The OpenerDirector manages a collection of Handler objects that do
				9	all the actual work. Each Handler implements a particular protocol or
				10	option. The OpenerDirector is a composite object that invokes the
				11	Handlers needed to open the requested URL. For example, the
				12	HTTPHandler performs HTTP GET and POST requests and deals with
				13	non-error returns. The HTTPRedirectHandler automatically deals with
				14	HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
				15	deals with digest authentication.
				16
				17	urlopen(url, data=None) -- Basic usage is the same as original
				18	urllib. pass the url and optionally data to post to an HTTP URL, and
				19	get a file-like object back. One difference is that you can also pass
				20	a Request instance instead of URL. Raises a URLError (subclass of
				21	IOError); for HTTP errors, raises an HTTPError, which can also be
				22	treated as a valid response.
				23
				24	build_opener -- Function that creates a new OpenerDirector instance.
				25	Will install the default handlers. Accepts one or more Handlers as
				26	arguments, either instances or Handler classes that it will
				27	instantiate. If one of the argument is a subclass of the default
				28	handler, the argument will be installed instead of the default.
				29
				30	install_opener -- Installs a new opener as the default opener.
				31
				32	objects of interest:
				33	OpenerDirector --
				34
				35	Request -- An object that encapsulates the state of a request. The
				36	state can be as simple as the URL. It can also include extra HTTP
				37	headers, e.g. a User-Agent.
				38
				39	BaseHandler --
				40
				41	internals:
				42	BaseHandler and parent
				43	_call_chain conventions
				44
				45	Example usage:
				46
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	47	import urllib.request
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	48
				49	# set up authentication info
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	50	authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	51	authinfo.add_password(realm='PDQ Application',
				52	uri='https://mahler:8092/site-updates.py',
				53	user='klem',
				54	passwd='geheim$parole')
				55
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	56	proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	57
				58	# build a new opener that adds authentication and caching FTP handlers
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	59	opener = urllib.request.build_opener(proxy_support, authinfo,
				60	urllib.request.CacheFTPHandler)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	61
				62	# install it
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	63	urllib.request.install_opener(opener)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	64
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	65	f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	66	"""
				67
				68	# XXX issues:
				69	# If an authentication error handler that tries to perform
				70	# authentication for some reason but fails, how should the error be
				71	# signalled? The client needs to know the HTTP error code. But if
				72	# the handler knows that the problem was, e.g., that it didn't know
				73	# that hash algo that requested in the challenge, it would be good to
				74	# pass that information along to the client, too.
				75	# ftp errors aren't handled cleanly
				76	# check digest against correct (i.e. non-apache) implementation
				77
				78	# Possible extensions:
				79	# complex proxies XXX not sure what exactly was meant by this
				80	# abstract factory for opener
				81
				82	import base64
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	83	import bisect
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	84	import email
				85	import hashlib
				86	import http.client
				87	import io
				88	import os
				89	import posixpath
				90	import random
				91	import re
				92	import socket
				93	import sys
				94	import time
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	95
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	96	from urllib.error import URLError, HTTPError, ContentTooShortError
				97	from urllib.parse import (
				98	urlparse, urlsplit, urljoin, unwrap, quote, unquote,
				99	splittype, splithost, splitport, splituser, splitpasswd,
Facundo Batista	f24802c	2008-08-17 03:36:03 +0000	[diff] [blame]	100	splitattr, splitquery, splitvalue, to_bytes, urlunparse)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	101	from urllib.response import addinfourl, addclosehook
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	102
				103	# check for SSL
				104	try:
				105	import ssl
				106	except:
				107	_have_ssl = False
				108	else:
				109	_have_ssl = True
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	110
				111	# used in User-Agent header sent
				112	__version__ = sys.version[:3]
				113
				114	_opener = None
				115	def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
				116	global _opener
				117	if _opener is None:
				118	_opener = build_opener()
				119	return _opener.open(url, data, timeout)
				120
				121	def install_opener(opener):
				122	global _opener
				123	_opener = opener
				124
				125	# TODO(jhylton): Make this work with the same global opener.
				126	_urlopener = None
				127	def urlretrieve(url, filename=None, reporthook=None, data=None):
				128	global _urlopener
				129	if not _urlopener:
				130	_urlopener = FancyURLopener()
				131	return _urlopener.retrieve(url, filename, reporthook, data)
				132
				133	def urlcleanup():
				134	if _urlopener:
				135	_urlopener.cleanup()
				136	global _opener
				137	if _opener:
				138	_opener = None
				139
				140	# copied from cookielib.py
Antoine Pitrou	fd03645	2008-08-19 17:56:33 +0000	[diff] [blame]	141	_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	142	def request_host(request):
				143	"""Return request-host, as defined by RFC 2965.
				144
				145	Variation from RFC: returned value is lowercased, for convenient
				146	comparison.
				147
				148	"""
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	149	url = request.full_url
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	150	host = urlparse(url)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	151	if host == "":
				152	host = request.get_header("Host", "")
				153
				154	# remove port, if present
				155	host = _cut_port_re.sub("", host, 1)
				156	return host.lower()
				157
				158	class Request:
				159
				160	def __init__(self, url, data=None, headers={},
				161	origin_req_host=None, unverifiable=False):
				162	# unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	163	self.full_url = unwrap(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	164	self.data = data
				165	self.headers = {}
				166	for key, value in headers.items():
				167	self.add_header(key, value)
				168	self.unredirected_hdrs = {}
				169	if origin_req_host is None:
				170	origin_req_host = request_host(self)
				171	self.origin_req_host = origin_req_host
				172	self.unverifiable = unverifiable
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	173	self._parse()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	174
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	175	def _parse(self):
				176	self.type, rest = splittype(self.full_url)
				177	if self.type is None:
				178	raise ValueError("unknown url type: %s" % self.full_url)
				179	self.host, self.selector = splithost(rest)
				180	if self.host:
				181	self.host = unquote(self.host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	182
				183	def get_method(self):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	184	if self.data is not None:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	185	return "POST"
				186	else:
				187	return "GET"
				188
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	189	# Begin deprecated methods
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	190
				191	def add_data(self, data):
				192	self.data = data
				193
				194	def has_data(self):
				195	return self.data is not None
				196
				197	def get_data(self):
				198	return self.data
				199
				200	def get_full_url(self):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	201	return self.full_url
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	202
				203	def get_type(self):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	204	return self.type
				205
				206	def get_host(self):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	207	return self.host
				208
				209	def get_selector(self):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	210	return self.selector
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	211
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	212	def is_unverifiable(self):
				213	return self.unverifiable
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	214
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	215	def get_origin_req_host(self):
				216	return self.origin_req_host
				217
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	218	# End deprecated methods
				219
				220	def set_proxy(self, host, type):
				221	self.host, self.type = host, type
				222	self.selector = self.full_url
				223
				224	def has_proxy(self):
				225	return self.selector == self.full_url
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	226
				227	def add_header(self, key, val):
				228	# useful for something like authentication
				229	self.headers[key.capitalize()] = val
				230
				231	def add_unredirected_header(self, key, val):
				232	# will not be added to a redirected request
				233	self.unredirected_hdrs[key.capitalize()] = val
				234
				235	def has_header(self, header_name):
				236	return (header_name in self.headers or
				237	header_name in self.unredirected_hdrs)
				238
				239	def get_header(self, header_name, default=None):
				240	return self.headers.get(
				241	header_name,
				242	self.unredirected_hdrs.get(header_name, default))
				243
				244	def header_items(self):
				245	hdrs = self.unredirected_hdrs.copy()
				246	hdrs.update(self.headers)
				247	return list(hdrs.items())
				248
				249	class OpenerDirector:
				250	def __init__(self):
				251	client_version = "Python-urllib/%s" % __version__
				252	self.addheaders = [('User-agent', client_version)]
				253	# manage the individual handlers
				254	self.handlers = []
				255	self.handle_open = {}
				256	self.handle_error = {}
				257	self.process_response = {}
				258	self.process_request = {}
				259
				260	def add_handler(self, handler):
				261	if not hasattr(handler, "add_parent"):
				262	raise TypeError("expected BaseHandler instance, got %r" %
				263	type(handler))
				264
				265	added = False
				266	for meth in dir(handler):
				267	if meth in ["redirect_request", "do_open", "proxy_open"]:
				268	# oops, coincidental match
				269	continue
				270
				271	i = meth.find("_")
				272	protocol = meth[:i]
				273	condition = meth[i+1:]
				274
				275	if condition.startswith("error"):
				276	j = condition.find("_") + i + 1
				277	kind = meth[j+1:]
				278	try:
				279	kind = int(kind)
				280	except ValueError:
				281	pass
				282	lookup = self.handle_error.get(protocol, {})
				283	self.handle_error[protocol] = lookup
				284	elif condition == "open":
				285	kind = protocol
				286	lookup = self.handle_open
				287	elif condition == "response":
				288	kind = protocol
				289	lookup = self.process_response
				290	elif condition == "request":
				291	kind = protocol
				292	lookup = self.process_request
				293	else:
				294	continue
				295
				296	handlers = lookup.setdefault(kind, [])
				297	if handlers:
				298	bisect.insort(handlers, handler)
				299	else:
				300	handlers.append(handler)
				301	added = True
				302
				303	if added:
				304	# the handlers must work in an specific order, the order
				305	# is specified in a Handler attribute
				306	bisect.insort(self.handlers, handler)
				307	handler.add_parent(self)
				308
				309	def close(self):
				310	# Only exists for backwards compatibility.
				311	pass
				312
				313	def _call_chain(self, chain, kind, meth_name, *args):
				314	# Handlers raise an exception if no one else should try to handle
				315	# the request, or return None if they can't but another handler
				316	# could. Otherwise, they return the response.
				317	handlers = chain.get(kind, ())
				318	for handler in handlers:
				319	func = getattr(handler, meth_name)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	320	result = func(*args)
				321	if result is not None:
				322	return result
				323
				324	def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
				325	# accept a URL or a Request object
				326	if isinstance(fullurl, str):
				327	req = Request(fullurl, data)
				328	else:
				329	req = fullurl
				330	if data is not None:
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	331	req.data = data
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	332
				333	req.timeout = timeout
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	334	protocol = req.type
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	335
				336	# pre-process request
				337	meth_name = protocol+"_request"
				338	for processor in self.process_request.get(protocol, []):
				339	meth = getattr(processor, meth_name)
				340	req = meth(req)
				341
				342	response = self._open(req, data)
				343
				344	# post-process response
				345	meth_name = protocol+"_response"
				346	for processor in self.process_response.get(protocol, []):
				347	meth = getattr(processor, meth_name)
				348	response = meth(req, response)
				349
				350	return response
				351
				352	def _open(self, req, data=None):
				353	result = self._call_chain(self.handle_open, 'default',
				354	'default_open', req)
				355	if result:
				356	return result
				357
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	358	protocol = req.type
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	359	result = self._call_chain(self.handle_open, protocol, protocol +
				360	'_open', req)
				361	if result:
				362	return result
				363
				364	return self._call_chain(self.handle_open, 'unknown',
				365	'unknown_open', req)
				366
				367	def error(self, proto, *args):
				368	if proto in ('http', 'https'):
				369	# XXX http[s] protocols are special-cased
				370	dict = self.handle_error['http'] # https is not different than http
				371	proto = args[2] # YUCK!
				372	meth_name = 'http_error_%s' % proto
				373	http_err = 1
				374	orig_args = args
				375	else:
				376	dict = self.handle_error
				377	meth_name = proto + '_error'
				378	http_err = 0
				379	args = (dict, proto, meth_name) + args
				380	result = self._call_chain(*args)
				381	if result:
				382	return result
				383
				384	if http_err:
				385	args = (dict, 'default', 'http_error_default') + orig_args
				386	return self._call_chain(*args)
				387
				388	# XXX probably also want an abstract factory that knows when it makes
				389	# sense to skip a superclass in favor of a subclass and when it might
				390	# make sense to include both
				391
				392	def build_opener(*handlers):
				393	"""Create an opener object from a list of handlers.
				394
				395	The opener will use several default handlers, including support
				396	for HTTP and FTP.
				397
				398	If any of the handlers passed as arguments are subclasses of the
				399	default handlers, the default handlers will not be used.
				400	"""
				401	def isclass(obj):
				402	return isinstance(obj, type) or hasattr(obj, "__bases__")
				403
				404	opener = OpenerDirector()
				405	default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
				406	HTTPDefaultErrorHandler, HTTPRedirectHandler,
				407	FTPHandler, FileHandler, HTTPErrorProcessor]
				408	if hasattr(http.client, "HTTPSConnection"):
				409	default_classes.append(HTTPSHandler)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	410	skip = set()
				411	for klass in default_classes:
				412	for check in handlers:
				413	if isclass(check):
				414	if issubclass(check, klass):
				415	skip.add(klass)
				416	elif isinstance(check, klass):
				417	skip.add(klass)
				418	for klass in skip:
				419	default_classes.remove(klass)
				420
				421	for klass in default_classes:
				422	opener.add_handler(klass())
				423
				424	for h in handlers:
				425	if isclass(h):
				426	h = h()
				427	opener.add_handler(h)
				428	return opener
				429
				430	class BaseHandler:
				431	handler_order = 500
				432
				433	def add_parent(self, parent):
				434	self.parent = parent
				435
				436	def close(self):
				437	# Only exists for backwards compatibility
				438	pass
				439
				440	def __lt__(self, other):
				441	if not hasattr(other, "handler_order"):
				442	# Try to preserve the old behavior of having custom classes
				443	# inserted after default ones (works only for custom user
				444	# classes which are not aware of handler_order).
				445	return True
				446	return self.handler_order < other.handler_order
				447
				448
				449	class HTTPErrorProcessor(BaseHandler):
				450	"""Process HTTP error responses."""
				451	handler_order = 1000 # after all other processing
				452
				453	def http_response(self, request, response):
				454	code, msg, hdrs = response.code, response.msg, response.info()
				455
				456	# According to RFC 2616, "2xx" code indicates that the client's
				457	# request was successfully received, understood, and accepted.
				458	if not (200 <= code < 300):
				459	response = self.parent.error(
				460	'http', request, response, code, msg, hdrs)
				461
				462	return response
				463
				464	https_response = http_response
				465
				466	class HTTPDefaultErrorHandler(BaseHandler):
				467	def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	468	raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	469
				470	class HTTPRedirectHandler(BaseHandler):
				471	# maximum number of redirections to any single URL
				472	# this is needed because of the state that cookies introduce
				473	max_repeats = 4
				474	# maximum total number of redirections (regardless of URL) before
				475	# assuming we're in a loop
				476	max_redirections = 10
				477
				478	def redirect_request(self, req, fp, code, msg, headers, newurl):
				479	"""Return a Request or None in response to a redirect.
				480
				481	This is called by the http_error_30x methods when a
				482	redirection response is received. If a redirection should
				483	take place, return a new Request to allow http_error_30x to
				484	perform the redirect. Otherwise, raise HTTPError if no-one
				485	else should try to handle this url. Return None if you can't
				486	but another Handler might.
				487	"""
				488	m = req.get_method()
				489	if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
				490	or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	491	raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	492
				493	# Strictly (according to RFC 2616), 301 or 302 in response to
				494	# a POST MUST NOT cause a redirection without confirmation
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	495	# from the user (of urllib.request, in this case). In practice,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	496	# essentially all clients do redirect in this case, so we do
				497	# the same.
				498	# be conciliant with URIs containing a space
				499	newurl = newurl.replace(' ', '%20')
				500	CONTENT_HEADERS = ("content-length", "content-type")
				501	newheaders = dict((k, v) for k, v in req.headers.items()
				502	if k.lower() not in CONTENT_HEADERS)
				503	return Request(newurl,
				504	headers=newheaders,
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	505	origin_req_host=req.origin_req_host,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	506	unverifiable=True)
				507
				508	# Implementation note: To avoid the server sending us into an
				509	# infinite loop, the request object needs to track what URLs we
				510	# have already seen. Do this by adding a handler-specific
				511	# attribute to the Request object.
				512	def http_error_302(self, req, fp, code, msg, headers):
				513	# Some servers (incorrectly) return multiple Location headers
				514	# (so probably same goes for URI). Use first header.
				515	if "location" in headers:
				516	newurl = headers["location"]
				517	elif "uri" in headers:
				518	newurl = headers["uri"]
				519	else:
				520	return
Facundo Batista	f24802c	2008-08-17 03:36:03 +0000	[diff] [blame]	521
				522	# fix a possible malformed URL
				523	urlparts = urlparse(newurl)
				524	if not urlparts.path:
				525	urlparts = list(urlparts)
				526	urlparts[2] = "/"
				527	newurl = urlunparse(urlparts)
				528
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	529	newurl = urljoin(req.full_url, newurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	530
				531	# XXX Probably want to forget about the state of the current
				532	# request, although that might interact poorly with other
				533	# handlers that also use handler-specific request attributes
				534	new = self.redirect_request(req, fp, code, msg, headers, newurl)
				535	if new is None:
				536	return
				537
				538	# loop detection
				539	# .redirect_dict has a key url if url was previously visited.
				540	if hasattr(req, 'redirect_dict'):
				541	visited = new.redirect_dict = req.redirect_dict
				542	if (visited.get(newurl, 0) >= self.max_repeats or
				543	len(visited) >= self.max_redirections):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	544	raise HTTPError(req.full_url, code,
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	545	self.inf_msg + msg, headers, fp)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	546	else:
				547	visited = new.redirect_dict = req.redirect_dict = {}
				548	visited[newurl] = visited.get(newurl, 0) + 1
				549
				550	# Don't close the fp until we are sure that we won't use it
				551	# with HTTPError.
				552	fp.read()
				553	fp.close()
				554
				555	return self.parent.open(new)
				556
				557	http_error_301 = http_error_303 = http_error_307 = http_error_302
				558
				559	inf_msg = "The HTTP server returned a redirect error that would " \
				560	"lead to an infinite loop.\n" \
				561	"The last 30x error message was:\n"
				562
				563
				564	def _parse_proxy(proxy):
				565	"""Return (scheme, user, password, host/port) given a URL or an authority.
				566
				567	If a URL is supplied, it must have an authority (host:port) component.
				568	According to RFC 3986, having an authority component means the URL must
				569	have two slashes after the scheme:
				570
				571	>>> _parse_proxy('file:/ftp.example.com/')
				572	Traceback (most recent call last):
				573	ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
				574
				575	The first three items of the returned tuple may be None.
				576
				577	Examples of authority parsing:
				578
				579	>>> _parse_proxy('proxy.example.com')
				580	(None, None, None, 'proxy.example.com')
				581	>>> _parse_proxy('proxy.example.com:3128')
				582	(None, None, None, 'proxy.example.com:3128')
				583
				584	The authority component may optionally include userinfo (assumed to be
				585	username:password):
				586
				587	>>> _parse_proxy('joe:password@proxy.example.com')
				588	(None, 'joe', 'password', 'proxy.example.com')
				589	>>> _parse_proxy('joe:password@proxy.example.com:3128')
				590	(None, 'joe', 'password', 'proxy.example.com:3128')
				591
				592	Same examples, but with URLs instead:
				593
				594	>>> _parse_proxy('http://proxy.example.com/')
				595	('http', None, None, 'proxy.example.com')
				596	>>> _parse_proxy('http://proxy.example.com:3128/')
				597	('http', None, None, 'proxy.example.com:3128')
				598	>>> _parse_proxy('http://joe:password@proxy.example.com/')
				599	('http', 'joe', 'password', 'proxy.example.com')
				600	>>> _parse_proxy('http://joe:password@proxy.example.com:3128')
				601	('http', 'joe', 'password', 'proxy.example.com:3128')
				602
				603	Everything after the authority is ignored:
				604
				605	>>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
				606	('ftp', 'joe', 'password', 'proxy.example.com')
				607
				608	Test for no trailing '/' case:
				609
				610	>>> _parse_proxy('http://joe:password@proxy.example.com')
				611	('http', 'joe', 'password', 'proxy.example.com')
				612
				613	"""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	614	scheme, r_scheme = splittype(proxy)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	615	if not r_scheme.startswith("/"):
				616	# authority
				617	scheme = None
				618	authority = proxy
				619	else:
				620	# URL
				621	if not r_scheme.startswith("//"):
				622	raise ValueError("proxy URL with no authority: %r" % proxy)
				623	# We have an authority, so for RFC 3986-compliant URLs (by ss 3.
				624	# and 3.3.), path is empty or starts with '/'
				625	end = r_scheme.find("/", 2)
				626	if end == -1:
				627	end = None
				628	authority = r_scheme[2:end]
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	629	userinfo, hostport = splituser(authority)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	630	if userinfo is not None:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	631	user, password = splitpasswd(userinfo)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	632	else:
				633	user = password = None
				634	return scheme, user, password, hostport
				635
				636	class ProxyHandler(BaseHandler):
				637	# Proxies must be in front
				638	handler_order = 100
				639
				640	def __init__(self, proxies=None):
				641	if proxies is None:
				642	proxies = getproxies()
				643	assert hasattr(proxies, 'keys'), "proxies must be a mapping"
				644	self.proxies = proxies
				645	for type, url in proxies.items():
				646	setattr(self, '%s_open' % type,
				647	lambda r, proxy=url, type=type, meth=self.proxy_open: \
				648	meth(r, proxy, type))
				649
				650	def proxy_open(self, req, proxy, type):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	651	orig_type = req.type
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	652	proxy_type, user, password, hostport = _parse_proxy(proxy)
				653	if proxy_type is None:
				654	proxy_type = orig_type
				655	if user and password:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	656	user_pass = '%s:%s' % (unquote(user),
				657	unquote(password))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	658	creds = base64.b64encode(user_pass.encode()).decode("ascii")
				659	req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	660	hostport = unquote(hostport)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	661	req.set_proxy(hostport, proxy_type)
				662	if orig_type == proxy_type:
				663	# let other handlers take care of it
				664	return None
				665	else:
				666	# need to start over, because the other handlers don't
				667	# grok the proxy's URL type
				668	# e.g. if we have a constructor arg proxies like so:
				669	# {'http': 'ftp://proxy.example.com'}, we may end up turning
				670	# a request for http://acme.example.com/a into one for
				671	# ftp://proxy.example.com/a
				672	return self.parent.open(req)
				673
				674	class HTTPPasswordMgr:
				675
				676	def __init__(self):
				677	self.passwd = {}
				678
				679	def add_password(self, realm, uri, user, passwd):
				680	# uri could be a single URI or a sequence
				681	if isinstance(uri, str):
				682	uri = [uri]
				683	if not realm in self.passwd:
				684	self.passwd[realm] = {}
				685	for default_port in True, False:
				686	reduced_uri = tuple(
				687	[self.reduce_uri(u, default_port) for u in uri])
				688	self.passwd[realm][reduced_uri] = (user, passwd)
				689
				690	def find_user_password(self, realm, authuri):
				691	domains = self.passwd.get(realm, {})
				692	for default_port in True, False:
				693	reduced_authuri = self.reduce_uri(authuri, default_port)
				694	for uris, authinfo in domains.items():
				695	for uri in uris:
				696	if self.is_suburi(uri, reduced_authuri):
				697	return authinfo
				698	return None, None
				699
				700	def reduce_uri(self, uri, default_port=True):
				701	"""Accept authority or URI and extract only the authority and path."""
				702	# note HTTP URLs do not have a userinfo component
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	703	parts = urlsplit(uri)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	704	if parts[1]:
				705	# URI
				706	scheme = parts[0]
				707	authority = parts[1]
				708	path = parts[2] or '/'
				709	else:
				710	# host or host:port
				711	scheme = None
				712	authority = uri
				713	path = '/'
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	714	host, port = splitport(authority)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	715	if default_port and port is None and scheme is not None:
				716	dport = {"http": 80,
				717	"https": 443,
				718	}.get(scheme)
				719	if dport is not None:
				720	authority = "%s:%d" % (host, dport)
				721	return authority, path
				722
				723	def is_suburi(self, base, test):
				724	"""Check if test is below base in a URI tree
				725
				726	Both args must be URIs in reduced form.
				727	"""
				728	if base == test:
				729	return True
				730	if base[0] != test[0]:
				731	return False
				732	common = posixpath.commonprefix((base[1], test[1]))
				733	if len(common) == len(base[1]):
				734	return True
				735	return False
				736
				737
				738	class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
				739
				740	def find_user_password(self, realm, authuri):
				741	user, password = HTTPPasswordMgr.find_user_password(self, realm,
				742	authuri)
				743	if user is not None:
				744	return user, password
				745	return HTTPPasswordMgr.find_user_password(self, None, authuri)
				746
				747
				748	class AbstractBasicAuthHandler:
				749
				750	# XXX this allows for multiple auth-schemes, but will stupidly pick
				751	# the last one with a realm specified.
				752
				753	# allow for double- and single-quoted realm values
				754	# (single quotes are a violation of the RFC, but appear in the wild)
				755	rx = re.compile('(?:.,)[ \t]*([^ \t]+)[ \t]+'
				756	'realm=(["\'])(.*?)\\2', re.I)
				757
				758	# XXX could pre-emptively send auth info already accepted (RFC 2617,
				759	# end of section 2, and section 1.2 immediately after "credentials"
				760	# production).
				761
				762	def __init__(self, password_mgr=None):
				763	if password_mgr is None:
				764	password_mgr = HTTPPasswordMgr()
				765	self.passwd = password_mgr
				766	self.add_password = self.passwd.add_password
				767
				768	def http_error_auth_reqed(self, authreq, host, req, headers):
				769	# host may be an authority (without userinfo) or a URL with an
				770	# authority
				771	# XXX could be multiple headers
				772	authreq = headers.get(authreq, None)
				773	if authreq:
				774	mo = AbstractBasicAuthHandler.rx.search(authreq)
				775	if mo:
				776	scheme, quote, realm = mo.groups()
				777	if scheme.lower() == 'basic':
				778	return self.retry_http_basic_auth(host, req, realm)
				779
				780	def retry_http_basic_auth(self, host, req, realm):
				781	user, pw = self.passwd.find_user_password(realm, host)
				782	if pw is not None:
				783	raw = "%s:%s" % (user, pw)
				784	auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
				785	if req.headers.get(self.auth_header, None) == auth:
				786	return None
				787	req.add_header(self.auth_header, auth)
				788	return self.parent.open(req)
				789	else:
				790	return None
				791
				792
				793	class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
				794
				795	auth_header = 'Authorization'
				796
				797	def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	798	url = req.full_url
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	799	return self.http_error_auth_reqed('www-authenticate',
				800	url, req, headers)
				801
				802
				803	class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
				804
				805	auth_header = 'Proxy-authorization'
				806
				807	def http_error_407(self, req, fp, code, msg, headers):
				808	# http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	809	# authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	810	# should not, RFC 3986 s. 3.2.1) support requests for URLs containing
				811	# userinfo.
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	812	authority = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	813	return self.http_error_auth_reqed('proxy-authenticate',
				814	authority, req, headers)
				815
				816
				817	def randombytes(n):
				818	"""Return n random bytes."""
				819	return os.urandom(n)
				820
				821	class AbstractDigestAuthHandler:
				822	# Digest authentication is specified in RFC 2617.
				823
				824	# XXX The client does not inspect the Authentication-Info header
				825	# in a successful response.
				826
				827	# XXX It should be possible to test this implementation against
				828	# a mock server that just generates a static set of challenges.
				829
				830	# XXX qop="auth-int" supports is shaky
				831
				832	def __init__(self, passwd=None):
				833	if passwd is None:
				834	passwd = HTTPPasswordMgr()
				835	self.passwd = passwd
				836	self.add_password = self.passwd.add_password
				837	self.retried = 0
				838	self.nonce_count = 0
				839
				840	def reset_retry_count(self):
				841	self.retried = 0
				842
				843	def http_error_auth_reqed(self, auth_header, host, req, headers):
				844	authreq = headers.get(auth_header, None)
				845	if self.retried > 5:
				846	# Don't fail endlessly - if we failed once, we'll probably
				847	# fail a second time. Hm. Unless the Password Manager is
				848	# prompting for the information. Crap. This isn't great
				849	# but it's better than the current 'repeat until recursion
				850	# depth exceeded' approach <wink>
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	851	raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	852	headers, None)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	853	else:
				854	self.retried += 1
				855	if authreq:
				856	scheme = authreq.split()[0]
				857	if scheme.lower() == 'digest':
				858	return self.retry_http_digest_auth(req, authreq)
				859
				860	def retry_http_digest_auth(self, req, auth):
				861	token, challenge = auth.split(' ', 1)
				862	chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
				863	auth = self.get_authorization(req, chal)
				864	if auth:
				865	auth_val = 'Digest %s' % auth
				866	if req.headers.get(self.auth_header, None) == auth_val:
				867	return None
				868	req.add_unredirected_header(self.auth_header, auth_val)
				869	resp = self.parent.open(req)
				870	return resp
				871
				872	def get_cnonce(self, nonce):
				873	# The cnonce-value is an opaque
				874	# quoted string value provided by the client and used by both client
				875	# and server to avoid chosen plaintext attacks, to provide mutual
				876	# authentication, and to provide some message integrity protection.
				877	# This isn't a fabulous effort, but it's probably Good Enough.
				878	s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
				879	b = s.encode("ascii") + randombytes(8)
				880	dig = hashlib.sha1(b).hexdigest()
				881	return dig[:16]
				882
				883	def get_authorization(self, req, chal):
				884	try:
				885	realm = chal['realm']
				886	nonce = chal['nonce']
				887	qop = chal.get('qop')
				888	algorithm = chal.get('algorithm', 'MD5')
				889	# mod_digest doesn't send an opaque, even though it isn't
				890	# supposed to be optional
				891	opaque = chal.get('opaque', None)
				892	except KeyError:
				893	return None
				894
				895	H, KD = self.get_algorithm_impls(algorithm)
				896	if H is None:
				897	return None
				898
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	899	user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	900	if user is None:
				901	return None
				902
				903	# XXX not implemented yet
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	904	if req.data is not None:
				905	entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	906	else:
				907	entdig = None
				908
				909	A1 = "%s:%s:%s" % (user, realm, pw)
				910	A2 = "%s:%s" % (req.get_method(),
				911	# XXX selector: what about proxies and full urls
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	912	req.selector)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	913	if qop == 'auth':
				914	self.nonce_count += 1
				915	ncvalue = '%08x' % self.nonce_count
				916	cnonce = self.get_cnonce(nonce)
				917	noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
				918	respdig = KD(H(A1), noncebit)
				919	elif qop is None:
				920	respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
				921	else:
				922	# XXX handle auth-int.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	923	raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	924
				925	# XXX should the partial digests be encoded too?
				926
				927	base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	928	'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	929	respdig)
				930	if opaque:
				931	base += ', opaque="%s"' % opaque
				932	if entdig:
				933	base += ', digest="%s"' % entdig
				934	base += ', algorithm="%s"' % algorithm
				935	if qop:
				936	base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
				937	return base
				938
				939	def get_algorithm_impls(self, algorithm):
				940	# lambdas assume digest modules are imported at the top level
				941	if algorithm == 'MD5':
				942	H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
				943	elif algorithm == 'SHA':
				944	H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
				945	# XXX MD5-sess
				946	KD = lambda s, d: H("%s:%s" % (s, d))
				947	return H, KD
				948
				949	def get_entity_digest(self, data, chal):
				950	# XXX not implemented yet
				951	return None
				952
				953
				954	class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
				955	"""An authentication protocol defined by RFC 2069
				956
				957	Digest authentication improves on basic authentication because it
				958	does not transmit passwords in the clear.
				959	"""
				960
				961	auth_header = 'Authorization'
				962	handler_order = 490 # before Basic auth
				963
				964	def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	965	host = urlparse(req.full_url)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	966	retry = self.http_error_auth_reqed('www-authenticate',
				967	host, req, headers)
				968	self.reset_retry_count()
				969	return retry
				970
				971
				972	class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
				973
				974	auth_header = 'Proxy-Authorization'
				975	handler_order = 490 # before Basic auth
				976
				977	def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	978	host = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	979	retry = self.http_error_auth_reqed('proxy-authenticate',
				980	host, req, headers)
				981	self.reset_retry_count()
				982	return retry
				983
				984	class AbstractHTTPHandler(BaseHandler):
				985
				986	def __init__(self, debuglevel=0):
				987	self._debuglevel = debuglevel
				988
				989	def set_http_debuglevel(self, level):
				990	self._debuglevel = level
				991
				992	def do_request_(self, request):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	993	host = request.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	994	if not host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	995	raise URLError('no host given')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	996
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	997	if request.data is not None: # POST
				998	data = request.data
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	999	if not request.has_header('Content-type'):
				1000	request.add_unredirected_header(
				1001	'Content-type',
				1002	'application/x-www-form-urlencoded')
				1003	if not request.has_header('Content-length'):
				1004	request.add_unredirected_header(
				1005	'Content-length', '%d' % len(data))
				1006
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	1007	sel_host = host
				1008	if request.has_proxy():
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1009	scheme, sel = splittype(request.selector)
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	1010	sel_host, sel_path = splithost(sel)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1011	if not request.has_header('Host'):
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	1012	request.add_unredirected_header('Host', sel_host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1013	for name, value in self.parent.addheaders:
				1014	name = name.capitalize()
				1015	if not request.has_header(name):
				1016	request.add_unredirected_header(name, value)
				1017
				1018	return request
				1019
				1020	def do_open(self, http_class, req):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1021	"""Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1022
				1023	http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1024	"""
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1025	host = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1026	if not host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1027	raise URLError('no host given')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1028
				1029	h = http_class(host, timeout=req.timeout) # will parse host:port
				1030	headers = dict(req.headers)
				1031	headers.update(req.unredirected_hdrs)
				1032
				1033	# TODO(jhylton): Should this be redesigned to handle
				1034	# persistent connections?
				1035
				1036	# We want to make an HTTP/1.1 request, but the addinfourl
				1037	# class isn't prepared to deal with a persistent connection.
				1038	# It will try to read all remaining data from the socket,
				1039	# which will block while the server waits for the next request.
				1040	# So make sure the connection gets closed after the (only)
				1041	# request.
				1042	headers["Connection"] = "close"
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1043	headers = dict((name.title(), val) for name, val in headers.items())
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1044	try:
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1045	h.request(req.get_method(), req.selector, req.data, headers)
				1046	r = h.getresponse() # an HTTPResponse instance
				1047	except socket.error as err:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1048	raise URLError(err)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1049
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1050	r.url = req.full_url
				1051	# This line replaces the .msg attribute of the HTTPResponse
				1052	# with .headers, because urllib clients expect the response to
				1053	# have the reason in .msg. It would be good to mark this
				1054	# attribute is deprecated and get then to use info() or
				1055	# .headers.
				1056	r.msg = r.reason
				1057	return r
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1058
				1059
				1060	class HTTPHandler(AbstractHTTPHandler):
				1061
				1062	def http_open(self, req):
				1063	return self.do_open(http.client.HTTPConnection, req)
				1064
				1065	http_request = AbstractHTTPHandler.do_request_
				1066
				1067	if hasattr(http.client, 'HTTPSConnection'):
				1068	class HTTPSHandler(AbstractHTTPHandler):
				1069
				1070	def https_open(self, req):
				1071	return self.do_open(http.client.HTTPSConnection, req)
				1072
				1073	https_request = AbstractHTTPHandler.do_request_
				1074
				1075	class HTTPCookieProcessor(BaseHandler):
				1076	def __init__(self, cookiejar=None):
				1077	import http.cookiejar
				1078	if cookiejar is None:
				1079	cookiejar = http.cookiejar.CookieJar()
				1080	self.cookiejar = cookiejar
				1081
				1082	def http_request(self, request):
				1083	self.cookiejar.add_cookie_header(request)
				1084	return request
				1085
				1086	def http_response(self, request, response):
				1087	self.cookiejar.extract_cookies(response, request)
				1088	return response
				1089
				1090	https_request = http_request
				1091	https_response = http_response
				1092
				1093	class UnknownHandler(BaseHandler):
				1094	def unknown_open(self, req):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1095	type = req.type
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1096	raise URLError('unknown url type: %s' % type)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1097
				1098	def parse_keqv_list(l):
				1099	"""Parse list of key=value strings where keys are not duplicated."""
				1100	parsed = {}
				1101	for elt in l:
				1102	k, v = elt.split('=', 1)
				1103	if v[0] == '"' and v[-1] == '"':
				1104	v = v[1:-1]
				1105	parsed[k] = v
				1106	return parsed
				1107
				1108	def parse_http_list(s):
				1109	"""Parse lists as described by RFC 2068 Section 2.
				1110
				1111	In particular, parse comma-separated lists where the elements of
				1112	the list may include quoted-strings. A quoted-string could
				1113	contain a comma. A non-quoted string could have quotes in the
				1114	middle. Neither commas nor quotes count if they are escaped.
				1115	Only double-quotes count, not single-quotes.
				1116	"""
				1117	res = []
				1118	part = ''
				1119
				1120	escape = quote = False
				1121	for cur in s:
				1122	if escape:
				1123	part += cur
				1124	escape = False
				1125	continue
				1126	if quote:
				1127	if cur == '\\':
				1128	escape = True
				1129	continue
				1130	elif cur == '"':
				1131	quote = False
				1132	part += cur
				1133	continue
				1134
				1135	if cur == ',':
				1136	res.append(part)
				1137	part = ''
				1138	continue
				1139
				1140	if cur == '"':
				1141	quote = True
				1142
				1143	part += cur
				1144
				1145	# append last part
				1146	if part:
				1147	res.append(part)
				1148
				1149	return [part.strip() for part in res]
				1150
				1151	class FileHandler(BaseHandler):
				1152	# Use local file or FTP depending on form of URL
				1153	def file_open(self, req):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1154	url = req.selector
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1155	if url[:2] == '//' and url[2:3] != '/':
				1156	req.type = 'ftp'
				1157	return self.parent.open(req)
				1158	else:
				1159	return self.open_local_file(req)
				1160
				1161	# names for the localhost
				1162	names = None
				1163	def get_names(self):
				1164	if FileHandler.names is None:
				1165	try:
				1166	FileHandler.names = (socket.gethostbyname('localhost'),
				1167	socket.gethostbyname(socket.gethostname()))
				1168	except socket.gaierror:
				1169	FileHandler.names = (socket.gethostbyname('localhost'),)
				1170	return FileHandler.names
				1171
				1172	# not entirely sure what the rules are here
				1173	def open_local_file(self, req):
				1174	import email.utils
				1175	import mimetypes
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1176	host = req.host
				1177	file = req.selector
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1178	localfile = url2pathname(file)
				1179	try:
				1180	stats = os.stat(localfile)
				1181	size = stats.st_size
				1182	modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
				1183	mtype = mimetypes.guess_type(file)[0]
				1184	headers = email.message_from_string(
				1185	'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
				1186	(mtype or 'text/plain', size, modified))
				1187	if host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1188	host, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1189	if not host or \
				1190	(not port and _safe_gethostbyname(host) in self.get_names()):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1191	return addinfourl(open(localfile, 'rb'), headers, 'file:'+file)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1192	except OSError as msg:
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	1193	# users shouldn't expect OSErrors coming from urlopen()
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1194	raise URLError(msg)
				1195	raise URLError('file not on local host')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1196
				1197	def _safe_gethostbyname(host):
				1198	try:
				1199	return socket.gethostbyname(host)
				1200	except socket.gaierror:
				1201	return None
				1202
				1203	class FTPHandler(BaseHandler):
				1204	def ftp_open(self, req):
				1205	import ftplib
				1206	import mimetypes
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1207	host = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1208	if not host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1209	raise URLError('ftp error: no host given')
				1210	host, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1211	if port is None:
				1212	port = ftplib.FTP_PORT
				1213	else:
				1214	port = int(port)
				1215
				1216	# username/password handling
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1217	user, host = splituser(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1218	if user:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1219	user, passwd = splitpasswd(user)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1220	else:
				1221	passwd = None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1222	host = unquote(host)
				1223	user = unquote(user or '')
				1224	passwd = unquote(passwd or '')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1225
				1226	try:
				1227	host = socket.gethostbyname(host)
				1228	except socket.error as msg:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1229	raise URLError(msg)
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1230	path, attrs = splitattr(req.selector)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1231	dirs = path.split('/')
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1232	dirs = list(map(unquote, dirs))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1233	dirs, file = dirs[:-1], dirs[-1]
				1234	if dirs and not dirs[0]:
				1235	dirs = dirs[1:]
				1236	try:
				1237	fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
				1238	type = file and 'I' or 'D'
				1239	for attr in attrs:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1240	attr, value = splitvalue(attr)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1241	if attr.lower() == 'type' and \
				1242	value in ('a', 'A', 'i', 'I', 'd', 'D'):
				1243	type = value.upper()
				1244	fp, retrlen = fw.retrfile(file, type)
				1245	headers = ""
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1246	mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1247	if mtype:
				1248	headers += "Content-type: %s\n" % mtype
				1249	if retrlen is not None and retrlen >= 0:
				1250	headers += "Content-length: %d\n" % retrlen
				1251	headers = email.message_from_string(headers)
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1252	return addinfourl(fp, headers, req.full_url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1253	except ftplib.all_errors as msg:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1254	exc = URLError('ftp error: %s' % msg)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1255	raise exc.with_traceback(sys.exc_info()[2])
				1256
				1257	def connect_ftp(self, user, passwd, host, port, dirs, timeout):
				1258	fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
				1259	return fw
				1260
				1261	class CacheFTPHandler(FTPHandler):
				1262	# XXX would be nice to have pluggable cache strategies
				1263	# XXX this stuff is definitely not thread safe
				1264	def __init__(self):
				1265	self.cache = {}
				1266	self.timeout = {}
				1267	self.soonest = 0
				1268	self.delay = 60
				1269	self.max_conns = 16
				1270
				1271	def setTimeout(self, t):
				1272	self.delay = t
				1273
				1274	def setMaxConns(self, m):
				1275	self.max_conns = m
				1276
				1277	def connect_ftp(self, user, passwd, host, port, dirs, timeout):
				1278	key = user, host, port, '/'.join(dirs), timeout
				1279	if key in self.cache:
				1280	self.timeout[key] = time.time() + self.delay
				1281	else:
				1282	self.cache[key] = ftpwrapper(user, passwd, host, port,
				1283	dirs, timeout)
				1284	self.timeout[key] = time.time() + self.delay
				1285	self.check_cache()
				1286	return self.cache[key]
				1287
				1288	def check_cache(self):
				1289	# first check for old ones
				1290	t = time.time()
				1291	if self.soonest <= t:
				1292	for k, v in list(self.timeout.items()):
				1293	if v < t:
				1294	self.cache[k].close()
				1295	del self.cache[k]
				1296	del self.timeout[k]
				1297	self.soonest = min(list(self.timeout.values()))
				1298
				1299	# then check the size
				1300	if len(self.cache) == self.max_conns:
				1301	for k, v in list(self.timeout.items()):
				1302	if v == self.soonest:
				1303	del self.cache[k]
				1304	del self.timeout[k]
				1305	break
				1306	self.soonest = min(list(self.timeout.values()))
				1307
				1308	# Code move from the old urllib module
				1309
				1310	MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
				1311
				1312	# Helper for non-unix systems
				1313	if os.name == 'mac':
				1314	from macurl2path import url2pathname, pathname2url
				1315	elif os.name == 'nt':
				1316	from nturl2path import url2pathname, pathname2url
				1317	else:
				1318	def url2pathname(pathname):
				1319	"""OS-specific conversion from a relative URL of the 'file' scheme
				1320	to a file system path; not recommended for general use."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1321	return unquote(pathname)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1322
				1323	def pathname2url(pathname):
				1324	"""OS-specific conversion from a file system path to a relative URL
				1325	of the 'file' scheme; not recommended for general use."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1326	return quote(pathname)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1327
				1328	# This really consists of two pieces:
				1329	# (1) a class which handles opening of all sorts of URLs
				1330	# (plus assorted utilities etc.)
				1331	# (2) a set of functions for parsing URLs
				1332	# XXX Should these be separated out into different modules?
				1333
				1334
				1335	ftpcache = {}
				1336	class URLopener:
				1337	"""Class to open URLs.
				1338	This is a class rather than just a subroutine because we may need
				1339	more than one set of global protocol-specific options.
				1340	Note -- this is a base class for those who don't want the
				1341	automatic handling of errors type 302 (relocated) and 401
				1342	(authorization needed)."""
				1343
				1344	__tempfiles = None
				1345
				1346	version = "Python-urllib/%s" % __version__
				1347
				1348	# Constructor
				1349	def __init__(self, proxies=None, **x509):
				1350	if proxies is None:
				1351	proxies = getproxies()
				1352	assert hasattr(proxies, 'keys'), "proxies must be a mapping"
				1353	self.proxies = proxies
				1354	self.key_file = x509.get('key_file')
				1355	self.cert_file = x509.get('cert_file')
				1356	self.addheaders = [('User-Agent', self.version)]
				1357	self.__tempfiles = []
				1358	self.__unlink = os.unlink # See cleanup()
				1359	self.tempcache = None
				1360	# Undocumented feature: if you assign {} to tempcache,
				1361	# it is used to cache files retrieved with
				1362	# self.retrieve(). This is not enabled by default
				1363	# since it does not work for changing documents (and I
				1364	# haven't got the logic to check expiration headers
				1365	# yet).
				1366	self.ftpcache = ftpcache
				1367	# Undocumented feature: you can use a different
				1368	# ftp cache by assigning to the .ftpcache member;
				1369	# in case you want logically independent URL openers
				1370	# XXX This is not threadsafe. Bah.
				1371
				1372	def __del__(self):
				1373	self.close()
				1374
				1375	def close(self):
				1376	self.cleanup()
				1377
				1378	def cleanup(self):
				1379	# This code sometimes runs when the rest of this module
				1380	# has already been deleted, so it can't use any globals
				1381	# or import anything.
				1382	if self.__tempfiles:
				1383	for file in self.__tempfiles:
				1384	try:
				1385	self.__unlink(file)
				1386	except OSError:
				1387	pass
				1388	del self.__tempfiles[:]
				1389	if self.tempcache:
				1390	self.tempcache.clear()
				1391
				1392	def addheader(self, *args):
				1393	"""Add a header to be used by the HTTP interface only
				1394	e.g. u.addheader('Accept', 'sound/basic')"""
				1395	self.addheaders.append(args)
				1396
				1397	# External interface
				1398	def open(self, fullurl, data=None):
				1399	"""Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1400	fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran	690ce9b	2009-05-05 18:41:13 +0000	[diff] [blame]	1401	fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1402	if self.tempcache and fullurl in self.tempcache:
				1403	filename, headers = self.tempcache[fullurl]
				1404	fp = open(filename, 'rb')
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1405	return addinfourl(fp, headers, fullurl)
				1406	urltype, url = splittype(fullurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1407	if not urltype:
				1408	urltype = 'file'
				1409	if urltype in self.proxies:
				1410	proxy = self.proxies[urltype]
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1411	urltype, proxyhost = splittype(proxy)
				1412	host, selector = splithost(proxyhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1413	url = (host, fullurl) # Signal special case to open_*()
				1414	else:
				1415	proxy = None
				1416	name = 'open_' + urltype
				1417	self.type = urltype
				1418	name = name.replace('-', '_')
				1419	if not hasattr(self, name):
				1420	if proxy:
				1421	return self.open_unknown_proxy(proxy, fullurl, data)
				1422	else:
				1423	return self.open_unknown(fullurl, data)
				1424	try:
				1425	if data is None:
				1426	return getattr(self, name)(url)
				1427	else:
				1428	return getattr(self, name)(url, data)
				1429	except socket.error as msg:
				1430	raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
				1431
				1432	def open_unknown(self, fullurl, data=None):
				1433	"""Overridable interface to open unknown URL type."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1434	type, url = splittype(fullurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1435	raise IOError('url error', 'unknown url type', type)
				1436
				1437	def open_unknown_proxy(self, proxy, fullurl, data=None):
				1438	"""Overridable interface to open unknown URL type."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1439	type, url = splittype(fullurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1440	raise IOError('url error', 'invalid proxy for %s' % type, proxy)
				1441
				1442	# External interface
				1443	def retrieve(self, url, filename=None, reporthook=None, data=None):
				1444	"""retrieve(url) returns (filename, headers) for a local object
				1445	or (tempfilename, headers) for a remote object."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1446	url = unwrap(to_bytes(url))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1447	if self.tempcache and url in self.tempcache:
				1448	return self.tempcache[url]
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1449	type, url1 = splittype(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1450	if filename is None and (not type or type == 'file'):
				1451	try:
				1452	fp = self.open_local_file(url1)
				1453	hdrs = fp.info()
				1454	del fp
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1455	return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1456	except IOError as msg:
				1457	pass
				1458	fp = self.open(url, data)
Benjamin Peterson	5f28b7b	2009-03-26 21:49:58 +0000	[diff] [blame]	1459	try:
				1460	headers = fp.info()
				1461	if filename:
				1462	tfp = open(filename, 'wb')
				1463	else:
				1464	import tempfile
				1465	garbage, path = splittype(url)
				1466	garbage, path = splithost(path or "")
				1467	path, garbage = splitquery(path or "")
				1468	path, garbage = splitattr(path or "")
				1469	suffix = os.path.splitext(path)[1]
				1470	(fd, filename) = tempfile.mkstemp(suffix)
				1471	self.__tempfiles.append(filename)
				1472	tfp = os.fdopen(fd, 'wb')
				1473	try:
				1474	result = filename, headers
				1475	if self.tempcache is not None:
				1476	self.tempcache[url] = result
				1477	bs = 1024*8
				1478	size = -1
				1479	read = 0
				1480	blocknum = 0
				1481	if reporthook:
				1482	if "content-length" in headers:
				1483	size = int(headers["Content-Length"])
				1484	reporthook(blocknum, bs, size)
				1485	while 1:
				1486	block = fp.read(bs)
				1487	if not block:
				1488	break
				1489	read += len(block)
				1490	tfp.write(block)
				1491	blocknum += 1
				1492	if reporthook:
				1493	reporthook(blocknum, bs, size)
				1494	finally:
				1495	tfp.close()
				1496	finally:
				1497	fp.close()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1498	del fp
				1499	del tfp
				1500
				1501	# raise exception if actual size does not match content-length header
				1502	if size >= 0 and read < size:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1503	raise ContentTooShortError(
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1504	"retrieval incomplete: got only %i out of %i bytes"
				1505	% (read, size), result)
				1506
				1507	return result
				1508
				1509	# Each method named open_<type> knows how to open that type of URL
				1510
				1511	def _open_generic_http(self, connection_factory, url, data):
				1512	"""Make an HTTP connection using connection_class.
				1513
				1514	This is an internal method that should be called from
				1515	open_http() or open_https().
				1516
				1517	Arguments:
				1518	- connection_factory should take a host name and return an
				1519	HTTPConnection instance.
				1520	- url is the url to retrieval or a host, relative-path pair.
				1521	- data is payload for a POST request or None.
				1522	"""
				1523
				1524	user_passwd = None
				1525	proxy_passwd= None
				1526	if isinstance(url, str):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1527	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1528	if host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1529	user_passwd, host = splituser(host)
				1530	host = unquote(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1531	realhost = host
				1532	else:
				1533	host, selector = url
				1534	# check whether the proxy contains authorization information
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1535	proxy_passwd, host = splituser(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1536	# now we proceed with the url we want to obtain
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1537	urltype, rest = splittype(selector)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1538	url = rest
				1539	user_passwd = None
				1540	if urltype.lower() != 'http':
				1541	realhost = None
				1542	else:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1543	realhost, rest = splithost(rest)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1544	if realhost:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1545	user_passwd, realhost = splituser(realhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1546	if user_passwd:
				1547	selector = "%s://%s%s" % (urltype, realhost, rest)
				1548	if proxy_bypass(realhost):
				1549	host = realhost
				1550
				1551	#print "proxy via http:", host, selector
				1552	if not host: raise IOError('http error', 'no host given')
				1553
				1554	if proxy_passwd:
				1555	import base64
				1556	proxy_auth = base64.b64encode(proxy_passwd).strip()
				1557	else:
				1558	proxy_auth = None
				1559
				1560	if user_passwd:
				1561	import base64
				1562	auth = base64.b64encode(user_passwd).strip()
				1563	else:
				1564	auth = None
				1565	http_conn = connection_factory(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1566	headers = {}
				1567	if proxy_auth:
				1568	headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
				1569	if auth:
				1570	headers["Authorization"] = "Basic %s" % auth
				1571	if realhost:
				1572	headers["Host"] = realhost
				1573	for header, value in self.addheaders:
				1574	headers[header] = value
				1575
				1576	if data is not None:
				1577	headers["Content-Type"] = "application/x-www-form-urlencoded"
				1578	http_conn.request("POST", selector, data, headers)
				1579	else:
				1580	http_conn.request("GET", selector, headers=headers)
				1581
				1582	try:
				1583	response = http_conn.getresponse()
				1584	except http.client.BadStatusLine:
				1585	# something went wrong with the HTTP status line
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1586	raise URLError("http protocol error: bad status line")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1587
				1588	# According to RFC 2616, "2xx" code indicates that the client's
				1589	# request was successfully received, understood, and accepted.
				1590	if 200 <= response.status < 300:
Antoine Pitrou	b353c12	2009-02-11 00:39:14 +0000	[diff] [blame]	1591	return addinfourl(response, response.msg, "http:" + url,
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1592	response.status)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1593	else:
				1594	return self.http_error(
				1595	url, response.fp,
				1596	response.status, response.reason, response.msg, data)
				1597
				1598	def open_http(self, url, data=None):
				1599	"""Use HTTP protocol."""
				1600	return self._open_generic_http(http.client.HTTPConnection, url, data)
				1601
				1602	def http_error(self, url, fp, errcode, errmsg, headers, data=None):
				1603	"""Handle http errors.
				1604
				1605	Derived class can override this, or provide specific handlers
				1606	named http_error_DDD where DDD is the 3-digit error code."""
				1607	# First check if there's a specific handler for this error
				1608	name = 'http_error_%d' % errcode
				1609	if hasattr(self, name):
				1610	method = getattr(self, name)
				1611	if data is None:
				1612	result = method(url, fp, errcode, errmsg, headers)
				1613	else:
				1614	result = method(url, fp, errcode, errmsg, headers, data)
				1615	if result: return result
				1616	return self.http_error_default(url, fp, errcode, errmsg, headers)
				1617
				1618	def http_error_default(self, url, fp, errcode, errmsg, headers):
				1619	"""Default error handler: close the connection and raise IOError."""
				1620	void = fp.read()
				1621	fp.close()
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1622	raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1623
				1624	if _have_ssl:
				1625	def _https_connection(self, host):
				1626	return http.client.HTTPSConnection(host,
				1627	key_file=self.key_file,
				1628	cert_file=self.cert_file)
				1629
				1630	def open_https(self, url, data=None):
				1631	"""Use HTTPS protocol."""
				1632	return self._open_generic_http(self._https_connection, url, data)
				1633
				1634	def open_file(self, url):
				1635	"""Use local file or FTP depending on form of URL."""
				1636	if not isinstance(url, str):
				1637	raise URLError('file error', 'proxy support for file protocol currently not implemented')
				1638	if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
				1639	return self.open_ftp(url)
				1640	else:
				1641	return self.open_local_file(url)
				1642
				1643	def open_local_file(self, url):
				1644	"""Use local file."""
				1645	import mimetypes, email.utils
				1646	from io import StringIO
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1647	host, file = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1648	localname = url2pathname(file)
				1649	try:
				1650	stats = os.stat(localname)
				1651	except OSError as e:
				1652	raise URLError(e.errno, e.strerror, e.filename)
				1653	size = stats.st_size
				1654	modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
				1655	mtype = mimetypes.guess_type(url)[0]
				1656	headers = email.message_from_string(
				1657	'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
				1658	(mtype or 'text/plain', size, modified))
				1659	if not host:
				1660	urlfile = file
				1661	if file[:1] == '/':
				1662	urlfile = 'file://' + file
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1663	return addinfourl(open(localname, 'rb'), headers, urlfile)
				1664	host, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1665	if (not port
				1666	and socket.gethostbyname(host) in (localhost(), thishost())):
				1667	urlfile = file
				1668	if file[:1] == '/':
				1669	urlfile = 'file://' + file
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1670	return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1671	raise URLError('local file error', 'not on local host')
				1672
				1673	def open_ftp(self, url):
				1674	"""Use FTP protocol."""
				1675	if not isinstance(url, str):
				1676	raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
				1677	import mimetypes
				1678	from io import StringIO
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1679	host, path = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1680	if not host: raise URLError('ftp error', 'no host given')
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1681	host, port = splitport(host)
				1682	user, host = splituser(host)
				1683	if user: user, passwd = splitpasswd(user)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1684	else: passwd = None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1685	host = unquote(host)
				1686	user = unquote(user or '')
				1687	passwd = unquote(passwd or '')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1688	host = socket.gethostbyname(host)
				1689	if not port:
				1690	import ftplib
				1691	port = ftplib.FTP_PORT
				1692	else:
				1693	port = int(port)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1694	path, attrs = splitattr(path)
				1695	path = unquote(path)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1696	dirs = path.split('/')
				1697	dirs, file = dirs[:-1], dirs[-1]
				1698	if dirs and not dirs[0]: dirs = dirs[1:]
				1699	if dirs and not dirs[0]: dirs[0] = '/'
				1700	key = user, host, port, '/'.join(dirs)
				1701	# XXX thread unsafe!
				1702	if len(self.ftpcache) > MAXFTPCACHE:
				1703	# Prune the cache, rather arbitrarily
				1704	for k in self.ftpcache.keys():
				1705	if k != key:
				1706	v = self.ftpcache[k]
				1707	del self.ftpcache[k]
				1708	v.close()
				1709	try:
				1710	if not key in self.ftpcache:
				1711	self.ftpcache[key] = \
				1712	ftpwrapper(user, passwd, host, port, dirs)
				1713	if not file: type = 'D'
				1714	else: type = 'I'
				1715	for attr in attrs:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1716	attr, value = splitvalue(attr)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1717	if attr.lower() == 'type' and \
				1718	value in ('a', 'A', 'i', 'I', 'd', 'D'):
				1719	type = value.upper()
				1720	(fp, retrlen) = self.ftpcache[key].retrfile(file, type)
				1721	mtype = mimetypes.guess_type("ftp:" + url)[0]
				1722	headers = ""
				1723	if mtype:
				1724	headers += "Content-Type: %s\n" % mtype
				1725	if retrlen is not None and retrlen >= 0:
				1726	headers += "Content-Length: %d\n" % retrlen
				1727	headers = email.message_from_string(headers)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1728	return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1729	except ftperrors() as msg:
				1730	raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
				1731
				1732	def open_data(self, url, data=None):
				1733	"""Use "data" URL."""
				1734	if not isinstance(url, str):
				1735	raise URLError('data error', 'proxy support for data protocol currently not implemented')
				1736	# ignore POSTed data
				1737	#
				1738	# syntax of data URLs:
				1739	# dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
				1740	# mediatype := [ type "/" subtype ] *( ";" parameter )
				1741	# data := *urlchar
				1742	# parameter := attribute "=" value
				1743	try:
				1744	[type, data] = url.split(',', 1)
				1745	except ValueError:
				1746	raise IOError('data error', 'bad data URL')
				1747	if not type:
				1748	type = 'text/plain;charset=US-ASCII'
				1749	semi = type.rfind(';')
				1750	if semi >= 0 and '=' not in type[semi:]:
				1751	encoding = type[semi+1:]
				1752	type = type[:semi]
				1753	else:
				1754	encoding = ''
				1755	msg = []
				1756	msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
				1757	time.gmtime(time.time())))
				1758	msg.append('Content-type: %s' % type)
				1759	if encoding == 'base64':
				1760	import base64
Georg Brandl	706824f	2009-06-04 09:42:55 +0000	[diff] [blame]	1761	# XXX is this encoding/decoding ok?
				1762	data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1763	else:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1764	data = unquote(data)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1765	msg.append('Content-Length: %d' % len(data))
				1766	msg.append('')
				1767	msg.append(data)
				1768	msg = '\n'.join(msg)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1769	headers = email.message_from_string(msg)
				1770	f = io.StringIO(msg)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1771	#f.fileno = None # needed for addinfourl
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1772	return addinfourl(f, headers, url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1773
				1774
				1775	class FancyURLopener(URLopener):
				1776	"""Derived class with handlers for errors we can handle (perhaps)."""
				1777
				1778	def __init__(self, args, *kwargs):
				1779	URLopener.__init__(self, args, *kwargs)
				1780	self.auth_cache = {}
				1781	self.tries = 0
				1782	self.maxtries = 10
				1783
				1784	def http_error_default(self, url, fp, errcode, errmsg, headers):
				1785	"""Default error handling -- don't raise an exception."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1786	return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1787
				1788	def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
				1789	"""Error 302 -- relocated (temporarily)."""
				1790	self.tries += 1
				1791	if self.maxtries and self.tries >= self.maxtries:
				1792	if hasattr(self, "http_error_500"):
				1793	meth = self.http_error_500
				1794	else:
				1795	meth = self.http_error_default
				1796	self.tries = 0
				1797	return meth(url, fp, 500,
				1798	"Internal Server Error: Redirect Recursion", headers)
				1799	result = self.redirect_internal(url, fp, errcode, errmsg, headers,
				1800	data)
				1801	self.tries = 0
				1802	return result
				1803
				1804	def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
				1805	if 'location' in headers:
				1806	newurl = headers['location']
				1807	elif 'uri' in headers:
				1808	newurl = headers['uri']
				1809	else:
				1810	return
				1811	void = fp.read()
				1812	fp.close()
				1813	# In case the server sent a relative URL, join with original:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1814	newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1815	return self.open(newurl)
				1816
				1817	def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
				1818	"""Error 301 -- also relocated (permanently)."""
				1819	return self.http_error_302(url, fp, errcode, errmsg, headers, data)
				1820
				1821	def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
				1822	"""Error 303 -- also relocated (essentially identical to 302)."""
				1823	return self.http_error_302(url, fp, errcode, errmsg, headers, data)
				1824
				1825	def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
				1826	"""Error 307 -- relocated, but turn POST into error."""
				1827	if data is None:
				1828	return self.http_error_302(url, fp, errcode, errmsg, headers, data)
				1829	else:
				1830	return self.http_error_default(url, fp, errcode, errmsg, headers)
				1831
				1832	def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
				1833	"""Error 401 -- authentication required.
				1834	This function supports Basic authentication only."""
				1835	if not 'www-authenticate' in headers:
				1836	URLopener.http_error_default(self, url, fp,
				1837	errcode, errmsg, headers)
				1838	stuff = headers['www-authenticate']
				1839	import re
				1840	match = re.match('[ \t]([^ \t]+)[ \t]+realm="([^"])"', stuff)
				1841	if not match:
				1842	URLopener.http_error_default(self, url, fp,
				1843	errcode, errmsg, headers)
				1844	scheme, realm = match.groups()
				1845	if scheme.lower() != 'basic':
				1846	URLopener.http_error_default(self, url, fp,
				1847	errcode, errmsg, headers)
				1848	name = 'retry_' + self.type + '_basic_auth'
				1849	if data is None:
				1850	return getattr(self,name)(url, realm)
				1851	else:
				1852	return getattr(self,name)(url, realm, data)
				1853
				1854	def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
				1855	"""Error 407 -- proxy authentication required.
				1856	This function supports Basic authentication only."""
				1857	if not 'proxy-authenticate' in headers:
				1858	URLopener.http_error_default(self, url, fp,
				1859	errcode, errmsg, headers)
				1860	stuff = headers['proxy-authenticate']
				1861	import re
				1862	match = re.match('[ \t]([^ \t]+)[ \t]+realm="([^"])"', stuff)
				1863	if not match:
				1864	URLopener.http_error_default(self, url, fp,
				1865	errcode, errmsg, headers)
				1866	scheme, realm = match.groups()
				1867	if scheme.lower() != 'basic':
				1868	URLopener.http_error_default(self, url, fp,
				1869	errcode, errmsg, headers)
				1870	name = 'retry_proxy_' + self.type + '_basic_auth'
				1871	if data is None:
				1872	return getattr(self,name)(url, realm)
				1873	else:
				1874	return getattr(self,name)(url, realm, data)
				1875
				1876	def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1877	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1878	newurl = 'http://' + host + selector
				1879	proxy = self.proxies['http']
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1880	urltype, proxyhost = splittype(proxy)
				1881	proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1882	i = proxyhost.find('@') + 1
				1883	proxyhost = proxyhost[i:]
				1884	user, passwd = self.get_user_passwd(proxyhost, realm, i)
				1885	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1886	proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1887	quote(passwd, safe=''), proxyhost)
				1888	self.proxies['http'] = 'http://' + proxyhost + proxyselector
				1889	if data is None:
				1890	return self.open(newurl)
				1891	else:
				1892	return self.open(newurl, data)
				1893
				1894	def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1895	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1896	newurl = 'https://' + host + selector
				1897	proxy = self.proxies['https']
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1898	urltype, proxyhost = splittype(proxy)
				1899	proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1900	i = proxyhost.find('@') + 1
				1901	proxyhost = proxyhost[i:]
				1902	user, passwd = self.get_user_passwd(proxyhost, realm, i)
				1903	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1904	proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1905	quote(passwd, safe=''), proxyhost)
				1906	self.proxies['https'] = 'https://' + proxyhost + proxyselector
				1907	if data is None:
				1908	return self.open(newurl)
				1909	else:
				1910	return self.open(newurl, data)
				1911
				1912	def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1913	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1914	i = host.find('@') + 1
				1915	host = host[i:]
				1916	user, passwd = self.get_user_passwd(host, realm, i)
				1917	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1918	host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1919	quote(passwd, safe=''), host)
				1920	newurl = 'http://' + host + selector
				1921	if data is None:
				1922	return self.open(newurl)
				1923	else:
				1924	return self.open(newurl, data)
				1925
				1926	def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1927	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1928	i = host.find('@') + 1
				1929	host = host[i:]
				1930	user, passwd = self.get_user_passwd(host, realm, i)
				1931	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1932	host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1933	quote(passwd, safe=''), host)
				1934	newurl = 'https://' + host + selector
				1935	if data is None:
				1936	return self.open(newurl)
				1937	else:
				1938	return self.open(newurl, data)
				1939
				1940	def get_user_passwd(self, host, realm, clear_cache = 0):
				1941	key = realm + '@' + host.lower()
				1942	if key in self.auth_cache:
				1943	if clear_cache:
				1944	del self.auth_cache[key]
				1945	else:
				1946	return self.auth_cache[key]
				1947	user, passwd = self.prompt_user_passwd(host, realm)
				1948	if user or passwd: self.auth_cache[key] = (user, passwd)
				1949	return user, passwd
				1950
				1951	def prompt_user_passwd(self, host, realm):
				1952	"""Override this in a GUI environment!"""
				1953	import getpass
				1954	try:
				1955	user = input("Enter username for %s at %s: " % (realm, host))
				1956	passwd = getpass.getpass("Enter password for %s in %s at %s: " %
				1957	(user, realm, host))
				1958	return user, passwd
				1959	except KeyboardInterrupt:
				1960	print()
				1961	return None, None
				1962
				1963
				1964	# Utility functions
				1965
				1966	_localhost = None
				1967	def localhost():
				1968	"""Return the IP address of the magic hostname 'localhost'."""
				1969	global _localhost
				1970	if _localhost is None:
				1971	_localhost = socket.gethostbyname('localhost')
				1972	return _localhost
				1973
				1974	_thishost = None
				1975	def thishost():
				1976	"""Return the IP address of the current host."""
				1977	global _thishost
				1978	if _thishost is None:
				1979	_thishost = socket.gethostbyname(socket.gethostname())
				1980	return _thishost
				1981
				1982	_ftperrors = None
				1983	def ftperrors():
				1984	"""Return the set of errors raised by the FTP class."""
				1985	global _ftperrors
				1986	if _ftperrors is None:
				1987	import ftplib
				1988	_ftperrors = ftplib.all_errors
				1989	return _ftperrors
				1990
				1991	_noheaders = None
				1992	def noheaders():
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1993	"""Return an empty email Message object."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1994	global _noheaders
				1995	if _noheaders is None:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1996	_noheaders = email.message_from_string("")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1997	return _noheaders
				1998
				1999
				2000	# Utility classes
				2001
				2002	class ftpwrapper:
				2003	"""Class used by open_ftp() for cache of open FTP connections."""
				2004
				2005	def __init__(self, user, passwd, host, port, dirs, timeout=None):
				2006	self.user = user
				2007	self.passwd = passwd
				2008	self.host = host
				2009	self.port = port
				2010	self.dirs = dirs
				2011	self.timeout = timeout
				2012	self.init()
				2013
				2014	def init(self):
				2015	import ftplib
				2016	self.busy = 0
				2017	self.ftp = ftplib.FTP()
				2018	self.ftp.connect(self.host, self.port, self.timeout)
				2019	self.ftp.login(self.user, self.passwd)
				2020	for dir in self.dirs:
				2021	self.ftp.cwd(dir)
				2022
				2023	def retrfile(self, file, type):
				2024	import ftplib
				2025	self.endtransfer()
				2026	if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
				2027	else: cmd = 'TYPE ' + type; isdir = 0
				2028	try:
				2029	self.ftp.voidcmd(cmd)
				2030	except ftplib.all_errors:
				2031	self.init()
				2032	self.ftp.voidcmd(cmd)
				2033	conn = None
				2034	if file and not isdir:
				2035	# Try to retrieve as a file
				2036	try:
				2037	cmd = 'RETR ' + file
				2038	conn = self.ftp.ntransfercmd(cmd)
				2039	except ftplib.error_perm as reason:
				2040	if str(reason)[:3] != '550':
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2041	raise URLError('ftp error', reason).with_traceback(
				2042	sys.exc_info()[2])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2043	if not conn:
				2044	# Set transfer mode to ASCII!
				2045	self.ftp.voidcmd('TYPE A')
				2046	# Try a directory listing. Verify that directory exists.
				2047	if file:
				2048	pwd = self.ftp.pwd()
				2049	try:
				2050	try:
				2051	self.ftp.cwd(file)
				2052	except ftplib.error_perm as reason:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2053	raise URLError('ftp error', reason) from reason
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2054	finally:
				2055	self.ftp.cwd(pwd)
				2056	cmd = 'LIST ' + file
				2057	else:
				2058	cmd = 'LIST'
				2059	conn = self.ftp.ntransfercmd(cmd)
				2060	self.busy = 1
				2061	# Pass back both a suitably decorated object and a retrieval length
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2062	return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2063	def endtransfer(self):
				2064	if not self.busy:
				2065	return
				2066	self.busy = 0
				2067	try:
				2068	self.ftp.voidresp()
				2069	except ftperrors():
				2070	pass
				2071
				2072	def close(self):
				2073	self.endtransfer()
				2074	try:
				2075	self.ftp.close()
				2076	except ftperrors():
				2077	pass
				2078
				2079	# Proxy handling
				2080	def getproxies_environment():
				2081	"""Return a dictionary of scheme -> proxy server URL mappings.
				2082
				2083	Scan the environment for variables named <scheme>_proxy;
				2084	this seems to be the standard convention. If you need a
				2085	different way, you can pass a proxies dictionary to the
				2086	[Fancy]URLopener constructor.
				2087
				2088	"""
				2089	proxies = {}
				2090	for name, value in os.environ.items():
				2091	name = name.lower()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2092	if value and name[-6:] == '_proxy':
				2093	proxies[name[:-6]] = value
				2094	return proxies
				2095
				2096	def proxy_bypass_environment(host):
				2097	"""Test if proxies should not be used for a particular host.
				2098
				2099	Checks the environment for a variable named no_proxy, which should
				2100	be a list of DNS suffixes separated by commas, or '*' for all hosts.
				2101	"""
				2102	no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
				2103	# '*' is special case for always bypass
				2104	if no_proxy == '*':
				2105	return 1
				2106	# strip port off host
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2107	hostonly, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2108	# check if the host ends with any of the DNS suffixes
				2109	for name in no_proxy.split(','):
				2110	if name and (hostonly.endswith(name) or host.endswith(name)):
				2111	return 1
				2112	# otherwise, don't bypass
				2113	return 0
				2114
				2115
				2116	if sys.platform == 'darwin':
				2117	def getproxies_internetconfig():
				2118	"""Return a dictionary of scheme -> proxy server URL mappings.
				2119
				2120	By convention the mac uses Internet Config to store
				2121	proxies. An HTTP proxy, for instance, is stored under
				2122	the HttpProxy key.
				2123
				2124	"""
				2125	try:
				2126	import ic
				2127	except ImportError:
				2128	return {}
				2129
				2130	try:
				2131	config = ic.IC()
				2132	except ic.error:
				2133	return {}
				2134	proxies = {}
				2135	# HTTP:
				2136	if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
				2137	try:
				2138	value = config['HTTPProxyHost']
				2139	except ic.error:
				2140	pass
				2141	else:
				2142	proxies['http'] = 'http://%s' % value
				2143	# FTP: XXX To be done.
				2144	# Gopher: XXX To be done.
				2145	return proxies
				2146
				2147	def proxy_bypass(host):
				2148	if getproxies_environment():
				2149	return proxy_bypass_environment(host)
				2150	else:
				2151	return 0
				2152
				2153	def getproxies():
				2154	return getproxies_environment() or getproxies_internetconfig()
				2155
				2156	elif os.name == 'nt':
				2157	def getproxies_registry():
				2158	"""Return a dictionary of scheme -> proxy server URL mappings.
				2159
				2160	Win32 uses the registry to store proxies.
				2161
				2162	"""
				2163	proxies = {}
				2164	try:
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2165	import winreg
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2166	except ImportError:
				2167	# Std module, so should be around - but you never know!
				2168	return proxies
				2169	try:
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2170	internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2171	r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2172	proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2173	'ProxyEnable')[0]
				2174	if proxyEnable:
				2175	# Returned as Unicode but problems if not converted to ASCII
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2176	proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2177	'ProxyServer')[0])
				2178	if '=' in proxyServer:
				2179	# Per-protocol settings
				2180	for p in proxyServer.split(';'):
				2181	protocol, address = p.split('=', 1)
				2182	# See if address has a type:// prefix
				2183	import re
				2184	if not re.match('^([^/:]+)://', address):
				2185	address = '%s://%s' % (protocol, address)
				2186	proxies[protocol] = address
				2187	else:
				2188	# Use one setting for all protocols
				2189	if proxyServer[:5] == 'http:':
				2190	proxies['http'] = proxyServer
				2191	else:
				2192	proxies['http'] = 'http://%s' % proxyServer
				2193	proxies['ftp'] = 'ftp://%s' % proxyServer
				2194	internetSettings.Close()
				2195	except (WindowsError, ValueError, TypeError):
				2196	# Either registry key not found etc, or the value in an
				2197	# unexpected format.
				2198	# proxies already set up to be empty so nothing to do
				2199	pass
				2200	return proxies
				2201
				2202	def getproxies():
				2203	"""Return a dictionary of scheme -> proxy server URL mappings.
				2204
				2205	Returns settings gathered from the environment, if specified,
				2206	or the registry.
				2207
				2208	"""
				2209	return getproxies_environment() or getproxies_registry()
				2210
				2211	def proxy_bypass_registry(host):
				2212	try:
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2213	import winreg
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2214	import re
				2215	except ImportError:
				2216	# Std modules, so should be around - but you never know!
				2217	return 0
				2218	try:
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2219	internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2220	r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2221	proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2222	'ProxyEnable')[0]
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2223	proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2224	'ProxyOverride')[0])
				2225	# ^^^^ Returned as Unicode but problems if not converted to ASCII
				2226	except WindowsError:
				2227	return 0
				2228	if not proxyEnable or not proxyOverride:
				2229	return 0
				2230	# try to make a host list from name and IP address.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2231	rawHost, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2232	host = [rawHost]
				2233	try:
				2234	addr = socket.gethostbyname(rawHost)
				2235	if addr != rawHost:
				2236	host.append(addr)
				2237	except socket.error:
				2238	pass
				2239	try:
				2240	fqdn = socket.getfqdn(rawHost)
				2241	if fqdn != rawHost:
				2242	host.append(fqdn)
				2243	except socket.error:
				2244	pass
				2245	# make a check value list from the registry entry: replace the
				2246	# '<local>' string by the localhost entry and the corresponding
				2247	# canonical entry.
				2248	proxyOverride = proxyOverride.split(';')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2249	# now check if we match one of the registry values.
				2250	for test in proxyOverride:
Senthil Kumaran	4947606	2009-05-01 06:00:23 +0000	[diff] [blame]	2251	if test == '<local>':
				2252	if '.' not in rawHost:
				2253	return 1
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2254	test = test.replace(".", r"\.") # mask dots
				2255	test = test.replace("", r".") # change glob sequence
				2256	test = test.replace("?", r".") # change glob char
				2257	for val in host:
				2258	# print "%s <--> %s" %( test, val )
				2259	if re.match(test, val, re.I):
				2260	return 1
				2261	return 0
				2262
				2263	def proxy_bypass(host):
				2264	"""Return a dictionary of scheme -> proxy server URL mappings.
				2265
				2266	Returns settings gathered from the environment, if specified,
				2267	or the registry.
				2268
				2269	"""
				2270	if getproxies_environment():
				2271	return proxy_bypass_environment(host)
				2272	else:
				2273	return proxy_bypass_registry(host)
				2274
				2275	else:
				2276	# By default use environment variables
				2277	getproxies = getproxies_environment
				2278	proxy_bypass = proxy_bypass_environment