Blame - Lib/urllib/request.py - platform/external/python/cpython2

blob: 6c7215219cfeee573669cf2a6467ed8f6655b04e [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""An extensible library for opening URLs using a variety of protocols
				2
				3	The simplest way to use this module is to call the urlopen function,
				4	which accepts a string containing a URL or a Request object (described
				5	below). It opens the URL and returns the results as file-like
				6	object; the returned object has some extra methods described below.
				7
				8	The OpenerDirector manages a collection of Handler objects that do
				9	all the actual work. Each Handler implements a particular protocol or
				10	option. The OpenerDirector is a composite object that invokes the
				11	Handlers needed to open the requested URL. For example, the
				12	HTTPHandler performs HTTP GET and POST requests and deals with
				13	non-error returns. The HTTPRedirectHandler automatically deals with
				14	HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
				15	deals with digest authentication.
				16
				17	urlopen(url, data=None) -- Basic usage is the same as original
				18	urllib. pass the url and optionally data to post to an HTTP URL, and
				19	get a file-like object back. One difference is that you can also pass
				20	a Request instance instead of URL. Raises a URLError (subclass of
				21	IOError); for HTTP errors, raises an HTTPError, which can also be
				22	treated as a valid response.
				23
				24	build_opener -- Function that creates a new OpenerDirector instance.
				25	Will install the default handlers. Accepts one or more Handlers as
				26	arguments, either instances or Handler classes that it will
				27	instantiate. If one of the argument is a subclass of the default
				28	handler, the argument will be installed instead of the default.
				29
				30	install_opener -- Installs a new opener as the default opener.
				31
				32	objects of interest:
Senthil Kumaran	1107c5d	2009-11-15 06:20:55 +0000	[diff] [blame]	33
				34	OpenerDirector -- Sets up the User-Agent as the Python-urllib and manages the
				35	Handler classes while dealing with both requests and responses.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	36
				37	Request -- An object that encapsulates the state of a request. The
				38	state can be as simple as the URL. It can also include extra HTTP
				39	headers, e.g. a User-Agent.
				40
				41	BaseHandler --
				42
				43	internals:
				44	BaseHandler and parent
				45	_call_chain conventions
				46
				47	Example usage:
				48
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	49	import urllib.request
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	50
				51	# set up authentication info
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	52	authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	53	authinfo.add_password(realm='PDQ Application',
				54	uri='https://mahler:8092/site-updates.py',
				55	user='klem',
				56	passwd='geheim$parole')
				57
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	58	proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	59
				60	# build a new opener that adds authentication and caching FTP handlers
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	61	opener = urllib.request.build_opener(proxy_support, authinfo,
				62	urllib.request.CacheFTPHandler)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	63
				64	# install it
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	65	urllib.request.install_opener(opener)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	66
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	67	f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	68	"""
				69
				70	# XXX issues:
				71	# If an authentication error handler that tries to perform
				72	# authentication for some reason but fails, how should the error be
				73	# signalled? The client needs to know the HTTP error code. But if
				74	# the handler knows that the problem was, e.g., that it didn't know
				75	# that hash algo that requested in the challenge, it would be good to
				76	# pass that information along to the client, too.
				77	# ftp errors aren't handled cleanly
				78	# check digest against correct (i.e. non-apache) implementation
				79
				80	# Possible extensions:
				81	# complex proxies XXX not sure what exactly was meant by this
				82	# abstract factory for opener
				83
				84	import base64
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	85	import bisect
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	86	import email
				87	import hashlib
				88	import http.client
				89	import io
				90	import os
				91	import posixpath
				92	import random
				93	import re
				94	import socket
				95	import sys
				96	import time
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	97
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	98	from urllib.error import URLError, HTTPError, ContentTooShortError
				99	from urllib.parse import (
				100	urlparse, urlsplit, urljoin, unwrap, quote, unquote,
				101	splittype, splithost, splitport, splituser, splitpasswd,
Facundo Batista	f24802c	2008-08-17 03:36:03 +0000	[diff] [blame]	102	splitattr, splitquery, splitvalue, to_bytes, urlunparse)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	103	from urllib.response import addinfourl, addclosehook
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	104
				105	# check for SSL
				106	try:
				107	import ssl
				108	except:
				109	_have_ssl = False
				110	else:
				111	_have_ssl = True
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	112
				113	# used in User-Agent header sent
				114	__version__ = sys.version[:3]
				115
				116	_opener = None
				117	def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
				118	global _opener
				119	if _opener is None:
				120	_opener = build_opener()
				121	return _opener.open(url, data, timeout)
				122
				123	def install_opener(opener):
				124	global _opener
				125	_opener = opener
				126
				127	# TODO(jhylton): Make this work with the same global opener.
				128	_urlopener = None
				129	def urlretrieve(url, filename=None, reporthook=None, data=None):
				130	global _urlopener
				131	if not _urlopener:
				132	_urlopener = FancyURLopener()
				133	return _urlopener.retrieve(url, filename, reporthook, data)
				134
				135	def urlcleanup():
				136	if _urlopener:
				137	_urlopener.cleanup()
				138	global _opener
				139	if _opener:
				140	_opener = None
				141
				142	# copied from cookielib.py
Antoine Pitrou	fd03645	2008-08-19 17:56:33 +0000	[diff] [blame]	143	_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	144	def request_host(request):
				145	"""Return request-host, as defined by RFC 2965.
				146
				147	Variation from RFC: returned value is lowercased, for convenient
				148	comparison.
				149
				150	"""
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	151	url = request.full_url
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	152	host = urlparse(url)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	153	if host == "":
				154	host = request.get_header("Host", "")
				155
				156	# remove port, if present
				157	host = _cut_port_re.sub("", host, 1)
				158	return host.lower()
				159
				160	class Request:
				161
				162	def __init__(self, url, data=None, headers={},
				163	origin_req_host=None, unverifiable=False):
				164	# unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	165	self.full_url = unwrap(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	166	self.data = data
				167	self.headers = {}
Senthil Kumaran	97f0c6b	2009-07-25 04:24:38 +0000	[diff] [blame]	168	self._tunnel_host = None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	169	for key, value in headers.items():
				170	self.add_header(key, value)
				171	self.unredirected_hdrs = {}
				172	if origin_req_host is None:
				173	origin_req_host = request_host(self)
				174	self.origin_req_host = origin_req_host
				175	self.unverifiable = unverifiable
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	176	self._parse()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	177
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	178	def _parse(self):
				179	self.type, rest = splittype(self.full_url)
				180	if self.type is None:
				181	raise ValueError("unknown url type: %s" % self.full_url)
				182	self.host, self.selector = splithost(rest)
				183	if self.host:
				184	self.host = unquote(self.host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	185
				186	def get_method(self):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	187	if self.data is not None:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	188	return "POST"
				189	else:
				190	return "GET"
				191
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	192	# Begin deprecated methods
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	193
				194	def add_data(self, data):
				195	self.data = data
				196
				197	def has_data(self):
				198	return self.data is not None
				199
				200	def get_data(self):
				201	return self.data
				202
				203	def get_full_url(self):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	204	return self.full_url
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	205
				206	def get_type(self):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	207	return self.type
				208
				209	def get_host(self):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	210	return self.host
				211
				212	def get_selector(self):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	213	return self.selector
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	214
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	215	def is_unverifiable(self):
				216	return self.unverifiable
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	217
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	218	def get_origin_req_host(self):
				219	return self.origin_req_host
				220
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	221	# End deprecated methods
				222
				223	def set_proxy(self, host, type):
Senthil Kumaran	97f0c6b	2009-07-25 04:24:38 +0000	[diff] [blame]	224	if self.type == 'https' and not self._tunnel_host:
				225	self._tunnel_host = self.host
				226	else:
				227	self.type= type
				228	self.selector = self.full_url
				229	self.host = host
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	230
				231	def has_proxy(self):
				232	return self.selector == self.full_url
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	233
				234	def add_header(self, key, val):
				235	# useful for something like authentication
				236	self.headers[key.capitalize()] = val
				237
				238	def add_unredirected_header(self, key, val):
				239	# will not be added to a redirected request
				240	self.unredirected_hdrs[key.capitalize()] = val
				241
				242	def has_header(self, header_name):
				243	return (header_name in self.headers or
				244	header_name in self.unredirected_hdrs)
				245
				246	def get_header(self, header_name, default=None):
				247	return self.headers.get(
				248	header_name,
				249	self.unredirected_hdrs.get(header_name, default))
				250
				251	def header_items(self):
				252	hdrs = self.unredirected_hdrs.copy()
				253	hdrs.update(self.headers)
				254	return list(hdrs.items())
				255
				256	class OpenerDirector:
				257	def __init__(self):
				258	client_version = "Python-urllib/%s" % __version__
				259	self.addheaders = [('User-agent', client_version)]
				260	# manage the individual handlers
				261	self.handlers = []
				262	self.handle_open = {}
				263	self.handle_error = {}
				264	self.process_response = {}
				265	self.process_request = {}
				266
				267	def add_handler(self, handler):
				268	if not hasattr(handler, "add_parent"):
				269	raise TypeError("expected BaseHandler instance, got %r" %
				270	type(handler))
				271
				272	added = False
				273	for meth in dir(handler):
				274	if meth in ["redirect_request", "do_open", "proxy_open"]:
				275	# oops, coincidental match
				276	continue
				277
				278	i = meth.find("_")
				279	protocol = meth[:i]
				280	condition = meth[i+1:]
				281
				282	if condition.startswith("error"):
				283	j = condition.find("_") + i + 1
				284	kind = meth[j+1:]
				285	try:
				286	kind = int(kind)
				287	except ValueError:
				288	pass
				289	lookup = self.handle_error.get(protocol, {})
				290	self.handle_error[protocol] = lookup
				291	elif condition == "open":
				292	kind = protocol
				293	lookup = self.handle_open
				294	elif condition == "response":
				295	kind = protocol
				296	lookup = self.process_response
				297	elif condition == "request":
				298	kind = protocol
				299	lookup = self.process_request
				300	else:
				301	continue
				302
				303	handlers = lookup.setdefault(kind, [])
				304	if handlers:
				305	bisect.insort(handlers, handler)
				306	else:
				307	handlers.append(handler)
				308	added = True
				309
				310	if added:
				311	# the handlers must work in an specific order, the order
				312	# is specified in a Handler attribute
				313	bisect.insort(self.handlers, handler)
				314	handler.add_parent(self)
				315
				316	def close(self):
				317	# Only exists for backwards compatibility.
				318	pass
				319
				320	def _call_chain(self, chain, kind, meth_name, *args):
				321	# Handlers raise an exception if no one else should try to handle
				322	# the request, or return None if they can't but another handler
				323	# could. Otherwise, they return the response.
				324	handlers = chain.get(kind, ())
				325	for handler in handlers:
				326	func = getattr(handler, meth_name)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	327	result = func(*args)
				328	if result is not None:
				329	return result
				330
				331	def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
				332	# accept a URL or a Request object
				333	if isinstance(fullurl, str):
				334	req = Request(fullurl, data)
				335	else:
				336	req = fullurl
				337	if data is not None:
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	338	req.data = data
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	339
				340	req.timeout = timeout
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	341	protocol = req.type
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	342
				343	# pre-process request
				344	meth_name = protocol+"_request"
				345	for processor in self.process_request.get(protocol, []):
				346	meth = getattr(processor, meth_name)
				347	req = meth(req)
				348
				349	response = self._open(req, data)
				350
				351	# post-process response
				352	meth_name = protocol+"_response"
				353	for processor in self.process_response.get(protocol, []):
				354	meth = getattr(processor, meth_name)
				355	response = meth(req, response)
				356
				357	return response
				358
				359	def _open(self, req, data=None):
				360	result = self._call_chain(self.handle_open, 'default',
				361	'default_open', req)
				362	if result:
				363	return result
				364
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	365	protocol = req.type
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	366	result = self._call_chain(self.handle_open, protocol, protocol +
				367	'_open', req)
				368	if result:
				369	return result
				370
				371	return self._call_chain(self.handle_open, 'unknown',
				372	'unknown_open', req)
				373
				374	def error(self, proto, *args):
				375	if proto in ('http', 'https'):
				376	# XXX http[s] protocols are special-cased
				377	dict = self.handle_error['http'] # https is not different than http
				378	proto = args[2] # YUCK!
				379	meth_name = 'http_error_%s' % proto
				380	http_err = 1
				381	orig_args = args
				382	else:
				383	dict = self.handle_error
				384	meth_name = proto + '_error'
				385	http_err = 0
				386	args = (dict, proto, meth_name) + args
				387	result = self._call_chain(*args)
				388	if result:
				389	return result
				390
				391	if http_err:
				392	args = (dict, 'default', 'http_error_default') + orig_args
				393	return self._call_chain(*args)
				394
				395	# XXX probably also want an abstract factory that knows when it makes
				396	# sense to skip a superclass in favor of a subclass and when it might
				397	# make sense to include both
				398
				399	def build_opener(*handlers):
				400	"""Create an opener object from a list of handlers.
				401
				402	The opener will use several default handlers, including support
Senthil Kumaran	1107c5d	2009-11-15 06:20:55 +0000	[diff] [blame]	403	for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	404
				405	If any of the handlers passed as arguments are subclasses of the
				406	default handlers, the default handlers will not be used.
				407	"""
				408	def isclass(obj):
				409	return isinstance(obj, type) or hasattr(obj, "__bases__")
				410
				411	opener = OpenerDirector()
				412	default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
				413	HTTPDefaultErrorHandler, HTTPRedirectHandler,
				414	FTPHandler, FileHandler, HTTPErrorProcessor]
				415	if hasattr(http.client, "HTTPSConnection"):
				416	default_classes.append(HTTPSHandler)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	417	skip = set()
				418	for klass in default_classes:
				419	for check in handlers:
				420	if isclass(check):
				421	if issubclass(check, klass):
				422	skip.add(klass)
				423	elif isinstance(check, klass):
				424	skip.add(klass)
				425	for klass in skip:
				426	default_classes.remove(klass)
				427
				428	for klass in default_classes:
				429	opener.add_handler(klass())
				430
				431	for h in handlers:
				432	if isclass(h):
				433	h = h()
				434	opener.add_handler(h)
				435	return opener
				436
				437	class BaseHandler:
				438	handler_order = 500
				439
				440	def add_parent(self, parent):
				441	self.parent = parent
				442
				443	def close(self):
				444	# Only exists for backwards compatibility
				445	pass
				446
				447	def __lt__(self, other):
				448	if not hasattr(other, "handler_order"):
				449	# Try to preserve the old behavior of having custom classes
				450	# inserted after default ones (works only for custom user
				451	# classes which are not aware of handler_order).
				452	return True
				453	return self.handler_order < other.handler_order
				454
				455
				456	class HTTPErrorProcessor(BaseHandler):
				457	"""Process HTTP error responses."""
				458	handler_order = 1000 # after all other processing
				459
				460	def http_response(self, request, response):
				461	code, msg, hdrs = response.code, response.msg, response.info()
				462
				463	# According to RFC 2616, "2xx" code indicates that the client's
				464	# request was successfully received, understood, and accepted.
				465	if not (200 <= code < 300):
				466	response = self.parent.error(
				467	'http', request, response, code, msg, hdrs)
				468
				469	return response
				470
				471	https_response = http_response
				472
				473	class HTTPDefaultErrorHandler(BaseHandler):
				474	def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	475	raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	476
				477	class HTTPRedirectHandler(BaseHandler):
				478	# maximum number of redirections to any single URL
				479	# this is needed because of the state that cookies introduce
				480	max_repeats = 4
				481	# maximum total number of redirections (regardless of URL) before
				482	# assuming we're in a loop
				483	max_redirections = 10
				484
				485	def redirect_request(self, req, fp, code, msg, headers, newurl):
				486	"""Return a Request or None in response to a redirect.
				487
				488	This is called by the http_error_30x methods when a
				489	redirection response is received. If a redirection should
				490	take place, return a new Request to allow http_error_30x to
				491	perform the redirect. Otherwise, raise HTTPError if no-one
				492	else should try to handle this url. Return None if you can't
				493	but another Handler might.
				494	"""
				495	m = req.get_method()
				496	if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
				497	or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	498	raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	499
				500	# Strictly (according to RFC 2616), 301 or 302 in response to
				501	# a POST MUST NOT cause a redirection without confirmation
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	502	# from the user (of urllib.request, in this case). In practice,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	503	# essentially all clients do redirect in this case, so we do
				504	# the same.
				505	# be conciliant with URIs containing a space
				506	newurl = newurl.replace(' ', '%20')
				507	CONTENT_HEADERS = ("content-length", "content-type")
				508	newheaders = dict((k, v) for k, v in req.headers.items()
				509	if k.lower() not in CONTENT_HEADERS)
				510	return Request(newurl,
				511	headers=newheaders,
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	512	origin_req_host=req.origin_req_host,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	513	unverifiable=True)
				514
				515	# Implementation note: To avoid the server sending us into an
				516	# infinite loop, the request object needs to track what URLs we
				517	# have already seen. Do this by adding a handler-specific
				518	# attribute to the Request object.
				519	def http_error_302(self, req, fp, code, msg, headers):
				520	# Some servers (incorrectly) return multiple Location headers
				521	# (so probably same goes for URI). Use first header.
				522	if "location" in headers:
				523	newurl = headers["location"]
				524	elif "uri" in headers:
				525	newurl = headers["uri"]
				526	else:
				527	return
Facundo Batista	f24802c	2008-08-17 03:36:03 +0000	[diff] [blame]	528
				529	# fix a possible malformed URL
				530	urlparts = urlparse(newurl)
				531	if not urlparts.path:
				532	urlparts = list(urlparts)
				533	urlparts[2] = "/"
				534	newurl = urlunparse(urlparts)
				535
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	536	newurl = urljoin(req.full_url, newurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	537
				538	# XXX Probably want to forget about the state of the current
				539	# request, although that might interact poorly with other
				540	# handlers that also use handler-specific request attributes
				541	new = self.redirect_request(req, fp, code, msg, headers, newurl)
				542	if new is None:
				543	return
				544
				545	# loop detection
				546	# .redirect_dict has a key url if url was previously visited.
				547	if hasattr(req, 'redirect_dict'):
				548	visited = new.redirect_dict = req.redirect_dict
				549	if (visited.get(newurl, 0) >= self.max_repeats or
				550	len(visited) >= self.max_redirections):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	551	raise HTTPError(req.full_url, code,
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	552	self.inf_msg + msg, headers, fp)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	553	else:
				554	visited = new.redirect_dict = req.redirect_dict = {}
				555	visited[newurl] = visited.get(newurl, 0) + 1
				556
				557	# Don't close the fp until we are sure that we won't use it
				558	# with HTTPError.
				559	fp.read()
				560	fp.close()
				561
Senthil Kumaran	fb8cc2f	2009-07-19 02:44:19 +0000	[diff] [blame]	562	return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	563
				564	http_error_301 = http_error_303 = http_error_307 = http_error_302
				565
				566	inf_msg = "The HTTP server returned a redirect error that would " \
				567	"lead to an infinite loop.\n" \
				568	"The last 30x error message was:\n"
				569
				570
				571	def _parse_proxy(proxy):
				572	"""Return (scheme, user, password, host/port) given a URL or an authority.
				573
				574	If a URL is supplied, it must have an authority (host:port) component.
				575	According to RFC 3986, having an authority component means the URL must
				576	have two slashes after the scheme:
				577
				578	>>> _parse_proxy('file:/ftp.example.com/')
				579	Traceback (most recent call last):
				580	ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
				581
				582	The first three items of the returned tuple may be None.
				583
				584	Examples of authority parsing:
				585
				586	>>> _parse_proxy('proxy.example.com')
				587	(None, None, None, 'proxy.example.com')
				588	>>> _parse_proxy('proxy.example.com:3128')
				589	(None, None, None, 'proxy.example.com:3128')
				590
				591	The authority component may optionally include userinfo (assumed to be
				592	username:password):
				593
				594	>>> _parse_proxy('joe:password@proxy.example.com')
				595	(None, 'joe', 'password', 'proxy.example.com')
				596	>>> _parse_proxy('joe:password@proxy.example.com:3128')
				597	(None, 'joe', 'password', 'proxy.example.com:3128')
				598
				599	Same examples, but with URLs instead:
				600
				601	>>> _parse_proxy('http://proxy.example.com/')
				602	('http', None, None, 'proxy.example.com')
				603	>>> _parse_proxy('http://proxy.example.com:3128/')
				604	('http', None, None, 'proxy.example.com:3128')
				605	>>> _parse_proxy('http://joe:password@proxy.example.com/')
				606	('http', 'joe', 'password', 'proxy.example.com')
				607	>>> _parse_proxy('http://joe:password@proxy.example.com:3128')
				608	('http', 'joe', 'password', 'proxy.example.com:3128')
				609
				610	Everything after the authority is ignored:
				611
				612	>>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
				613	('ftp', 'joe', 'password', 'proxy.example.com')
				614
				615	Test for no trailing '/' case:
				616
				617	>>> _parse_proxy('http://joe:password@proxy.example.com')
				618	('http', 'joe', 'password', 'proxy.example.com')
				619
				620	"""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	621	scheme, r_scheme = splittype(proxy)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	622	if not r_scheme.startswith("/"):
				623	# authority
				624	scheme = None
				625	authority = proxy
				626	else:
				627	# URL
				628	if not r_scheme.startswith("//"):
				629	raise ValueError("proxy URL with no authority: %r" % proxy)
				630	# We have an authority, so for RFC 3986-compliant URLs (by ss 3.
				631	# and 3.3.), path is empty or starts with '/'
				632	end = r_scheme.find("/", 2)
				633	if end == -1:
				634	end = None
				635	authority = r_scheme[2:end]
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	636	userinfo, hostport = splituser(authority)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	637	if userinfo is not None:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	638	user, password = splitpasswd(userinfo)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	639	else:
				640	user = password = None
				641	return scheme, user, password, hostport
				642
				643	class ProxyHandler(BaseHandler):
				644	# Proxies must be in front
				645	handler_order = 100
				646
				647	def __init__(self, proxies=None):
				648	if proxies is None:
				649	proxies = getproxies()
				650	assert hasattr(proxies, 'keys'), "proxies must be a mapping"
				651	self.proxies = proxies
				652	for type, url in proxies.items():
				653	setattr(self, '%s_open' % type,
				654	lambda r, proxy=url, type=type, meth=self.proxy_open: \
				655	meth(r, proxy, type))
				656
				657	def proxy_open(self, req, proxy, type):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	658	orig_type = req.type
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	659	proxy_type, user, password, hostport = _parse_proxy(proxy)
				660	if proxy_type is None:
				661	proxy_type = orig_type
Senthil Kumaran	7bb0497	2009-10-11 04:58:55 +0000	[diff] [blame]	662
				663	if req.host and proxy_bypass(req.host):
				664	return None
				665
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	666	if user and password:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	667	user_pass = '%s:%s' % (unquote(user),
				668	unquote(password))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	669	creds = base64.b64encode(user_pass.encode()).decode("ascii")
				670	req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	671	hostport = unquote(hostport)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	672	req.set_proxy(hostport, proxy_type)
Senthil Kumaran	97f0c6b	2009-07-25 04:24:38 +0000	[diff] [blame]	673	if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	674	# let other handlers take care of it
				675	return None
				676	else:
				677	# need to start over, because the other handlers don't
				678	# grok the proxy's URL type
				679	# e.g. if we have a constructor arg proxies like so:
				680	# {'http': 'ftp://proxy.example.com'}, we may end up turning
				681	# a request for http://acme.example.com/a into one for
				682	# ftp://proxy.example.com/a
Senthil Kumaran	fb8cc2f	2009-07-19 02:44:19 +0000	[diff] [blame]	683	return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	684
				685	class HTTPPasswordMgr:
				686
				687	def __init__(self):
				688	self.passwd = {}
				689
				690	def add_password(self, realm, uri, user, passwd):
				691	# uri could be a single URI or a sequence
				692	if isinstance(uri, str):
				693	uri = [uri]
				694	if not realm in self.passwd:
				695	self.passwd[realm] = {}
				696	for default_port in True, False:
				697	reduced_uri = tuple(
				698	[self.reduce_uri(u, default_port) for u in uri])
				699	self.passwd[realm][reduced_uri] = (user, passwd)
				700
				701	def find_user_password(self, realm, authuri):
				702	domains = self.passwd.get(realm, {})
				703	for default_port in True, False:
				704	reduced_authuri = self.reduce_uri(authuri, default_port)
				705	for uris, authinfo in domains.items():
				706	for uri in uris:
				707	if self.is_suburi(uri, reduced_authuri):
				708	return authinfo
				709	return None, None
				710
				711	def reduce_uri(self, uri, default_port=True):
				712	"""Accept authority or URI and extract only the authority and path."""
				713	# note HTTP URLs do not have a userinfo component
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	714	parts = urlsplit(uri)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	715	if parts[1]:
				716	# URI
				717	scheme = parts[0]
				718	authority = parts[1]
				719	path = parts[2] or '/'
				720	else:
				721	# host or host:port
				722	scheme = None
				723	authority = uri
				724	path = '/'
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	725	host, port = splitport(authority)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	726	if default_port and port is None and scheme is not None:
				727	dport = {"http": 80,
				728	"https": 443,
				729	}.get(scheme)
				730	if dport is not None:
				731	authority = "%s:%d" % (host, dport)
				732	return authority, path
				733
				734	def is_suburi(self, base, test):
				735	"""Check if test is below base in a URI tree
				736
				737	Both args must be URIs in reduced form.
				738	"""
				739	if base == test:
				740	return True
				741	if base[0] != test[0]:
				742	return False
				743	common = posixpath.commonprefix((base[1], test[1]))
				744	if len(common) == len(base[1]):
				745	return True
				746	return False
				747
				748
				749	class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
				750
				751	def find_user_password(self, realm, authuri):
				752	user, password = HTTPPasswordMgr.find_user_password(self, realm,
				753	authuri)
				754	if user is not None:
				755	return user, password
				756	return HTTPPasswordMgr.find_user_password(self, None, authuri)
				757
				758
				759	class AbstractBasicAuthHandler:
				760
				761	# XXX this allows for multiple auth-schemes, but will stupidly pick
				762	# the last one with a realm specified.
				763
				764	# allow for double- and single-quoted realm values
				765	# (single quotes are a violation of the RFC, but appear in the wild)
				766	rx = re.compile('(?:.,)[ \t]*([^ \t]+)[ \t]+'
				767	'realm=(["\'])(.*?)\\2', re.I)
				768
				769	# XXX could pre-emptively send auth info already accepted (RFC 2617,
				770	# end of section 2, and section 1.2 immediately after "credentials"
				771	# production).
				772
				773	def __init__(self, password_mgr=None):
				774	if password_mgr is None:
				775	password_mgr = HTTPPasswordMgr()
				776	self.passwd = password_mgr
				777	self.add_password = self.passwd.add_password
				778
				779	def http_error_auth_reqed(self, authreq, host, req, headers):
				780	# host may be an authority (without userinfo) or a URL with an
				781	# authority
				782	# XXX could be multiple headers
				783	authreq = headers.get(authreq, None)
				784	if authreq:
				785	mo = AbstractBasicAuthHandler.rx.search(authreq)
				786	if mo:
				787	scheme, quote, realm = mo.groups()
				788	if scheme.lower() == 'basic':
				789	return self.retry_http_basic_auth(host, req, realm)
				790
				791	def retry_http_basic_auth(self, host, req, realm):
				792	user, pw = self.passwd.find_user_password(realm, host)
				793	if pw is not None:
				794	raw = "%s:%s" % (user, pw)
				795	auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
				796	if req.headers.get(self.auth_header, None) == auth:
				797	return None
				798	req.add_header(self.auth_header, auth)
Senthil Kumaran	fb8cc2f	2009-07-19 02:44:19 +0000	[diff] [blame]	799	return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	800	else:
				801	return None
				802
				803
				804	class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
				805
				806	auth_header = 'Authorization'
				807
				808	def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	809	url = req.full_url
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	810	return self.http_error_auth_reqed('www-authenticate',
				811	url, req, headers)
				812
				813
				814	class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
				815
				816	auth_header = 'Proxy-authorization'
				817
				818	def http_error_407(self, req, fp, code, msg, headers):
				819	# http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	820	# authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	821	# should not, RFC 3986 s. 3.2.1) support requests for URLs containing
				822	# userinfo.
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	823	authority = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	824	return self.http_error_auth_reqed('proxy-authenticate',
				825	authority, req, headers)
				826
				827
				828	def randombytes(n):
				829	"""Return n random bytes."""
				830	return os.urandom(n)
				831
				832	class AbstractDigestAuthHandler:
				833	# Digest authentication is specified in RFC 2617.
				834
				835	# XXX The client does not inspect the Authentication-Info header
				836	# in a successful response.
				837
				838	# XXX It should be possible to test this implementation against
				839	# a mock server that just generates a static set of challenges.
				840
				841	# XXX qop="auth-int" supports is shaky
				842
				843	def __init__(self, passwd=None):
				844	if passwd is None:
				845	passwd = HTTPPasswordMgr()
				846	self.passwd = passwd
				847	self.add_password = self.passwd.add_password
				848	self.retried = 0
				849	self.nonce_count = 0
Senthil Kumaran	4c7eaee	2009-11-15 08:43:45 +0000	[diff] [blame]	850	self.last_nonce = None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	851
				852	def reset_retry_count(self):
				853	self.retried = 0
				854
				855	def http_error_auth_reqed(self, auth_header, host, req, headers):
				856	authreq = headers.get(auth_header, None)
				857	if self.retried > 5:
				858	# Don't fail endlessly - if we failed once, we'll probably
				859	# fail a second time. Hm. Unless the Password Manager is
				860	# prompting for the information. Crap. This isn't great
				861	# but it's better than the current 'repeat until recursion
				862	# depth exceeded' approach <wink>
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	863	raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	864	headers, None)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	865	else:
				866	self.retried += 1
				867	if authreq:
				868	scheme = authreq.split()[0]
				869	if scheme.lower() == 'digest':
				870	return self.retry_http_digest_auth(req, authreq)
				871
				872	def retry_http_digest_auth(self, req, auth):
				873	token, challenge = auth.split(' ', 1)
				874	chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
				875	auth = self.get_authorization(req, chal)
				876	if auth:
				877	auth_val = 'Digest %s' % auth
				878	if req.headers.get(self.auth_header, None) == auth_val:
				879	return None
				880	req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaran	fb8cc2f	2009-07-19 02:44:19 +0000	[diff] [blame]	881	resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	882	return resp
				883
				884	def get_cnonce(self, nonce):
				885	# The cnonce-value is an opaque
				886	# quoted string value provided by the client and used by both client
				887	# and server to avoid chosen plaintext attacks, to provide mutual
				888	# authentication, and to provide some message integrity protection.
				889	# This isn't a fabulous effort, but it's probably Good Enough.
				890	s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
				891	b = s.encode("ascii") + randombytes(8)
				892	dig = hashlib.sha1(b).hexdigest()
				893	return dig[:16]
				894
				895	def get_authorization(self, req, chal):
				896	try:
				897	realm = chal['realm']
				898	nonce = chal['nonce']
				899	qop = chal.get('qop')
				900	algorithm = chal.get('algorithm', 'MD5')
				901	# mod_digest doesn't send an opaque, even though it isn't
				902	# supposed to be optional
				903	opaque = chal.get('opaque', None)
				904	except KeyError:
				905	return None
				906
				907	H, KD = self.get_algorithm_impls(algorithm)
				908	if H is None:
				909	return None
				910
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	911	user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	912	if user is None:
				913	return None
				914
				915	# XXX not implemented yet
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	916	if req.data is not None:
				917	entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	918	else:
				919	entdig = None
				920
				921	A1 = "%s:%s:%s" % (user, realm, pw)
				922	A2 = "%s:%s" % (req.get_method(),
				923	# XXX selector: what about proxies and full urls
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	924	req.selector)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	925	if qop == 'auth':
Senthil Kumaran	4c7eaee	2009-11-15 08:43:45 +0000	[diff] [blame]	926	if nonce == self.last_nonce:
				927	self.nonce_count += 1
				928	else:
				929	self.nonce_count = 1
				930	self.last_nonce = nonce
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	931	ncvalue = '%08x' % self.nonce_count
				932	cnonce = self.get_cnonce(nonce)
				933	noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
				934	respdig = KD(H(A1), noncebit)
				935	elif qop is None:
				936	respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
				937	else:
				938	# XXX handle auth-int.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	939	raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	940
				941	# XXX should the partial digests be encoded too?
				942
				943	base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	944	'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	945	respdig)
				946	if opaque:
				947	base += ', opaque="%s"' % opaque
				948	if entdig:
				949	base += ', digest="%s"' % entdig
				950	base += ', algorithm="%s"' % algorithm
				951	if qop:
				952	base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
				953	return base
				954
				955	def get_algorithm_impls(self, algorithm):
				956	# lambdas assume digest modules are imported at the top level
				957	if algorithm == 'MD5':
				958	H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
				959	elif algorithm == 'SHA':
				960	H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
				961	# XXX MD5-sess
				962	KD = lambda s, d: H("%s:%s" % (s, d))
				963	return H, KD
				964
				965	def get_entity_digest(self, data, chal):
				966	# XXX not implemented yet
				967	return None
				968
				969
				970	class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
				971	"""An authentication protocol defined by RFC 2069
				972
				973	Digest authentication improves on basic authentication because it
				974	does not transmit passwords in the clear.
				975	"""
				976
				977	auth_header = 'Authorization'
				978	handler_order = 490 # before Basic auth
				979
				980	def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	981	host = urlparse(req.full_url)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	982	retry = self.http_error_auth_reqed('www-authenticate',
				983	host, req, headers)
				984	self.reset_retry_count()
				985	return retry
				986
				987
				988	class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
				989
				990	auth_header = 'Proxy-Authorization'
				991	handler_order = 490 # before Basic auth
				992
				993	def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	994	host = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	995	retry = self.http_error_auth_reqed('proxy-authenticate',
				996	host, req, headers)
				997	self.reset_retry_count()
				998	return retry
				999
				1000	class AbstractHTTPHandler(BaseHandler):
				1001
				1002	def __init__(self, debuglevel=0):
				1003	self._debuglevel = debuglevel
				1004
				1005	def set_http_debuglevel(self, level):
				1006	self._debuglevel = level
				1007
				1008	def do_request_(self, request):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1009	host = request.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1010	if not host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1011	raise URLError('no host given')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1012
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1013	if request.data is not None: # POST
				1014	data = request.data
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1015	if not request.has_header('Content-type'):
				1016	request.add_unredirected_header(
				1017	'Content-type',
				1018	'application/x-www-form-urlencoded')
				1019	if not request.has_header('Content-length'):
				1020	request.add_unredirected_header(
				1021	'Content-length', '%d' % len(data))
				1022
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	1023	sel_host = host
				1024	if request.has_proxy():
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1025	scheme, sel = splittype(request.selector)
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	1026	sel_host, sel_path = splithost(sel)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1027	if not request.has_header('Host'):
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	1028	request.add_unredirected_header('Host', sel_host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1029	for name, value in self.parent.addheaders:
				1030	name = name.capitalize()
				1031	if not request.has_header(name):
				1032	request.add_unredirected_header(name, value)
				1033
				1034	return request
				1035
				1036	def do_open(self, http_class, req):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1037	"""Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1038
				1039	http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1040	"""
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1041	host = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1042	if not host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1043	raise URLError('no host given')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1044
				1045	h = http_class(host, timeout=req.timeout) # will parse host:port
				1046	headers = dict(req.headers)
				1047	headers.update(req.unredirected_hdrs)
				1048
				1049	# TODO(jhylton): Should this be redesigned to handle
				1050	# persistent connections?
				1051
				1052	# We want to make an HTTP/1.1 request, but the addinfourl
				1053	# class isn't prepared to deal with a persistent connection.
				1054	# It will try to read all remaining data from the socket,
				1055	# which will block while the server waits for the next request.
				1056	# So make sure the connection gets closed after the (only)
				1057	# request.
				1058	headers["Connection"] = "close"
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1059	headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran	97f0c6b	2009-07-25 04:24:38 +0000	[diff] [blame]	1060
				1061	if req._tunnel_host:
				1062	h.set_tunnel(req._tunnel_host)
				1063
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1064	try:
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1065	h.request(req.get_method(), req.selector, req.data, headers)
				1066	r = h.getresponse() # an HTTPResponse instance
				1067	except socket.error as err:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1068	raise URLError(err)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1069
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1070	r.url = req.full_url
				1071	# This line replaces the .msg attribute of the HTTPResponse
				1072	# with .headers, because urllib clients expect the response to
				1073	# have the reason in .msg. It would be good to mark this
				1074	# attribute is deprecated and get then to use info() or
				1075	# .headers.
				1076	r.msg = r.reason
				1077	return r
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1078
				1079
				1080	class HTTPHandler(AbstractHTTPHandler):
				1081
				1082	def http_open(self, req):
				1083	return self.do_open(http.client.HTTPConnection, req)
				1084
				1085	http_request = AbstractHTTPHandler.do_request_
				1086
				1087	if hasattr(http.client, 'HTTPSConnection'):
				1088	class HTTPSHandler(AbstractHTTPHandler):
				1089
				1090	def https_open(self, req):
				1091	return self.do_open(http.client.HTTPSConnection, req)
				1092
				1093	https_request = AbstractHTTPHandler.do_request_
				1094
				1095	class HTTPCookieProcessor(BaseHandler):
				1096	def __init__(self, cookiejar=None):
				1097	import http.cookiejar
				1098	if cookiejar is None:
				1099	cookiejar = http.cookiejar.CookieJar()
				1100	self.cookiejar = cookiejar
				1101
				1102	def http_request(self, request):
				1103	self.cookiejar.add_cookie_header(request)
				1104	return request
				1105
				1106	def http_response(self, request, response):
				1107	self.cookiejar.extract_cookies(response, request)
				1108	return response
				1109
				1110	https_request = http_request
				1111	https_response = http_response
				1112
				1113	class UnknownHandler(BaseHandler):
				1114	def unknown_open(self, req):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1115	type = req.type
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1116	raise URLError('unknown url type: %s' % type)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1117
				1118	def parse_keqv_list(l):
				1119	"""Parse list of key=value strings where keys are not duplicated."""
				1120	parsed = {}
				1121	for elt in l:
				1122	k, v = elt.split('=', 1)
				1123	if v[0] == '"' and v[-1] == '"':
				1124	v = v[1:-1]
				1125	parsed[k] = v
				1126	return parsed
				1127
				1128	def parse_http_list(s):
				1129	"""Parse lists as described by RFC 2068 Section 2.
				1130
				1131	In particular, parse comma-separated lists where the elements of
				1132	the list may include quoted-strings. A quoted-string could
				1133	contain a comma. A non-quoted string could have quotes in the
				1134	middle. Neither commas nor quotes count if they are escaped.
				1135	Only double-quotes count, not single-quotes.
				1136	"""
				1137	res = []
				1138	part = ''
				1139
				1140	escape = quote = False
				1141	for cur in s:
				1142	if escape:
				1143	part += cur
				1144	escape = False
				1145	continue
				1146	if quote:
				1147	if cur == '\\':
				1148	escape = True
				1149	continue
				1150	elif cur == '"':
				1151	quote = False
				1152	part += cur
				1153	continue
				1154
				1155	if cur == ',':
				1156	res.append(part)
				1157	part = ''
				1158	continue
				1159
				1160	if cur == '"':
				1161	quote = True
				1162
				1163	part += cur
				1164
				1165	# append last part
				1166	if part:
				1167	res.append(part)
				1168
				1169	return [part.strip() for part in res]
				1170
				1171	class FileHandler(BaseHandler):
				1172	# Use local file or FTP depending on form of URL
				1173	def file_open(self, req):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1174	url = req.selector
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1175	if url[:2] == '//' and url[2:3] != '/':
				1176	req.type = 'ftp'
				1177	return self.parent.open(req)
				1178	else:
				1179	return self.open_local_file(req)
				1180
				1181	# names for the localhost
				1182	names = None
				1183	def get_names(self):
				1184	if FileHandler.names is None:
				1185	try:
				1186	FileHandler.names = (socket.gethostbyname('localhost'),
				1187	socket.gethostbyname(socket.gethostname()))
				1188	except socket.gaierror:
				1189	FileHandler.names = (socket.gethostbyname('localhost'),)
				1190	return FileHandler.names
				1191
				1192	# not entirely sure what the rules are here
				1193	def open_local_file(self, req):
				1194	import email.utils
				1195	import mimetypes
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1196	host = req.host
				1197	file = req.selector
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1198	localfile = url2pathname(file)
				1199	try:
				1200	stats = os.stat(localfile)
				1201	size = stats.st_size
				1202	modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
				1203	mtype = mimetypes.guess_type(file)[0]
				1204	headers = email.message_from_string(
				1205	'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
				1206	(mtype or 'text/plain', size, modified))
				1207	if host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1208	host, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1209	if not host or \
				1210	(not port and _safe_gethostbyname(host) in self.get_names()):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1211	return addinfourl(open(localfile, 'rb'), headers, 'file:'+file)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1212	except OSError as msg:
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	1213	# users shouldn't expect OSErrors coming from urlopen()
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1214	raise URLError(msg)
				1215	raise URLError('file not on local host')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1216
				1217	def _safe_gethostbyname(host):
				1218	try:
				1219	return socket.gethostbyname(host)
				1220	except socket.gaierror:
				1221	return None
				1222
				1223	class FTPHandler(BaseHandler):
				1224	def ftp_open(self, req):
				1225	import ftplib
				1226	import mimetypes
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1227	host = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1228	if not host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1229	raise URLError('ftp error: no host given')
				1230	host, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1231	if port is None:
				1232	port = ftplib.FTP_PORT
				1233	else:
				1234	port = int(port)
				1235
				1236	# username/password handling
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1237	user, host = splituser(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1238	if user:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1239	user, passwd = splitpasswd(user)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1240	else:
				1241	passwd = None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1242	host = unquote(host)
				1243	user = unquote(user or '')
				1244	passwd = unquote(passwd or '')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1245
				1246	try:
				1247	host = socket.gethostbyname(host)
				1248	except socket.error as msg:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1249	raise URLError(msg)
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1250	path, attrs = splitattr(req.selector)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1251	dirs = path.split('/')
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1252	dirs = list(map(unquote, dirs))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1253	dirs, file = dirs[:-1], dirs[-1]
				1254	if dirs and not dirs[0]:
				1255	dirs = dirs[1:]
				1256	try:
				1257	fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
				1258	type = file and 'I' or 'D'
				1259	for attr in attrs:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1260	attr, value = splitvalue(attr)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1261	if attr.lower() == 'type' and \
				1262	value in ('a', 'A', 'i', 'I', 'd', 'D'):
				1263	type = value.upper()
				1264	fp, retrlen = fw.retrfile(file, type)
				1265	headers = ""
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1266	mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1267	if mtype:
				1268	headers += "Content-type: %s\n" % mtype
				1269	if retrlen is not None and retrlen >= 0:
				1270	headers += "Content-length: %d\n" % retrlen
				1271	headers = email.message_from_string(headers)
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1272	return addinfourl(fp, headers, req.full_url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1273	except ftplib.all_errors as msg:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1274	exc = URLError('ftp error: %s' % msg)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1275	raise exc.with_traceback(sys.exc_info()[2])
				1276
				1277	def connect_ftp(self, user, passwd, host, port, dirs, timeout):
				1278	fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
				1279	return fw
				1280
				1281	class CacheFTPHandler(FTPHandler):
				1282	# XXX would be nice to have pluggable cache strategies
				1283	# XXX this stuff is definitely not thread safe
				1284	def __init__(self):
				1285	self.cache = {}
				1286	self.timeout = {}
				1287	self.soonest = 0
				1288	self.delay = 60
				1289	self.max_conns = 16
				1290
				1291	def setTimeout(self, t):
				1292	self.delay = t
				1293
				1294	def setMaxConns(self, m):
				1295	self.max_conns = m
				1296
				1297	def connect_ftp(self, user, passwd, host, port, dirs, timeout):
				1298	key = user, host, port, '/'.join(dirs), timeout
				1299	if key in self.cache:
				1300	self.timeout[key] = time.time() + self.delay
				1301	else:
				1302	self.cache[key] = ftpwrapper(user, passwd, host, port,
				1303	dirs, timeout)
				1304	self.timeout[key] = time.time() + self.delay
				1305	self.check_cache()
				1306	return self.cache[key]
				1307
				1308	def check_cache(self):
				1309	# first check for old ones
				1310	t = time.time()
				1311	if self.soonest <= t:
				1312	for k, v in list(self.timeout.items()):
				1313	if v < t:
				1314	self.cache[k].close()
				1315	del self.cache[k]
				1316	del self.timeout[k]
				1317	self.soonest = min(list(self.timeout.values()))
				1318
				1319	# then check the size
				1320	if len(self.cache) == self.max_conns:
				1321	for k, v in list(self.timeout.items()):
				1322	if v == self.soonest:
				1323	del self.cache[k]
				1324	del self.timeout[k]
				1325	break
				1326	self.soonest = min(list(self.timeout.values()))
				1327
				1328	# Code move from the old urllib module
				1329
				1330	MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
				1331
				1332	# Helper for non-unix systems
				1333	if os.name == 'mac':
				1334	from macurl2path import url2pathname, pathname2url
				1335	elif os.name == 'nt':
				1336	from nturl2path import url2pathname, pathname2url
				1337	else:
				1338	def url2pathname(pathname):
				1339	"""OS-specific conversion from a relative URL of the 'file' scheme
				1340	to a file system path; not recommended for general use."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1341	return unquote(pathname)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1342
				1343	def pathname2url(pathname):
				1344	"""OS-specific conversion from a file system path to a relative URL
				1345	of the 'file' scheme; not recommended for general use."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1346	return quote(pathname)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1347
				1348	# This really consists of two pieces:
				1349	# (1) a class which handles opening of all sorts of URLs
				1350	# (plus assorted utilities etc.)
				1351	# (2) a set of functions for parsing URLs
				1352	# XXX Should these be separated out into different modules?
				1353
				1354
				1355	ftpcache = {}
				1356	class URLopener:
				1357	"""Class to open URLs.
				1358	This is a class rather than just a subroutine because we may need
				1359	more than one set of global protocol-specific options.
				1360	Note -- this is a base class for those who don't want the
				1361	automatic handling of errors type 302 (relocated) and 401
				1362	(authorization needed)."""
				1363
				1364	__tempfiles = None
				1365
				1366	version = "Python-urllib/%s" % __version__
				1367
				1368	# Constructor
				1369	def __init__(self, proxies=None, **x509):
				1370	if proxies is None:
				1371	proxies = getproxies()
				1372	assert hasattr(proxies, 'keys'), "proxies must be a mapping"
				1373	self.proxies = proxies
				1374	self.key_file = x509.get('key_file')
				1375	self.cert_file = x509.get('cert_file')
				1376	self.addheaders = [('User-Agent', self.version)]
				1377	self.__tempfiles = []
				1378	self.__unlink = os.unlink # See cleanup()
				1379	self.tempcache = None
				1380	# Undocumented feature: if you assign {} to tempcache,
				1381	# it is used to cache files retrieved with
				1382	# self.retrieve(). This is not enabled by default
				1383	# since it does not work for changing documents (and I
				1384	# haven't got the logic to check expiration headers
				1385	# yet).
				1386	self.ftpcache = ftpcache
				1387	# Undocumented feature: you can use a different
				1388	# ftp cache by assigning to the .ftpcache member;
				1389	# in case you want logically independent URL openers
				1390	# XXX This is not threadsafe. Bah.
				1391
				1392	def __del__(self):
				1393	self.close()
				1394
				1395	def close(self):
				1396	self.cleanup()
				1397
				1398	def cleanup(self):
				1399	# This code sometimes runs when the rest of this module
				1400	# has already been deleted, so it can't use any globals
				1401	# or import anything.
				1402	if self.__tempfiles:
				1403	for file in self.__tempfiles:
				1404	try:
				1405	self.__unlink(file)
				1406	except OSError:
				1407	pass
				1408	del self.__tempfiles[:]
				1409	if self.tempcache:
				1410	self.tempcache.clear()
				1411
				1412	def addheader(self, *args):
				1413	"""Add a header to be used by the HTTP interface only
				1414	e.g. u.addheader('Accept', 'sound/basic')"""
				1415	self.addheaders.append(args)
				1416
				1417	# External interface
				1418	def open(self, fullurl, data=None):
				1419	"""Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1420	fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran	690ce9b	2009-05-05 18:41:13 +0000	[diff] [blame]	1421	fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1422	if self.tempcache and fullurl in self.tempcache:
				1423	filename, headers = self.tempcache[fullurl]
				1424	fp = open(filename, 'rb')
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1425	return addinfourl(fp, headers, fullurl)
				1426	urltype, url = splittype(fullurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1427	if not urltype:
				1428	urltype = 'file'
				1429	if urltype in self.proxies:
				1430	proxy = self.proxies[urltype]
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1431	urltype, proxyhost = splittype(proxy)
				1432	host, selector = splithost(proxyhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1433	url = (host, fullurl) # Signal special case to open_*()
				1434	else:
				1435	proxy = None
				1436	name = 'open_' + urltype
				1437	self.type = urltype
				1438	name = name.replace('-', '_')
				1439	if not hasattr(self, name):
				1440	if proxy:
				1441	return self.open_unknown_proxy(proxy, fullurl, data)
				1442	else:
				1443	return self.open_unknown(fullurl, data)
				1444	try:
				1445	if data is None:
				1446	return getattr(self, name)(url)
				1447	else:
				1448	return getattr(self, name)(url, data)
				1449	except socket.error as msg:
				1450	raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
				1451
				1452	def open_unknown(self, fullurl, data=None):
				1453	"""Overridable interface to open unknown URL type."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1454	type, url = splittype(fullurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1455	raise IOError('url error', 'unknown url type', type)
				1456
				1457	def open_unknown_proxy(self, proxy, fullurl, data=None):
				1458	"""Overridable interface to open unknown URL type."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1459	type, url = splittype(fullurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1460	raise IOError('url error', 'invalid proxy for %s' % type, proxy)
				1461
				1462	# External interface
				1463	def retrieve(self, url, filename=None, reporthook=None, data=None):
				1464	"""retrieve(url) returns (filename, headers) for a local object
				1465	or (tempfilename, headers) for a remote object."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1466	url = unwrap(to_bytes(url))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1467	if self.tempcache and url in self.tempcache:
				1468	return self.tempcache[url]
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1469	type, url1 = splittype(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1470	if filename is None and (not type or type == 'file'):
				1471	try:
				1472	fp = self.open_local_file(url1)
				1473	hdrs = fp.info()
Philip Jenvey	cb134d7	2009-12-03 02:45:01 +0000	[diff] [blame]	1474	fp.close()
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1475	return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1476	except IOError as msg:
				1477	pass
				1478	fp = self.open(url, data)
Benjamin Peterson	5f28b7b	2009-03-26 21:49:58 +0000	[diff] [blame]	1479	try:
				1480	headers = fp.info()
				1481	if filename:
				1482	tfp = open(filename, 'wb')
				1483	else:
				1484	import tempfile
				1485	garbage, path = splittype(url)
				1486	garbage, path = splithost(path or "")
				1487	path, garbage = splitquery(path or "")
				1488	path, garbage = splitattr(path or "")
				1489	suffix = os.path.splitext(path)[1]
				1490	(fd, filename) = tempfile.mkstemp(suffix)
				1491	self.__tempfiles.append(filename)
				1492	tfp = os.fdopen(fd, 'wb')
				1493	try:
				1494	result = filename, headers
				1495	if self.tempcache is not None:
				1496	self.tempcache[url] = result
				1497	bs = 1024*8
				1498	size = -1
				1499	read = 0
				1500	blocknum = 0
				1501	if reporthook:
				1502	if "content-length" in headers:
				1503	size = int(headers["Content-Length"])
				1504	reporthook(blocknum, bs, size)
				1505	while 1:
				1506	block = fp.read(bs)
				1507	if not block:
				1508	break
				1509	read += len(block)
				1510	tfp.write(block)
				1511	blocknum += 1
				1512	if reporthook:
				1513	reporthook(blocknum, bs, size)
				1514	finally:
				1515	tfp.close()
				1516	finally:
				1517	fp.close()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1518
				1519	# raise exception if actual size does not match content-length header
				1520	if size >= 0 and read < size:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1521	raise ContentTooShortError(
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1522	"retrieval incomplete: got only %i out of %i bytes"
				1523	% (read, size), result)
				1524
				1525	return result
				1526
				1527	# Each method named open_<type> knows how to open that type of URL
				1528
				1529	def _open_generic_http(self, connection_factory, url, data):
				1530	"""Make an HTTP connection using connection_class.
				1531
				1532	This is an internal method that should be called from
				1533	open_http() or open_https().
				1534
				1535	Arguments:
				1536	- connection_factory should take a host name and return an
				1537	HTTPConnection instance.
				1538	- url is the url to retrieval or a host, relative-path pair.
				1539	- data is payload for a POST request or None.
				1540	"""
				1541
				1542	user_passwd = None
				1543	proxy_passwd= None
				1544	if isinstance(url, str):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1545	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1546	if host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1547	user_passwd, host = splituser(host)
				1548	host = unquote(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1549	realhost = host
				1550	else:
				1551	host, selector = url
				1552	# check whether the proxy contains authorization information
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1553	proxy_passwd, host = splituser(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1554	# now we proceed with the url we want to obtain
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1555	urltype, rest = splittype(selector)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1556	url = rest
				1557	user_passwd = None
				1558	if urltype.lower() != 'http':
				1559	realhost = None
				1560	else:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1561	realhost, rest = splithost(rest)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1562	if realhost:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1563	user_passwd, realhost = splituser(realhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1564	if user_passwd:
				1565	selector = "%s://%s%s" % (urltype, realhost, rest)
				1566	if proxy_bypass(realhost):
				1567	host = realhost
				1568
				1569	#print "proxy via http:", host, selector
				1570	if not host: raise IOError('http error', 'no host given')
				1571
				1572	if proxy_passwd:
				1573	import base64
				1574	proxy_auth = base64.b64encode(proxy_passwd).strip()
				1575	else:
				1576	proxy_auth = None
				1577
				1578	if user_passwd:
				1579	import base64
				1580	auth = base64.b64encode(user_passwd).strip()
				1581	else:
				1582	auth = None
				1583	http_conn = connection_factory(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1584	headers = {}
				1585	if proxy_auth:
				1586	headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
				1587	if auth:
				1588	headers["Authorization"] = "Basic %s" % auth
				1589	if realhost:
				1590	headers["Host"] = realhost
				1591	for header, value in self.addheaders:
				1592	headers[header] = value
				1593
				1594	if data is not None:
				1595	headers["Content-Type"] = "application/x-www-form-urlencoded"
				1596	http_conn.request("POST", selector, data, headers)
				1597	else:
				1598	http_conn.request("GET", selector, headers=headers)
				1599
				1600	try:
				1601	response = http_conn.getresponse()
				1602	except http.client.BadStatusLine:
				1603	# something went wrong with the HTTP status line
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1604	raise URLError("http protocol error: bad status line")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1605
				1606	# According to RFC 2616, "2xx" code indicates that the client's
				1607	# request was successfully received, understood, and accepted.
				1608	if 200 <= response.status < 300:
Antoine Pitrou	b353c12	2009-02-11 00:39:14 +0000	[diff] [blame]	1609	return addinfourl(response, response.msg, "http:" + url,
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1610	response.status)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1611	else:
				1612	return self.http_error(
				1613	url, response.fp,
				1614	response.status, response.reason, response.msg, data)
				1615
				1616	def open_http(self, url, data=None):
				1617	"""Use HTTP protocol."""
				1618	return self._open_generic_http(http.client.HTTPConnection, url, data)
				1619
				1620	def http_error(self, url, fp, errcode, errmsg, headers, data=None):
				1621	"""Handle http errors.
				1622
				1623	Derived class can override this, or provide specific handlers
				1624	named http_error_DDD where DDD is the 3-digit error code."""
				1625	# First check if there's a specific handler for this error
				1626	name = 'http_error_%d' % errcode
				1627	if hasattr(self, name):
				1628	method = getattr(self, name)
				1629	if data is None:
				1630	result = method(url, fp, errcode, errmsg, headers)
				1631	else:
				1632	result = method(url, fp, errcode, errmsg, headers, data)
				1633	if result: return result
				1634	return self.http_error_default(url, fp, errcode, errmsg, headers)
				1635
				1636	def http_error_default(self, url, fp, errcode, errmsg, headers):
				1637	"""Default error handler: close the connection and raise IOError."""
				1638	void = fp.read()
				1639	fp.close()
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1640	raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1641
				1642	if _have_ssl:
				1643	def _https_connection(self, host):
				1644	return http.client.HTTPSConnection(host,
				1645	key_file=self.key_file,
				1646	cert_file=self.cert_file)
				1647
				1648	def open_https(self, url, data=None):
				1649	"""Use HTTPS protocol."""
				1650	return self._open_generic_http(self._https_connection, url, data)
				1651
				1652	def open_file(self, url):
				1653	"""Use local file or FTP depending on form of URL."""
				1654	if not isinstance(url, str):
				1655	raise URLError('file error', 'proxy support for file protocol currently not implemented')
				1656	if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
				1657	return self.open_ftp(url)
				1658	else:
				1659	return self.open_local_file(url)
				1660
				1661	def open_local_file(self, url):
				1662	"""Use local file."""
				1663	import mimetypes, email.utils
				1664	from io import StringIO
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1665	host, file = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1666	localname = url2pathname(file)
				1667	try:
				1668	stats = os.stat(localname)
				1669	except OSError as e:
				1670	raise URLError(e.errno, e.strerror, e.filename)
				1671	size = stats.st_size
				1672	modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
				1673	mtype = mimetypes.guess_type(url)[0]
				1674	headers = email.message_from_string(
				1675	'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
				1676	(mtype or 'text/plain', size, modified))
				1677	if not host:
				1678	urlfile = file
				1679	if file[:1] == '/':
				1680	urlfile = 'file://' + file
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1681	return addinfourl(open(localname, 'rb'), headers, urlfile)
				1682	host, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1683	if (not port
				1684	and socket.gethostbyname(host) in (localhost(), thishost())):
				1685	urlfile = file
				1686	if file[:1] == '/':
				1687	urlfile = 'file://' + file
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1688	return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1689	raise URLError('local file error', 'not on local host')
				1690
				1691	def open_ftp(self, url):
				1692	"""Use FTP protocol."""
				1693	if not isinstance(url, str):
				1694	raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
				1695	import mimetypes
				1696	from io import StringIO
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1697	host, path = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1698	if not host: raise URLError('ftp error', 'no host given')
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1699	host, port = splitport(host)
				1700	user, host = splituser(host)
				1701	if user: user, passwd = splitpasswd(user)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1702	else: passwd = None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1703	host = unquote(host)
				1704	user = unquote(user or '')
				1705	passwd = unquote(passwd or '')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1706	host = socket.gethostbyname(host)
				1707	if not port:
				1708	import ftplib
				1709	port = ftplib.FTP_PORT
				1710	else:
				1711	port = int(port)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1712	path, attrs = splitattr(path)
				1713	path = unquote(path)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1714	dirs = path.split('/')
				1715	dirs, file = dirs[:-1], dirs[-1]
				1716	if dirs and not dirs[0]: dirs = dirs[1:]
				1717	if dirs and not dirs[0]: dirs[0] = '/'
				1718	key = user, host, port, '/'.join(dirs)
				1719	# XXX thread unsafe!
				1720	if len(self.ftpcache) > MAXFTPCACHE:
				1721	# Prune the cache, rather arbitrarily
				1722	for k in self.ftpcache.keys():
				1723	if k != key:
				1724	v = self.ftpcache[k]
				1725	del self.ftpcache[k]
				1726	v.close()
				1727	try:
				1728	if not key in self.ftpcache:
				1729	self.ftpcache[key] = \
				1730	ftpwrapper(user, passwd, host, port, dirs)
				1731	if not file: type = 'D'
				1732	else: type = 'I'
				1733	for attr in attrs:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1734	attr, value = splitvalue(attr)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1735	if attr.lower() == 'type' and \
				1736	value in ('a', 'A', 'i', 'I', 'd', 'D'):
				1737	type = value.upper()
				1738	(fp, retrlen) = self.ftpcache[key].retrfile(file, type)
				1739	mtype = mimetypes.guess_type("ftp:" + url)[0]
				1740	headers = ""
				1741	if mtype:
				1742	headers += "Content-Type: %s\n" % mtype
				1743	if retrlen is not None and retrlen >= 0:
				1744	headers += "Content-Length: %d\n" % retrlen
				1745	headers = email.message_from_string(headers)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1746	return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1747	except ftperrors() as msg:
				1748	raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
				1749
				1750	def open_data(self, url, data=None):
				1751	"""Use "data" URL."""
				1752	if not isinstance(url, str):
				1753	raise URLError('data error', 'proxy support for data protocol currently not implemented')
				1754	# ignore POSTed data
				1755	#
				1756	# syntax of data URLs:
				1757	# dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
				1758	# mediatype := [ type "/" subtype ] *( ";" parameter )
				1759	# data := *urlchar
				1760	# parameter := attribute "=" value
				1761	try:
				1762	[type, data] = url.split(',', 1)
				1763	except ValueError:
				1764	raise IOError('data error', 'bad data URL')
				1765	if not type:
				1766	type = 'text/plain;charset=US-ASCII'
				1767	semi = type.rfind(';')
				1768	if semi >= 0 and '=' not in type[semi:]:
				1769	encoding = type[semi+1:]
				1770	type = type[:semi]
				1771	else:
				1772	encoding = ''
				1773	msg = []
				1774	msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
				1775	time.gmtime(time.time())))
				1776	msg.append('Content-type: %s' % type)
				1777	if encoding == 'base64':
				1778	import base64
Georg Brandl	706824f	2009-06-04 09:42:55 +0000	[diff] [blame]	1779	# XXX is this encoding/decoding ok?
				1780	data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1781	else:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1782	data = unquote(data)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1783	msg.append('Content-Length: %d' % len(data))
				1784	msg.append('')
				1785	msg.append(data)
				1786	msg = '\n'.join(msg)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1787	headers = email.message_from_string(msg)
				1788	f = io.StringIO(msg)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1789	#f.fileno = None # needed for addinfourl
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1790	return addinfourl(f, headers, url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1791
				1792
				1793	class FancyURLopener(URLopener):
				1794	"""Derived class with handlers for errors we can handle (perhaps)."""
				1795
				1796	def __init__(self, args, *kwargs):
				1797	URLopener.__init__(self, args, *kwargs)
				1798	self.auth_cache = {}
				1799	self.tries = 0
				1800	self.maxtries = 10
				1801
				1802	def http_error_default(self, url, fp, errcode, errmsg, headers):
				1803	"""Default error handling -- don't raise an exception."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1804	return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1805
				1806	def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
				1807	"""Error 302 -- relocated (temporarily)."""
				1808	self.tries += 1
				1809	if self.maxtries and self.tries >= self.maxtries:
				1810	if hasattr(self, "http_error_500"):
				1811	meth = self.http_error_500
				1812	else:
				1813	meth = self.http_error_default
				1814	self.tries = 0
				1815	return meth(url, fp, 500,
				1816	"Internal Server Error: Redirect Recursion", headers)
				1817	result = self.redirect_internal(url, fp, errcode, errmsg, headers,
				1818	data)
				1819	self.tries = 0
				1820	return result
				1821
				1822	def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
				1823	if 'location' in headers:
				1824	newurl = headers['location']
				1825	elif 'uri' in headers:
				1826	newurl = headers['uri']
				1827	else:
				1828	return
				1829	void = fp.read()
				1830	fp.close()
				1831	# In case the server sent a relative URL, join with original:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1832	newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1833	return self.open(newurl)
				1834
				1835	def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
				1836	"""Error 301 -- also relocated (permanently)."""
				1837	return self.http_error_302(url, fp, errcode, errmsg, headers, data)
				1838
				1839	def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
				1840	"""Error 303 -- also relocated (essentially identical to 302)."""
				1841	return self.http_error_302(url, fp, errcode, errmsg, headers, data)
				1842
				1843	def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
				1844	"""Error 307 -- relocated, but turn POST into error."""
				1845	if data is None:
				1846	return self.http_error_302(url, fp, errcode, errmsg, headers, data)
				1847	else:
				1848	return self.http_error_default(url, fp, errcode, errmsg, headers)
				1849
				1850	def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
				1851	"""Error 401 -- authentication required.
				1852	This function supports Basic authentication only."""
				1853	if not 'www-authenticate' in headers:
				1854	URLopener.http_error_default(self, url, fp,
				1855	errcode, errmsg, headers)
				1856	stuff = headers['www-authenticate']
				1857	import re
				1858	match = re.match('[ \t]([^ \t]+)[ \t]+realm="([^"])"', stuff)
				1859	if not match:
				1860	URLopener.http_error_default(self, url, fp,
				1861	errcode, errmsg, headers)
				1862	scheme, realm = match.groups()
				1863	if scheme.lower() != 'basic':
				1864	URLopener.http_error_default(self, url, fp,
				1865	errcode, errmsg, headers)
				1866	name = 'retry_' + self.type + '_basic_auth'
				1867	if data is None:
				1868	return getattr(self,name)(url, realm)
				1869	else:
				1870	return getattr(self,name)(url, realm, data)
				1871
				1872	def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
				1873	"""Error 407 -- proxy authentication required.
				1874	This function supports Basic authentication only."""
				1875	if not 'proxy-authenticate' in headers:
				1876	URLopener.http_error_default(self, url, fp,
				1877	errcode, errmsg, headers)
				1878	stuff = headers['proxy-authenticate']
				1879	import re
				1880	match = re.match('[ \t]([^ \t]+)[ \t]+realm="([^"])"', stuff)
				1881	if not match:
				1882	URLopener.http_error_default(self, url, fp,
				1883	errcode, errmsg, headers)
				1884	scheme, realm = match.groups()
				1885	if scheme.lower() != 'basic':
				1886	URLopener.http_error_default(self, url, fp,
				1887	errcode, errmsg, headers)
				1888	name = 'retry_proxy_' + self.type + '_basic_auth'
				1889	if data is None:
				1890	return getattr(self,name)(url, realm)
				1891	else:
				1892	return getattr(self,name)(url, realm, data)
				1893
				1894	def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1895	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1896	newurl = 'http://' + host + selector
				1897	proxy = self.proxies['http']
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1898	urltype, proxyhost = splittype(proxy)
				1899	proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1900	i = proxyhost.find('@') + 1
				1901	proxyhost = proxyhost[i:]
				1902	user, passwd = self.get_user_passwd(proxyhost, realm, i)
				1903	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1904	proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1905	quote(passwd, safe=''), proxyhost)
				1906	self.proxies['http'] = 'http://' + proxyhost + proxyselector
				1907	if data is None:
				1908	return self.open(newurl)
				1909	else:
				1910	return self.open(newurl, data)
				1911
				1912	def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1913	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1914	newurl = 'https://' + host + selector
				1915	proxy = self.proxies['https']
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1916	urltype, proxyhost = splittype(proxy)
				1917	proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1918	i = proxyhost.find('@') + 1
				1919	proxyhost = proxyhost[i:]
				1920	user, passwd = self.get_user_passwd(proxyhost, realm, i)
				1921	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1922	proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1923	quote(passwd, safe=''), proxyhost)
				1924	self.proxies['https'] = 'https://' + proxyhost + proxyselector
				1925	if data is None:
				1926	return self.open(newurl)
				1927	else:
				1928	return self.open(newurl, data)
				1929
				1930	def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1931	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1932	i = host.find('@') + 1
				1933	host = host[i:]
				1934	user, passwd = self.get_user_passwd(host, realm, i)
				1935	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1936	host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1937	quote(passwd, safe=''), host)
				1938	newurl = 'http://' + host + selector
				1939	if data is None:
				1940	return self.open(newurl)
				1941	else:
				1942	return self.open(newurl, data)
				1943
				1944	def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1945	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1946	i = host.find('@') + 1
				1947	host = host[i:]
				1948	user, passwd = self.get_user_passwd(host, realm, i)
				1949	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1950	host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1951	quote(passwd, safe=''), host)
				1952	newurl = 'https://' + host + selector
				1953	if data is None:
				1954	return self.open(newurl)
				1955	else:
				1956	return self.open(newurl, data)
				1957
				1958	def get_user_passwd(self, host, realm, clear_cache = 0):
				1959	key = realm + '@' + host.lower()
				1960	if key in self.auth_cache:
				1961	if clear_cache:
				1962	del self.auth_cache[key]
				1963	else:
				1964	return self.auth_cache[key]
				1965	user, passwd = self.prompt_user_passwd(host, realm)
				1966	if user or passwd: self.auth_cache[key] = (user, passwd)
				1967	return user, passwd
				1968
				1969	def prompt_user_passwd(self, host, realm):
				1970	"""Override this in a GUI environment!"""
				1971	import getpass
				1972	try:
				1973	user = input("Enter username for %s at %s: " % (realm, host))
				1974	passwd = getpass.getpass("Enter password for %s in %s at %s: " %
				1975	(user, realm, host))
				1976	return user, passwd
				1977	except KeyboardInterrupt:
				1978	print()
				1979	return None, None
				1980
				1981
				1982	# Utility functions
				1983
				1984	_localhost = None
				1985	def localhost():
				1986	"""Return the IP address of the magic hostname 'localhost'."""
				1987	global _localhost
				1988	if _localhost is None:
				1989	_localhost = socket.gethostbyname('localhost')
				1990	return _localhost
				1991
				1992	_thishost = None
				1993	def thishost():
				1994	"""Return the IP address of the current host."""
				1995	global _thishost
				1996	if _thishost is None:
				1997	_thishost = socket.gethostbyname(socket.gethostname())
				1998	return _thishost
				1999
				2000	_ftperrors = None
				2001	def ftperrors():
				2002	"""Return the set of errors raised by the FTP class."""
				2003	global _ftperrors
				2004	if _ftperrors is None:
				2005	import ftplib
				2006	_ftperrors = ftplib.all_errors
				2007	return _ftperrors
				2008
				2009	_noheaders = None
				2010	def noheaders():
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2011	"""Return an empty email Message object."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2012	global _noheaders
				2013	if _noheaders is None:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2014	_noheaders = email.message_from_string("")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2015	return _noheaders
				2016
				2017
				2018	# Utility classes
				2019
				2020	class ftpwrapper:
				2021	"""Class used by open_ftp() for cache of open FTP connections."""
				2022
				2023	def __init__(self, user, passwd, host, port, dirs, timeout=None):
				2024	self.user = user
				2025	self.passwd = passwd
				2026	self.host = host
				2027	self.port = port
				2028	self.dirs = dirs
				2029	self.timeout = timeout
				2030	self.init()
				2031
				2032	def init(self):
				2033	import ftplib
				2034	self.busy = 0
				2035	self.ftp = ftplib.FTP()
				2036	self.ftp.connect(self.host, self.port, self.timeout)
				2037	self.ftp.login(self.user, self.passwd)
				2038	for dir in self.dirs:
				2039	self.ftp.cwd(dir)
				2040
				2041	def retrfile(self, file, type):
				2042	import ftplib
				2043	self.endtransfer()
				2044	if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
				2045	else: cmd = 'TYPE ' + type; isdir = 0
				2046	try:
				2047	self.ftp.voidcmd(cmd)
				2048	except ftplib.all_errors:
				2049	self.init()
				2050	self.ftp.voidcmd(cmd)
				2051	conn = None
				2052	if file and not isdir:
				2053	# Try to retrieve as a file
				2054	try:
				2055	cmd = 'RETR ' + file
				2056	conn = self.ftp.ntransfercmd(cmd)
				2057	except ftplib.error_perm as reason:
				2058	if str(reason)[:3] != '550':
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2059	raise URLError('ftp error', reason).with_traceback(
				2060	sys.exc_info()[2])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2061	if not conn:
				2062	# Set transfer mode to ASCII!
				2063	self.ftp.voidcmd('TYPE A')
				2064	# Try a directory listing. Verify that directory exists.
				2065	if file:
				2066	pwd = self.ftp.pwd()
				2067	try:
				2068	try:
				2069	self.ftp.cwd(file)
				2070	except ftplib.error_perm as reason:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2071	raise URLError('ftp error', reason) from reason
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2072	finally:
				2073	self.ftp.cwd(pwd)
				2074	cmd = 'LIST ' + file
				2075	else:
				2076	cmd = 'LIST'
				2077	conn = self.ftp.ntransfercmd(cmd)
				2078	self.busy = 1
				2079	# Pass back both a suitably decorated object and a retrieval length
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2080	return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2081	def endtransfer(self):
				2082	if not self.busy:
				2083	return
				2084	self.busy = 0
				2085	try:
				2086	self.ftp.voidresp()
				2087	except ftperrors():
				2088	pass
				2089
				2090	def close(self):
				2091	self.endtransfer()
				2092	try:
				2093	self.ftp.close()
				2094	except ftperrors():
				2095	pass
				2096
				2097	# Proxy handling
				2098	def getproxies_environment():
				2099	"""Return a dictionary of scheme -> proxy server URL mappings.
				2100
				2101	Scan the environment for variables named <scheme>_proxy;
				2102	this seems to be the standard convention. If you need a
				2103	different way, you can pass a proxies dictionary to the
				2104	[Fancy]URLopener constructor.
				2105
				2106	"""
				2107	proxies = {}
				2108	for name, value in os.environ.items():
				2109	name = name.lower()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2110	if value and name[-6:] == '_proxy':
				2111	proxies[name[:-6]] = value
				2112	return proxies
				2113
				2114	def proxy_bypass_environment(host):
				2115	"""Test if proxies should not be used for a particular host.
				2116
				2117	Checks the environment for a variable named no_proxy, which should
				2118	be a list of DNS suffixes separated by commas, or '*' for all hosts.
				2119	"""
				2120	no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
				2121	# '*' is special case for always bypass
				2122	if no_proxy == '*':
				2123	return 1
				2124	# strip port off host
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2125	hostonly, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2126	# check if the host ends with any of the DNS suffixes
				2127	for name in no_proxy.split(','):
				2128	if name and (hostonly.endswith(name) or host.endswith(name)):
				2129	return 1
				2130	# otherwise, don't bypass
				2131	return 0
				2132
				2133
				2134	if sys.platform == 'darwin':
				2135	def getproxies_internetconfig():
				2136	"""Return a dictionary of scheme -> proxy server URL mappings.
				2137
				2138	By convention the mac uses Internet Config to store
				2139	proxies. An HTTP proxy, for instance, is stored under
				2140	the HttpProxy key.
				2141
				2142	"""
				2143	try:
				2144	import ic
				2145	except ImportError:
				2146	return {}
				2147
				2148	try:
				2149	config = ic.IC()
				2150	except ic.error:
				2151	return {}
				2152	proxies = {}
				2153	# HTTP:
				2154	if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
				2155	try:
				2156	value = config['HTTPProxyHost']
				2157	except ic.error:
				2158	pass
				2159	else:
				2160	proxies['http'] = 'http://%s' % value
				2161	# FTP: XXX To be done.
				2162	# Gopher: XXX To be done.
				2163	return proxies
				2164
				2165	def proxy_bypass(host):
				2166	if getproxies_environment():
				2167	return proxy_bypass_environment(host)
				2168	else:
				2169	return 0
				2170
				2171	def getproxies():
				2172	return getproxies_environment() or getproxies_internetconfig()
				2173
				2174	elif os.name == 'nt':
				2175	def getproxies_registry():
				2176	"""Return a dictionary of scheme -> proxy server URL mappings.
				2177
				2178	Win32 uses the registry to store proxies.
				2179
				2180	"""
				2181	proxies = {}
				2182	try:
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2183	import winreg
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2184	except ImportError:
				2185	# Std module, so should be around - but you never know!
				2186	return proxies
				2187	try:
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2188	internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2189	r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2190	proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2191	'ProxyEnable')[0]
				2192	if proxyEnable:
				2193	# Returned as Unicode but problems if not converted to ASCII
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2194	proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2195	'ProxyServer')[0])
				2196	if '=' in proxyServer:
				2197	# Per-protocol settings
				2198	for p in proxyServer.split(';'):
				2199	protocol, address = p.split('=', 1)
				2200	# See if address has a type:// prefix
				2201	import re
				2202	if not re.match('^([^/:]+)://', address):
				2203	address = '%s://%s' % (protocol, address)
				2204	proxies[protocol] = address
				2205	else:
				2206	# Use one setting for all protocols
				2207	if proxyServer[:5] == 'http:':
				2208	proxies['http'] = proxyServer
				2209	else:
				2210	proxies['http'] = 'http://%s' % proxyServer
				2211	proxies['ftp'] = 'ftp://%s' % proxyServer
				2212	internetSettings.Close()
				2213	except (WindowsError, ValueError, TypeError):
				2214	# Either registry key not found etc, or the value in an
				2215	# unexpected format.
				2216	# proxies already set up to be empty so nothing to do
				2217	pass
				2218	return proxies
				2219
				2220	def getproxies():
				2221	"""Return a dictionary of scheme -> proxy server URL mappings.
				2222
				2223	Returns settings gathered from the environment, if specified,
				2224	or the registry.
				2225
				2226	"""
				2227	return getproxies_environment() or getproxies_registry()
				2228
				2229	def proxy_bypass_registry(host):
				2230	try:
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2231	import winreg
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2232	import re
				2233	except ImportError:
				2234	# Std modules, so should be around - but you never know!
				2235	return 0
				2236	try:
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2237	internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2238	r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2239	proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2240	'ProxyEnable')[0]
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2241	proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2242	'ProxyOverride')[0])
				2243	# ^^^^ Returned as Unicode but problems if not converted to ASCII
				2244	except WindowsError:
				2245	return 0
				2246	if not proxyEnable or not proxyOverride:
				2247	return 0
				2248	# try to make a host list from name and IP address.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2249	rawHost, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2250	host = [rawHost]
				2251	try:
				2252	addr = socket.gethostbyname(rawHost)
				2253	if addr != rawHost:
				2254	host.append(addr)
				2255	except socket.error:
				2256	pass
				2257	try:
				2258	fqdn = socket.getfqdn(rawHost)
				2259	if fqdn != rawHost:
				2260	host.append(fqdn)
				2261	except socket.error:
				2262	pass
				2263	# make a check value list from the registry entry: replace the
				2264	# '<local>' string by the localhost entry and the corresponding
				2265	# canonical entry.
				2266	proxyOverride = proxyOverride.split(';')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2267	# now check if we match one of the registry values.
				2268	for test in proxyOverride:
Senthil Kumaran	4947606	2009-05-01 06:00:23 +0000	[diff] [blame]	2269	if test == '<local>':
				2270	if '.' not in rawHost:
				2271	return 1
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2272	test = test.replace(".", r"\.") # mask dots
				2273	test = test.replace("", r".") # change glob sequence
				2274	test = test.replace("?", r".") # change glob char
				2275	for val in host:
				2276	# print "%s <--> %s" %( test, val )
				2277	if re.match(test, val, re.I):
				2278	return 1
				2279	return 0
				2280
				2281	def proxy_bypass(host):
				2282	"""Return a dictionary of scheme -> proxy server URL mappings.
				2283
				2284	Returns settings gathered from the environment, if specified,
				2285	or the registry.
				2286
				2287	"""
				2288	if getproxies_environment():
				2289	return proxy_bypass_environment(host)
				2290	else:
				2291	return proxy_bypass_registry(host)
				2292
				2293	else:
				2294	# By default use environment variables
				2295	getproxies = getproxies_environment
				2296	proxy_bypass = proxy_bypass_environment