Blame - Lib/urllib/request.py - platform/external/python/cpython2

blob: d669aec54e1f4f15d1ff9916469d6b35d296f6f7 [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""An extensible library for opening URLs using a variety of protocols
				2
				3	The simplest way to use this module is to call the urlopen function,
				4	which accepts a string containing a URL or a Request object (described
				5	below). It opens the URL and returns the results as file-like
				6	object; the returned object has some extra methods described below.
				7
				8	The OpenerDirector manages a collection of Handler objects that do
				9	all the actual work. Each Handler implements a particular protocol or
				10	option. The OpenerDirector is a composite object that invokes the
				11	Handlers needed to open the requested URL. For example, the
				12	HTTPHandler performs HTTP GET and POST requests and deals with
				13	non-error returns. The HTTPRedirectHandler automatically deals with
				14	HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
				15	deals with digest authentication.
				16
				17	urlopen(url, data=None) -- Basic usage is the same as original
				18	urllib. pass the url and optionally data to post to an HTTP URL, and
				19	get a file-like object back. One difference is that you can also pass
				20	a Request instance instead of URL. Raises a URLError (subclass of
				21	IOError); for HTTP errors, raises an HTTPError, which can also be
				22	treated as a valid response.
				23
				24	build_opener -- Function that creates a new OpenerDirector instance.
				25	Will install the default handlers. Accepts one or more Handlers as
				26	arguments, either instances or Handler classes that it will
				27	instantiate. If one of the argument is a subclass of the default
				28	handler, the argument will be installed instead of the default.
				29
				30	install_opener -- Installs a new opener as the default opener.
				31
				32	objects of interest:
Senthil Kumaran	1107c5d	2009-11-15 06:20:55 +0000	[diff] [blame]	33
				34	OpenerDirector -- Sets up the User-Agent as the Python-urllib and manages the
				35	Handler classes while dealing with both requests and responses.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	36
				37	Request -- An object that encapsulates the state of a request. The
				38	state can be as simple as the URL. It can also include extra HTTP
				39	headers, e.g. a User-Agent.
				40
				41	BaseHandler --
				42
				43	internals:
				44	BaseHandler and parent
				45	_call_chain conventions
				46
				47	Example usage:
				48
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	49	import urllib.request
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	50
				51	# set up authentication info
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	52	authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	53	authinfo.add_password(realm='PDQ Application',
				54	uri='https://mahler:8092/site-updates.py',
				55	user='klem',
				56	passwd='geheim$parole')
				57
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	58	proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	59
				60	# build a new opener that adds authentication and caching FTP handlers
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	61	opener = urllib.request.build_opener(proxy_support, authinfo,
				62	urllib.request.CacheFTPHandler)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	63
				64	# install it
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	65	urllib.request.install_opener(opener)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	66
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	67	f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	68	"""
				69
				70	# XXX issues:
				71	# If an authentication error handler that tries to perform
				72	# authentication for some reason but fails, how should the error be
				73	# signalled? The client needs to know the HTTP error code. But if
				74	# the handler knows that the problem was, e.g., that it didn't know
				75	# that hash algo that requested in the challenge, it would be good to
				76	# pass that information along to the client, too.
				77	# ftp errors aren't handled cleanly
				78	# check digest against correct (i.e. non-apache) implementation
				79
				80	# Possible extensions:
				81	# complex proxies XXX not sure what exactly was meant by this
				82	# abstract factory for opener
				83
				84	import base64
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	85	import bisect
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	86	import email
				87	import hashlib
				88	import http.client
				89	import io
				90	import os
				91	import posixpath
				92	import random
				93	import re
				94	import socket
				95	import sys
				96	import time
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	97
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	98	from urllib.error import URLError, HTTPError, ContentTooShortError
				99	from urllib.parse import (
				100	urlparse, urlsplit, urljoin, unwrap, quote, unquote,
				101	splittype, splithost, splitport, splituser, splitpasswd,
Facundo Batista	f24802c	2008-08-17 03:36:03 +0000	[diff] [blame]	102	splitattr, splitquery, splitvalue, to_bytes, urlunparse)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	103	from urllib.response import addinfourl, addclosehook
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	104
				105	# check for SSL
				106	try:
				107	import ssl
				108	except:
				109	_have_ssl = False
				110	else:
				111	_have_ssl = True
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	112
				113	# used in User-Agent header sent
				114	__version__ = sys.version[:3]
				115
				116	_opener = None
				117	def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
				118	global _opener
				119	if _opener is None:
				120	_opener = build_opener()
				121	return _opener.open(url, data, timeout)
				122
				123	def install_opener(opener):
				124	global _opener
				125	_opener = opener
				126
				127	# TODO(jhylton): Make this work with the same global opener.
				128	_urlopener = None
				129	def urlretrieve(url, filename=None, reporthook=None, data=None):
				130	global _urlopener
				131	if not _urlopener:
				132	_urlopener = FancyURLopener()
				133	return _urlopener.retrieve(url, filename, reporthook, data)
				134
				135	def urlcleanup():
				136	if _urlopener:
				137	_urlopener.cleanup()
				138	global _opener
				139	if _opener:
				140	_opener = None
				141
				142	# copied from cookielib.py
Antoine Pitrou	fd03645	2008-08-19 17:56:33 +0000	[diff] [blame]	143	_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	144	def request_host(request):
				145	"""Return request-host, as defined by RFC 2965.
				146
				147	Variation from RFC: returned value is lowercased, for convenient
				148	comparison.
				149
				150	"""
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	151	url = request.full_url
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	152	host = urlparse(url)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	153	if host == "":
				154	host = request.get_header("Host", "")
				155
				156	# remove port, if present
				157	host = _cut_port_re.sub("", host, 1)
				158	return host.lower()
				159
				160	class Request:
				161
				162	def __init__(self, url, data=None, headers={},
				163	origin_req_host=None, unverifiable=False):
				164	# unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	165	self.full_url = unwrap(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	166	self.data = data
				167	self.headers = {}
Senthil Kumaran	97f0c6b	2009-07-25 04:24:38 +0000	[diff] [blame]	168	self._tunnel_host = None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	169	for key, value in headers.items():
				170	self.add_header(key, value)
				171	self.unredirected_hdrs = {}
				172	if origin_req_host is None:
				173	origin_req_host = request_host(self)
				174	self.origin_req_host = origin_req_host
				175	self.unverifiable = unverifiable
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	176	self._parse()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	177
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	178	def _parse(self):
				179	self.type, rest = splittype(self.full_url)
				180	if self.type is None:
				181	raise ValueError("unknown url type: %s" % self.full_url)
				182	self.host, self.selector = splithost(rest)
				183	if self.host:
				184	self.host = unquote(self.host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	185
				186	def get_method(self):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	187	if self.data is not None:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	188	return "POST"
				189	else:
				190	return "GET"
				191
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	192	# Begin deprecated methods
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	193
				194	def add_data(self, data):
				195	self.data = data
				196
				197	def has_data(self):
				198	return self.data is not None
				199
				200	def get_data(self):
				201	return self.data
				202
				203	def get_full_url(self):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	204	return self.full_url
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	205
				206	def get_type(self):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	207	return self.type
				208
				209	def get_host(self):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	210	return self.host
				211
				212	def get_selector(self):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	213	return self.selector
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	214
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	215	def is_unverifiable(self):
				216	return self.unverifiable
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	217
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	218	def get_origin_req_host(self):
				219	return self.origin_req_host
				220
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	221	# End deprecated methods
				222
				223	def set_proxy(self, host, type):
Senthil Kumaran	97f0c6b	2009-07-25 04:24:38 +0000	[diff] [blame]	224	if self.type == 'https' and not self._tunnel_host:
				225	self._tunnel_host = self.host
				226	else:
				227	self.type= type
				228	self.selector = self.full_url
				229	self.host = host
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	230
				231	def has_proxy(self):
				232	return self.selector == self.full_url
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	233
				234	def add_header(self, key, val):
				235	# useful for something like authentication
				236	self.headers[key.capitalize()] = val
				237
				238	def add_unredirected_header(self, key, val):
				239	# will not be added to a redirected request
				240	self.unredirected_hdrs[key.capitalize()] = val
				241
				242	def has_header(self, header_name):
				243	return (header_name in self.headers or
				244	header_name in self.unredirected_hdrs)
				245
				246	def get_header(self, header_name, default=None):
				247	return self.headers.get(
				248	header_name,
				249	self.unredirected_hdrs.get(header_name, default))
				250
				251	def header_items(self):
				252	hdrs = self.unredirected_hdrs.copy()
				253	hdrs.update(self.headers)
				254	return list(hdrs.items())
				255
				256	class OpenerDirector:
				257	def __init__(self):
				258	client_version = "Python-urllib/%s" % __version__
				259	self.addheaders = [('User-agent', client_version)]
				260	# manage the individual handlers
				261	self.handlers = []
				262	self.handle_open = {}
				263	self.handle_error = {}
				264	self.process_response = {}
				265	self.process_request = {}
				266
				267	def add_handler(self, handler):
				268	if not hasattr(handler, "add_parent"):
				269	raise TypeError("expected BaseHandler instance, got %r" %
				270	type(handler))
				271
				272	added = False
				273	for meth in dir(handler):
				274	if meth in ["redirect_request", "do_open", "proxy_open"]:
				275	# oops, coincidental match
				276	continue
				277
				278	i = meth.find("_")
				279	protocol = meth[:i]
				280	condition = meth[i+1:]
				281
				282	if condition.startswith("error"):
				283	j = condition.find("_") + i + 1
				284	kind = meth[j+1:]
				285	try:
				286	kind = int(kind)
				287	except ValueError:
				288	pass
				289	lookup = self.handle_error.get(protocol, {})
				290	self.handle_error[protocol] = lookup
				291	elif condition == "open":
				292	kind = protocol
				293	lookup = self.handle_open
				294	elif condition == "response":
				295	kind = protocol
				296	lookup = self.process_response
				297	elif condition == "request":
				298	kind = protocol
				299	lookup = self.process_request
				300	else:
				301	continue
				302
				303	handlers = lookup.setdefault(kind, [])
				304	if handlers:
				305	bisect.insort(handlers, handler)
				306	else:
				307	handlers.append(handler)
				308	added = True
				309
				310	if added:
				311	# the handlers must work in an specific order, the order
				312	# is specified in a Handler attribute
				313	bisect.insort(self.handlers, handler)
				314	handler.add_parent(self)
				315
				316	def close(self):
				317	# Only exists for backwards compatibility.
				318	pass
				319
				320	def _call_chain(self, chain, kind, meth_name, *args):
				321	# Handlers raise an exception if no one else should try to handle
				322	# the request, or return None if they can't but another handler
				323	# could. Otherwise, they return the response.
				324	handlers = chain.get(kind, ())
				325	for handler in handlers:
				326	func = getattr(handler, meth_name)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	327	result = func(*args)
				328	if result is not None:
				329	return result
				330
				331	def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
				332	# accept a URL or a Request object
				333	if isinstance(fullurl, str):
				334	req = Request(fullurl, data)
				335	else:
				336	req = fullurl
				337	if data is not None:
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	338	req.data = data
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	339
				340	req.timeout = timeout
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	341	protocol = req.type
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	342
				343	# pre-process request
				344	meth_name = protocol+"_request"
				345	for processor in self.process_request.get(protocol, []):
				346	meth = getattr(processor, meth_name)
				347	req = meth(req)
				348
				349	response = self._open(req, data)
				350
				351	# post-process response
				352	meth_name = protocol+"_response"
				353	for processor in self.process_response.get(protocol, []):
				354	meth = getattr(processor, meth_name)
				355	response = meth(req, response)
				356
				357	return response
				358
				359	def _open(self, req, data=None):
				360	result = self._call_chain(self.handle_open, 'default',
				361	'default_open', req)
				362	if result:
				363	return result
				364
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	365	protocol = req.type
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	366	result = self._call_chain(self.handle_open, protocol, protocol +
				367	'_open', req)
				368	if result:
				369	return result
				370
				371	return self._call_chain(self.handle_open, 'unknown',
				372	'unknown_open', req)
				373
				374	def error(self, proto, *args):
				375	if proto in ('http', 'https'):
				376	# XXX http[s] protocols are special-cased
				377	dict = self.handle_error['http'] # https is not different than http
				378	proto = args[2] # YUCK!
				379	meth_name = 'http_error_%s' % proto
				380	http_err = 1
				381	orig_args = args
				382	else:
				383	dict = self.handle_error
				384	meth_name = proto + '_error'
				385	http_err = 0
				386	args = (dict, proto, meth_name) + args
				387	result = self._call_chain(*args)
				388	if result:
				389	return result
				390
				391	if http_err:
				392	args = (dict, 'default', 'http_error_default') + orig_args
				393	return self._call_chain(*args)
				394
				395	# XXX probably also want an abstract factory that knows when it makes
				396	# sense to skip a superclass in favor of a subclass and when it might
				397	# make sense to include both
				398
				399	def build_opener(*handlers):
				400	"""Create an opener object from a list of handlers.
				401
				402	The opener will use several default handlers, including support
Senthil Kumaran	1107c5d	2009-11-15 06:20:55 +0000	[diff] [blame]	403	for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	404
				405	If any of the handlers passed as arguments are subclasses of the
				406	default handlers, the default handlers will not be used.
				407	"""
				408	def isclass(obj):
				409	return isinstance(obj, type) or hasattr(obj, "__bases__")
				410
				411	opener = OpenerDirector()
				412	default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
				413	HTTPDefaultErrorHandler, HTTPRedirectHandler,
				414	FTPHandler, FileHandler, HTTPErrorProcessor]
				415	if hasattr(http.client, "HTTPSConnection"):
				416	default_classes.append(HTTPSHandler)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	417	skip = set()
				418	for klass in default_classes:
				419	for check in handlers:
				420	if isclass(check):
				421	if issubclass(check, klass):
				422	skip.add(klass)
				423	elif isinstance(check, klass):
				424	skip.add(klass)
				425	for klass in skip:
				426	default_classes.remove(klass)
				427
				428	for klass in default_classes:
				429	opener.add_handler(klass())
				430
				431	for h in handlers:
				432	if isclass(h):
				433	h = h()
				434	opener.add_handler(h)
				435	return opener
				436
				437	class BaseHandler:
				438	handler_order = 500
				439
				440	def add_parent(self, parent):
				441	self.parent = parent
				442
				443	def close(self):
				444	# Only exists for backwards compatibility
				445	pass
				446
				447	def __lt__(self, other):
				448	if not hasattr(other, "handler_order"):
				449	# Try to preserve the old behavior of having custom classes
				450	# inserted after default ones (works only for custom user
				451	# classes which are not aware of handler_order).
				452	return True
				453	return self.handler_order < other.handler_order
				454
				455
				456	class HTTPErrorProcessor(BaseHandler):
				457	"""Process HTTP error responses."""
				458	handler_order = 1000 # after all other processing
				459
				460	def http_response(self, request, response):
				461	code, msg, hdrs = response.code, response.msg, response.info()
				462
				463	# According to RFC 2616, "2xx" code indicates that the client's
				464	# request was successfully received, understood, and accepted.
				465	if not (200 <= code < 300):
				466	response = self.parent.error(
				467	'http', request, response, code, msg, hdrs)
				468
				469	return response
				470
				471	https_response = http_response
				472
				473	class HTTPDefaultErrorHandler(BaseHandler):
				474	def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	475	raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	476
				477	class HTTPRedirectHandler(BaseHandler):
				478	# maximum number of redirections to any single URL
				479	# this is needed because of the state that cookies introduce
				480	max_repeats = 4
				481	# maximum total number of redirections (regardless of URL) before
				482	# assuming we're in a loop
				483	max_redirections = 10
				484
				485	def redirect_request(self, req, fp, code, msg, headers, newurl):
				486	"""Return a Request or None in response to a redirect.
				487
				488	This is called by the http_error_30x methods when a
				489	redirection response is received. If a redirection should
				490	take place, return a new Request to allow http_error_30x to
				491	perform the redirect. Otherwise, raise HTTPError if no-one
				492	else should try to handle this url. Return None if you can't
				493	but another Handler might.
				494	"""
				495	m = req.get_method()
				496	if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
				497	or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	498	raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	499
				500	# Strictly (according to RFC 2616), 301 or 302 in response to
				501	# a POST MUST NOT cause a redirection without confirmation
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	502	# from the user (of urllib.request, in this case). In practice,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	503	# essentially all clients do redirect in this case, so we do
				504	# the same.
				505	# be conciliant with URIs containing a space
				506	newurl = newurl.replace(' ', '%20')
				507	CONTENT_HEADERS = ("content-length", "content-type")
				508	newheaders = dict((k, v) for k, v in req.headers.items()
				509	if k.lower() not in CONTENT_HEADERS)
				510	return Request(newurl,
				511	headers=newheaders,
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	512	origin_req_host=req.origin_req_host,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	513	unverifiable=True)
				514
				515	# Implementation note: To avoid the server sending us into an
				516	# infinite loop, the request object needs to track what URLs we
				517	# have already seen. Do this by adding a handler-specific
				518	# attribute to the Request object.
				519	def http_error_302(self, req, fp, code, msg, headers):
				520	# Some servers (incorrectly) return multiple Location headers
				521	# (so probably same goes for URI). Use first header.
				522	if "location" in headers:
				523	newurl = headers["location"]
				524	elif "uri" in headers:
				525	newurl = headers["uri"]
				526	else:
				527	return
Facundo Batista	f24802c	2008-08-17 03:36:03 +0000	[diff] [blame]	528
				529	# fix a possible malformed URL
				530	urlparts = urlparse(newurl)
				531	if not urlparts.path:
				532	urlparts = list(urlparts)
				533	urlparts[2] = "/"
				534	newurl = urlunparse(urlparts)
				535
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	536	newurl = urljoin(req.full_url, newurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	537
				538	# XXX Probably want to forget about the state of the current
				539	# request, although that might interact poorly with other
				540	# handlers that also use handler-specific request attributes
				541	new = self.redirect_request(req, fp, code, msg, headers, newurl)
				542	if new is None:
				543	return
				544
				545	# loop detection
				546	# .redirect_dict has a key url if url was previously visited.
				547	if hasattr(req, 'redirect_dict'):
				548	visited = new.redirect_dict = req.redirect_dict
				549	if (visited.get(newurl, 0) >= self.max_repeats or
				550	len(visited) >= self.max_redirections):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	551	raise HTTPError(req.full_url, code,
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	552	self.inf_msg + msg, headers, fp)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	553	else:
				554	visited = new.redirect_dict = req.redirect_dict = {}
				555	visited[newurl] = visited.get(newurl, 0) + 1
				556
				557	# Don't close the fp until we are sure that we won't use it
				558	# with HTTPError.
				559	fp.read()
				560	fp.close()
				561
Senthil Kumaran	fb8cc2f	2009-07-19 02:44:19 +0000	[diff] [blame]	562	return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	563
				564	http_error_301 = http_error_303 = http_error_307 = http_error_302
				565
				566	inf_msg = "The HTTP server returned a redirect error that would " \
				567	"lead to an infinite loop.\n" \
				568	"The last 30x error message was:\n"
				569
				570
				571	def _parse_proxy(proxy):
				572	"""Return (scheme, user, password, host/port) given a URL or an authority.
				573
				574	If a URL is supplied, it must have an authority (host:port) component.
				575	According to RFC 3986, having an authority component means the URL must
				576	have two slashes after the scheme:
				577
				578	>>> _parse_proxy('file:/ftp.example.com/')
				579	Traceback (most recent call last):
				580	ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
				581
				582	The first three items of the returned tuple may be None.
				583
				584	Examples of authority parsing:
				585
				586	>>> _parse_proxy('proxy.example.com')
				587	(None, None, None, 'proxy.example.com')
				588	>>> _parse_proxy('proxy.example.com:3128')
				589	(None, None, None, 'proxy.example.com:3128')
				590
				591	The authority component may optionally include userinfo (assumed to be
				592	username:password):
				593
				594	>>> _parse_proxy('joe:password@proxy.example.com')
				595	(None, 'joe', 'password', 'proxy.example.com')
				596	>>> _parse_proxy('joe:password@proxy.example.com:3128')
				597	(None, 'joe', 'password', 'proxy.example.com:3128')
				598
				599	Same examples, but with URLs instead:
				600
				601	>>> _parse_proxy('http://proxy.example.com/')
				602	('http', None, None, 'proxy.example.com')
				603	>>> _parse_proxy('http://proxy.example.com:3128/')
				604	('http', None, None, 'proxy.example.com:3128')
				605	>>> _parse_proxy('http://joe:password@proxy.example.com/')
				606	('http', 'joe', 'password', 'proxy.example.com')
				607	>>> _parse_proxy('http://joe:password@proxy.example.com:3128')
				608	('http', 'joe', 'password', 'proxy.example.com:3128')
				609
				610	Everything after the authority is ignored:
				611
				612	>>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
				613	('ftp', 'joe', 'password', 'proxy.example.com')
				614
				615	Test for no trailing '/' case:
				616
				617	>>> _parse_proxy('http://joe:password@proxy.example.com')
				618	('http', 'joe', 'password', 'proxy.example.com')
				619
				620	"""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	621	scheme, r_scheme = splittype(proxy)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	622	if not r_scheme.startswith("/"):
				623	# authority
				624	scheme = None
				625	authority = proxy
				626	else:
				627	# URL
				628	if not r_scheme.startswith("//"):
				629	raise ValueError("proxy URL with no authority: %r" % proxy)
				630	# We have an authority, so for RFC 3986-compliant URLs (by ss 3.
				631	# and 3.3.), path is empty or starts with '/'
				632	end = r_scheme.find("/", 2)
				633	if end == -1:
				634	end = None
				635	authority = r_scheme[2:end]
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	636	userinfo, hostport = splituser(authority)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	637	if userinfo is not None:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	638	user, password = splitpasswd(userinfo)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	639	else:
				640	user = password = None
				641	return scheme, user, password, hostport
				642
				643	class ProxyHandler(BaseHandler):
				644	# Proxies must be in front
				645	handler_order = 100
				646
				647	def __init__(self, proxies=None):
				648	if proxies is None:
				649	proxies = getproxies()
				650	assert hasattr(proxies, 'keys'), "proxies must be a mapping"
				651	self.proxies = proxies
				652	for type, url in proxies.items():
				653	setattr(self, '%s_open' % type,
				654	lambda r, proxy=url, type=type, meth=self.proxy_open: \
				655	meth(r, proxy, type))
				656
				657	def proxy_open(self, req, proxy, type):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	658	orig_type = req.type
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	659	proxy_type, user, password, hostport = _parse_proxy(proxy)
				660	if proxy_type is None:
				661	proxy_type = orig_type
Senthil Kumaran	7bb0497	2009-10-11 04:58:55 +0000	[diff] [blame]	662
				663	if req.host and proxy_bypass(req.host):
				664	return None
				665
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	666	if user and password:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	667	user_pass = '%s:%s' % (unquote(user),
				668	unquote(password))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	669	creds = base64.b64encode(user_pass.encode()).decode("ascii")
				670	req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	671	hostport = unquote(hostport)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	672	req.set_proxy(hostport, proxy_type)
Senthil Kumaran	97f0c6b	2009-07-25 04:24:38 +0000	[diff] [blame]	673	if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	674	# let other handlers take care of it
				675	return None
				676	else:
				677	# need to start over, because the other handlers don't
				678	# grok the proxy's URL type
				679	# e.g. if we have a constructor arg proxies like so:
				680	# {'http': 'ftp://proxy.example.com'}, we may end up turning
				681	# a request for http://acme.example.com/a into one for
				682	# ftp://proxy.example.com/a
Senthil Kumaran	fb8cc2f	2009-07-19 02:44:19 +0000	[diff] [blame]	683	return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	684
				685	class HTTPPasswordMgr:
				686
				687	def __init__(self):
				688	self.passwd = {}
				689
				690	def add_password(self, realm, uri, user, passwd):
				691	# uri could be a single URI or a sequence
				692	if isinstance(uri, str):
				693	uri = [uri]
				694	if not realm in self.passwd:
				695	self.passwd[realm] = {}
				696	for default_port in True, False:
				697	reduced_uri = tuple(
				698	[self.reduce_uri(u, default_port) for u in uri])
				699	self.passwd[realm][reduced_uri] = (user, passwd)
				700
				701	def find_user_password(self, realm, authuri):
				702	domains = self.passwd.get(realm, {})
				703	for default_port in True, False:
				704	reduced_authuri = self.reduce_uri(authuri, default_port)
				705	for uris, authinfo in domains.items():
				706	for uri in uris:
				707	if self.is_suburi(uri, reduced_authuri):
				708	return authinfo
				709	return None, None
				710
				711	def reduce_uri(self, uri, default_port=True):
				712	"""Accept authority or URI and extract only the authority and path."""
				713	# note HTTP URLs do not have a userinfo component
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	714	parts = urlsplit(uri)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	715	if parts[1]:
				716	# URI
				717	scheme = parts[0]
				718	authority = parts[1]
				719	path = parts[2] or '/'
				720	else:
				721	# host or host:port
				722	scheme = None
				723	authority = uri
				724	path = '/'
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	725	host, port = splitport(authority)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	726	if default_port and port is None and scheme is not None:
				727	dport = {"http": 80,
				728	"https": 443,
				729	}.get(scheme)
				730	if dport is not None:
				731	authority = "%s:%d" % (host, dport)
				732	return authority, path
				733
				734	def is_suburi(self, base, test):
				735	"""Check if test is below base in a URI tree
				736
				737	Both args must be URIs in reduced form.
				738	"""
				739	if base == test:
				740	return True
				741	if base[0] != test[0]:
				742	return False
				743	common = posixpath.commonprefix((base[1], test[1]))
				744	if len(common) == len(base[1]):
				745	return True
				746	return False
				747
				748
				749	class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
				750
				751	def find_user_password(self, realm, authuri):
				752	user, password = HTTPPasswordMgr.find_user_password(self, realm,
				753	authuri)
				754	if user is not None:
				755	return user, password
				756	return HTTPPasswordMgr.find_user_password(self, None, authuri)
				757
				758
				759	class AbstractBasicAuthHandler:
				760
				761	# XXX this allows for multiple auth-schemes, but will stupidly pick
				762	# the last one with a realm specified.
				763
				764	# allow for double- and single-quoted realm values
				765	# (single quotes are a violation of the RFC, but appear in the wild)
				766	rx = re.compile('(?:.,)[ \t]*([^ \t]+)[ \t]+'
				767	'realm=(["\'])(.*?)\\2', re.I)
				768
				769	# XXX could pre-emptively send auth info already accepted (RFC 2617,
				770	# end of section 2, and section 1.2 immediately after "credentials"
				771	# production).
				772
				773	def __init__(self, password_mgr=None):
				774	if password_mgr is None:
				775	password_mgr = HTTPPasswordMgr()
				776	self.passwd = password_mgr
				777	self.add_password = self.passwd.add_password
				778
				779	def http_error_auth_reqed(self, authreq, host, req, headers):
				780	# host may be an authority (without userinfo) or a URL with an
				781	# authority
				782	# XXX could be multiple headers
				783	authreq = headers.get(authreq, None)
				784	if authreq:
				785	mo = AbstractBasicAuthHandler.rx.search(authreq)
				786	if mo:
				787	scheme, quote, realm = mo.groups()
				788	if scheme.lower() == 'basic':
				789	return self.retry_http_basic_auth(host, req, realm)
				790
				791	def retry_http_basic_auth(self, host, req, realm):
				792	user, pw = self.passwd.find_user_password(realm, host)
				793	if pw is not None:
				794	raw = "%s:%s" % (user, pw)
				795	auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
				796	if req.headers.get(self.auth_header, None) == auth:
				797	return None
				798	req.add_header(self.auth_header, auth)
Senthil Kumaran	fb8cc2f	2009-07-19 02:44:19 +0000	[diff] [blame]	799	return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	800	else:
				801	return None
				802
				803
				804	class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
				805
				806	auth_header = 'Authorization'
				807
				808	def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	809	url = req.full_url
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	810	return self.http_error_auth_reqed('www-authenticate',
				811	url, req, headers)
				812
				813
				814	class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
				815
				816	auth_header = 'Proxy-authorization'
				817
				818	def http_error_407(self, req, fp, code, msg, headers):
				819	# http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	820	# authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	821	# should not, RFC 3986 s. 3.2.1) support requests for URLs containing
				822	# userinfo.
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	823	authority = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	824	return self.http_error_auth_reqed('proxy-authenticate',
				825	authority, req, headers)
				826
				827
				828	def randombytes(n):
				829	"""Return n random bytes."""
				830	return os.urandom(n)
				831
				832	class AbstractDigestAuthHandler:
				833	# Digest authentication is specified in RFC 2617.
				834
				835	# XXX The client does not inspect the Authentication-Info header
				836	# in a successful response.
				837
				838	# XXX It should be possible to test this implementation against
				839	# a mock server that just generates a static set of challenges.
				840
				841	# XXX qop="auth-int" supports is shaky
				842
				843	def __init__(self, passwd=None):
				844	if passwd is None:
				845	passwd = HTTPPasswordMgr()
				846	self.passwd = passwd
				847	self.add_password = self.passwd.add_password
				848	self.retried = 0
				849	self.nonce_count = 0
Senthil Kumaran	4c7eaee	2009-11-15 08:43:45 +0000	[diff] [blame^]	850	self.last_nonce = None
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	851
				852	def reset_retry_count(self):
				853	self.retried = 0
				854
				855	def http_error_auth_reqed(self, auth_header, host, req, headers):
				856	authreq = headers.get(auth_header, None)
				857	if self.retried > 5:
				858	# Don't fail endlessly - if we failed once, we'll probably
				859	# fail a second time. Hm. Unless the Password Manager is
				860	# prompting for the information. Crap. This isn't great
				861	# but it's better than the current 'repeat until recursion
				862	# depth exceeded' approach <wink>
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	863	raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	864	headers, None)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	865	else:
				866	self.retried += 1
				867	if authreq:
				868	scheme = authreq.split()[0]
				869	if scheme.lower() == 'digest':
				870	return self.retry_http_digest_auth(req, authreq)
				871
				872	def retry_http_digest_auth(self, req, auth):
				873	token, challenge = auth.split(' ', 1)
				874	chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
				875	auth = self.get_authorization(req, chal)
				876	if auth:
				877	auth_val = 'Digest %s' % auth
				878	if req.headers.get(self.auth_header, None) == auth_val:
				879	return None
				880	req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaran	fb8cc2f	2009-07-19 02:44:19 +0000	[diff] [blame]	881	resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	882	return resp
				883
				884	def get_cnonce(self, nonce):
				885	# The cnonce-value is an opaque
				886	# quoted string value provided by the client and used by both client
				887	# and server to avoid chosen plaintext attacks, to provide mutual
				888	# authentication, and to provide some message integrity protection.
				889	# This isn't a fabulous effort, but it's probably Good Enough.
				890	s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
				891	b = s.encode("ascii") + randombytes(8)
				892	dig = hashlib.sha1(b).hexdigest()
				893	return dig[:16]
				894
				895	def get_authorization(self, req, chal):
				896	try:
				897	realm = chal['realm']
				898	nonce = chal['nonce']
				899	qop = chal.get('qop')
				900	algorithm = chal.get('algorithm', 'MD5')
				901	# mod_digest doesn't send an opaque, even though it isn't
				902	# supposed to be optional
				903	opaque = chal.get('opaque', None)
				904	except KeyError:
				905	return None
				906
				907	H, KD = self.get_algorithm_impls(algorithm)
				908	if H is None:
				909	return None
				910
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	911	user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	912	if user is None:
				913	return None
				914
				915	# XXX not implemented yet
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	916	if req.data is not None:
				917	entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	918	else:
				919	entdig = None
				920
				921	A1 = "%s:%s:%s" % (user, realm, pw)
				922	A2 = "%s:%s" % (req.get_method(),
				923	# XXX selector: what about proxies and full urls
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	924	req.selector)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	925	if qop == 'auth':
Senthil Kumaran	4c7eaee	2009-11-15 08:43:45 +0000	[diff] [blame^]	926	if nonce == self.last_nonce:
				927	self.nonce_count += 1
				928	else:
				929	self.nonce_count = 1
				930	self.last_nonce = nonce
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	931	ncvalue = '%08x' % self.nonce_count
				932	cnonce = self.get_cnonce(nonce)
				933	noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
				934	respdig = KD(H(A1), noncebit)
				935	elif qop is None:
				936	respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
				937	else:
				938	# XXX handle auth-int.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	939	raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	940
				941	# XXX should the partial digests be encoded too?
				942
				943	base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	944	'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	945	respdig)
				946	if opaque:
				947	base += ', opaque="%s"' % opaque
				948	if entdig:
				949	base += ', digest="%s"' % entdig
				950	base += ', algorithm="%s"' % algorithm
				951	if qop:
				952	base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
				953	return base
				954
				955	def get_algorithm_impls(self, algorithm):
				956	# lambdas assume digest modules are imported at the top level
				957	if algorithm == 'MD5':
				958	H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
				959	elif algorithm == 'SHA':
				960	H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
				961	# XXX MD5-sess
				962	KD = lambda s, d: H("%s:%s" % (s, d))
				963	return H, KD
				964
				965	def get_entity_digest(self, data, chal):
				966	# XXX not implemented yet
				967	return None
				968
				969
				970	class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
				971	"""An authentication protocol defined by RFC 2069
				972
				973	Digest authentication improves on basic authentication because it
				974	does not transmit passwords in the clear.
				975	"""
				976
				977	auth_header = 'Authorization'
				978	handler_order = 490 # before Basic auth
				979
				980	def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	981	host = urlparse(req.full_url)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	982	retry = self.http_error_auth_reqed('www-authenticate',
				983	host, req, headers)
				984	self.reset_retry_count()
				985	return retry
				986
				987
				988	class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
				989
				990	auth_header = 'Proxy-Authorization'
				991	handler_order = 490 # before Basic auth
				992
				993	def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	994	host = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	995	retry = self.http_error_auth_reqed('proxy-authenticate',
				996	host, req, headers)
				997	self.reset_retry_count()
				998	return retry
				999
				1000	class AbstractHTTPHandler(BaseHandler):
				1001
				1002	def __init__(self, debuglevel=0):
				1003	self._debuglevel = debuglevel
				1004
				1005	def set_http_debuglevel(self, level):
				1006	self._debuglevel = level
				1007
				1008	def do_request_(self, request):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1009	host = request.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1010	if not host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1011	raise URLError('no host given')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1012
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1013	if request.data is not None: # POST
				1014	data = request.data
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1015	if not request.has_header('Content-type'):
				1016	request.add_unredirected_header(
				1017	'Content-type',
				1018	'application/x-www-form-urlencoded')
				1019	if not request.has_header('Content-length'):
				1020	request.add_unredirected_header(
				1021	'Content-length', '%d' % len(data))
				1022
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	1023	sel_host = host
				1024	if request.has_proxy():
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1025	scheme, sel = splittype(request.selector)
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	1026	sel_host, sel_path = splithost(sel)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1027	if not request.has_header('Host'):
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	1028	request.add_unredirected_header('Host', sel_host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1029	for name, value in self.parent.addheaders:
				1030	name = name.capitalize()
				1031	if not request.has_header(name):
				1032	request.add_unredirected_header(name, value)
				1033
				1034	return request
				1035
				1036	def do_open(self, http_class, req):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1037	"""Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1038
				1039	http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1040	"""
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1041	host = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1042	if not host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1043	raise URLError('no host given')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1044
				1045	h = http_class(host, timeout=req.timeout) # will parse host:port
				1046	headers = dict(req.headers)
				1047	headers.update(req.unredirected_hdrs)
				1048
				1049	# TODO(jhylton): Should this be redesigned to handle
				1050	# persistent connections?
				1051
				1052	# We want to make an HTTP/1.1 request, but the addinfourl
				1053	# class isn't prepared to deal with a persistent connection.
				1054	# It will try to read all remaining data from the socket,
				1055	# which will block while the server waits for the next request.
				1056	# So make sure the connection gets closed after the (only)
				1057	# request.
				1058	headers["Connection"] = "close"
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1059	headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran	97f0c6b	2009-07-25 04:24:38 +0000	[diff] [blame]	1060
				1061	if req._tunnel_host:
				1062	h.set_tunnel(req._tunnel_host)
				1063
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1064	try:
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1065	h.request(req.get_method(), req.selector, req.data, headers)
				1066	r = h.getresponse() # an HTTPResponse instance
				1067	except socket.error as err:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1068	raise URLError(err)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1069
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1070	r.url = req.full_url
				1071	# This line replaces the .msg attribute of the HTTPResponse
				1072	# with .headers, because urllib clients expect the response to
				1073	# have the reason in .msg. It would be good to mark this
				1074	# attribute is deprecated and get then to use info() or
				1075	# .headers.
				1076	r.msg = r.reason
				1077	return r
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1078
				1079
				1080	class HTTPHandler(AbstractHTTPHandler):
				1081
				1082	def http_open(self, req):
				1083	return self.do_open(http.client.HTTPConnection, req)
				1084
				1085	http_request = AbstractHTTPHandler.do_request_
				1086
				1087	if hasattr(http.client, 'HTTPSConnection'):
				1088	class HTTPSHandler(AbstractHTTPHandler):
				1089
				1090	def https_open(self, req):
				1091	return self.do_open(http.client.HTTPSConnection, req)
				1092
				1093	https_request = AbstractHTTPHandler.do_request_
				1094
				1095	class HTTPCookieProcessor(BaseHandler):
				1096	def __init__(self, cookiejar=None):
				1097	import http.cookiejar
				1098	if cookiejar is None:
				1099	cookiejar = http.cookiejar.CookieJar()
				1100	self.cookiejar = cookiejar
				1101
				1102	def http_request(self, request):
				1103	self.cookiejar.add_cookie_header(request)
				1104	return request
				1105
				1106	def http_response(self, request, response):
				1107	self.cookiejar.extract_cookies(response, request)
				1108	return response
				1109
				1110	https_request = http_request
				1111	https_response = http_response
				1112
				1113	class UnknownHandler(BaseHandler):
				1114	def unknown_open(self, req):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1115	type = req.type
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1116	raise URLError('unknown url type: %s' % type)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1117
				1118	def parse_keqv_list(l):
				1119	"""Parse list of key=value strings where keys are not duplicated."""
				1120	parsed = {}
				1121	for elt in l:
				1122	k, v = elt.split('=', 1)
				1123	if v[0] == '"' and v[-1] == '"':
				1124	v = v[1:-1]
				1125	parsed[k] = v
				1126	return parsed
				1127
				1128	def parse_http_list(s):
				1129	"""Parse lists as described by RFC 2068 Section 2.
				1130
				1131	In particular, parse comma-separated lists where the elements of
				1132	the list may include quoted-strings. A quoted-string could
				1133	contain a comma. A non-quoted string could have quotes in the
				1134	middle. Neither commas nor quotes count if they are escaped.
				1135	Only double-quotes count, not single-quotes.
				1136	"""
				1137	res = []
				1138	part = ''
				1139
				1140	escape = quote = False
				1141	for cur in s:
				1142	if escape:
				1143	part += cur
				1144	escape = False
				1145	continue
				1146	if quote:
				1147	if cur == '\\':
				1148	escape = True
				1149	continue
				1150	elif cur == '"':
				1151	quote = False
				1152	part += cur
				1153	continue
				1154
				1155	if cur == ',':
				1156	res.append(part)
				1157	part = ''
				1158	continue
				1159
				1160	if cur == '"':
				1161	quote = True
				1162
				1163	part += cur
				1164
				1165	# append last part
				1166	if part:
				1167	res.append(part)
				1168
				1169	return [part.strip() for part in res]
				1170
				1171	class FileHandler(BaseHandler):
				1172	# Use local file or FTP depending on form of URL
				1173	def file_open(self, req):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1174	url = req.selector
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1175	if url[:2] == '//' and url[2:3] != '/':
				1176	req.type = 'ftp'
				1177	return self.parent.open(req)
				1178	else:
				1179	return self.open_local_file(req)
				1180
				1181	# names for the localhost
				1182	names = None
				1183	def get_names(self):
				1184	if FileHandler.names is None:
				1185	try:
				1186	FileHandler.names = (socket.gethostbyname('localhost'),
				1187	socket.gethostbyname(socket.gethostname()))
				1188	except socket.gaierror:
				1189	FileHandler.names = (socket.gethostbyname('localhost'),)
				1190	return FileHandler.names
				1191
				1192	# not entirely sure what the rules are here
				1193	def open_local_file(self, req):
				1194	import email.utils
				1195	import mimetypes
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1196	host = req.host
				1197	file = req.selector
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1198	localfile = url2pathname(file)
				1199	try:
				1200	stats = os.stat(localfile)
				1201	size = stats.st_size
				1202	modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
				1203	mtype = mimetypes.guess_type(file)[0]
				1204	headers = email.message_from_string(
				1205	'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
				1206	(mtype or 'text/plain', size, modified))
				1207	if host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1208	host, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1209	if not host or \
				1210	(not port and _safe_gethostbyname(host) in self.get_names()):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1211	return addinfourl(open(localfile, 'rb'), headers, 'file:'+file)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1212	except OSError as msg:
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	1213	# users shouldn't expect OSErrors coming from urlopen()
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1214	raise URLError(msg)
				1215	raise URLError('file not on local host')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1216
				1217	def _safe_gethostbyname(host):
				1218	try:
				1219	return socket.gethostbyname(host)
				1220	except socket.gaierror:
				1221	return None
				1222
				1223	class FTPHandler(BaseHandler):
				1224	def ftp_open(self, req):
				1225	import ftplib
				1226	import mimetypes
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1227	host = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1228	if not host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1229	raise URLError('ftp error: no host given')
				1230	host, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1231	if port is None:
				1232	port = ftplib.FTP_PORT
				1233	else:
				1234	port = int(port)
				1235
				1236	# username/password handling
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1237	user, host = splituser(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1238	if user:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1239	user, passwd = splitpasswd(user)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1240	else:
				1241	passwd = None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1242	host = unquote(host)
				1243	user = unquote(user or '')
				1244	passwd = unquote(passwd or '')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1245
				1246	try:
				1247	host = socket.gethostbyname(host)
				1248	except socket.error as msg:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1249	raise URLError(msg)
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1250	path, attrs = splitattr(req.selector)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1251	dirs = path.split('/')
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1252	dirs = list(map(unquote, dirs))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1253	dirs, file = dirs[:-1], dirs[-1]
				1254	if dirs and not dirs[0]:
				1255	dirs = dirs[1:]
				1256	try:
				1257	fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
				1258	type = file and 'I' or 'D'
				1259	for attr in attrs:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1260	attr, value = splitvalue(attr)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1261	if attr.lower() == 'type' and \
				1262	value in ('a', 'A', 'i', 'I', 'd', 'D'):
				1263	type = value.upper()
				1264	fp, retrlen = fw.retrfile(file, type)
				1265	headers = ""
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1266	mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1267	if mtype:
				1268	headers += "Content-type: %s\n" % mtype
				1269	if retrlen is not None and retrlen >= 0:
				1270	headers += "Content-length: %d\n" % retrlen
				1271	headers = email.message_from_string(headers)
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame]	1272	return addinfourl(fp, headers, req.full_url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1273	except ftplib.all_errors as msg:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1274	exc = URLError('ftp error: %s' % msg)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1275	raise exc.with_traceback(sys.exc_info()[2])
				1276
				1277	def connect_ftp(self, user, passwd, host, port, dirs, timeout):
				1278	fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
				1279	return fw
				1280
				1281	class CacheFTPHandler(FTPHandler):
				1282	# XXX would be nice to have pluggable cache strategies
				1283	# XXX this stuff is definitely not thread safe
				1284	def __init__(self):
				1285	self.cache = {}
				1286	self.timeout = {}
				1287	self.soonest = 0
				1288	self.delay = 60
				1289	self.max_conns = 16
				1290
				1291	def setTimeout(self, t):
				1292	self.delay = t
				1293
				1294	def setMaxConns(self, m):
				1295	self.max_conns = m
				1296
				1297	def connect_ftp(self, user, passwd, host, port, dirs, timeout):
				1298	key = user, host, port, '/'.join(dirs), timeout
				1299	if key in self.cache:
				1300	self.timeout[key] = time.time() + self.delay
				1301	else:
				1302	self.cache[key] = ftpwrapper(user, passwd, host, port,
				1303	dirs, timeout)
				1304	self.timeout[key] = time.time() + self.delay
				1305	self.check_cache()
				1306	return self.cache[key]
				1307
				1308	def check_cache(self):
				1309	# first check for old ones
				1310	t = time.time()
				1311	if self.soonest <= t:
				1312	for k, v in list(self.timeout.items()):
				1313	if v < t:
				1314	self.cache[k].close()
				1315	del self.cache[k]
				1316	del self.timeout[k]
				1317	self.soonest = min(list(self.timeout.values()))
				1318
				1319	# then check the size
				1320	if len(self.cache) == self.max_conns:
				1321	for k, v in list(self.timeout.items()):
				1322	if v == self.soonest:
				1323	del self.cache[k]
				1324	del self.timeout[k]
				1325	break
				1326	self.soonest = min(list(self.timeout.values()))
				1327
				1328	# Code move from the old urllib module
				1329
				1330	MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
				1331
				1332	# Helper for non-unix systems
				1333	if os.name == 'mac':
				1334	from macurl2path import url2pathname, pathname2url
				1335	elif os.name == 'nt':
				1336	from nturl2path import url2pathname, pathname2url
				1337	else:
				1338	def url2pathname(pathname):
				1339	"""OS-specific conversion from a relative URL of the 'file' scheme
				1340	to a file system path; not recommended for general use."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1341	return unquote(pathname)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1342
				1343	def pathname2url(pathname):
				1344	"""OS-specific conversion from a file system path to a relative URL
				1345	of the 'file' scheme; not recommended for general use."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1346	return quote(pathname)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1347
				1348	# This really consists of two pieces:
				1349	# (1) a class which handles opening of all sorts of URLs
				1350	# (plus assorted utilities etc.)
				1351	# (2) a set of functions for parsing URLs
				1352	# XXX Should these be separated out into different modules?
				1353
				1354
				1355	ftpcache = {}
				1356	class URLopener:
				1357	"""Class to open URLs.
				1358	This is a class rather than just a subroutine because we may need
				1359	more than one set of global protocol-specific options.
				1360	Note -- this is a base class for those who don't want the
				1361	automatic handling of errors type 302 (relocated) and 401
				1362	(authorization needed)."""
				1363
				1364	__tempfiles = None
				1365
				1366	version = "Python-urllib/%s" % __version__
				1367
				1368	# Constructor
				1369	def __init__(self, proxies=None, **x509):
				1370	if proxies is None:
				1371	proxies = getproxies()
				1372	assert hasattr(proxies, 'keys'), "proxies must be a mapping"
				1373	self.proxies = proxies
				1374	self.key_file = x509.get('key_file')
				1375	self.cert_file = x509.get('cert_file')
				1376	self.addheaders = [('User-Agent', self.version)]
				1377	self.__tempfiles = []
				1378	self.__unlink = os.unlink # See cleanup()
				1379	self.tempcache = None
				1380	# Undocumented feature: if you assign {} to tempcache,
				1381	# it is used to cache files retrieved with
				1382	# self.retrieve(). This is not enabled by default
				1383	# since it does not work for changing documents (and I
				1384	# haven't got the logic to check expiration headers
				1385	# yet).
				1386	self.ftpcache = ftpcache
				1387	# Undocumented feature: you can use a different
				1388	# ftp cache by assigning to the .ftpcache member;
				1389	# in case you want logically independent URL openers
				1390	# XXX This is not threadsafe. Bah.
				1391
				1392	def __del__(self):
				1393	self.close()
				1394
				1395	def close(self):
				1396	self.cleanup()
				1397
				1398	def cleanup(self):
				1399	# This code sometimes runs when the rest of this module
				1400	# has already been deleted, so it can't use any globals
				1401	# or import anything.
				1402	if self.__tempfiles:
				1403	for file in self.__tempfiles:
				1404	try:
				1405	self.__unlink(file)
				1406	except OSError:
				1407	pass
				1408	del self.__tempfiles[:]
				1409	if self.tempcache:
				1410	self.tempcache.clear()
				1411
				1412	def addheader(self, *args):
				1413	"""Add a header to be used by the HTTP interface only
				1414	e.g. u.addheader('Accept', 'sound/basic')"""
				1415	self.addheaders.append(args)
				1416
				1417	# External interface
				1418	def open(self, fullurl, data=None):
				1419	"""Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1420	fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran	690ce9b	2009-05-05 18:41:13 +0000	[diff] [blame]	1421	fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1422	if self.tempcache and fullurl in self.tempcache:
				1423	filename, headers = self.tempcache[fullurl]
				1424	fp = open(filename, 'rb')
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1425	return addinfourl(fp, headers, fullurl)
				1426	urltype, url = splittype(fullurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1427	if not urltype:
				1428	urltype = 'file'
				1429	if urltype in self.proxies:
				1430	proxy = self.proxies[urltype]
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1431	urltype, proxyhost = splittype(proxy)
				1432	host, selector = splithost(proxyhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1433	url = (host, fullurl) # Signal special case to open_*()
				1434	else:
				1435	proxy = None
				1436	name = 'open_' + urltype
				1437	self.type = urltype
				1438	name = name.replace('-', '_')
				1439	if not hasattr(self, name):
				1440	if proxy:
				1441	return self.open_unknown_proxy(proxy, fullurl, data)
				1442	else:
				1443	return self.open_unknown(fullurl, data)
				1444	try:
				1445	if data is None:
				1446	return getattr(self, name)(url)
				1447	else:
				1448	return getattr(self, name)(url, data)
				1449	except socket.error as msg:
				1450	raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
				1451
				1452	def open_unknown(self, fullurl, data=None):
				1453	"""Overridable interface to open unknown URL type."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1454	type, url = splittype(fullurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1455	raise IOError('url error', 'unknown url type', type)
				1456
				1457	def open_unknown_proxy(self, proxy, fullurl, data=None):
				1458	"""Overridable interface to open unknown URL type."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1459	type, url = splittype(fullurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1460	raise IOError('url error', 'invalid proxy for %s' % type, proxy)
				1461
				1462	# External interface
				1463	def retrieve(self, url, filename=None, reporthook=None, data=None):
				1464	"""retrieve(url) returns (filename, headers) for a local object
				1465	or (tempfilename, headers) for a remote object."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1466	url = unwrap(to_bytes(url))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1467	if self.tempcache and url in self.tempcache:
				1468	return self.tempcache[url]
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1469	type, url1 = splittype(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1470	if filename is None and (not type or type == 'file'):
				1471	try:
				1472	fp = self.open_local_file(url1)
				1473	hdrs = fp.info()
				1474	del fp
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1475	return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1476	except IOError as msg:
				1477	pass
				1478	fp = self.open(url, data)
Benjamin Peterson	5f28b7b	2009-03-26 21:49:58 +0000	[diff] [blame]	1479	try:
				1480	headers = fp.info()
				1481	if filename:
				1482	tfp = open(filename, 'wb')
				1483	else:
				1484	import tempfile
				1485	garbage, path = splittype(url)
				1486	garbage, path = splithost(path or "")
				1487	path, garbage = splitquery(path or "")
				1488	path, garbage = splitattr(path or "")
				1489	suffix = os.path.splitext(path)[1]
				1490	(fd, filename) = tempfile.mkstemp(suffix)
				1491	self.__tempfiles.append(filename)
				1492	tfp = os.fdopen(fd, 'wb')
				1493	try:
				1494	result = filename, headers
				1495	if self.tempcache is not None:
				1496	self.tempcache[url] = result
				1497	bs = 1024*8
				1498	size = -1
				1499	read = 0
				1500	blocknum = 0
				1501	if reporthook:
				1502	if "content-length" in headers:
				1503	size = int(headers["Content-Length"])
				1504	reporthook(blocknum, bs, size)
				1505	while 1:
				1506	block = fp.read(bs)
				1507	if not block:
				1508	break
				1509	read += len(block)
				1510	tfp.write(block)
				1511	blocknum += 1
				1512	if reporthook:
				1513	reporthook(blocknum, bs, size)
				1514	finally:
				1515	tfp.close()
				1516	finally:
				1517	fp.close()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1518	del fp
				1519	del tfp
				1520
				1521	# raise exception if actual size does not match content-length header
				1522	if size >= 0 and read < size:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1523	raise ContentTooShortError(
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1524	"retrieval incomplete: got only %i out of %i bytes"
				1525	% (read, size), result)
				1526
				1527	return result
				1528
				1529	# Each method named open_<type> knows how to open that type of URL
				1530
				1531	def _open_generic_http(self, connection_factory, url, data):
				1532	"""Make an HTTP connection using connection_class.
				1533
				1534	This is an internal method that should be called from
				1535	open_http() or open_https().
				1536
				1537	Arguments:
				1538	- connection_factory should take a host name and return an
				1539	HTTPConnection instance.
				1540	- url is the url to retrieval or a host, relative-path pair.
				1541	- data is payload for a POST request or None.
				1542	"""
				1543
				1544	user_passwd = None
				1545	proxy_passwd= None
				1546	if isinstance(url, str):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1547	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1548	if host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1549	user_passwd, host = splituser(host)
				1550	host = unquote(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1551	realhost = host
				1552	else:
				1553	host, selector = url
				1554	# check whether the proxy contains authorization information
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1555	proxy_passwd, host = splituser(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1556	# now we proceed with the url we want to obtain
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1557	urltype, rest = splittype(selector)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1558	url = rest
				1559	user_passwd = None
				1560	if urltype.lower() != 'http':
				1561	realhost = None
				1562	else:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1563	realhost, rest = splithost(rest)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1564	if realhost:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1565	user_passwd, realhost = splituser(realhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1566	if user_passwd:
				1567	selector = "%s://%s%s" % (urltype, realhost, rest)
				1568	if proxy_bypass(realhost):
				1569	host = realhost
				1570
				1571	#print "proxy via http:", host, selector
				1572	if not host: raise IOError('http error', 'no host given')
				1573
				1574	if proxy_passwd:
				1575	import base64
				1576	proxy_auth = base64.b64encode(proxy_passwd).strip()
				1577	else:
				1578	proxy_auth = None
				1579
				1580	if user_passwd:
				1581	import base64
				1582	auth = base64.b64encode(user_passwd).strip()
				1583	else:
				1584	auth = None
				1585	http_conn = connection_factory(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1586	headers = {}
				1587	if proxy_auth:
				1588	headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
				1589	if auth:
				1590	headers["Authorization"] = "Basic %s" % auth
				1591	if realhost:
				1592	headers["Host"] = realhost
				1593	for header, value in self.addheaders:
				1594	headers[header] = value
				1595
				1596	if data is not None:
				1597	headers["Content-Type"] = "application/x-www-form-urlencoded"
				1598	http_conn.request("POST", selector, data, headers)
				1599	else:
				1600	http_conn.request("GET", selector, headers=headers)
				1601
				1602	try:
				1603	response = http_conn.getresponse()
				1604	except http.client.BadStatusLine:
				1605	# something went wrong with the HTTP status line
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1606	raise URLError("http protocol error: bad status line")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1607
				1608	# According to RFC 2616, "2xx" code indicates that the client's
				1609	# request was successfully received, understood, and accepted.
				1610	if 200 <= response.status < 300:
Antoine Pitrou	b353c12	2009-02-11 00:39:14 +0000	[diff] [blame]	1611	return addinfourl(response, response.msg, "http:" + url,
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1612	response.status)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1613	else:
				1614	return self.http_error(
				1615	url, response.fp,
				1616	response.status, response.reason, response.msg, data)
				1617
				1618	def open_http(self, url, data=None):
				1619	"""Use HTTP protocol."""
				1620	return self._open_generic_http(http.client.HTTPConnection, url, data)
				1621
				1622	def http_error(self, url, fp, errcode, errmsg, headers, data=None):
				1623	"""Handle http errors.
				1624
				1625	Derived class can override this, or provide specific handlers
				1626	named http_error_DDD where DDD is the 3-digit error code."""
				1627	# First check if there's a specific handler for this error
				1628	name = 'http_error_%d' % errcode
				1629	if hasattr(self, name):
				1630	method = getattr(self, name)
				1631	if data is None:
				1632	result = method(url, fp, errcode, errmsg, headers)
				1633	else:
				1634	result = method(url, fp, errcode, errmsg, headers, data)
				1635	if result: return result
				1636	return self.http_error_default(url, fp, errcode, errmsg, headers)
				1637
				1638	def http_error_default(self, url, fp, errcode, errmsg, headers):
				1639	"""Default error handler: close the connection and raise IOError."""
				1640	void = fp.read()
				1641	fp.close()
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1642	raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1643
				1644	if _have_ssl:
				1645	def _https_connection(self, host):
				1646	return http.client.HTTPSConnection(host,
				1647	key_file=self.key_file,
				1648	cert_file=self.cert_file)
				1649
				1650	def open_https(self, url, data=None):
				1651	"""Use HTTPS protocol."""
				1652	return self._open_generic_http(self._https_connection, url, data)
				1653
				1654	def open_file(self, url):
				1655	"""Use local file or FTP depending on form of URL."""
				1656	if not isinstance(url, str):
				1657	raise URLError('file error', 'proxy support for file protocol currently not implemented')
				1658	if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
				1659	return self.open_ftp(url)
				1660	else:
				1661	return self.open_local_file(url)
				1662
				1663	def open_local_file(self, url):
				1664	"""Use local file."""
				1665	import mimetypes, email.utils
				1666	from io import StringIO
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1667	host, file = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1668	localname = url2pathname(file)
				1669	try:
				1670	stats = os.stat(localname)
				1671	except OSError as e:
				1672	raise URLError(e.errno, e.strerror, e.filename)
				1673	size = stats.st_size
				1674	modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
				1675	mtype = mimetypes.guess_type(url)[0]
				1676	headers = email.message_from_string(
				1677	'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
				1678	(mtype or 'text/plain', size, modified))
				1679	if not host:
				1680	urlfile = file
				1681	if file[:1] == '/':
				1682	urlfile = 'file://' + file
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1683	return addinfourl(open(localname, 'rb'), headers, urlfile)
				1684	host, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1685	if (not port
				1686	and socket.gethostbyname(host) in (localhost(), thishost())):
				1687	urlfile = file
				1688	if file[:1] == '/':
				1689	urlfile = 'file://' + file
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1690	return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1691	raise URLError('local file error', 'not on local host')
				1692
				1693	def open_ftp(self, url):
				1694	"""Use FTP protocol."""
				1695	if not isinstance(url, str):
				1696	raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
				1697	import mimetypes
				1698	from io import StringIO
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1699	host, path = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1700	if not host: raise URLError('ftp error', 'no host given')
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1701	host, port = splitport(host)
				1702	user, host = splituser(host)
				1703	if user: user, passwd = splitpasswd(user)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1704	else: passwd = None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1705	host = unquote(host)
				1706	user = unquote(user or '')
				1707	passwd = unquote(passwd or '')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1708	host = socket.gethostbyname(host)
				1709	if not port:
				1710	import ftplib
				1711	port = ftplib.FTP_PORT
				1712	else:
				1713	port = int(port)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1714	path, attrs = splitattr(path)
				1715	path = unquote(path)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1716	dirs = path.split('/')
				1717	dirs, file = dirs[:-1], dirs[-1]
				1718	if dirs and not dirs[0]: dirs = dirs[1:]
				1719	if dirs and not dirs[0]: dirs[0] = '/'
				1720	key = user, host, port, '/'.join(dirs)
				1721	# XXX thread unsafe!
				1722	if len(self.ftpcache) > MAXFTPCACHE:
				1723	# Prune the cache, rather arbitrarily
				1724	for k in self.ftpcache.keys():
				1725	if k != key:
				1726	v = self.ftpcache[k]
				1727	del self.ftpcache[k]
				1728	v.close()
				1729	try:
				1730	if not key in self.ftpcache:
				1731	self.ftpcache[key] = \
				1732	ftpwrapper(user, passwd, host, port, dirs)
				1733	if not file: type = 'D'
				1734	else: type = 'I'
				1735	for attr in attrs:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1736	attr, value = splitvalue(attr)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1737	if attr.lower() == 'type' and \
				1738	value in ('a', 'A', 'i', 'I', 'd', 'D'):
				1739	type = value.upper()
				1740	(fp, retrlen) = self.ftpcache[key].retrfile(file, type)
				1741	mtype = mimetypes.guess_type("ftp:" + url)[0]
				1742	headers = ""
				1743	if mtype:
				1744	headers += "Content-Type: %s\n" % mtype
				1745	if retrlen is not None and retrlen >= 0:
				1746	headers += "Content-Length: %d\n" % retrlen
				1747	headers = email.message_from_string(headers)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1748	return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1749	except ftperrors() as msg:
				1750	raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
				1751
				1752	def open_data(self, url, data=None):
				1753	"""Use "data" URL."""
				1754	if not isinstance(url, str):
				1755	raise URLError('data error', 'proxy support for data protocol currently not implemented')
				1756	# ignore POSTed data
				1757	#
				1758	# syntax of data URLs:
				1759	# dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
				1760	# mediatype := [ type "/" subtype ] *( ";" parameter )
				1761	# data := *urlchar
				1762	# parameter := attribute "=" value
				1763	try:
				1764	[type, data] = url.split(',', 1)
				1765	except ValueError:
				1766	raise IOError('data error', 'bad data URL')
				1767	if not type:
				1768	type = 'text/plain;charset=US-ASCII'
				1769	semi = type.rfind(';')
				1770	if semi >= 0 and '=' not in type[semi:]:
				1771	encoding = type[semi+1:]
				1772	type = type[:semi]
				1773	else:
				1774	encoding = ''
				1775	msg = []
				1776	msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
				1777	time.gmtime(time.time())))
				1778	msg.append('Content-type: %s' % type)
				1779	if encoding == 'base64':
				1780	import base64
Georg Brandl	706824f	2009-06-04 09:42:55 +0000	[diff] [blame]	1781	# XXX is this encoding/decoding ok?
				1782	data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1783	else:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1784	data = unquote(data)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1785	msg.append('Content-Length: %d' % len(data))
				1786	msg.append('')
				1787	msg.append(data)
				1788	msg = '\n'.join(msg)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1789	headers = email.message_from_string(msg)
				1790	f = io.StringIO(msg)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1791	#f.fileno = None # needed for addinfourl
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1792	return addinfourl(f, headers, url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1793
				1794
				1795	class FancyURLopener(URLopener):
				1796	"""Derived class with handlers for errors we can handle (perhaps)."""
				1797
				1798	def __init__(self, args, *kwargs):
				1799	URLopener.__init__(self, args, *kwargs)
				1800	self.auth_cache = {}
				1801	self.tries = 0
				1802	self.maxtries = 10
				1803
				1804	def http_error_default(self, url, fp, errcode, errmsg, headers):
				1805	"""Default error handling -- don't raise an exception."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1806	return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1807
				1808	def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
				1809	"""Error 302 -- relocated (temporarily)."""
				1810	self.tries += 1
				1811	if self.maxtries and self.tries >= self.maxtries:
				1812	if hasattr(self, "http_error_500"):
				1813	meth = self.http_error_500
				1814	else:
				1815	meth = self.http_error_default
				1816	self.tries = 0
				1817	return meth(url, fp, 500,
				1818	"Internal Server Error: Redirect Recursion", headers)
				1819	result = self.redirect_internal(url, fp, errcode, errmsg, headers,
				1820	data)
				1821	self.tries = 0
				1822	return result
				1823
				1824	def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
				1825	if 'location' in headers:
				1826	newurl = headers['location']
				1827	elif 'uri' in headers:
				1828	newurl = headers['uri']
				1829	else:
				1830	return
				1831	void = fp.read()
				1832	fp.close()
				1833	# In case the server sent a relative URL, join with original:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1834	newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1835	return self.open(newurl)
				1836
				1837	def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
				1838	"""Error 301 -- also relocated (permanently)."""
				1839	return self.http_error_302(url, fp, errcode, errmsg, headers, data)
				1840
				1841	def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
				1842	"""Error 303 -- also relocated (essentially identical to 302)."""
				1843	return self.http_error_302(url, fp, errcode, errmsg, headers, data)
				1844
				1845	def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
				1846	"""Error 307 -- relocated, but turn POST into error."""
				1847	if data is None:
				1848	return self.http_error_302(url, fp, errcode, errmsg, headers, data)
				1849	else:
				1850	return self.http_error_default(url, fp, errcode, errmsg, headers)
				1851
				1852	def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
				1853	"""Error 401 -- authentication required.
				1854	This function supports Basic authentication only."""
				1855	if not 'www-authenticate' in headers:
				1856	URLopener.http_error_default(self, url, fp,
				1857	errcode, errmsg, headers)
				1858	stuff = headers['www-authenticate']
				1859	import re
				1860	match = re.match('[ \t]([^ \t]+)[ \t]+realm="([^"])"', stuff)
				1861	if not match:
				1862	URLopener.http_error_default(self, url, fp,
				1863	errcode, errmsg, headers)
				1864	scheme, realm = match.groups()
				1865	if scheme.lower() != 'basic':
				1866	URLopener.http_error_default(self, url, fp,
				1867	errcode, errmsg, headers)
				1868	name = 'retry_' + self.type + '_basic_auth'
				1869	if data is None:
				1870	return getattr(self,name)(url, realm)
				1871	else:
				1872	return getattr(self,name)(url, realm, data)
				1873
				1874	def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
				1875	"""Error 407 -- proxy authentication required.
				1876	This function supports Basic authentication only."""
				1877	if not 'proxy-authenticate' in headers:
				1878	URLopener.http_error_default(self, url, fp,
				1879	errcode, errmsg, headers)
				1880	stuff = headers['proxy-authenticate']
				1881	import re
				1882	match = re.match('[ \t]([^ \t]+)[ \t]+realm="([^"])"', stuff)
				1883	if not match:
				1884	URLopener.http_error_default(self, url, fp,
				1885	errcode, errmsg, headers)
				1886	scheme, realm = match.groups()
				1887	if scheme.lower() != 'basic':
				1888	URLopener.http_error_default(self, url, fp,
				1889	errcode, errmsg, headers)
				1890	name = 'retry_proxy_' + self.type + '_basic_auth'
				1891	if data is None:
				1892	return getattr(self,name)(url, realm)
				1893	else:
				1894	return getattr(self,name)(url, realm, data)
				1895
				1896	def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1897	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1898	newurl = 'http://' + host + selector
				1899	proxy = self.proxies['http']
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1900	urltype, proxyhost = splittype(proxy)
				1901	proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1902	i = proxyhost.find('@') + 1
				1903	proxyhost = proxyhost[i:]
				1904	user, passwd = self.get_user_passwd(proxyhost, realm, i)
				1905	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1906	proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1907	quote(passwd, safe=''), proxyhost)
				1908	self.proxies['http'] = 'http://' + proxyhost + proxyselector
				1909	if data is None:
				1910	return self.open(newurl)
				1911	else:
				1912	return self.open(newurl, data)
				1913
				1914	def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1915	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1916	newurl = 'https://' + host + selector
				1917	proxy = self.proxies['https']
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1918	urltype, proxyhost = splittype(proxy)
				1919	proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1920	i = proxyhost.find('@') + 1
				1921	proxyhost = proxyhost[i:]
				1922	user, passwd = self.get_user_passwd(proxyhost, realm, i)
				1923	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1924	proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1925	quote(passwd, safe=''), proxyhost)
				1926	self.proxies['https'] = 'https://' + proxyhost + proxyselector
				1927	if data is None:
				1928	return self.open(newurl)
				1929	else:
				1930	return self.open(newurl, data)
				1931
				1932	def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1933	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1934	i = host.find('@') + 1
				1935	host = host[i:]
				1936	user, passwd = self.get_user_passwd(host, realm, i)
				1937	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1938	host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1939	quote(passwd, safe=''), host)
				1940	newurl = 'http://' + host + selector
				1941	if data is None:
				1942	return self.open(newurl)
				1943	else:
				1944	return self.open(newurl, data)
				1945
				1946	def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1947	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1948	i = host.find('@') + 1
				1949	host = host[i:]
				1950	user, passwd = self.get_user_passwd(host, realm, i)
				1951	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1952	host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1953	quote(passwd, safe=''), host)
				1954	newurl = 'https://' + host + selector
				1955	if data is None:
				1956	return self.open(newurl)
				1957	else:
				1958	return self.open(newurl, data)
				1959
				1960	def get_user_passwd(self, host, realm, clear_cache = 0):
				1961	key = realm + '@' + host.lower()
				1962	if key in self.auth_cache:
				1963	if clear_cache:
				1964	del self.auth_cache[key]
				1965	else:
				1966	return self.auth_cache[key]
				1967	user, passwd = self.prompt_user_passwd(host, realm)
				1968	if user or passwd: self.auth_cache[key] = (user, passwd)
				1969	return user, passwd
				1970
				1971	def prompt_user_passwd(self, host, realm):
				1972	"""Override this in a GUI environment!"""
				1973	import getpass
				1974	try:
				1975	user = input("Enter username for %s at %s: " % (realm, host))
				1976	passwd = getpass.getpass("Enter password for %s in %s at %s: " %
				1977	(user, realm, host))
				1978	return user, passwd
				1979	except KeyboardInterrupt:
				1980	print()
				1981	return None, None
				1982
				1983
				1984	# Utility functions
				1985
				1986	_localhost = None
				1987	def localhost():
				1988	"""Return the IP address of the magic hostname 'localhost'."""
				1989	global _localhost
				1990	if _localhost is None:
				1991	_localhost = socket.gethostbyname('localhost')
				1992	return _localhost
				1993
				1994	_thishost = None
				1995	def thishost():
				1996	"""Return the IP address of the current host."""
				1997	global _thishost
				1998	if _thishost is None:
				1999	_thishost = socket.gethostbyname(socket.gethostname())
				2000	return _thishost
				2001
				2002	_ftperrors = None
				2003	def ftperrors():
				2004	"""Return the set of errors raised by the FTP class."""
				2005	global _ftperrors
				2006	if _ftperrors is None:
				2007	import ftplib
				2008	_ftperrors = ftplib.all_errors
				2009	return _ftperrors
				2010
				2011	_noheaders = None
				2012	def noheaders():
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2013	"""Return an empty email Message object."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2014	global _noheaders
				2015	if _noheaders is None:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2016	_noheaders = email.message_from_string("")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2017	return _noheaders
				2018
				2019
				2020	# Utility classes
				2021
				2022	class ftpwrapper:
				2023	"""Class used by open_ftp() for cache of open FTP connections."""
				2024
				2025	def __init__(self, user, passwd, host, port, dirs, timeout=None):
				2026	self.user = user
				2027	self.passwd = passwd
				2028	self.host = host
				2029	self.port = port
				2030	self.dirs = dirs
				2031	self.timeout = timeout
				2032	self.init()
				2033
				2034	def init(self):
				2035	import ftplib
				2036	self.busy = 0
				2037	self.ftp = ftplib.FTP()
				2038	self.ftp.connect(self.host, self.port, self.timeout)
				2039	self.ftp.login(self.user, self.passwd)
				2040	for dir in self.dirs:
				2041	self.ftp.cwd(dir)
				2042
				2043	def retrfile(self, file, type):
				2044	import ftplib
				2045	self.endtransfer()
				2046	if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
				2047	else: cmd = 'TYPE ' + type; isdir = 0
				2048	try:
				2049	self.ftp.voidcmd(cmd)
				2050	except ftplib.all_errors:
				2051	self.init()
				2052	self.ftp.voidcmd(cmd)
				2053	conn = None
				2054	if file and not isdir:
				2055	# Try to retrieve as a file
				2056	try:
				2057	cmd = 'RETR ' + file
				2058	conn = self.ftp.ntransfercmd(cmd)
				2059	except ftplib.error_perm as reason:
				2060	if str(reason)[:3] != '550':
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2061	raise URLError('ftp error', reason).with_traceback(
				2062	sys.exc_info()[2])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2063	if not conn:
				2064	# Set transfer mode to ASCII!
				2065	self.ftp.voidcmd('TYPE A')
				2066	# Try a directory listing. Verify that directory exists.
				2067	if file:
				2068	pwd = self.ftp.pwd()
				2069	try:
				2070	try:
				2071	self.ftp.cwd(file)
				2072	except ftplib.error_perm as reason:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2073	raise URLError('ftp error', reason) from reason
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2074	finally:
				2075	self.ftp.cwd(pwd)
				2076	cmd = 'LIST ' + file
				2077	else:
				2078	cmd = 'LIST'
				2079	conn = self.ftp.ntransfercmd(cmd)
				2080	self.busy = 1
				2081	# Pass back both a suitably decorated object and a retrieval length
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2082	return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2083	def endtransfer(self):
				2084	if not self.busy:
				2085	return
				2086	self.busy = 0
				2087	try:
				2088	self.ftp.voidresp()
				2089	except ftperrors():
				2090	pass
				2091
				2092	def close(self):
				2093	self.endtransfer()
				2094	try:
				2095	self.ftp.close()
				2096	except ftperrors():
				2097	pass
				2098
				2099	# Proxy handling
				2100	def getproxies_environment():
				2101	"""Return a dictionary of scheme -> proxy server URL mappings.
				2102
				2103	Scan the environment for variables named <scheme>_proxy;
				2104	this seems to be the standard convention. If you need a
				2105	different way, you can pass a proxies dictionary to the
				2106	[Fancy]URLopener constructor.
				2107
				2108	"""
				2109	proxies = {}
				2110	for name, value in os.environ.items():
				2111	name = name.lower()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2112	if value and name[-6:] == '_proxy':
				2113	proxies[name[:-6]] = value
				2114	return proxies
				2115
				2116	def proxy_bypass_environment(host):
				2117	"""Test if proxies should not be used for a particular host.
				2118
				2119	Checks the environment for a variable named no_proxy, which should
				2120	be a list of DNS suffixes separated by commas, or '*' for all hosts.
				2121	"""
				2122	no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
				2123	# '*' is special case for always bypass
				2124	if no_proxy == '*':
				2125	return 1
				2126	# strip port off host
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2127	hostonly, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2128	# check if the host ends with any of the DNS suffixes
				2129	for name in no_proxy.split(','):
				2130	if name and (hostonly.endswith(name) or host.endswith(name)):
				2131	return 1
				2132	# otherwise, don't bypass
				2133	return 0
				2134
				2135
				2136	if sys.platform == 'darwin':
				2137	def getproxies_internetconfig():
				2138	"""Return a dictionary of scheme -> proxy server URL mappings.
				2139
				2140	By convention the mac uses Internet Config to store
				2141	proxies. An HTTP proxy, for instance, is stored under
				2142	the HttpProxy key.
				2143
				2144	"""
				2145	try:
				2146	import ic
				2147	except ImportError:
				2148	return {}
				2149
				2150	try:
				2151	config = ic.IC()
				2152	except ic.error:
				2153	return {}
				2154	proxies = {}
				2155	# HTTP:
				2156	if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
				2157	try:
				2158	value = config['HTTPProxyHost']
				2159	except ic.error:
				2160	pass
				2161	else:
				2162	proxies['http'] = 'http://%s' % value
				2163	# FTP: XXX To be done.
				2164	# Gopher: XXX To be done.
				2165	return proxies
				2166
				2167	def proxy_bypass(host):
				2168	if getproxies_environment():
				2169	return proxy_bypass_environment(host)
				2170	else:
				2171	return 0
				2172
				2173	def getproxies():
				2174	return getproxies_environment() or getproxies_internetconfig()
				2175
				2176	elif os.name == 'nt':
				2177	def getproxies_registry():
				2178	"""Return a dictionary of scheme -> proxy server URL mappings.
				2179
				2180	Win32 uses the registry to store proxies.
				2181
				2182	"""
				2183	proxies = {}
				2184	try:
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2185	import winreg
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2186	except ImportError:
				2187	# Std module, so should be around - but you never know!
				2188	return proxies
				2189	try:
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2190	internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2191	r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2192	proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2193	'ProxyEnable')[0]
				2194	if proxyEnable:
				2195	# Returned as Unicode but problems if not converted to ASCII
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2196	proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2197	'ProxyServer')[0])
				2198	if '=' in proxyServer:
				2199	# Per-protocol settings
				2200	for p in proxyServer.split(';'):
				2201	protocol, address = p.split('=', 1)
				2202	# See if address has a type:// prefix
				2203	import re
				2204	if not re.match('^([^/:]+)://', address):
				2205	address = '%s://%s' % (protocol, address)
				2206	proxies[protocol] = address
				2207	else:
				2208	# Use one setting for all protocols
				2209	if proxyServer[:5] == 'http:':
				2210	proxies['http'] = proxyServer
				2211	else:
				2212	proxies['http'] = 'http://%s' % proxyServer
				2213	proxies['ftp'] = 'ftp://%s' % proxyServer
				2214	internetSettings.Close()
				2215	except (WindowsError, ValueError, TypeError):
				2216	# Either registry key not found etc, or the value in an
				2217	# unexpected format.
				2218	# proxies already set up to be empty so nothing to do
				2219	pass
				2220	return proxies
				2221
				2222	def getproxies():
				2223	"""Return a dictionary of scheme -> proxy server URL mappings.
				2224
				2225	Returns settings gathered from the environment, if specified,
				2226	or the registry.
				2227
				2228	"""
				2229	return getproxies_environment() or getproxies_registry()
				2230
				2231	def proxy_bypass_registry(host):
				2232	try:
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2233	import winreg
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2234	import re
				2235	except ImportError:
				2236	# Std modules, so should be around - but you never know!
				2237	return 0
				2238	try:
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2239	internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2240	r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2241	proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2242	'ProxyEnable')[0]
Georg Brandl	4ed72ac	2009-04-01 04:28:33 +0000	[diff] [blame]	2243	proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2244	'ProxyOverride')[0])
				2245	# ^^^^ Returned as Unicode but problems if not converted to ASCII
				2246	except WindowsError:
				2247	return 0
				2248	if not proxyEnable or not proxyOverride:
				2249	return 0
				2250	# try to make a host list from name and IP address.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2251	rawHost, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2252	host = [rawHost]
				2253	try:
				2254	addr = socket.gethostbyname(rawHost)
				2255	if addr != rawHost:
				2256	host.append(addr)
				2257	except socket.error:
				2258	pass
				2259	try:
				2260	fqdn = socket.getfqdn(rawHost)
				2261	if fqdn != rawHost:
				2262	host.append(fqdn)
				2263	except socket.error:
				2264	pass
				2265	# make a check value list from the registry entry: replace the
				2266	# '<local>' string by the localhost entry and the corresponding
				2267	# canonical entry.
				2268	proxyOverride = proxyOverride.split(';')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2269	# now check if we match one of the registry values.
				2270	for test in proxyOverride:
Senthil Kumaran	4947606	2009-05-01 06:00:23 +0000	[diff] [blame]	2271	if test == '<local>':
				2272	if '.' not in rawHost:
				2273	return 1
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2274	test = test.replace(".", r"\.") # mask dots
				2275	test = test.replace("", r".") # change glob sequence
				2276	test = test.replace("?", r".") # change glob char
				2277	for val in host:
				2278	# print "%s <--> %s" %( test, val )
				2279	if re.match(test, val, re.I):
				2280	return 1
				2281	return 0
				2282
				2283	def proxy_bypass(host):
				2284	"""Return a dictionary of scheme -> proxy server URL mappings.
				2285
				2286	Returns settings gathered from the environment, if specified,
				2287	or the registry.
				2288
				2289	"""
				2290	if getproxies_environment():
				2291	return proxy_bypass_environment(host)
				2292	else:
				2293	return proxy_bypass_registry(host)
				2294
				2295	else:
				2296	# By default use environment variables
				2297	getproxies = getproxies_environment
				2298	proxy_bypass = proxy_bypass_environment