Blame - Lib/urllib/request.py - platform/external/python/cpython2

blob: 3776536d387e1a5350e9090ac8f8f177878d8d4d [file] [log] [blame]

Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1	"""An extensible library for opening URLs using a variety of protocols
				2
				3	The simplest way to use this module is to call the urlopen function,
				4	which accepts a string containing a URL or a Request object (described
				5	below). It opens the URL and returns the results as file-like
				6	object; the returned object has some extra methods described below.
				7
				8	The OpenerDirector manages a collection of Handler objects that do
				9	all the actual work. Each Handler implements a particular protocol or
				10	option. The OpenerDirector is a composite object that invokes the
				11	Handlers needed to open the requested URL. For example, the
				12	HTTPHandler performs HTTP GET and POST requests and deals with
				13	non-error returns. The HTTPRedirectHandler automatically deals with
				14	HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
				15	deals with digest authentication.
				16
				17	urlopen(url, data=None) -- Basic usage is the same as original
				18	urllib. pass the url and optionally data to post to an HTTP URL, and
				19	get a file-like object back. One difference is that you can also pass
				20	a Request instance instead of URL. Raises a URLError (subclass of
				21	IOError); for HTTP errors, raises an HTTPError, which can also be
				22	treated as a valid response.
				23
				24	build_opener -- Function that creates a new OpenerDirector instance.
				25	Will install the default handlers. Accepts one or more Handlers as
				26	arguments, either instances or Handler classes that it will
				27	instantiate. If one of the argument is a subclass of the default
				28	handler, the argument will be installed instead of the default.
				29
				30	install_opener -- Installs a new opener as the default opener.
				31
				32	objects of interest:
				33	OpenerDirector --
				34
				35	Request -- An object that encapsulates the state of a request. The
				36	state can be as simple as the URL. It can also include extra HTTP
				37	headers, e.g. a User-Agent.
				38
				39	BaseHandler --
				40
				41	internals:
				42	BaseHandler and parent
				43	_call_chain conventions
				44
				45	Example usage:
				46
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	47	import urllib.request
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	48
				49	# set up authentication info
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	50	authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	51	authinfo.add_password(realm='PDQ Application',
				52	uri='https://mahler:8092/site-updates.py',
				53	user='klem',
				54	passwd='geheim$parole')
				55
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	56	proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	57
				58	# build a new opener that adds authentication and caching FTP handlers
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	59	opener = urllib.request.build_opener(proxy_support, authinfo,
				60	urllib.request.CacheFTPHandler)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	61
				62	# install it
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	63	urllib.request.install_opener(opener)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	64
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	65	f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	66	"""
				67
				68	# XXX issues:
				69	# If an authentication error handler that tries to perform
				70	# authentication for some reason but fails, how should the error be
				71	# signalled? The client needs to know the HTTP error code. But if
				72	# the handler knows that the problem was, e.g., that it didn't know
				73	# that hash algo that requested in the challenge, it would be good to
				74	# pass that information along to the client, too.
				75	# ftp errors aren't handled cleanly
				76	# check digest against correct (i.e. non-apache) implementation
				77
				78	# Possible extensions:
				79	# complex proxies XXX not sure what exactly was meant by this
				80	# abstract factory for opener
				81
				82	import base64
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	83	import bisect
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	84	import email
				85	import hashlib
				86	import http.client
				87	import io
				88	import os
				89	import posixpath
				90	import random
				91	import re
				92	import socket
				93	import sys
				94	import time
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	95
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	96	from urllib.error import URLError, HTTPError, ContentTooShortError
				97	from urllib.parse import (
				98	urlparse, urlsplit, urljoin, unwrap, quote, unquote,
				99	splittype, splithost, splitport, splituser, splitpasswd,
Facundo Batista	f24802c	2008-08-17 03:36:03 +0000	[diff] [blame]	100	splitattr, splitquery, splitvalue, to_bytes, urlunparse)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	101	from urllib.response import addinfourl, addclosehook
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	102
				103	# check for SSL
				104	try:
				105	import ssl
				106	except:
				107	_have_ssl = False
				108	else:
				109	_have_ssl = True
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	110
				111	# used in User-Agent header sent
				112	__version__ = sys.version[:3]
				113
				114	_opener = None
				115	def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
				116	global _opener
				117	if _opener is None:
				118	_opener = build_opener()
				119	return _opener.open(url, data, timeout)
				120
				121	def install_opener(opener):
				122	global _opener
				123	_opener = opener
				124
				125	# TODO(jhylton): Make this work with the same global opener.
				126	_urlopener = None
				127	def urlretrieve(url, filename=None, reporthook=None, data=None):
				128	global _urlopener
				129	if not _urlopener:
				130	_urlopener = FancyURLopener()
				131	return _urlopener.retrieve(url, filename, reporthook, data)
				132
				133	def urlcleanup():
				134	if _urlopener:
				135	_urlopener.cleanup()
				136	global _opener
				137	if _opener:
				138	_opener = None
				139
				140	# copied from cookielib.py
Antoine Pitrou	fd03645	2008-08-19 17:56:33 +0000	[diff] [blame]	141	_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	142	def request_host(request):
				143	"""Return request-host, as defined by RFC 2965.
				144
				145	Variation from RFC: returned value is lowercased, for convenient
				146	comparison.
				147
				148	"""
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	149	url = request.full_url
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	150	host = urlparse(url)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	151	if host == "":
				152	host = request.get_header("Host", "")
				153
				154	# remove port, if present
				155	host = _cut_port_re.sub("", host, 1)
				156	return host.lower()
				157
				158	class Request:
				159
				160	def __init__(self, url, data=None, headers={},
				161	origin_req_host=None, unverifiable=False):
				162	# unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	163	self.full_url = unwrap(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	164	self.data = data
				165	self.headers = {}
				166	for key, value in headers.items():
				167	self.add_header(key, value)
				168	self.unredirected_hdrs = {}
				169	if origin_req_host is None:
				170	origin_req_host = request_host(self)
				171	self.origin_req_host = origin_req_host
				172	self.unverifiable = unverifiable
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	173	self._parse()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	174
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	175	def _parse(self):
				176	self.type, rest = splittype(self.full_url)
				177	if self.type is None:
				178	raise ValueError("unknown url type: %s" % self.full_url)
				179	self.host, self.selector = splithost(rest)
				180	if self.host:
				181	self.host = unquote(self.host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	182
				183	def get_method(self):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	184	if self.data is not None:
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	185	return "POST"
				186	else:
				187	return "GET"
				188
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	189	# Begin deprecated methods
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	190
				191	def add_data(self, data):
				192	self.data = data
				193
				194	def has_data(self):
				195	return self.data is not None
				196
				197	def get_data(self):
				198	return self.data
				199
				200	def get_full_url(self):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	201	return self.full_url
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	202
				203	def get_type(self):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	204	return self.type
				205
				206	def get_host(self):
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	207	return self.host
				208
				209	def get_selector(self):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	210	return self.selector
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	211
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	212	def is_unverifiable(self):
				213	return self.unverifiable
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	214
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	215	def get_origin_req_host(self):
				216	return self.origin_req_host
				217
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	218	# End deprecated methods
				219
				220	def set_proxy(self, host, type):
				221	self.host, self.type = host, type
				222	self.selector = self.full_url
				223
				224	def has_proxy(self):
				225	return self.selector == self.full_url
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	226
				227	def add_header(self, key, val):
				228	# useful for something like authentication
				229	self.headers[key.capitalize()] = val
				230
				231	def add_unredirected_header(self, key, val):
				232	# will not be added to a redirected request
				233	self.unredirected_hdrs[key.capitalize()] = val
				234
				235	def has_header(self, header_name):
				236	return (header_name in self.headers or
				237	header_name in self.unredirected_hdrs)
				238
				239	def get_header(self, header_name, default=None):
				240	return self.headers.get(
				241	header_name,
				242	self.unredirected_hdrs.get(header_name, default))
				243
				244	def header_items(self):
				245	hdrs = self.unredirected_hdrs.copy()
				246	hdrs.update(self.headers)
				247	return list(hdrs.items())
				248
				249	class OpenerDirector:
				250	def __init__(self):
				251	client_version = "Python-urllib/%s" % __version__
				252	self.addheaders = [('User-agent', client_version)]
				253	# manage the individual handlers
				254	self.handlers = []
				255	self.handle_open = {}
				256	self.handle_error = {}
				257	self.process_response = {}
				258	self.process_request = {}
				259
				260	def add_handler(self, handler):
				261	if not hasattr(handler, "add_parent"):
				262	raise TypeError("expected BaseHandler instance, got %r" %
				263	type(handler))
				264
				265	added = False
				266	for meth in dir(handler):
				267	if meth in ["redirect_request", "do_open", "proxy_open"]:
				268	# oops, coincidental match
				269	continue
				270
				271	i = meth.find("_")
				272	protocol = meth[:i]
				273	condition = meth[i+1:]
				274
				275	if condition.startswith("error"):
				276	j = condition.find("_") + i + 1
				277	kind = meth[j+1:]
				278	try:
				279	kind = int(kind)
				280	except ValueError:
				281	pass
				282	lookup = self.handle_error.get(protocol, {})
				283	self.handle_error[protocol] = lookup
				284	elif condition == "open":
				285	kind = protocol
				286	lookup = self.handle_open
				287	elif condition == "response":
				288	kind = protocol
				289	lookup = self.process_response
				290	elif condition == "request":
				291	kind = protocol
				292	lookup = self.process_request
				293	else:
				294	continue
				295
				296	handlers = lookup.setdefault(kind, [])
				297	if handlers:
				298	bisect.insort(handlers, handler)
				299	else:
				300	handlers.append(handler)
				301	added = True
				302
				303	if added:
				304	# the handlers must work in an specific order, the order
				305	# is specified in a Handler attribute
				306	bisect.insort(self.handlers, handler)
				307	handler.add_parent(self)
				308
				309	def close(self):
				310	# Only exists for backwards compatibility.
				311	pass
				312
				313	def _call_chain(self, chain, kind, meth_name, *args):
				314	# Handlers raise an exception if no one else should try to handle
				315	# the request, or return None if they can't but another handler
				316	# could. Otherwise, they return the response.
				317	handlers = chain.get(kind, ())
				318	for handler in handlers:
				319	func = getattr(handler, meth_name)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	320	result = func(*args)
				321	if result is not None:
				322	return result
				323
				324	def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
				325	# accept a URL or a Request object
				326	if isinstance(fullurl, str):
				327	req = Request(fullurl, data)
				328	else:
				329	req = fullurl
				330	if data is not None:
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	331	req.data = data
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	332
				333	req.timeout = timeout
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	334	protocol = req.type
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	335
				336	# pre-process request
				337	meth_name = protocol+"_request"
				338	for processor in self.process_request.get(protocol, []):
				339	meth = getattr(processor, meth_name)
				340	req = meth(req)
				341
				342	response = self._open(req, data)
				343
				344	# post-process response
				345	meth_name = protocol+"_response"
				346	for processor in self.process_response.get(protocol, []):
				347	meth = getattr(processor, meth_name)
				348	response = meth(req, response)
				349
				350	return response
				351
				352	def _open(self, req, data=None):
				353	result = self._call_chain(self.handle_open, 'default',
				354	'default_open', req)
				355	if result:
				356	return result
				357
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	358	protocol = req.type
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	359	result = self._call_chain(self.handle_open, protocol, protocol +
				360	'_open', req)
				361	if result:
				362	return result
				363
				364	return self._call_chain(self.handle_open, 'unknown',
				365	'unknown_open', req)
				366
				367	def error(self, proto, *args):
				368	if proto in ('http', 'https'):
				369	# XXX http[s] protocols are special-cased
				370	dict = self.handle_error['http'] # https is not different than http
				371	proto = args[2] # YUCK!
				372	meth_name = 'http_error_%s' % proto
				373	http_err = 1
				374	orig_args = args
				375	else:
				376	dict = self.handle_error
				377	meth_name = proto + '_error'
				378	http_err = 0
				379	args = (dict, proto, meth_name) + args
				380	result = self._call_chain(*args)
				381	if result:
				382	return result
				383
				384	if http_err:
				385	args = (dict, 'default', 'http_error_default') + orig_args
				386	return self._call_chain(*args)
				387
				388	# XXX probably also want an abstract factory that knows when it makes
				389	# sense to skip a superclass in favor of a subclass and when it might
				390	# make sense to include both
				391
				392	def build_opener(*handlers):
				393	"""Create an opener object from a list of handlers.
				394
				395	The opener will use several default handlers, including support
				396	for HTTP and FTP.
				397
				398	If any of the handlers passed as arguments are subclasses of the
				399	default handlers, the default handlers will not be used.
				400	"""
				401	def isclass(obj):
				402	return isinstance(obj, type) or hasattr(obj, "__bases__")
				403
				404	opener = OpenerDirector()
				405	default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
				406	HTTPDefaultErrorHandler, HTTPRedirectHandler,
				407	FTPHandler, FileHandler, HTTPErrorProcessor]
				408	if hasattr(http.client, "HTTPSConnection"):
				409	default_classes.append(HTTPSHandler)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	410	skip = set()
				411	for klass in default_classes:
				412	for check in handlers:
				413	if isclass(check):
				414	if issubclass(check, klass):
				415	skip.add(klass)
				416	elif isinstance(check, klass):
				417	skip.add(klass)
				418	for klass in skip:
				419	default_classes.remove(klass)
				420
				421	for klass in default_classes:
				422	opener.add_handler(klass())
				423
				424	for h in handlers:
				425	if isclass(h):
				426	h = h()
				427	opener.add_handler(h)
				428	return opener
				429
				430	class BaseHandler:
				431	handler_order = 500
				432
				433	def add_parent(self, parent):
				434	self.parent = parent
				435
				436	def close(self):
				437	# Only exists for backwards compatibility
				438	pass
				439
				440	def __lt__(self, other):
				441	if not hasattr(other, "handler_order"):
				442	# Try to preserve the old behavior of having custom classes
				443	# inserted after default ones (works only for custom user
				444	# classes which are not aware of handler_order).
				445	return True
				446	return self.handler_order < other.handler_order
				447
				448
				449	class HTTPErrorProcessor(BaseHandler):
				450	"""Process HTTP error responses."""
				451	handler_order = 1000 # after all other processing
				452
				453	def http_response(self, request, response):
				454	code, msg, hdrs = response.code, response.msg, response.info()
				455
				456	# According to RFC 2616, "2xx" code indicates that the client's
				457	# request was successfully received, understood, and accepted.
				458	if not (200 <= code < 300):
				459	response = self.parent.error(
				460	'http', request, response, code, msg, hdrs)
				461
				462	return response
				463
				464	https_response = http_response
				465
				466	class HTTPDefaultErrorHandler(BaseHandler):
				467	def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	468	raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	469
				470	class HTTPRedirectHandler(BaseHandler):
				471	# maximum number of redirections to any single URL
				472	# this is needed because of the state that cookies introduce
				473	max_repeats = 4
				474	# maximum total number of redirections (regardless of URL) before
				475	# assuming we're in a loop
				476	max_redirections = 10
				477
				478	def redirect_request(self, req, fp, code, msg, headers, newurl):
				479	"""Return a Request or None in response to a redirect.
				480
				481	This is called by the http_error_30x methods when a
				482	redirection response is received. If a redirection should
				483	take place, return a new Request to allow http_error_30x to
				484	perform the redirect. Otherwise, raise HTTPError if no-one
				485	else should try to handle this url. Return None if you can't
				486	but another Handler might.
				487	"""
				488	m = req.get_method()
				489	if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
				490	or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	491	raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	492
				493	# Strictly (according to RFC 2616), 301 or 302 in response to
				494	# a POST MUST NOT cause a redirection without confirmation
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	495	# from the user (of urllib.request, in this case). In practice,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	496	# essentially all clients do redirect in this case, so we do
				497	# the same.
				498	# be conciliant with URIs containing a space
				499	newurl = newurl.replace(' ', '%20')
				500	CONTENT_HEADERS = ("content-length", "content-type")
				501	newheaders = dict((k, v) for k, v in req.headers.items()
				502	if k.lower() not in CONTENT_HEADERS)
				503	return Request(newurl,
				504	headers=newheaders,
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	505	origin_req_host=req.origin_req_host,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	506	unverifiable=True)
				507
				508	# Implementation note: To avoid the server sending us into an
				509	# infinite loop, the request object needs to track what URLs we
				510	# have already seen. Do this by adding a handler-specific
				511	# attribute to the Request object.
				512	def http_error_302(self, req, fp, code, msg, headers):
				513	# Some servers (incorrectly) return multiple Location headers
				514	# (so probably same goes for URI). Use first header.
				515	if "location" in headers:
				516	newurl = headers["location"]
				517	elif "uri" in headers:
				518	newurl = headers["uri"]
				519	else:
				520	return
Facundo Batista	f24802c	2008-08-17 03:36:03 +0000	[diff] [blame]	521
				522	# fix a possible malformed URL
				523	urlparts = urlparse(newurl)
				524	if not urlparts.path:
				525	urlparts = list(urlparts)
				526	urlparts[2] = "/"
				527	newurl = urlunparse(urlparts)
				528
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	529	newurl = urljoin(req.full_url, newurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	530
				531	# XXX Probably want to forget about the state of the current
				532	# request, although that might interact poorly with other
				533	# handlers that also use handler-specific request attributes
				534	new = self.redirect_request(req, fp, code, msg, headers, newurl)
				535	if new is None:
				536	return
				537
				538	# loop detection
				539	# .redirect_dict has a key url if url was previously visited.
				540	if hasattr(req, 'redirect_dict'):
				541	visited = new.redirect_dict = req.redirect_dict
				542	if (visited.get(newurl, 0) >= self.max_repeats or
				543	len(visited) >= self.max_redirections):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	544	raise HTTPError(req.full_url, code,
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	545	self.inf_msg + msg, headers, fp)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	546	else:
				547	visited = new.redirect_dict = req.redirect_dict = {}
				548	visited[newurl] = visited.get(newurl, 0) + 1
				549
				550	# Don't close the fp until we are sure that we won't use it
				551	# with HTTPError.
				552	fp.read()
				553	fp.close()
				554
				555	return self.parent.open(new)
				556
				557	http_error_301 = http_error_303 = http_error_307 = http_error_302
				558
				559	inf_msg = "The HTTP server returned a redirect error that would " \
				560	"lead to an infinite loop.\n" \
				561	"The last 30x error message was:\n"
				562
				563
				564	def _parse_proxy(proxy):
				565	"""Return (scheme, user, password, host/port) given a URL or an authority.
				566
				567	If a URL is supplied, it must have an authority (host:port) component.
				568	According to RFC 3986, having an authority component means the URL must
				569	have two slashes after the scheme:
				570
				571	>>> _parse_proxy('file:/ftp.example.com/')
				572	Traceback (most recent call last):
				573	ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
				574
				575	The first three items of the returned tuple may be None.
				576
				577	Examples of authority parsing:
				578
				579	>>> _parse_proxy('proxy.example.com')
				580	(None, None, None, 'proxy.example.com')
				581	>>> _parse_proxy('proxy.example.com:3128')
				582	(None, None, None, 'proxy.example.com:3128')
				583
				584	The authority component may optionally include userinfo (assumed to be
				585	username:password):
				586
				587	>>> _parse_proxy('joe:password@proxy.example.com')
				588	(None, 'joe', 'password', 'proxy.example.com')
				589	>>> _parse_proxy('joe:password@proxy.example.com:3128')
				590	(None, 'joe', 'password', 'proxy.example.com:3128')
				591
				592	Same examples, but with URLs instead:
				593
				594	>>> _parse_proxy('http://proxy.example.com/')
				595	('http', None, None, 'proxy.example.com')
				596	>>> _parse_proxy('http://proxy.example.com:3128/')
				597	('http', None, None, 'proxy.example.com:3128')
				598	>>> _parse_proxy('http://joe:password@proxy.example.com/')
				599	('http', 'joe', 'password', 'proxy.example.com')
				600	>>> _parse_proxy('http://joe:password@proxy.example.com:3128')
				601	('http', 'joe', 'password', 'proxy.example.com:3128')
				602
				603	Everything after the authority is ignored:
				604
				605	>>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
				606	('ftp', 'joe', 'password', 'proxy.example.com')
				607
				608	Test for no trailing '/' case:
				609
				610	>>> _parse_proxy('http://joe:password@proxy.example.com')
				611	('http', 'joe', 'password', 'proxy.example.com')
				612
				613	"""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	614	scheme, r_scheme = splittype(proxy)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	615	if not r_scheme.startswith("/"):
				616	# authority
				617	scheme = None
				618	authority = proxy
				619	else:
				620	# URL
				621	if not r_scheme.startswith("//"):
				622	raise ValueError("proxy URL with no authority: %r" % proxy)
				623	# We have an authority, so for RFC 3986-compliant URLs (by ss 3.
				624	# and 3.3.), path is empty or starts with '/'
				625	end = r_scheme.find("/", 2)
				626	if end == -1:
				627	end = None
				628	authority = r_scheme[2:end]
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	629	userinfo, hostport = splituser(authority)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	630	if userinfo is not None:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	631	user, password = splitpasswd(userinfo)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	632	else:
				633	user = password = None
				634	return scheme, user, password, hostport
				635
				636	class ProxyHandler(BaseHandler):
				637	# Proxies must be in front
				638	handler_order = 100
				639
				640	def __init__(self, proxies=None):
				641	if proxies is None:
				642	proxies = getproxies()
				643	assert hasattr(proxies, 'keys'), "proxies must be a mapping"
				644	self.proxies = proxies
				645	for type, url in proxies.items():
				646	setattr(self, '%s_open' % type,
				647	lambda r, proxy=url, type=type, meth=self.proxy_open: \
				648	meth(r, proxy, type))
				649
				650	def proxy_open(self, req, proxy, type):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	651	orig_type = req.type
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	652	proxy_type, user, password, hostport = _parse_proxy(proxy)
				653	if proxy_type is None:
				654	proxy_type = orig_type
				655	if user and password:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	656	user_pass = '%s:%s' % (unquote(user),
				657	unquote(password))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	658	creds = base64.b64encode(user_pass.encode()).decode("ascii")
				659	req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	660	hostport = unquote(hostport)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	661	req.set_proxy(hostport, proxy_type)
				662	if orig_type == proxy_type:
				663	# let other handlers take care of it
				664	return None
				665	else:
				666	# need to start over, because the other handlers don't
				667	# grok the proxy's URL type
				668	# e.g. if we have a constructor arg proxies like so:
				669	# {'http': 'ftp://proxy.example.com'}, we may end up turning
				670	# a request for http://acme.example.com/a into one for
				671	# ftp://proxy.example.com/a
				672	return self.parent.open(req)
				673
				674	class HTTPPasswordMgr:
				675
				676	def __init__(self):
				677	self.passwd = {}
				678
				679	def add_password(self, realm, uri, user, passwd):
				680	# uri could be a single URI or a sequence
				681	if isinstance(uri, str):
				682	uri = [uri]
				683	if not realm in self.passwd:
				684	self.passwd[realm] = {}
				685	for default_port in True, False:
				686	reduced_uri = tuple(
				687	[self.reduce_uri(u, default_port) for u in uri])
				688	self.passwd[realm][reduced_uri] = (user, passwd)
				689
				690	def find_user_password(self, realm, authuri):
				691	domains = self.passwd.get(realm, {})
				692	for default_port in True, False:
				693	reduced_authuri = self.reduce_uri(authuri, default_port)
				694	for uris, authinfo in domains.items():
				695	for uri in uris:
				696	if self.is_suburi(uri, reduced_authuri):
				697	return authinfo
				698	return None, None
				699
				700	def reduce_uri(self, uri, default_port=True):
				701	"""Accept authority or URI and extract only the authority and path."""
				702	# note HTTP URLs do not have a userinfo component
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	703	parts = urlsplit(uri)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	704	if parts[1]:
				705	# URI
				706	scheme = parts[0]
				707	authority = parts[1]
				708	path = parts[2] or '/'
				709	else:
				710	# host or host:port
				711	scheme = None
				712	authority = uri
				713	path = '/'
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	714	host, port = splitport(authority)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	715	if default_port and port is None and scheme is not None:
				716	dport = {"http": 80,
				717	"https": 443,
				718	}.get(scheme)
				719	if dport is not None:
				720	authority = "%s:%d" % (host, dport)
				721	return authority, path
				722
				723	def is_suburi(self, base, test):
				724	"""Check if test is below base in a URI tree
				725
				726	Both args must be URIs in reduced form.
				727	"""
				728	if base == test:
				729	return True
				730	if base[0] != test[0]:
				731	return False
				732	common = posixpath.commonprefix((base[1], test[1]))
				733	if len(common) == len(base[1]):
				734	return True
				735	return False
				736
				737
				738	class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
				739
				740	def find_user_password(self, realm, authuri):
				741	user, password = HTTPPasswordMgr.find_user_password(self, realm,
				742	authuri)
				743	if user is not None:
				744	return user, password
				745	return HTTPPasswordMgr.find_user_password(self, None, authuri)
				746
				747
				748	class AbstractBasicAuthHandler:
				749
				750	# XXX this allows for multiple auth-schemes, but will stupidly pick
				751	# the last one with a realm specified.
				752
				753	# allow for double- and single-quoted realm values
				754	# (single quotes are a violation of the RFC, but appear in the wild)
				755	rx = re.compile('(?:.,)[ \t]*([^ \t]+)[ \t]+'
				756	'realm=(["\'])(.*?)\\2', re.I)
				757
				758	# XXX could pre-emptively send auth info already accepted (RFC 2617,
				759	# end of section 2, and section 1.2 immediately after "credentials"
				760	# production).
				761
				762	def __init__(self, password_mgr=None):
				763	if password_mgr is None:
				764	password_mgr = HTTPPasswordMgr()
				765	self.passwd = password_mgr
				766	self.add_password = self.passwd.add_password
				767
				768	def http_error_auth_reqed(self, authreq, host, req, headers):
				769	# host may be an authority (without userinfo) or a URL with an
				770	# authority
				771	# XXX could be multiple headers
				772	authreq = headers.get(authreq, None)
				773	if authreq:
				774	mo = AbstractBasicAuthHandler.rx.search(authreq)
				775	if mo:
				776	scheme, quote, realm = mo.groups()
				777	if scheme.lower() == 'basic':
				778	return self.retry_http_basic_auth(host, req, realm)
				779
				780	def retry_http_basic_auth(self, host, req, realm):
				781	user, pw = self.passwd.find_user_password(realm, host)
				782	if pw is not None:
				783	raw = "%s:%s" % (user, pw)
				784	auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
				785	if req.headers.get(self.auth_header, None) == auth:
				786	return None
				787	req.add_header(self.auth_header, auth)
				788	return self.parent.open(req)
				789	else:
				790	return None
				791
				792
				793	class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
				794
				795	auth_header = 'Authorization'
				796
				797	def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	798	url = req.full_url
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	799	return self.http_error_auth_reqed('www-authenticate',
				800	url, req, headers)
				801
				802
				803	class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
				804
				805	auth_header = 'Proxy-authorization'
				806
				807	def http_error_407(self, req, fp, code, msg, headers):
				808	# http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	809	# authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	810	# should not, RFC 3986 s. 3.2.1) support requests for URLs containing
				811	# userinfo.
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	812	authority = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	813	return self.http_error_auth_reqed('proxy-authenticate',
				814	authority, req, headers)
				815
				816
				817	def randombytes(n):
				818	"""Return n random bytes."""
				819	return os.urandom(n)
				820
				821	class AbstractDigestAuthHandler:
				822	# Digest authentication is specified in RFC 2617.
				823
				824	# XXX The client does not inspect the Authentication-Info header
				825	# in a successful response.
				826
				827	# XXX It should be possible to test this implementation against
				828	# a mock server that just generates a static set of challenges.
				829
				830	# XXX qop="auth-int" supports is shaky
				831
				832	def __init__(self, passwd=None):
				833	if passwd is None:
				834	passwd = HTTPPasswordMgr()
				835	self.passwd = passwd
				836	self.add_password = self.passwd.add_password
				837	self.retried = 0
				838	self.nonce_count = 0
				839
				840	def reset_retry_count(self):
				841	self.retried = 0
				842
				843	def http_error_auth_reqed(self, auth_header, host, req, headers):
				844	authreq = headers.get(auth_header, None)
				845	if self.retried > 5:
				846	# Don't fail endlessly - if we failed once, we'll probably
				847	# fail a second time. Hm. Unless the Password Manager is
				848	# prompting for the information. Crap. This isn't great
				849	# but it's better than the current 'repeat until recursion
				850	# depth exceeded' approach <wink>
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	851	raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	852	headers, None)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	853	else:
				854	self.retried += 1
				855	if authreq:
				856	scheme = authreq.split()[0]
				857	if scheme.lower() == 'digest':
				858	return self.retry_http_digest_auth(req, authreq)
				859
				860	def retry_http_digest_auth(self, req, auth):
				861	token, challenge = auth.split(' ', 1)
				862	chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
				863	auth = self.get_authorization(req, chal)
				864	if auth:
				865	auth_val = 'Digest %s' % auth
				866	if req.headers.get(self.auth_header, None) == auth_val:
				867	return None
				868	req.add_unredirected_header(self.auth_header, auth_val)
				869	resp = self.parent.open(req)
				870	return resp
				871
				872	def get_cnonce(self, nonce):
				873	# The cnonce-value is an opaque
				874	# quoted string value provided by the client and used by both client
				875	# and server to avoid chosen plaintext attacks, to provide mutual
				876	# authentication, and to provide some message integrity protection.
				877	# This isn't a fabulous effort, but it's probably Good Enough.
				878	s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
				879	b = s.encode("ascii") + randombytes(8)
				880	dig = hashlib.sha1(b).hexdigest()
				881	return dig[:16]
				882
				883	def get_authorization(self, req, chal):
				884	try:
				885	realm = chal['realm']
				886	nonce = chal['nonce']
				887	qop = chal.get('qop')
				888	algorithm = chal.get('algorithm', 'MD5')
				889	# mod_digest doesn't send an opaque, even though it isn't
				890	# supposed to be optional
				891	opaque = chal.get('opaque', None)
				892	except KeyError:
				893	return None
				894
				895	H, KD = self.get_algorithm_impls(algorithm)
				896	if H is None:
				897	return None
				898
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	899	user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	900	if user is None:
				901	return None
				902
				903	# XXX not implemented yet
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	904	if req.data is not None:
				905	entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	906	else:
				907	entdig = None
				908
				909	A1 = "%s:%s:%s" % (user, realm, pw)
				910	A2 = "%s:%s" % (req.get_method(),
				911	# XXX selector: what about proxies and full urls
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	912	req.selector)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	913	if qop == 'auth':
				914	self.nonce_count += 1
				915	ncvalue = '%08x' % self.nonce_count
				916	cnonce = self.get_cnonce(nonce)
				917	noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
				918	respdig = KD(H(A1), noncebit)
				919	elif qop is None:
				920	respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
				921	else:
				922	# XXX handle auth-int.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	923	raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	924
				925	# XXX should the partial digests be encoded too?
				926
				927	base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	928	'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	929	respdig)
				930	if opaque:
				931	base += ', opaque="%s"' % opaque
				932	if entdig:
				933	base += ', digest="%s"' % entdig
				934	base += ', algorithm="%s"' % algorithm
				935	if qop:
				936	base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
				937	return base
				938
				939	def get_algorithm_impls(self, algorithm):
				940	# lambdas assume digest modules are imported at the top level
				941	if algorithm == 'MD5':
				942	H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
				943	elif algorithm == 'SHA':
				944	H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
				945	# XXX MD5-sess
				946	KD = lambda s, d: H("%s:%s" % (s, d))
				947	return H, KD
				948
				949	def get_entity_digest(self, data, chal):
				950	# XXX not implemented yet
				951	return None
				952
				953
				954	class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
				955	"""An authentication protocol defined by RFC 2069
				956
				957	Digest authentication improves on basic authentication because it
				958	does not transmit passwords in the clear.
				959	"""
				960
				961	auth_header = 'Authorization'
				962	handler_order = 490 # before Basic auth
				963
				964	def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	965	host = urlparse(req.full_url)[1]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	966	retry = self.http_error_auth_reqed('www-authenticate',
				967	host, req, headers)
				968	self.reset_retry_count()
				969	return retry
				970
				971
				972	class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
				973
				974	auth_header = 'Proxy-Authorization'
				975	handler_order = 490 # before Basic auth
				976
				977	def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	978	host = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	979	retry = self.http_error_auth_reqed('proxy-authenticate',
				980	host, req, headers)
				981	self.reset_retry_count()
				982	return retry
				983
				984	class AbstractHTTPHandler(BaseHandler):
				985
				986	def __init__(self, debuglevel=0):
				987	self._debuglevel = debuglevel
				988
				989	def set_http_debuglevel(self, level):
				990	self._debuglevel = level
				991
				992	def do_request_(self, request):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	993	host = request.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	994	if not host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	995	raise URLError('no host given')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	996
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	997	if request.data is not None: # POST
				998	data = request.data
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	999	if not request.has_header('Content-type'):
				1000	request.add_unredirected_header(
				1001	'Content-type',
				1002	'application/x-www-form-urlencoded')
				1003	if not request.has_header('Content-length'):
				1004	request.add_unredirected_header(
				1005	'Content-length', '%d' % len(data))
				1006
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	1007	sel_host = host
				1008	if request.has_proxy():
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1009	scheme, sel = splittype(request.selector)
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	1010	sel_host, sel_path = splithost(sel)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1011	if not request.has_header('Host'):
Facundo Batista	72dc1ea	2008-08-16 14:44:32 +0000	[diff] [blame]	1012	request.add_unredirected_header('Host', sel_host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1013	for name, value in self.parent.addheaders:
				1014	name = name.capitalize()
				1015	if not request.has_header(name):
				1016	request.add_unredirected_header(name, value)
				1017
				1018	return request
				1019
				1020	def do_open(self, http_class, req):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1021	"""Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1022
				1023	http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1024	"""
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1025	host = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1026	if not host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1027	raise URLError('no host given')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1028
				1029	h = http_class(host, timeout=req.timeout) # will parse host:port
				1030	headers = dict(req.headers)
				1031	headers.update(req.unredirected_hdrs)
				1032
				1033	# TODO(jhylton): Should this be redesigned to handle
				1034	# persistent connections?
				1035
				1036	# We want to make an HTTP/1.1 request, but the addinfourl
				1037	# class isn't prepared to deal with a persistent connection.
				1038	# It will try to read all remaining data from the socket,
				1039	# which will block while the server waits for the next request.
				1040	# So make sure the connection gets closed after the (only)
				1041	# request.
				1042	headers["Connection"] = "close"
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1043	headers = dict((name.title(), val) for name, val in headers.items())
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1044	try:
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1045	h.request(req.get_method(), req.selector, req.data, headers)
				1046	r = h.getresponse() # an HTTPResponse instance
				1047	except socket.error as err:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1048	raise URLError(err)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1049
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1050	r.url = req.full_url
				1051	# This line replaces the .msg attribute of the HTTPResponse
				1052	# with .headers, because urllib clients expect the response to
				1053	# have the reason in .msg. It would be good to mark this
				1054	# attribute is deprecated and get then to use info() or
				1055	# .headers.
				1056	r.msg = r.reason
				1057	return r
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1058
				1059
				1060	class HTTPHandler(AbstractHTTPHandler):
				1061
				1062	def http_open(self, req):
				1063	return self.do_open(http.client.HTTPConnection, req)
				1064
				1065	http_request = AbstractHTTPHandler.do_request_
				1066
				1067	if hasattr(http.client, 'HTTPSConnection'):
				1068	class HTTPSHandler(AbstractHTTPHandler):
				1069
				1070	def https_open(self, req):
				1071	return self.do_open(http.client.HTTPSConnection, req)
				1072
				1073	https_request = AbstractHTTPHandler.do_request_
				1074
				1075	class HTTPCookieProcessor(BaseHandler):
				1076	def __init__(self, cookiejar=None):
				1077	import http.cookiejar
				1078	if cookiejar is None:
				1079	cookiejar = http.cookiejar.CookieJar()
				1080	self.cookiejar = cookiejar
				1081
				1082	def http_request(self, request):
				1083	self.cookiejar.add_cookie_header(request)
				1084	return request
				1085
				1086	def http_response(self, request, response):
				1087	self.cookiejar.extract_cookies(response, request)
				1088	return response
				1089
				1090	https_request = http_request
				1091	https_response = http_response
				1092
				1093	class UnknownHandler(BaseHandler):
				1094	def unknown_open(self, req):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1095	type = req.type
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1096	raise URLError('unknown url type: %s' % type)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1097
				1098	def parse_keqv_list(l):
				1099	"""Parse list of key=value strings where keys are not duplicated."""
				1100	parsed = {}
				1101	for elt in l:
				1102	k, v = elt.split('=', 1)
				1103	if v[0] == '"' and v[-1] == '"':
				1104	v = v[1:-1]
				1105	parsed[k] = v
				1106	return parsed
				1107
				1108	def parse_http_list(s):
				1109	"""Parse lists as described by RFC 2068 Section 2.
				1110
				1111	In particular, parse comma-separated lists where the elements of
				1112	the list may include quoted-strings. A quoted-string could
				1113	contain a comma. A non-quoted string could have quotes in the
				1114	middle. Neither commas nor quotes count if they are escaped.
				1115	Only double-quotes count, not single-quotes.
				1116	"""
				1117	res = []
				1118	part = ''
				1119
				1120	escape = quote = False
				1121	for cur in s:
				1122	if escape:
				1123	part += cur
				1124	escape = False
				1125	continue
				1126	if quote:
				1127	if cur == '\\':
				1128	escape = True
				1129	continue
				1130	elif cur == '"':
				1131	quote = False
				1132	part += cur
				1133	continue
				1134
				1135	if cur == ',':
				1136	res.append(part)
				1137	part = ''
				1138	continue
				1139
				1140	if cur == '"':
				1141	quote = True
				1142
				1143	part += cur
				1144
				1145	# append last part
				1146	if part:
				1147	res.append(part)
				1148
				1149	return [part.strip() for part in res]
				1150
				1151	class FileHandler(BaseHandler):
				1152	# Use local file or FTP depending on form of URL
				1153	def file_open(self, req):
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1154	url = req.selector
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1155	if url[:2] == '//' and url[2:3] != '/':
				1156	req.type = 'ftp'
				1157	return self.parent.open(req)
				1158	else:
				1159	return self.open_local_file(req)
				1160
				1161	# names for the localhost
				1162	names = None
				1163	def get_names(self):
				1164	if FileHandler.names is None:
				1165	try:
				1166	FileHandler.names = (socket.gethostbyname('localhost'),
				1167	socket.gethostbyname(socket.gethostname()))
				1168	except socket.gaierror:
				1169	FileHandler.names = (socket.gethostbyname('localhost'),)
				1170	return FileHandler.names
				1171
				1172	# not entirely sure what the rules are here
				1173	def open_local_file(self, req):
				1174	import email.utils
				1175	import mimetypes
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1176	host = req.host
				1177	file = req.selector
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1178	localfile = url2pathname(file)
				1179	try:
				1180	stats = os.stat(localfile)
				1181	size = stats.st_size
				1182	modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
				1183	mtype = mimetypes.guess_type(file)[0]
				1184	headers = email.message_from_string(
				1185	'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
				1186	(mtype or 'text/plain', size, modified))
				1187	if host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1188	host, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1189	if not host or \
				1190	(not port and _safe_gethostbyname(host) in self.get_names()):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1191	return addinfourl(open(localfile, 'rb'), headers, 'file:'+file)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1192	except OSError as msg:
Georg Brandl	029986a	2008-06-23 11:44:14 +0000	[diff] [blame]	1193	# users shouldn't expect OSErrors coming from urlopen()
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1194	raise URLError(msg)
				1195	raise URLError('file not on local host')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1196
				1197	def _safe_gethostbyname(host):
				1198	try:
				1199	return socket.gethostbyname(host)
				1200	except socket.gaierror:
				1201	return None
				1202
				1203	class FTPHandler(BaseHandler):
				1204	def ftp_open(self, req):
				1205	import ftplib
				1206	import mimetypes
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1207	host = req.host
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1208	if not host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1209	raise URLError('ftp error: no host given')
				1210	host, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1211	if port is None:
				1212	port = ftplib.FTP_PORT
				1213	else:
				1214	port = int(port)
				1215
				1216	# username/password handling
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1217	user, host = splituser(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1218	if user:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1219	user, passwd = splitpasswd(user)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1220	else:
				1221	passwd = None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1222	host = unquote(host)
				1223	user = unquote(user or '')
				1224	passwd = unquote(passwd or '')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1225
				1226	try:
				1227	host = socket.gethostbyname(host)
				1228	except socket.error as msg:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1229	raise URLError(msg)
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1230	path, attrs = splitattr(req.selector)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1231	dirs = path.split('/')
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1232	dirs = list(map(unquote, dirs))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1233	dirs, file = dirs[:-1], dirs[-1]
				1234	if dirs and not dirs[0]:
				1235	dirs = dirs[1:]
				1236	try:
				1237	fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
				1238	type = file and 'I' or 'D'
				1239	for attr in attrs:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1240	attr, value = splitvalue(attr)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1241	if attr.lower() == 'type' and \
				1242	value in ('a', 'A', 'i', 'I', 'd', 'D'):
				1243	type = value.upper()
				1244	fp, retrlen = fw.retrfile(file, type)
				1245	headers = ""
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1246	mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1247	if mtype:
				1248	headers += "Content-type: %s\n" % mtype
				1249	if retrlen is not None and retrlen >= 0:
				1250	headers += "Content-length: %d\n" % retrlen
				1251	headers = email.message_from_string(headers)
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1252	return addinfourl(fp, headers, req.full_url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1253	except ftplib.all_errors as msg:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1254	exc = URLError('ftp error: %s' % msg)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1255	raise exc.with_traceback(sys.exc_info()[2])
				1256
				1257	def connect_ftp(self, user, passwd, host, port, dirs, timeout):
				1258	fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
				1259	return fw
				1260
				1261	class CacheFTPHandler(FTPHandler):
				1262	# XXX would be nice to have pluggable cache strategies
				1263	# XXX this stuff is definitely not thread safe
				1264	def __init__(self):
				1265	self.cache = {}
				1266	self.timeout = {}
				1267	self.soonest = 0
				1268	self.delay = 60
				1269	self.max_conns = 16
				1270
				1271	def setTimeout(self, t):
				1272	self.delay = t
				1273
				1274	def setMaxConns(self, m):
				1275	self.max_conns = m
				1276
				1277	def connect_ftp(self, user, passwd, host, port, dirs, timeout):
				1278	key = user, host, port, '/'.join(dirs), timeout
				1279	if key in self.cache:
				1280	self.timeout[key] = time.time() + self.delay
				1281	else:
				1282	self.cache[key] = ftpwrapper(user, passwd, host, port,
				1283	dirs, timeout)
				1284	self.timeout[key] = time.time() + self.delay
				1285	self.check_cache()
				1286	return self.cache[key]
				1287
				1288	def check_cache(self):
				1289	# first check for old ones
				1290	t = time.time()
				1291	if self.soonest <= t:
				1292	for k, v in list(self.timeout.items()):
				1293	if v < t:
				1294	self.cache[k].close()
				1295	del self.cache[k]
				1296	del self.timeout[k]
				1297	self.soonest = min(list(self.timeout.values()))
				1298
				1299	# then check the size
				1300	if len(self.cache) == self.max_conns:
				1301	for k, v in list(self.timeout.items()):
				1302	if v == self.soonest:
				1303	del self.cache[k]
				1304	del self.timeout[k]
				1305	break
				1306	self.soonest = min(list(self.timeout.values()))
				1307
				1308	# Code move from the old urllib module
				1309
				1310	MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
				1311
				1312	# Helper for non-unix systems
				1313	if os.name == 'mac':
				1314	from macurl2path import url2pathname, pathname2url
				1315	elif os.name == 'nt':
				1316	from nturl2path import url2pathname, pathname2url
				1317	else:
				1318	def url2pathname(pathname):
				1319	"""OS-specific conversion from a relative URL of the 'file' scheme
				1320	to a file system path; not recommended for general use."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1321	return unquote(pathname)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1322
				1323	def pathname2url(pathname):
				1324	"""OS-specific conversion from a file system path to a relative URL
				1325	of the 'file' scheme; not recommended for general use."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1326	return quote(pathname)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1327
				1328	# This really consists of two pieces:
				1329	# (1) a class which handles opening of all sorts of URLs
				1330	# (plus assorted utilities etc.)
				1331	# (2) a set of functions for parsing URLs
				1332	# XXX Should these be separated out into different modules?
				1333
				1334
				1335	ftpcache = {}
				1336	class URLopener:
				1337	"""Class to open URLs.
				1338	This is a class rather than just a subroutine because we may need
				1339	more than one set of global protocol-specific options.
				1340	Note -- this is a base class for those who don't want the
				1341	automatic handling of errors type 302 (relocated) and 401
				1342	(authorization needed)."""
				1343
				1344	__tempfiles = None
				1345
				1346	version = "Python-urllib/%s" % __version__
				1347
				1348	# Constructor
				1349	def __init__(self, proxies=None, **x509):
				1350	if proxies is None:
				1351	proxies = getproxies()
				1352	assert hasattr(proxies, 'keys'), "proxies must be a mapping"
				1353	self.proxies = proxies
				1354	self.key_file = x509.get('key_file')
				1355	self.cert_file = x509.get('cert_file')
				1356	self.addheaders = [('User-Agent', self.version)]
				1357	self.__tempfiles = []
				1358	self.__unlink = os.unlink # See cleanup()
				1359	self.tempcache = None
				1360	# Undocumented feature: if you assign {} to tempcache,
				1361	# it is used to cache files retrieved with
				1362	# self.retrieve(). This is not enabled by default
				1363	# since it does not work for changing documents (and I
				1364	# haven't got the logic to check expiration headers
				1365	# yet).
				1366	self.ftpcache = ftpcache
				1367	# Undocumented feature: you can use a different
				1368	# ftp cache by assigning to the .ftpcache member;
				1369	# in case you want logically independent URL openers
				1370	# XXX This is not threadsafe. Bah.
				1371
				1372	def __del__(self):
				1373	self.close()
				1374
				1375	def close(self):
				1376	self.cleanup()
				1377
				1378	def cleanup(self):
				1379	# This code sometimes runs when the rest of this module
				1380	# has already been deleted, so it can't use any globals
				1381	# or import anything.
				1382	if self.__tempfiles:
				1383	for file in self.__tempfiles:
				1384	try:
				1385	self.__unlink(file)
				1386	except OSError:
				1387	pass
				1388	del self.__tempfiles[:]
				1389	if self.tempcache:
				1390	self.tempcache.clear()
				1391
				1392	def addheader(self, *args):
				1393	"""Add a header to be used by the HTTP interface only
				1394	e.g. u.addheader('Accept', 'sound/basic')"""
				1395	self.addheaders.append(args)
				1396
				1397	# External interface
				1398	def open(self, fullurl, data=None):
				1399	"""Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1400	fullurl = unwrap(to_bytes(fullurl))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1401	if self.tempcache and fullurl in self.tempcache:
				1402	filename, headers = self.tempcache[fullurl]
				1403	fp = open(filename, 'rb')
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1404	return addinfourl(fp, headers, fullurl)
				1405	urltype, url = splittype(fullurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1406	if not urltype:
				1407	urltype = 'file'
				1408	if urltype in self.proxies:
				1409	proxy = self.proxies[urltype]
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1410	urltype, proxyhost = splittype(proxy)
				1411	host, selector = splithost(proxyhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1412	url = (host, fullurl) # Signal special case to open_*()
				1413	else:
				1414	proxy = None
				1415	name = 'open_' + urltype
				1416	self.type = urltype
				1417	name = name.replace('-', '_')
				1418	if not hasattr(self, name):
				1419	if proxy:
				1420	return self.open_unknown_proxy(proxy, fullurl, data)
				1421	else:
				1422	return self.open_unknown(fullurl, data)
				1423	try:
				1424	if data is None:
				1425	return getattr(self, name)(url)
				1426	else:
				1427	return getattr(self, name)(url, data)
				1428	except socket.error as msg:
				1429	raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
				1430
				1431	def open_unknown(self, fullurl, data=None):
				1432	"""Overridable interface to open unknown URL type."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1433	type, url = splittype(fullurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1434	raise IOError('url error', 'unknown url type', type)
				1435
				1436	def open_unknown_proxy(self, proxy, fullurl, data=None):
				1437	"""Overridable interface to open unknown URL type."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1438	type, url = splittype(fullurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1439	raise IOError('url error', 'invalid proxy for %s' % type, proxy)
				1440
				1441	# External interface
				1442	def retrieve(self, url, filename=None, reporthook=None, data=None):
				1443	"""retrieve(url) returns (filename, headers) for a local object
				1444	or (tempfilename, headers) for a remote object."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1445	url = unwrap(to_bytes(url))
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1446	if self.tempcache and url in self.tempcache:
				1447	return self.tempcache[url]
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1448	type, url1 = splittype(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1449	if filename is None and (not type or type == 'file'):
				1450	try:
				1451	fp = self.open_local_file(url1)
				1452	hdrs = fp.info()
				1453	del fp
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1454	return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1455	except IOError as msg:
				1456	pass
				1457	fp = self.open(url, data)
Benjamin Peterson	5f28b7b	2009-03-26 21:49:58 +0000	[diff] [blame]	1458	try:
				1459	headers = fp.info()
				1460	if filename:
				1461	tfp = open(filename, 'wb')
				1462	else:
				1463	import tempfile
				1464	garbage, path = splittype(url)
				1465	garbage, path = splithost(path or "")
				1466	path, garbage = splitquery(path or "")
				1467	path, garbage = splitattr(path or "")
				1468	suffix = os.path.splitext(path)[1]
				1469	(fd, filename) = tempfile.mkstemp(suffix)
				1470	self.__tempfiles.append(filename)
				1471	tfp = os.fdopen(fd, 'wb')
				1472	try:
				1473	result = filename, headers
				1474	if self.tempcache is not None:
				1475	self.tempcache[url] = result
				1476	bs = 1024*8
				1477	size = -1
				1478	read = 0
				1479	blocknum = 0
				1480	if reporthook:
				1481	if "content-length" in headers:
				1482	size = int(headers["Content-Length"])
				1483	reporthook(blocknum, bs, size)
				1484	while 1:
				1485	block = fp.read(bs)
				1486	if not block:
				1487	break
				1488	read += len(block)
				1489	tfp.write(block)
				1490	blocknum += 1
				1491	if reporthook:
				1492	reporthook(blocknum, bs, size)
				1493	finally:
				1494	tfp.close()
				1495	finally:
				1496	fp.close()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1497	del fp
				1498	del tfp
				1499
				1500	# raise exception if actual size does not match content-length header
				1501	if size >= 0 and read < size:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1502	raise ContentTooShortError(
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1503	"retrieval incomplete: got only %i out of %i bytes"
				1504	% (read, size), result)
				1505
				1506	return result
				1507
				1508	# Each method named open_<type> knows how to open that type of URL
				1509
				1510	def _open_generic_http(self, connection_factory, url, data):
				1511	"""Make an HTTP connection using connection_class.
				1512
				1513	This is an internal method that should be called from
				1514	open_http() or open_https().
				1515
				1516	Arguments:
				1517	- connection_factory should take a host name and return an
				1518	HTTPConnection instance.
				1519	- url is the url to retrieval or a host, relative-path pair.
				1520	- data is payload for a POST request or None.
				1521	"""
				1522
				1523	user_passwd = None
				1524	proxy_passwd= None
				1525	if isinstance(url, str):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1526	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1527	if host:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1528	user_passwd, host = splituser(host)
				1529	host = unquote(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1530	realhost = host
				1531	else:
				1532	host, selector = url
				1533	# check whether the proxy contains authorization information
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1534	proxy_passwd, host = splituser(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1535	# now we proceed with the url we want to obtain
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1536	urltype, rest = splittype(selector)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1537	url = rest
				1538	user_passwd = None
				1539	if urltype.lower() != 'http':
				1540	realhost = None
				1541	else:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1542	realhost, rest = splithost(rest)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1543	if realhost:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1544	user_passwd, realhost = splituser(realhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1545	if user_passwd:
				1546	selector = "%s://%s%s" % (urltype, realhost, rest)
				1547	if proxy_bypass(realhost):
				1548	host = realhost
				1549
				1550	#print "proxy via http:", host, selector
				1551	if not host: raise IOError('http error', 'no host given')
				1552
				1553	if proxy_passwd:
				1554	import base64
				1555	proxy_auth = base64.b64encode(proxy_passwd).strip()
				1556	else:
				1557	proxy_auth = None
				1558
				1559	if user_passwd:
				1560	import base64
				1561	auth = base64.b64encode(user_passwd).strip()
				1562	else:
				1563	auth = None
				1564	http_conn = connection_factory(host)
Jeremy Hylton	6c5e28c	2009-03-31 14:35:53 +0000	[diff] [blame^]	1565	## # XXX We should fix urllib so that it works with HTTP/1.1.
				1566	## http_conn._http_vsn = 10
				1567	## http_conn._http_vsn_str = "HTTP/1.0"
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1568
				1569	headers = {}
				1570	if proxy_auth:
				1571	headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
				1572	if auth:
				1573	headers["Authorization"] = "Basic %s" % auth
				1574	if realhost:
				1575	headers["Host"] = realhost
				1576	for header, value in self.addheaders:
				1577	headers[header] = value
				1578
				1579	if data is not None:
				1580	headers["Content-Type"] = "application/x-www-form-urlencoded"
				1581	http_conn.request("POST", selector, data, headers)
				1582	else:
				1583	http_conn.request("GET", selector, headers=headers)
				1584
				1585	try:
				1586	response = http_conn.getresponse()
				1587	except http.client.BadStatusLine:
				1588	# something went wrong with the HTTP status line
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1589	raise URLError("http protocol error: bad status line")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1590
				1591	# According to RFC 2616, "2xx" code indicates that the client's
				1592	# request was successfully received, understood, and accepted.
				1593	if 200 <= response.status < 300:
Antoine Pitrou	b353c12	2009-02-11 00:39:14 +0000	[diff] [blame]	1594	return addinfourl(response, response.msg, "http:" + url,
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1595	response.status)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1596	else:
				1597	return self.http_error(
				1598	url, response.fp,
				1599	response.status, response.reason, response.msg, data)
				1600
				1601	def open_http(self, url, data=None):
				1602	"""Use HTTP protocol."""
				1603	return self._open_generic_http(http.client.HTTPConnection, url, data)
				1604
				1605	def http_error(self, url, fp, errcode, errmsg, headers, data=None):
				1606	"""Handle http errors.
				1607
				1608	Derived class can override this, or provide specific handlers
				1609	named http_error_DDD where DDD is the 3-digit error code."""
				1610	# First check if there's a specific handler for this error
				1611	name = 'http_error_%d' % errcode
				1612	if hasattr(self, name):
				1613	method = getattr(self, name)
				1614	if data is None:
				1615	result = method(url, fp, errcode, errmsg, headers)
				1616	else:
				1617	result = method(url, fp, errcode, errmsg, headers, data)
				1618	if result: return result
				1619	return self.http_error_default(url, fp, errcode, errmsg, headers)
				1620
				1621	def http_error_default(self, url, fp, errcode, errmsg, headers):
				1622	"""Default error handler: close the connection and raise IOError."""
				1623	void = fp.read()
				1624	fp.close()
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1625	raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1626
				1627	if _have_ssl:
				1628	def _https_connection(self, host):
				1629	return http.client.HTTPSConnection(host,
				1630	key_file=self.key_file,
				1631	cert_file=self.cert_file)
				1632
				1633	def open_https(self, url, data=None):
				1634	"""Use HTTPS protocol."""
				1635	return self._open_generic_http(self._https_connection, url, data)
				1636
				1637	def open_file(self, url):
				1638	"""Use local file or FTP depending on form of URL."""
				1639	if not isinstance(url, str):
				1640	raise URLError('file error', 'proxy support for file protocol currently not implemented')
				1641	if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
				1642	return self.open_ftp(url)
				1643	else:
				1644	return self.open_local_file(url)
				1645
				1646	def open_local_file(self, url):
				1647	"""Use local file."""
				1648	import mimetypes, email.utils
				1649	from io import StringIO
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1650	host, file = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1651	localname = url2pathname(file)
				1652	try:
				1653	stats = os.stat(localname)
				1654	except OSError as e:
				1655	raise URLError(e.errno, e.strerror, e.filename)
				1656	size = stats.st_size
				1657	modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
				1658	mtype = mimetypes.guess_type(url)[0]
				1659	headers = email.message_from_string(
				1660	'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
				1661	(mtype or 'text/plain', size, modified))
				1662	if not host:
				1663	urlfile = file
				1664	if file[:1] == '/':
				1665	urlfile = 'file://' + file
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1666	return addinfourl(open(localname, 'rb'), headers, urlfile)
				1667	host, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1668	if (not port
				1669	and socket.gethostbyname(host) in (localhost(), thishost())):
				1670	urlfile = file
				1671	if file[:1] == '/':
				1672	urlfile = 'file://' + file
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1673	return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1674	raise URLError('local file error', 'not on local host')
				1675
				1676	def open_ftp(self, url):
				1677	"""Use FTP protocol."""
				1678	if not isinstance(url, str):
				1679	raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
				1680	import mimetypes
				1681	from io import StringIO
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1682	host, path = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1683	if not host: raise URLError('ftp error', 'no host given')
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1684	host, port = splitport(host)
				1685	user, host = splituser(host)
				1686	if user: user, passwd = splitpasswd(user)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1687	else: passwd = None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1688	host = unquote(host)
				1689	user = unquote(user or '')
				1690	passwd = unquote(passwd or '')
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1691	host = socket.gethostbyname(host)
				1692	if not port:
				1693	import ftplib
				1694	port = ftplib.FTP_PORT
				1695	else:
				1696	port = int(port)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1697	path, attrs = splitattr(path)
				1698	path = unquote(path)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1699	dirs = path.split('/')
				1700	dirs, file = dirs[:-1], dirs[-1]
				1701	if dirs and not dirs[0]: dirs = dirs[1:]
				1702	if dirs and not dirs[0]: dirs[0] = '/'
				1703	key = user, host, port, '/'.join(dirs)
				1704	# XXX thread unsafe!
				1705	if len(self.ftpcache) > MAXFTPCACHE:
				1706	# Prune the cache, rather arbitrarily
				1707	for k in self.ftpcache.keys():
				1708	if k != key:
				1709	v = self.ftpcache[k]
				1710	del self.ftpcache[k]
				1711	v.close()
				1712	try:
				1713	if not key in self.ftpcache:
				1714	self.ftpcache[key] = \
				1715	ftpwrapper(user, passwd, host, port, dirs)
				1716	if not file: type = 'D'
				1717	else: type = 'I'
				1718	for attr in attrs:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1719	attr, value = splitvalue(attr)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1720	if attr.lower() == 'type' and \
				1721	value in ('a', 'A', 'i', 'I', 'd', 'D'):
				1722	type = value.upper()
				1723	(fp, retrlen) = self.ftpcache[key].retrfile(file, type)
				1724	mtype = mimetypes.guess_type("ftp:" + url)[0]
				1725	headers = ""
				1726	if mtype:
				1727	headers += "Content-Type: %s\n" % mtype
				1728	if retrlen is not None and retrlen >= 0:
				1729	headers += "Content-Length: %d\n" % retrlen
				1730	headers = email.message_from_string(headers)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1731	return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1732	except ftperrors() as msg:
				1733	raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
				1734
				1735	def open_data(self, url, data=None):
				1736	"""Use "data" URL."""
				1737	if not isinstance(url, str):
				1738	raise URLError('data error', 'proxy support for data protocol currently not implemented')
				1739	# ignore POSTed data
				1740	#
				1741	# syntax of data URLs:
				1742	# dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
				1743	# mediatype := [ type "/" subtype ] *( ";" parameter )
				1744	# data := *urlchar
				1745	# parameter := attribute "=" value
				1746	try:
				1747	[type, data] = url.split(',', 1)
				1748	except ValueError:
				1749	raise IOError('data error', 'bad data URL')
				1750	if not type:
				1751	type = 'text/plain;charset=US-ASCII'
				1752	semi = type.rfind(';')
				1753	if semi >= 0 and '=' not in type[semi:]:
				1754	encoding = type[semi+1:]
				1755	type = type[:semi]
				1756	else:
				1757	encoding = ''
				1758	msg = []
				1759	msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
				1760	time.gmtime(time.time())))
				1761	msg.append('Content-type: %s' % type)
				1762	if encoding == 'base64':
				1763	import base64
				1764	data = base64.decodestring(data)
				1765	else:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1766	data = unquote(data)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1767	msg.append('Content-Length: %d' % len(data))
				1768	msg.append('')
				1769	msg.append(data)
				1770	msg = '\n'.join(msg)
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1771	headers = email.message_from_string(msg)
				1772	f = io.StringIO(msg)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1773	#f.fileno = None # needed for addinfourl
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1774	return addinfourl(f, headers, url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1775
				1776
				1777	class FancyURLopener(URLopener):
				1778	"""Derived class with handlers for errors we can handle (perhaps)."""
				1779
				1780	def __init__(self, args, *kwargs):
				1781	URLopener.__init__(self, args, *kwargs)
				1782	self.auth_cache = {}
				1783	self.tries = 0
				1784	self.maxtries = 10
				1785
				1786	def http_error_default(self, url, fp, errcode, errmsg, headers):
				1787	"""Default error handling -- don't raise an exception."""
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1788	return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1789
				1790	def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
				1791	"""Error 302 -- relocated (temporarily)."""
				1792	self.tries += 1
				1793	if self.maxtries and self.tries >= self.maxtries:
				1794	if hasattr(self, "http_error_500"):
				1795	meth = self.http_error_500
				1796	else:
				1797	meth = self.http_error_default
				1798	self.tries = 0
				1799	return meth(url, fp, 500,
				1800	"Internal Server Error: Redirect Recursion", headers)
				1801	result = self.redirect_internal(url, fp, errcode, errmsg, headers,
				1802	data)
				1803	self.tries = 0
				1804	return result
				1805
				1806	def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
				1807	if 'location' in headers:
				1808	newurl = headers['location']
				1809	elif 'uri' in headers:
				1810	newurl = headers['uri']
				1811	else:
				1812	return
				1813	void = fp.read()
				1814	fp.close()
				1815	# In case the server sent a relative URL, join with original:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1816	newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1817	return self.open(newurl)
				1818
				1819	def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
				1820	"""Error 301 -- also relocated (permanently)."""
				1821	return self.http_error_302(url, fp, errcode, errmsg, headers, data)
				1822
				1823	def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
				1824	"""Error 303 -- also relocated (essentially identical to 302)."""
				1825	return self.http_error_302(url, fp, errcode, errmsg, headers, data)
				1826
				1827	def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
				1828	"""Error 307 -- relocated, but turn POST into error."""
				1829	if data is None:
				1830	return self.http_error_302(url, fp, errcode, errmsg, headers, data)
				1831	else:
				1832	return self.http_error_default(url, fp, errcode, errmsg, headers)
				1833
				1834	def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
				1835	"""Error 401 -- authentication required.
				1836	This function supports Basic authentication only."""
				1837	if not 'www-authenticate' in headers:
				1838	URLopener.http_error_default(self, url, fp,
				1839	errcode, errmsg, headers)
				1840	stuff = headers['www-authenticate']
				1841	import re
				1842	match = re.match('[ \t]([^ \t]+)[ \t]+realm="([^"])"', stuff)
				1843	if not match:
				1844	URLopener.http_error_default(self, url, fp,
				1845	errcode, errmsg, headers)
				1846	scheme, realm = match.groups()
				1847	if scheme.lower() != 'basic':
				1848	URLopener.http_error_default(self, url, fp,
				1849	errcode, errmsg, headers)
				1850	name = 'retry_' + self.type + '_basic_auth'
				1851	if data is None:
				1852	return getattr(self,name)(url, realm)
				1853	else:
				1854	return getattr(self,name)(url, realm, data)
				1855
				1856	def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
				1857	"""Error 407 -- proxy authentication required.
				1858	This function supports Basic authentication only."""
				1859	if not 'proxy-authenticate' in headers:
				1860	URLopener.http_error_default(self, url, fp,
				1861	errcode, errmsg, headers)
				1862	stuff = headers['proxy-authenticate']
				1863	import re
				1864	match = re.match('[ \t]([^ \t]+)[ \t]+realm="([^"])"', stuff)
				1865	if not match:
				1866	URLopener.http_error_default(self, url, fp,
				1867	errcode, errmsg, headers)
				1868	scheme, realm = match.groups()
				1869	if scheme.lower() != 'basic':
				1870	URLopener.http_error_default(self, url, fp,
				1871	errcode, errmsg, headers)
				1872	name = 'retry_proxy_' + self.type + '_basic_auth'
				1873	if data is None:
				1874	return getattr(self,name)(url, realm)
				1875	else:
				1876	return getattr(self,name)(url, realm, data)
				1877
				1878	def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1879	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1880	newurl = 'http://' + host + selector
				1881	proxy = self.proxies['http']
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1882	urltype, proxyhost = splittype(proxy)
				1883	proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1884	i = proxyhost.find('@') + 1
				1885	proxyhost = proxyhost[i:]
				1886	user, passwd = self.get_user_passwd(proxyhost, realm, i)
				1887	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1888	proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1889	quote(passwd, safe=''), proxyhost)
				1890	self.proxies['http'] = 'http://' + proxyhost + proxyselector
				1891	if data is None:
				1892	return self.open(newurl)
				1893	else:
				1894	return self.open(newurl, data)
				1895
				1896	def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1897	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1898	newurl = 'https://' + host + selector
				1899	proxy = self.proxies['https']
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1900	urltype, proxyhost = splittype(proxy)
				1901	proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1902	i = proxyhost.find('@') + 1
				1903	proxyhost = proxyhost[i:]
				1904	user, passwd = self.get_user_passwd(proxyhost, realm, i)
				1905	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1906	proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1907	quote(passwd, safe=''), proxyhost)
				1908	self.proxies['https'] = 'https://' + proxyhost + proxyselector
				1909	if data is None:
				1910	return self.open(newurl)
				1911	else:
				1912	return self.open(newurl, data)
				1913
				1914	def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1915	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1916	i = host.find('@') + 1
				1917	host = host[i:]
				1918	user, passwd = self.get_user_passwd(host, realm, i)
				1919	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1920	host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1921	quote(passwd, safe=''), host)
				1922	newurl = 'http://' + host + selector
				1923	if data is None:
				1924	return self.open(newurl)
				1925	else:
				1926	return self.open(newurl, data)
				1927
				1928	def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1929	host, selector = splithost(url)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1930	i = host.find('@') + 1
				1931	host = host[i:]
				1932	user, passwd = self.get_user_passwd(host, realm, i)
				1933	if not (user or passwd): return None
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1934	host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1935	quote(passwd, safe=''), host)
				1936	newurl = 'https://' + host + selector
				1937	if data is None:
				1938	return self.open(newurl)
				1939	else:
				1940	return self.open(newurl, data)
				1941
				1942	def get_user_passwd(self, host, realm, clear_cache = 0):
				1943	key = realm + '@' + host.lower()
				1944	if key in self.auth_cache:
				1945	if clear_cache:
				1946	del self.auth_cache[key]
				1947	else:
				1948	return self.auth_cache[key]
				1949	user, passwd = self.prompt_user_passwd(host, realm)
				1950	if user or passwd: self.auth_cache[key] = (user, passwd)
				1951	return user, passwd
				1952
				1953	def prompt_user_passwd(self, host, realm):
				1954	"""Override this in a GUI environment!"""
				1955	import getpass
				1956	try:
				1957	user = input("Enter username for %s at %s: " % (realm, host))
				1958	passwd = getpass.getpass("Enter password for %s in %s at %s: " %
				1959	(user, realm, host))
				1960	return user, passwd
				1961	except KeyboardInterrupt:
				1962	print()
				1963	return None, None
				1964
				1965
				1966	# Utility functions
				1967
				1968	_localhost = None
				1969	def localhost():
				1970	"""Return the IP address of the magic hostname 'localhost'."""
				1971	global _localhost
				1972	if _localhost is None:
				1973	_localhost = socket.gethostbyname('localhost')
				1974	return _localhost
				1975
				1976	_thishost = None
				1977	def thishost():
				1978	"""Return the IP address of the current host."""
				1979	global _thishost
				1980	if _thishost is None:
				1981	_thishost = socket.gethostbyname(socket.gethostname())
				1982	return _thishost
				1983
				1984	_ftperrors = None
				1985	def ftperrors():
				1986	"""Return the set of errors raised by the FTP class."""
				1987	global _ftperrors
				1988	if _ftperrors is None:
				1989	import ftplib
				1990	_ftperrors = ftplib.all_errors
				1991	return _ftperrors
				1992
				1993	_noheaders = None
				1994	def noheaders():
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1995	"""Return an empty email Message object."""
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1996	global _noheaders
				1997	if _noheaders is None:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	1998	_noheaders = email.message_from_string("")
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	1999	return _noheaders
				2000
				2001
				2002	# Utility classes
				2003
				2004	class ftpwrapper:
				2005	"""Class used by open_ftp() for cache of open FTP connections."""
				2006
				2007	def __init__(self, user, passwd, host, port, dirs, timeout=None):
				2008	self.user = user
				2009	self.passwd = passwd
				2010	self.host = host
				2011	self.port = port
				2012	self.dirs = dirs
				2013	self.timeout = timeout
				2014	self.init()
				2015
				2016	def init(self):
				2017	import ftplib
				2018	self.busy = 0
				2019	self.ftp = ftplib.FTP()
				2020	self.ftp.connect(self.host, self.port, self.timeout)
				2021	self.ftp.login(self.user, self.passwd)
				2022	for dir in self.dirs:
				2023	self.ftp.cwd(dir)
				2024
				2025	def retrfile(self, file, type):
				2026	import ftplib
				2027	self.endtransfer()
				2028	if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
				2029	else: cmd = 'TYPE ' + type; isdir = 0
				2030	try:
				2031	self.ftp.voidcmd(cmd)
				2032	except ftplib.all_errors:
				2033	self.init()
				2034	self.ftp.voidcmd(cmd)
				2035	conn = None
				2036	if file and not isdir:
				2037	# Try to retrieve as a file
				2038	try:
				2039	cmd = 'RETR ' + file
				2040	conn = self.ftp.ntransfercmd(cmd)
				2041	except ftplib.error_perm as reason:
				2042	if str(reason)[:3] != '550':
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2043	raise URLError('ftp error', reason).with_traceback(
				2044	sys.exc_info()[2])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2045	if not conn:
				2046	# Set transfer mode to ASCII!
				2047	self.ftp.voidcmd('TYPE A')
				2048	# Try a directory listing. Verify that directory exists.
				2049	if file:
				2050	pwd = self.ftp.pwd()
				2051	try:
				2052	try:
				2053	self.ftp.cwd(file)
				2054	except ftplib.error_perm as reason:
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2055	raise URLError('ftp error', reason) from reason
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2056	finally:
				2057	self.ftp.cwd(pwd)
				2058	cmd = 'LIST ' + file
				2059	else:
				2060	cmd = 'LIST'
				2061	conn = self.ftp.ntransfercmd(cmd)
				2062	self.busy = 1
				2063	# Pass back both a suitably decorated object and a retrieval length
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2064	return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2065	def endtransfer(self):
				2066	if not self.busy:
				2067	return
				2068	self.busy = 0
				2069	try:
				2070	self.ftp.voidresp()
				2071	except ftperrors():
				2072	pass
				2073
				2074	def close(self):
				2075	self.endtransfer()
				2076	try:
				2077	self.ftp.close()
				2078	except ftperrors():
				2079	pass
				2080
				2081	# Proxy handling
				2082	def getproxies_environment():
				2083	"""Return a dictionary of scheme -> proxy server URL mappings.
				2084
				2085	Scan the environment for variables named <scheme>_proxy;
				2086	this seems to be the standard convention. If you need a
				2087	different way, you can pass a proxies dictionary to the
				2088	[Fancy]URLopener constructor.
				2089
				2090	"""
				2091	proxies = {}
				2092	for name, value in os.environ.items():
				2093	name = name.lower()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2094	if value and name[-6:] == '_proxy':
				2095	proxies[name[:-6]] = value
				2096	return proxies
				2097
				2098	def proxy_bypass_environment(host):
				2099	"""Test if proxies should not be used for a particular host.
				2100
				2101	Checks the environment for a variable named no_proxy, which should
				2102	be a list of DNS suffixes separated by commas, or '*' for all hosts.
				2103	"""
				2104	no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
				2105	# '*' is special case for always bypass
				2106	if no_proxy == '*':
				2107	return 1
				2108	# strip port off host
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2109	hostonly, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2110	# check if the host ends with any of the DNS suffixes
				2111	for name in no_proxy.split(','):
				2112	if name and (hostonly.endswith(name) or host.endswith(name)):
				2113	return 1
				2114	# otherwise, don't bypass
				2115	return 0
				2116
				2117
				2118	if sys.platform == 'darwin':
				2119	def getproxies_internetconfig():
				2120	"""Return a dictionary of scheme -> proxy server URL mappings.
				2121
				2122	By convention the mac uses Internet Config to store
				2123	proxies. An HTTP proxy, for instance, is stored under
				2124	the HttpProxy key.
				2125
				2126	"""
				2127	try:
				2128	import ic
				2129	except ImportError:
				2130	return {}
				2131
				2132	try:
				2133	config = ic.IC()
				2134	except ic.error:
				2135	return {}
				2136	proxies = {}
				2137	# HTTP:
				2138	if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
				2139	try:
				2140	value = config['HTTPProxyHost']
				2141	except ic.error:
				2142	pass
				2143	else:
				2144	proxies['http'] = 'http://%s' % value
				2145	# FTP: XXX To be done.
				2146	# Gopher: XXX To be done.
				2147	return proxies
				2148
				2149	def proxy_bypass(host):
				2150	if getproxies_environment():
				2151	return proxy_bypass_environment(host)
				2152	else:
				2153	return 0
				2154
				2155	def getproxies():
				2156	return getproxies_environment() or getproxies_internetconfig()
				2157
				2158	elif os.name == 'nt':
				2159	def getproxies_registry():
				2160	"""Return a dictionary of scheme -> proxy server URL mappings.
				2161
				2162	Win32 uses the registry to store proxies.
				2163
				2164	"""
				2165	proxies = {}
				2166	try:
				2167	import _winreg
				2168	except ImportError:
				2169	# Std module, so should be around - but you never know!
				2170	return proxies
				2171	try:
				2172	internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
				2173	r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
				2174	proxyEnable = _winreg.QueryValueEx(internetSettings,
				2175	'ProxyEnable')[0]
				2176	if proxyEnable:
				2177	# Returned as Unicode but problems if not converted to ASCII
				2178	proxyServer = str(_winreg.QueryValueEx(internetSettings,
				2179	'ProxyServer')[0])
				2180	if '=' in proxyServer:
				2181	# Per-protocol settings
				2182	for p in proxyServer.split(';'):
				2183	protocol, address = p.split('=', 1)
				2184	# See if address has a type:// prefix
				2185	import re
				2186	if not re.match('^([^/:]+)://', address):
				2187	address = '%s://%s' % (protocol, address)
				2188	proxies[protocol] = address
				2189	else:
				2190	# Use one setting for all protocols
				2191	if proxyServer[:5] == 'http:':
				2192	proxies['http'] = proxyServer
				2193	else:
				2194	proxies['http'] = 'http://%s' % proxyServer
				2195	proxies['ftp'] = 'ftp://%s' % proxyServer
				2196	internetSettings.Close()
				2197	except (WindowsError, ValueError, TypeError):
				2198	# Either registry key not found etc, or the value in an
				2199	# unexpected format.
				2200	# proxies already set up to be empty so nothing to do
				2201	pass
				2202	return proxies
				2203
				2204	def getproxies():
				2205	"""Return a dictionary of scheme -> proxy server URL mappings.
				2206
				2207	Returns settings gathered from the environment, if specified,
				2208	or the registry.
				2209
				2210	"""
				2211	return getproxies_environment() or getproxies_registry()
				2212
				2213	def proxy_bypass_registry(host):
				2214	try:
				2215	import _winreg
				2216	import re
				2217	except ImportError:
				2218	# Std modules, so should be around - but you never know!
				2219	return 0
				2220	try:
				2221	internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
				2222	r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
				2223	proxyEnable = _winreg.QueryValueEx(internetSettings,
				2224	'ProxyEnable')[0]
				2225	proxyOverride = str(_winreg.QueryValueEx(internetSettings,
				2226	'ProxyOverride')[0])
				2227	# ^^^^ Returned as Unicode but problems if not converted to ASCII
				2228	except WindowsError:
				2229	return 0
				2230	if not proxyEnable or not proxyOverride:
				2231	return 0
				2232	# try to make a host list from name and IP address.
Georg Brandl	13e8946	2008-07-01 19:56:00 +0000	[diff] [blame]	2233	rawHost, port = splitport(host)
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2234	host = [rawHost]
				2235	try:
				2236	addr = socket.gethostbyname(rawHost)
				2237	if addr != rawHost:
				2238	host.append(addr)
				2239	except socket.error:
				2240	pass
				2241	try:
				2242	fqdn = socket.getfqdn(rawHost)
				2243	if fqdn != rawHost:
				2244	host.append(fqdn)
				2245	except socket.error:
				2246	pass
				2247	# make a check value list from the registry entry: replace the
				2248	# '<local>' string by the localhost entry and the corresponding
				2249	# canonical entry.
				2250	proxyOverride = proxyOverride.split(';')
				2251	i = 0
				2252	while i < len(proxyOverride):
				2253	if proxyOverride[i] == '<local>':
				2254	proxyOverride[i:i+1] = ['localhost',
				2255	'127.0.0.1',
				2256	socket.gethostname(),
				2257	socket.gethostbyname(
				2258	socket.gethostname())]
				2259	i += 1
				2260	# print proxyOverride
				2261	# now check if we match one of the registry values.
				2262	for test in proxyOverride:
				2263	test = test.replace(".", r"\.") # mask dots
				2264	test = test.replace("", r".") # change glob sequence
				2265	test = test.replace("?", r".") # change glob char
				2266	for val in host:
				2267	# print "%s <--> %s" %( test, val )
				2268	if re.match(test, val, re.I):
				2269	return 1
				2270	return 0
				2271
				2272	def proxy_bypass(host):
				2273	"""Return a dictionary of scheme -> proxy server URL mappings.
				2274
				2275	Returns settings gathered from the environment, if specified,
				2276	or the registry.
				2277
				2278	"""
				2279	if getproxies_environment():
				2280	return proxy_bypass_environment(host)
				2281	else:
				2282	return proxy_bypass_registry(host)
				2283
				2284	else:
				2285	# By default use environment variables
				2286	getproxies = getproxies_environment
				2287	proxy_bypass = proxy_bypass_environment