blob: 17936ee94848d20b42c5257b373a67445624380f [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000097import collections
Jeremy Hylton1afc1692008-06-18 20:49:58 +000098
Georg Brandl13e89462008-07-01 19:56:00 +000099from urllib.error import URLError, HTTPError, ContentTooShortError
100from urllib.parse import (
101 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
102 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000103 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000104from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106# check for SSL
107try:
108 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000109except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110 _have_ssl = False
111else:
112 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113
114# used in User-Agent header sent
115__version__ = sys.version[:3]
116
117_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000118def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
119 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000120 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000121 if cafile or capath:
122 if not _have_ssl:
123 raise ValueError('SSL support not available')
124 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
125 context.options |= ssl.OP_NO_SSLv2
126 if cafile or capath:
127 context.verify_mode = ssl.CERT_REQUIRED
128 context.load_verify_locations(cafile, capath)
129 check_hostname = True
130 else:
131 check_hostname = False
132 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
133 opener = build_opener(https_handler)
134 elif _opener is None:
135 _opener = opener = build_opener()
136 else:
137 opener = _opener
138 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000139
140def install_opener(opener):
141 global _opener
142 _opener = opener
143
144# TODO(jhylton): Make this work with the same global opener.
145_urlopener = None
146def urlretrieve(url, filename=None, reporthook=None, data=None):
147 global _urlopener
148 if not _urlopener:
149 _urlopener = FancyURLopener()
150 return _urlopener.retrieve(url, filename, reporthook, data)
151
152def urlcleanup():
153 if _urlopener:
154 _urlopener.cleanup()
155 global _opener
156 if _opener:
157 _opener = None
158
159# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000160_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000161def request_host(request):
162 """Return request-host, as defined by RFC 2965.
163
164 Variation from RFC: returned value is lowercased, for convenient
165 comparison.
166
167 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000168 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000169 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000170 if host == "":
171 host = request.get_header("Host", "")
172
173 # remove port, if present
174 host = _cut_port_re.sub("", host, 1)
175 return host.lower()
176
177class Request:
178
179 def __init__(self, url, data=None, headers={},
180 origin_req_host=None, unverifiable=False):
181 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000182 self.full_url = unwrap(url)
Senthil Kumarand95cc752010-08-08 11:27:53 +0000183 self.full_url, fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000184 self.data = data
185 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000186 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000187 for key, value in headers.items():
188 self.add_header(key, value)
189 self.unredirected_hdrs = {}
190 if origin_req_host is None:
191 origin_req_host = request_host(self)
192 self.origin_req_host = origin_req_host
193 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000194 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000195
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000196 def _parse(self):
197 self.type, rest = splittype(self.full_url)
198 if self.type is None:
199 raise ValueError("unknown url type: %s" % self.full_url)
200 self.host, self.selector = splithost(rest)
201 if self.host:
202 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000203
204 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000205 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000206 return "POST"
207 else:
208 return "GET"
209
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000210 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211
212 def add_data(self, data):
213 self.data = data
214
215 def has_data(self):
216 return self.data is not None
217
218 def get_data(self):
219 return self.data
220
221 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000222 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000223
224 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000225 return self.type
226
227 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000228 return self.host
229
230 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000231 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000232
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000233 def is_unverifiable(self):
234 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000235
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000236 def get_origin_req_host(self):
237 return self.origin_req_host
238
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000239 # End deprecated methods
240
241 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000242 if self.type == 'https' and not self._tunnel_host:
243 self._tunnel_host = self.host
244 else:
245 self.type= type
246 self.selector = self.full_url
247 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000248
249 def has_proxy(self):
250 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000251
252 def add_header(self, key, val):
253 # useful for something like authentication
254 self.headers[key.capitalize()] = val
255
256 def add_unredirected_header(self, key, val):
257 # will not be added to a redirected request
258 self.unredirected_hdrs[key.capitalize()] = val
259
260 def has_header(self, header_name):
261 return (header_name in self.headers or
262 header_name in self.unredirected_hdrs)
263
264 def get_header(self, header_name, default=None):
265 return self.headers.get(
266 header_name,
267 self.unredirected_hdrs.get(header_name, default))
268
269 def header_items(self):
270 hdrs = self.unredirected_hdrs.copy()
271 hdrs.update(self.headers)
272 return list(hdrs.items())
273
274class OpenerDirector:
275 def __init__(self):
276 client_version = "Python-urllib/%s" % __version__
277 self.addheaders = [('User-agent', client_version)]
278 # manage the individual handlers
279 self.handlers = []
280 self.handle_open = {}
281 self.handle_error = {}
282 self.process_response = {}
283 self.process_request = {}
284
285 def add_handler(self, handler):
286 if not hasattr(handler, "add_parent"):
287 raise TypeError("expected BaseHandler instance, got %r" %
288 type(handler))
289
290 added = False
291 for meth in dir(handler):
292 if meth in ["redirect_request", "do_open", "proxy_open"]:
293 # oops, coincidental match
294 continue
295
296 i = meth.find("_")
297 protocol = meth[:i]
298 condition = meth[i+1:]
299
300 if condition.startswith("error"):
301 j = condition.find("_") + i + 1
302 kind = meth[j+1:]
303 try:
304 kind = int(kind)
305 except ValueError:
306 pass
307 lookup = self.handle_error.get(protocol, {})
308 self.handle_error[protocol] = lookup
309 elif condition == "open":
310 kind = protocol
311 lookup = self.handle_open
312 elif condition == "response":
313 kind = protocol
314 lookup = self.process_response
315 elif condition == "request":
316 kind = protocol
317 lookup = self.process_request
318 else:
319 continue
320
321 handlers = lookup.setdefault(kind, [])
322 if handlers:
323 bisect.insort(handlers, handler)
324 else:
325 handlers.append(handler)
326 added = True
327
328 if added:
329 # the handlers must work in an specific order, the order
330 # is specified in a Handler attribute
331 bisect.insort(self.handlers, handler)
332 handler.add_parent(self)
333
334 def close(self):
335 # Only exists for backwards compatibility.
336 pass
337
338 def _call_chain(self, chain, kind, meth_name, *args):
339 # Handlers raise an exception if no one else should try to handle
340 # the request, or return None if they can't but another handler
341 # could. Otherwise, they return the response.
342 handlers = chain.get(kind, ())
343 for handler in handlers:
344 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000345 result = func(*args)
346 if result is not None:
347 return result
348
349 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
350 # accept a URL or a Request object
351 if isinstance(fullurl, str):
352 req = Request(fullurl, data)
353 else:
354 req = fullurl
355 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000356 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000357
358 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000359 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000360
361 # pre-process request
362 meth_name = protocol+"_request"
363 for processor in self.process_request.get(protocol, []):
364 meth = getattr(processor, meth_name)
365 req = meth(req)
366
367 response = self._open(req, data)
368
369 # post-process response
370 meth_name = protocol+"_response"
371 for processor in self.process_response.get(protocol, []):
372 meth = getattr(processor, meth_name)
373 response = meth(req, response)
374
375 return response
376
377 def _open(self, req, data=None):
378 result = self._call_chain(self.handle_open, 'default',
379 'default_open', req)
380 if result:
381 return result
382
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000383 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000384 result = self._call_chain(self.handle_open, protocol, protocol +
385 '_open', req)
386 if result:
387 return result
388
389 return self._call_chain(self.handle_open, 'unknown',
390 'unknown_open', req)
391
392 def error(self, proto, *args):
393 if proto in ('http', 'https'):
394 # XXX http[s] protocols are special-cased
395 dict = self.handle_error['http'] # https is not different than http
396 proto = args[2] # YUCK!
397 meth_name = 'http_error_%s' % proto
398 http_err = 1
399 orig_args = args
400 else:
401 dict = self.handle_error
402 meth_name = proto + '_error'
403 http_err = 0
404 args = (dict, proto, meth_name) + args
405 result = self._call_chain(*args)
406 if result:
407 return result
408
409 if http_err:
410 args = (dict, 'default', 'http_error_default') + orig_args
411 return self._call_chain(*args)
412
413# XXX probably also want an abstract factory that knows when it makes
414# sense to skip a superclass in favor of a subclass and when it might
415# make sense to include both
416
417def build_opener(*handlers):
418 """Create an opener object from a list of handlers.
419
420 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000421 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000422
423 If any of the handlers passed as arguments are subclasses of the
424 default handlers, the default handlers will not be used.
425 """
426 def isclass(obj):
427 return isinstance(obj, type) or hasattr(obj, "__bases__")
428
429 opener = OpenerDirector()
430 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
431 HTTPDefaultErrorHandler, HTTPRedirectHandler,
432 FTPHandler, FileHandler, HTTPErrorProcessor]
433 if hasattr(http.client, "HTTPSConnection"):
434 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000435 skip = set()
436 for klass in default_classes:
437 for check in handlers:
438 if isclass(check):
439 if issubclass(check, klass):
440 skip.add(klass)
441 elif isinstance(check, klass):
442 skip.add(klass)
443 for klass in skip:
444 default_classes.remove(klass)
445
446 for klass in default_classes:
447 opener.add_handler(klass())
448
449 for h in handlers:
450 if isclass(h):
451 h = h()
452 opener.add_handler(h)
453 return opener
454
455class BaseHandler:
456 handler_order = 500
457
458 def add_parent(self, parent):
459 self.parent = parent
460
461 def close(self):
462 # Only exists for backwards compatibility
463 pass
464
465 def __lt__(self, other):
466 if not hasattr(other, "handler_order"):
467 # Try to preserve the old behavior of having custom classes
468 # inserted after default ones (works only for custom user
469 # classes which are not aware of handler_order).
470 return True
471 return self.handler_order < other.handler_order
472
473
474class HTTPErrorProcessor(BaseHandler):
475 """Process HTTP error responses."""
476 handler_order = 1000 # after all other processing
477
478 def http_response(self, request, response):
479 code, msg, hdrs = response.code, response.msg, response.info()
480
481 # According to RFC 2616, "2xx" code indicates that the client's
482 # request was successfully received, understood, and accepted.
483 if not (200 <= code < 300):
484 response = self.parent.error(
485 'http', request, response, code, msg, hdrs)
486
487 return response
488
489 https_response = http_response
490
491class HTTPDefaultErrorHandler(BaseHandler):
492 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000493 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000494
495class HTTPRedirectHandler(BaseHandler):
496 # maximum number of redirections to any single URL
497 # this is needed because of the state that cookies introduce
498 max_repeats = 4
499 # maximum total number of redirections (regardless of URL) before
500 # assuming we're in a loop
501 max_redirections = 10
502
503 def redirect_request(self, req, fp, code, msg, headers, newurl):
504 """Return a Request or None in response to a redirect.
505
506 This is called by the http_error_30x methods when a
507 redirection response is received. If a redirection should
508 take place, return a new Request to allow http_error_30x to
509 perform the redirect. Otherwise, raise HTTPError if no-one
510 else should try to handle this url. Return None if you can't
511 but another Handler might.
512 """
513 m = req.get_method()
514 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
515 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000516 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000517
518 # Strictly (according to RFC 2616), 301 or 302 in response to
519 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000520 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000521 # essentially all clients do redirect in this case, so we do
522 # the same.
523 # be conciliant with URIs containing a space
524 newurl = newurl.replace(' ', '%20')
525 CONTENT_HEADERS = ("content-length", "content-type")
526 newheaders = dict((k, v) for k, v in req.headers.items()
527 if k.lower() not in CONTENT_HEADERS)
528 return Request(newurl,
529 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000530 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000531 unverifiable=True)
532
533 # Implementation note: To avoid the server sending us into an
534 # infinite loop, the request object needs to track what URLs we
535 # have already seen. Do this by adding a handler-specific
536 # attribute to the Request object.
537 def http_error_302(self, req, fp, code, msg, headers):
538 # Some servers (incorrectly) return multiple Location headers
539 # (so probably same goes for URI). Use first header.
540 if "location" in headers:
541 newurl = headers["location"]
542 elif "uri" in headers:
543 newurl = headers["uri"]
544 else:
545 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000546
547 # fix a possible malformed URL
548 urlparts = urlparse(newurl)
549 if not urlparts.path:
550 urlparts = list(urlparts)
551 urlparts[2] = "/"
552 newurl = urlunparse(urlparts)
553
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000554 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000555
556 # XXX Probably want to forget about the state of the current
557 # request, although that might interact poorly with other
558 # handlers that also use handler-specific request attributes
559 new = self.redirect_request(req, fp, code, msg, headers, newurl)
560 if new is None:
561 return
562
563 # loop detection
564 # .redirect_dict has a key url if url was previously visited.
565 if hasattr(req, 'redirect_dict'):
566 visited = new.redirect_dict = req.redirect_dict
567 if (visited.get(newurl, 0) >= self.max_repeats or
568 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000569 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000570 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000571 else:
572 visited = new.redirect_dict = req.redirect_dict = {}
573 visited[newurl] = visited.get(newurl, 0) + 1
574
575 # Don't close the fp until we are sure that we won't use it
576 # with HTTPError.
577 fp.read()
578 fp.close()
579
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000580 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000581
582 http_error_301 = http_error_303 = http_error_307 = http_error_302
583
584 inf_msg = "The HTTP server returned a redirect error that would " \
585 "lead to an infinite loop.\n" \
586 "The last 30x error message was:\n"
587
588
589def _parse_proxy(proxy):
590 """Return (scheme, user, password, host/port) given a URL or an authority.
591
592 If a URL is supplied, it must have an authority (host:port) component.
593 According to RFC 3986, having an authority component means the URL must
594 have two slashes after the scheme:
595
596 >>> _parse_proxy('file:/ftp.example.com/')
597 Traceback (most recent call last):
598 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
599
600 The first three items of the returned tuple may be None.
601
602 Examples of authority parsing:
603
604 >>> _parse_proxy('proxy.example.com')
605 (None, None, None, 'proxy.example.com')
606 >>> _parse_proxy('proxy.example.com:3128')
607 (None, None, None, 'proxy.example.com:3128')
608
609 The authority component may optionally include userinfo (assumed to be
610 username:password):
611
612 >>> _parse_proxy('joe:password@proxy.example.com')
613 (None, 'joe', 'password', 'proxy.example.com')
614 >>> _parse_proxy('joe:password@proxy.example.com:3128')
615 (None, 'joe', 'password', 'proxy.example.com:3128')
616
617 Same examples, but with URLs instead:
618
619 >>> _parse_proxy('http://proxy.example.com/')
620 ('http', None, None, 'proxy.example.com')
621 >>> _parse_proxy('http://proxy.example.com:3128/')
622 ('http', None, None, 'proxy.example.com:3128')
623 >>> _parse_proxy('http://joe:password@proxy.example.com/')
624 ('http', 'joe', 'password', 'proxy.example.com')
625 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
626 ('http', 'joe', 'password', 'proxy.example.com:3128')
627
628 Everything after the authority is ignored:
629
630 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
631 ('ftp', 'joe', 'password', 'proxy.example.com')
632
633 Test for no trailing '/' case:
634
635 >>> _parse_proxy('http://joe:password@proxy.example.com')
636 ('http', 'joe', 'password', 'proxy.example.com')
637
638 """
Georg Brandl13e89462008-07-01 19:56:00 +0000639 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000640 if not r_scheme.startswith("/"):
641 # authority
642 scheme = None
643 authority = proxy
644 else:
645 # URL
646 if not r_scheme.startswith("//"):
647 raise ValueError("proxy URL with no authority: %r" % proxy)
648 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
649 # and 3.3.), path is empty or starts with '/'
650 end = r_scheme.find("/", 2)
651 if end == -1:
652 end = None
653 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000654 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000655 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000656 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000657 else:
658 user = password = None
659 return scheme, user, password, hostport
660
661class ProxyHandler(BaseHandler):
662 # Proxies must be in front
663 handler_order = 100
664
665 def __init__(self, proxies=None):
666 if proxies is None:
667 proxies = getproxies()
668 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
669 self.proxies = proxies
670 for type, url in proxies.items():
671 setattr(self, '%s_open' % type,
672 lambda r, proxy=url, type=type, meth=self.proxy_open: \
673 meth(r, proxy, type))
674
675 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000676 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000677 proxy_type, user, password, hostport = _parse_proxy(proxy)
678 if proxy_type is None:
679 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000680
681 if req.host and proxy_bypass(req.host):
682 return None
683
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000684 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000685 user_pass = '%s:%s' % (unquote(user),
686 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000687 creds = base64.b64encode(user_pass.encode()).decode("ascii")
688 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000689 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000690 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000691 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000692 # let other handlers take care of it
693 return None
694 else:
695 # need to start over, because the other handlers don't
696 # grok the proxy's URL type
697 # e.g. if we have a constructor arg proxies like so:
698 # {'http': 'ftp://proxy.example.com'}, we may end up turning
699 # a request for http://acme.example.com/a into one for
700 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000701 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000702
703class HTTPPasswordMgr:
704
705 def __init__(self):
706 self.passwd = {}
707
708 def add_password(self, realm, uri, user, passwd):
709 # uri could be a single URI or a sequence
710 if isinstance(uri, str):
711 uri = [uri]
712 if not realm in self.passwd:
713 self.passwd[realm] = {}
714 for default_port in True, False:
715 reduced_uri = tuple(
716 [self.reduce_uri(u, default_port) for u in uri])
717 self.passwd[realm][reduced_uri] = (user, passwd)
718
719 def find_user_password(self, realm, authuri):
720 domains = self.passwd.get(realm, {})
721 for default_port in True, False:
722 reduced_authuri = self.reduce_uri(authuri, default_port)
723 for uris, authinfo in domains.items():
724 for uri in uris:
725 if self.is_suburi(uri, reduced_authuri):
726 return authinfo
727 return None, None
728
729 def reduce_uri(self, uri, default_port=True):
730 """Accept authority or URI and extract only the authority and path."""
731 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000732 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000733 if parts[1]:
734 # URI
735 scheme = parts[0]
736 authority = parts[1]
737 path = parts[2] or '/'
738 else:
739 # host or host:port
740 scheme = None
741 authority = uri
742 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000743 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000744 if default_port and port is None and scheme is not None:
745 dport = {"http": 80,
746 "https": 443,
747 }.get(scheme)
748 if dport is not None:
749 authority = "%s:%d" % (host, dport)
750 return authority, path
751
752 def is_suburi(self, base, test):
753 """Check if test is below base in a URI tree
754
755 Both args must be URIs in reduced form.
756 """
757 if base == test:
758 return True
759 if base[0] != test[0]:
760 return False
761 common = posixpath.commonprefix((base[1], test[1]))
762 if len(common) == len(base[1]):
763 return True
764 return False
765
766
767class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
768
769 def find_user_password(self, realm, authuri):
770 user, password = HTTPPasswordMgr.find_user_password(self, realm,
771 authuri)
772 if user is not None:
773 return user, password
774 return HTTPPasswordMgr.find_user_password(self, None, authuri)
775
776
777class AbstractBasicAuthHandler:
778
779 # XXX this allows for multiple auth-schemes, but will stupidly pick
780 # the last one with a realm specified.
781
782 # allow for double- and single-quoted realm values
783 # (single quotes are a violation of the RFC, but appear in the wild)
784 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
785 'realm=(["\'])(.*?)\\2', re.I)
786
787 # XXX could pre-emptively send auth info already accepted (RFC 2617,
788 # end of section 2, and section 1.2 immediately after "credentials"
789 # production).
790
791 def __init__(self, password_mgr=None):
792 if password_mgr is None:
793 password_mgr = HTTPPasswordMgr()
794 self.passwd = password_mgr
795 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000796 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000797
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000798 def reset_retry_count(self):
799 self.retried = 0
800
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000801 def http_error_auth_reqed(self, authreq, host, req, headers):
802 # host may be an authority (without userinfo) or a URL with an
803 # authority
804 # XXX could be multiple headers
805 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000806
807 if self.retried > 5:
808 # retry sending the username:password 5 times before failing.
809 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
810 headers, None)
811 else:
812 self.retried += 1
813
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000814 if authreq:
815 mo = AbstractBasicAuthHandler.rx.search(authreq)
816 if mo:
817 scheme, quote, realm = mo.groups()
818 if scheme.lower() == 'basic':
Senthil Kumaran4bb5c272010-08-26 06:16:22 +0000819 response = self.retry_http_basic_auth(host, req, realm)
820 if response and response.code != 401:
821 self.retried = 0
822 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000823
824 def retry_http_basic_auth(self, host, req, realm):
825 user, pw = self.passwd.find_user_password(realm, host)
826 if pw is not None:
827 raw = "%s:%s" % (user, pw)
828 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
829 if req.headers.get(self.auth_header, None) == auth:
830 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000831 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000832 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000833 else:
834 return None
835
836
837class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
838
839 auth_header = 'Authorization'
840
841 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000842 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000843 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000844 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000845 self.reset_retry_count()
846 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000847
848
849class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
850
851 auth_header = 'Proxy-authorization'
852
853 def http_error_407(self, req, fp, code, msg, headers):
854 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000855 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000856 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
857 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000858 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000859 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000860 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000861 self.reset_retry_count()
862 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000863
864
865def randombytes(n):
866 """Return n random bytes."""
867 return os.urandom(n)
868
869class AbstractDigestAuthHandler:
870 # Digest authentication is specified in RFC 2617.
871
872 # XXX The client does not inspect the Authentication-Info header
873 # in a successful response.
874
875 # XXX It should be possible to test this implementation against
876 # a mock server that just generates a static set of challenges.
877
878 # XXX qop="auth-int" supports is shaky
879
880 def __init__(self, passwd=None):
881 if passwd is None:
882 passwd = HTTPPasswordMgr()
883 self.passwd = passwd
884 self.add_password = self.passwd.add_password
885 self.retried = 0
886 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000887 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000888
889 def reset_retry_count(self):
890 self.retried = 0
891
892 def http_error_auth_reqed(self, auth_header, host, req, headers):
893 authreq = headers.get(auth_header, None)
894 if self.retried > 5:
895 # Don't fail endlessly - if we failed once, we'll probably
896 # fail a second time. Hm. Unless the Password Manager is
897 # prompting for the information. Crap. This isn't great
898 # but it's better than the current 'repeat until recursion
899 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000900 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000901 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000902 else:
903 self.retried += 1
904 if authreq:
905 scheme = authreq.split()[0]
906 if scheme.lower() == 'digest':
907 return self.retry_http_digest_auth(req, authreq)
908
909 def retry_http_digest_auth(self, req, auth):
910 token, challenge = auth.split(' ', 1)
911 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
912 auth = self.get_authorization(req, chal)
913 if auth:
914 auth_val = 'Digest %s' % auth
915 if req.headers.get(self.auth_header, None) == auth_val:
916 return None
917 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000918 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000919 return resp
920
921 def get_cnonce(self, nonce):
922 # The cnonce-value is an opaque
923 # quoted string value provided by the client and used by both client
924 # and server to avoid chosen plaintext attacks, to provide mutual
925 # authentication, and to provide some message integrity protection.
926 # This isn't a fabulous effort, but it's probably Good Enough.
927 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
928 b = s.encode("ascii") + randombytes(8)
929 dig = hashlib.sha1(b).hexdigest()
930 return dig[:16]
931
932 def get_authorization(self, req, chal):
933 try:
934 realm = chal['realm']
935 nonce = chal['nonce']
936 qop = chal.get('qop')
937 algorithm = chal.get('algorithm', 'MD5')
938 # mod_digest doesn't send an opaque, even though it isn't
939 # supposed to be optional
940 opaque = chal.get('opaque', None)
941 except KeyError:
942 return None
943
944 H, KD = self.get_algorithm_impls(algorithm)
945 if H is None:
946 return None
947
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000948 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000949 if user is None:
950 return None
951
952 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000953 if req.data is not None:
954 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000955 else:
956 entdig = None
957
958 A1 = "%s:%s:%s" % (user, realm, pw)
959 A2 = "%s:%s" % (req.get_method(),
960 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000961 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000962 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000963 if nonce == self.last_nonce:
964 self.nonce_count += 1
965 else:
966 self.nonce_count = 1
967 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000968 ncvalue = '%08x' % self.nonce_count
969 cnonce = self.get_cnonce(nonce)
970 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
971 respdig = KD(H(A1), noncebit)
972 elif qop is None:
973 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
974 else:
975 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000976 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000977
978 # XXX should the partial digests be encoded too?
979
980 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000981 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000982 respdig)
983 if opaque:
984 base += ', opaque="%s"' % opaque
985 if entdig:
986 base += ', digest="%s"' % entdig
987 base += ', algorithm="%s"' % algorithm
988 if qop:
989 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
990 return base
991
992 def get_algorithm_impls(self, algorithm):
993 # lambdas assume digest modules are imported at the top level
994 if algorithm == 'MD5':
995 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
996 elif algorithm == 'SHA':
997 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
998 # XXX MD5-sess
999 KD = lambda s, d: H("%s:%s" % (s, d))
1000 return H, KD
1001
1002 def get_entity_digest(self, data, chal):
1003 # XXX not implemented yet
1004 return None
1005
1006
1007class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1008 """An authentication protocol defined by RFC 2069
1009
1010 Digest authentication improves on basic authentication because it
1011 does not transmit passwords in the clear.
1012 """
1013
1014 auth_header = 'Authorization'
1015 handler_order = 490 # before Basic auth
1016
1017 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001018 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001019 retry = self.http_error_auth_reqed('www-authenticate',
1020 host, req, headers)
1021 self.reset_retry_count()
1022 return retry
1023
1024
1025class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1026
1027 auth_header = 'Proxy-Authorization'
1028 handler_order = 490 # before Basic auth
1029
1030 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001031 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001032 retry = self.http_error_auth_reqed('proxy-authenticate',
1033 host, req, headers)
1034 self.reset_retry_count()
1035 return retry
1036
1037class AbstractHTTPHandler(BaseHandler):
1038
1039 def __init__(self, debuglevel=0):
1040 self._debuglevel = debuglevel
1041
1042 def set_http_debuglevel(self, level):
1043 self._debuglevel = level
1044
1045 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001046 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001047 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001048 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001049
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001050 if request.data is not None: # POST
1051 data = request.data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001052 if not request.has_header('Content-type'):
1053 request.add_unredirected_header(
1054 'Content-type',
1055 'application/x-www-form-urlencoded')
1056 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001057 try:
1058 mv = memoryview(data)
1059 except TypeError:
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001060 if isinstance(data, collections.Iterable):
1061 raise ValueError("Content-Length should be specified \
1062 for iterable data of type %r %r" % (type(data),
1063 data))
1064 else:
1065 request.add_unredirected_header(
1066 'Content-length', '%d' % len(mv) * mv.itemsize)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001067
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001068 sel_host = host
1069 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001070 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001071 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001072 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001073 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001074 for name, value in self.parent.addheaders:
1075 name = name.capitalize()
1076 if not request.has_header(name):
1077 request.add_unredirected_header(name, value)
1078
1079 return request
1080
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001081 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001082 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001083
1084 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001085 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001086 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001087 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001088 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001089
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001090 # will parse host:port
1091 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001092
1093 headers = dict(req.unredirected_hdrs)
1094 headers.update(dict((k, v) for k, v in req.headers.items()
1095 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001096
1097 # TODO(jhylton): Should this be redesigned to handle
1098 # persistent connections?
1099
1100 # We want to make an HTTP/1.1 request, but the addinfourl
1101 # class isn't prepared to deal with a persistent connection.
1102 # It will try to read all remaining data from the socket,
1103 # which will block while the server waits for the next request.
1104 # So make sure the connection gets closed after the (only)
1105 # request.
1106 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001107 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001108
1109 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001110 tunnel_headers = {}
1111 proxy_auth_hdr = "Proxy-Authorization"
1112 if proxy_auth_hdr in headers:
1113 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1114 # Proxy-Authorization should not be sent to origin
1115 # server.
1116 del headers[proxy_auth_hdr]
1117 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001118
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001119 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001120 h.request(req.get_method(), req.selector, req.data, headers)
1121 r = h.getresponse() # an HTTPResponse instance
1122 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001123 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001124
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001125 r.url = req.full_url
1126 # This line replaces the .msg attribute of the HTTPResponse
1127 # with .headers, because urllib clients expect the response to
1128 # have the reason in .msg. It would be good to mark this
1129 # attribute is deprecated and get then to use info() or
1130 # .headers.
1131 r.msg = r.reason
1132 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001133
1134
1135class HTTPHandler(AbstractHTTPHandler):
1136
1137 def http_open(self, req):
1138 return self.do_open(http.client.HTTPConnection, req)
1139
1140 http_request = AbstractHTTPHandler.do_request_
1141
1142if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001143 import ssl
1144
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001145 class HTTPSHandler(AbstractHTTPHandler):
1146
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001147 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1148 AbstractHTTPHandler.__init__(self, debuglevel)
1149 self._context = context
1150 self._check_hostname = check_hostname
1151
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001152 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001153 return self.do_open(http.client.HTTPSConnection, req,
1154 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001155
1156 https_request = AbstractHTTPHandler.do_request_
1157
1158class HTTPCookieProcessor(BaseHandler):
1159 def __init__(self, cookiejar=None):
1160 import http.cookiejar
1161 if cookiejar is None:
1162 cookiejar = http.cookiejar.CookieJar()
1163 self.cookiejar = cookiejar
1164
1165 def http_request(self, request):
1166 self.cookiejar.add_cookie_header(request)
1167 return request
1168
1169 def http_response(self, request, response):
1170 self.cookiejar.extract_cookies(response, request)
1171 return response
1172
1173 https_request = http_request
1174 https_response = http_response
1175
1176class UnknownHandler(BaseHandler):
1177 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001178 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001179 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001180
1181def parse_keqv_list(l):
1182 """Parse list of key=value strings where keys are not duplicated."""
1183 parsed = {}
1184 for elt in l:
1185 k, v = elt.split('=', 1)
1186 if v[0] == '"' and v[-1] == '"':
1187 v = v[1:-1]
1188 parsed[k] = v
1189 return parsed
1190
1191def parse_http_list(s):
1192 """Parse lists as described by RFC 2068 Section 2.
1193
1194 In particular, parse comma-separated lists where the elements of
1195 the list may include quoted-strings. A quoted-string could
1196 contain a comma. A non-quoted string could have quotes in the
1197 middle. Neither commas nor quotes count if they are escaped.
1198 Only double-quotes count, not single-quotes.
1199 """
1200 res = []
1201 part = ''
1202
1203 escape = quote = False
1204 for cur in s:
1205 if escape:
1206 part += cur
1207 escape = False
1208 continue
1209 if quote:
1210 if cur == '\\':
1211 escape = True
1212 continue
1213 elif cur == '"':
1214 quote = False
1215 part += cur
1216 continue
1217
1218 if cur == ',':
1219 res.append(part)
1220 part = ''
1221 continue
1222
1223 if cur == '"':
1224 quote = True
1225
1226 part += cur
1227
1228 # append last part
1229 if part:
1230 res.append(part)
1231
1232 return [part.strip() for part in res]
1233
1234class FileHandler(BaseHandler):
1235 # Use local file or FTP depending on form of URL
1236 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001237 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001238 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1239 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001240 if not req.host is self.get_names():
1241 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001242 else:
1243 return self.open_local_file(req)
1244
1245 # names for the localhost
1246 names = None
1247 def get_names(self):
1248 if FileHandler.names is None:
1249 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001250 FileHandler.names = tuple(
1251 socket.gethostbyname_ex('localhost')[2] +
1252 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001253 except socket.gaierror:
1254 FileHandler.names = (socket.gethostbyname('localhost'),)
1255 return FileHandler.names
1256
1257 # not entirely sure what the rules are here
1258 def open_local_file(self, req):
1259 import email.utils
1260 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001261 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001262 filename = req.selector
1263 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001264 try:
1265 stats = os.stat(localfile)
1266 size = stats.st_size
1267 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001268 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001269 headers = email.message_from_string(
1270 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1271 (mtype or 'text/plain', size, modified))
1272 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001273 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001274 if not host or \
1275 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001276 if host:
1277 origurl = 'file://' + host + filename
1278 else:
1279 origurl = 'file://' + filename
1280 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001281 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001282 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001283 raise URLError(msg)
1284 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001285
1286def _safe_gethostbyname(host):
1287 try:
1288 return socket.gethostbyname(host)
1289 except socket.gaierror:
1290 return None
1291
1292class FTPHandler(BaseHandler):
1293 def ftp_open(self, req):
1294 import ftplib
1295 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001296 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001297 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001298 raise URLError('ftp error: no host given')
1299 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001300 if port is None:
1301 port = ftplib.FTP_PORT
1302 else:
1303 port = int(port)
1304
1305 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001306 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001307 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001308 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001309 else:
1310 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001311 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001312 user = user or ''
1313 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001314
1315 try:
1316 host = socket.gethostbyname(host)
1317 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001318 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001319 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001320 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001321 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001322 dirs, file = dirs[:-1], dirs[-1]
1323 if dirs and not dirs[0]:
1324 dirs = dirs[1:]
1325 try:
1326 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1327 type = file and 'I' or 'D'
1328 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001329 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001330 if attr.lower() == 'type' and \
1331 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1332 type = value.upper()
1333 fp, retrlen = fw.retrfile(file, type)
1334 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001335 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001336 if mtype:
1337 headers += "Content-type: %s\n" % mtype
1338 if retrlen is not None and retrlen >= 0:
1339 headers += "Content-length: %d\n" % retrlen
1340 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001341 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001342 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001343 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001344 raise exc.with_traceback(sys.exc_info()[2])
1345
1346 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1347 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1348 return fw
1349
1350class CacheFTPHandler(FTPHandler):
1351 # XXX would be nice to have pluggable cache strategies
1352 # XXX this stuff is definitely not thread safe
1353 def __init__(self):
1354 self.cache = {}
1355 self.timeout = {}
1356 self.soonest = 0
1357 self.delay = 60
1358 self.max_conns = 16
1359
1360 def setTimeout(self, t):
1361 self.delay = t
1362
1363 def setMaxConns(self, m):
1364 self.max_conns = m
1365
1366 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1367 key = user, host, port, '/'.join(dirs), timeout
1368 if key in self.cache:
1369 self.timeout[key] = time.time() + self.delay
1370 else:
1371 self.cache[key] = ftpwrapper(user, passwd, host, port,
1372 dirs, timeout)
1373 self.timeout[key] = time.time() + self.delay
1374 self.check_cache()
1375 return self.cache[key]
1376
1377 def check_cache(self):
1378 # first check for old ones
1379 t = time.time()
1380 if self.soonest <= t:
1381 for k, v in list(self.timeout.items()):
1382 if v < t:
1383 self.cache[k].close()
1384 del self.cache[k]
1385 del self.timeout[k]
1386 self.soonest = min(list(self.timeout.values()))
1387
1388 # then check the size
1389 if len(self.cache) == self.max_conns:
1390 for k, v in list(self.timeout.items()):
1391 if v == self.soonest:
1392 del self.cache[k]
1393 del self.timeout[k]
1394 break
1395 self.soonest = min(list(self.timeout.values()))
1396
1397# Code move from the old urllib module
1398
1399MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1400
1401# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001402if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001403 from nturl2path import url2pathname, pathname2url
1404else:
1405 def url2pathname(pathname):
1406 """OS-specific conversion from a relative URL of the 'file' scheme
1407 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001408 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001409
1410 def pathname2url(pathname):
1411 """OS-specific conversion from a file system path to a relative URL
1412 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001413 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001414
1415# This really consists of two pieces:
1416# (1) a class which handles opening of all sorts of URLs
1417# (plus assorted utilities etc.)
1418# (2) a set of functions for parsing URLs
1419# XXX Should these be separated out into different modules?
1420
1421
1422ftpcache = {}
1423class URLopener:
1424 """Class to open URLs.
1425 This is a class rather than just a subroutine because we may need
1426 more than one set of global protocol-specific options.
1427 Note -- this is a base class for those who don't want the
1428 automatic handling of errors type 302 (relocated) and 401
1429 (authorization needed)."""
1430
1431 __tempfiles = None
1432
1433 version = "Python-urllib/%s" % __version__
1434
1435 # Constructor
1436 def __init__(self, proxies=None, **x509):
1437 if proxies is None:
1438 proxies = getproxies()
1439 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1440 self.proxies = proxies
1441 self.key_file = x509.get('key_file')
1442 self.cert_file = x509.get('cert_file')
1443 self.addheaders = [('User-Agent', self.version)]
1444 self.__tempfiles = []
1445 self.__unlink = os.unlink # See cleanup()
1446 self.tempcache = None
1447 # Undocumented feature: if you assign {} to tempcache,
1448 # it is used to cache files retrieved with
1449 # self.retrieve(). This is not enabled by default
1450 # since it does not work for changing documents (and I
1451 # haven't got the logic to check expiration headers
1452 # yet).
1453 self.ftpcache = ftpcache
1454 # Undocumented feature: you can use a different
1455 # ftp cache by assigning to the .ftpcache member;
1456 # in case you want logically independent URL openers
1457 # XXX This is not threadsafe. Bah.
1458
1459 def __del__(self):
1460 self.close()
1461
1462 def close(self):
1463 self.cleanup()
1464
1465 def cleanup(self):
1466 # This code sometimes runs when the rest of this module
1467 # has already been deleted, so it can't use any globals
1468 # or import anything.
1469 if self.__tempfiles:
1470 for file in self.__tempfiles:
1471 try:
1472 self.__unlink(file)
1473 except OSError:
1474 pass
1475 del self.__tempfiles[:]
1476 if self.tempcache:
1477 self.tempcache.clear()
1478
1479 def addheader(self, *args):
1480 """Add a header to be used by the HTTP interface only
1481 e.g. u.addheader('Accept', 'sound/basic')"""
1482 self.addheaders.append(args)
1483
1484 # External interface
1485 def open(self, fullurl, data=None):
1486 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001487 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001488 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001489 if self.tempcache and fullurl in self.tempcache:
1490 filename, headers = self.tempcache[fullurl]
1491 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001492 return addinfourl(fp, headers, fullurl)
1493 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001494 if not urltype:
1495 urltype = 'file'
1496 if urltype in self.proxies:
1497 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001498 urltype, proxyhost = splittype(proxy)
1499 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001500 url = (host, fullurl) # Signal special case to open_*()
1501 else:
1502 proxy = None
1503 name = 'open_' + urltype
1504 self.type = urltype
1505 name = name.replace('-', '_')
1506 if not hasattr(self, name):
1507 if proxy:
1508 return self.open_unknown_proxy(proxy, fullurl, data)
1509 else:
1510 return self.open_unknown(fullurl, data)
1511 try:
1512 if data is None:
1513 return getattr(self, name)(url)
1514 else:
1515 return getattr(self, name)(url, data)
1516 except socket.error as msg:
1517 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1518
1519 def open_unknown(self, fullurl, data=None):
1520 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001521 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001522 raise IOError('url error', 'unknown url type', type)
1523
1524 def open_unknown_proxy(self, proxy, fullurl, data=None):
1525 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001526 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001527 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1528
1529 # External interface
1530 def retrieve(self, url, filename=None, reporthook=None, data=None):
1531 """retrieve(url) returns (filename, headers) for a local object
1532 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001533 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001534 if self.tempcache and url in self.tempcache:
1535 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001536 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001537 if filename is None and (not type or type == 'file'):
1538 try:
1539 fp = self.open_local_file(url1)
1540 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001541 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001542 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001543 except IOError as msg:
1544 pass
1545 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001546 try:
1547 headers = fp.info()
1548 if filename:
1549 tfp = open(filename, 'wb')
1550 else:
1551 import tempfile
1552 garbage, path = splittype(url)
1553 garbage, path = splithost(path or "")
1554 path, garbage = splitquery(path or "")
1555 path, garbage = splitattr(path or "")
1556 suffix = os.path.splitext(path)[1]
1557 (fd, filename) = tempfile.mkstemp(suffix)
1558 self.__tempfiles.append(filename)
1559 tfp = os.fdopen(fd, 'wb')
1560 try:
1561 result = filename, headers
1562 if self.tempcache is not None:
1563 self.tempcache[url] = result
1564 bs = 1024*8
1565 size = -1
1566 read = 0
1567 blocknum = 0
1568 if reporthook:
1569 if "content-length" in headers:
1570 size = int(headers["Content-Length"])
1571 reporthook(blocknum, bs, size)
1572 while 1:
1573 block = fp.read(bs)
1574 if not block:
1575 break
1576 read += len(block)
1577 tfp.write(block)
1578 blocknum += 1
1579 if reporthook:
1580 reporthook(blocknum, bs, size)
1581 finally:
1582 tfp.close()
1583 finally:
1584 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001585
1586 # raise exception if actual size does not match content-length header
1587 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001588 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001589 "retrieval incomplete: got only %i out of %i bytes"
1590 % (read, size), result)
1591
1592 return result
1593
1594 # Each method named open_<type> knows how to open that type of URL
1595
1596 def _open_generic_http(self, connection_factory, url, data):
1597 """Make an HTTP connection using connection_class.
1598
1599 This is an internal method that should be called from
1600 open_http() or open_https().
1601
1602 Arguments:
1603 - connection_factory should take a host name and return an
1604 HTTPConnection instance.
1605 - url is the url to retrieval or a host, relative-path pair.
1606 - data is payload for a POST request or None.
1607 """
1608
1609 user_passwd = None
1610 proxy_passwd= None
1611 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001612 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001613 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001614 user_passwd, host = splituser(host)
1615 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001616 realhost = host
1617 else:
1618 host, selector = url
1619 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001620 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001621 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001622 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001623 url = rest
1624 user_passwd = None
1625 if urltype.lower() != 'http':
1626 realhost = None
1627 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001628 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001629 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001630 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001631 if user_passwd:
1632 selector = "%s://%s%s" % (urltype, realhost, rest)
1633 if proxy_bypass(realhost):
1634 host = realhost
1635
1636 #print "proxy via http:", host, selector
1637 if not host: raise IOError('http error', 'no host given')
1638
1639 if proxy_passwd:
1640 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001641 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001642 else:
1643 proxy_auth = None
1644
1645 if user_passwd:
1646 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001647 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001648 else:
1649 auth = None
1650 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001651 headers = {}
1652 if proxy_auth:
1653 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1654 if auth:
1655 headers["Authorization"] = "Basic %s" % auth
1656 if realhost:
1657 headers["Host"] = realhost
1658 for header, value in self.addheaders:
1659 headers[header] = value
1660
1661 if data is not None:
1662 headers["Content-Type"] = "application/x-www-form-urlencoded"
1663 http_conn.request("POST", selector, data, headers)
1664 else:
1665 http_conn.request("GET", selector, headers=headers)
1666
1667 try:
1668 response = http_conn.getresponse()
1669 except http.client.BadStatusLine:
1670 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001671 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001672
1673 # According to RFC 2616, "2xx" code indicates that the client's
1674 # request was successfully received, understood, and accepted.
1675 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001676 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001677 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001678 else:
1679 return self.http_error(
1680 url, response.fp,
1681 response.status, response.reason, response.msg, data)
1682
1683 def open_http(self, url, data=None):
1684 """Use HTTP protocol."""
1685 return self._open_generic_http(http.client.HTTPConnection, url, data)
1686
1687 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1688 """Handle http errors.
1689
1690 Derived class can override this, or provide specific handlers
1691 named http_error_DDD where DDD is the 3-digit error code."""
1692 # First check if there's a specific handler for this error
1693 name = 'http_error_%d' % errcode
1694 if hasattr(self, name):
1695 method = getattr(self, name)
1696 if data is None:
1697 result = method(url, fp, errcode, errmsg, headers)
1698 else:
1699 result = method(url, fp, errcode, errmsg, headers, data)
1700 if result: return result
1701 return self.http_error_default(url, fp, errcode, errmsg, headers)
1702
1703 def http_error_default(self, url, fp, errcode, errmsg, headers):
1704 """Default error handler: close the connection and raise IOError."""
1705 void = fp.read()
1706 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001707 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001708
1709 if _have_ssl:
1710 def _https_connection(self, host):
1711 return http.client.HTTPSConnection(host,
1712 key_file=self.key_file,
1713 cert_file=self.cert_file)
1714
1715 def open_https(self, url, data=None):
1716 """Use HTTPS protocol."""
1717 return self._open_generic_http(self._https_connection, url, data)
1718
1719 def open_file(self, url):
1720 """Use local file or FTP depending on form of URL."""
1721 if not isinstance(url, str):
1722 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1723 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001724 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001725 else:
1726 return self.open_local_file(url)
1727
1728 def open_local_file(self, url):
1729 """Use local file."""
1730 import mimetypes, email.utils
1731 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001732 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001733 localname = url2pathname(file)
1734 try:
1735 stats = os.stat(localname)
1736 except OSError as e:
1737 raise URLError(e.errno, e.strerror, e.filename)
1738 size = stats.st_size
1739 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1740 mtype = mimetypes.guess_type(url)[0]
1741 headers = email.message_from_string(
1742 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1743 (mtype or 'text/plain', size, modified))
1744 if not host:
1745 urlfile = file
1746 if file[:1] == '/':
1747 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001748 return addinfourl(open(localname, 'rb'), headers, urlfile)
1749 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001750 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001751 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001752 urlfile = file
1753 if file[:1] == '/':
1754 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001755 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001756 raise URLError('local file error', 'not on local host')
1757
1758 def open_ftp(self, url):
1759 """Use FTP protocol."""
1760 if not isinstance(url, str):
1761 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1762 import mimetypes
1763 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001764 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001765 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001766 host, port = splitport(host)
1767 user, host = splituser(host)
1768 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001769 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001770 host = unquote(host)
1771 user = unquote(user or '')
1772 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001773 host = socket.gethostbyname(host)
1774 if not port:
1775 import ftplib
1776 port = ftplib.FTP_PORT
1777 else:
1778 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001779 path, attrs = splitattr(path)
1780 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001781 dirs = path.split('/')
1782 dirs, file = dirs[:-1], dirs[-1]
1783 if dirs and not dirs[0]: dirs = dirs[1:]
1784 if dirs and not dirs[0]: dirs[0] = '/'
1785 key = user, host, port, '/'.join(dirs)
1786 # XXX thread unsafe!
1787 if len(self.ftpcache) > MAXFTPCACHE:
1788 # Prune the cache, rather arbitrarily
1789 for k in self.ftpcache.keys():
1790 if k != key:
1791 v = self.ftpcache[k]
1792 del self.ftpcache[k]
1793 v.close()
1794 try:
1795 if not key in self.ftpcache:
1796 self.ftpcache[key] = \
1797 ftpwrapper(user, passwd, host, port, dirs)
1798 if not file: type = 'D'
1799 else: type = 'I'
1800 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001801 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001802 if attr.lower() == 'type' and \
1803 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1804 type = value.upper()
1805 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1806 mtype = mimetypes.guess_type("ftp:" + url)[0]
1807 headers = ""
1808 if mtype:
1809 headers += "Content-Type: %s\n" % mtype
1810 if retrlen is not None and retrlen >= 0:
1811 headers += "Content-Length: %d\n" % retrlen
1812 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001813 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001814 except ftperrors() as msg:
1815 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1816
1817 def open_data(self, url, data=None):
1818 """Use "data" URL."""
1819 if not isinstance(url, str):
1820 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1821 # ignore POSTed data
1822 #
1823 # syntax of data URLs:
1824 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1825 # mediatype := [ type "/" subtype ] *( ";" parameter )
1826 # data := *urlchar
1827 # parameter := attribute "=" value
1828 try:
1829 [type, data] = url.split(',', 1)
1830 except ValueError:
1831 raise IOError('data error', 'bad data URL')
1832 if not type:
1833 type = 'text/plain;charset=US-ASCII'
1834 semi = type.rfind(';')
1835 if semi >= 0 and '=' not in type[semi:]:
1836 encoding = type[semi+1:]
1837 type = type[:semi]
1838 else:
1839 encoding = ''
1840 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001841 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001842 time.gmtime(time.time())))
1843 msg.append('Content-type: %s' % type)
1844 if encoding == 'base64':
1845 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001846 # XXX is this encoding/decoding ok?
1847 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001848 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001849 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001850 msg.append('Content-Length: %d' % len(data))
1851 msg.append('')
1852 msg.append(data)
1853 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001854 headers = email.message_from_string(msg)
1855 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001856 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001857 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001858
1859
1860class FancyURLopener(URLopener):
1861 """Derived class with handlers for errors we can handle (perhaps)."""
1862
1863 def __init__(self, *args, **kwargs):
1864 URLopener.__init__(self, *args, **kwargs)
1865 self.auth_cache = {}
1866 self.tries = 0
1867 self.maxtries = 10
1868
1869 def http_error_default(self, url, fp, errcode, errmsg, headers):
1870 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001871 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001872
1873 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1874 """Error 302 -- relocated (temporarily)."""
1875 self.tries += 1
1876 if self.maxtries and self.tries >= self.maxtries:
1877 if hasattr(self, "http_error_500"):
1878 meth = self.http_error_500
1879 else:
1880 meth = self.http_error_default
1881 self.tries = 0
1882 return meth(url, fp, 500,
1883 "Internal Server Error: Redirect Recursion", headers)
1884 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1885 data)
1886 self.tries = 0
1887 return result
1888
1889 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1890 if 'location' in headers:
1891 newurl = headers['location']
1892 elif 'uri' in headers:
1893 newurl = headers['uri']
1894 else:
1895 return
1896 void = fp.read()
1897 fp.close()
1898 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001899 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001900 return self.open(newurl)
1901
1902 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1903 """Error 301 -- also relocated (permanently)."""
1904 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1905
1906 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1907 """Error 303 -- also relocated (essentially identical to 302)."""
1908 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1909
1910 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1911 """Error 307 -- relocated, but turn POST into error."""
1912 if data is None:
1913 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1914 else:
1915 return self.http_error_default(url, fp, errcode, errmsg, headers)
1916
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001917 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1918 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001919 """Error 401 -- authentication required.
1920 This function supports Basic authentication only."""
1921 if not 'www-authenticate' in headers:
1922 URLopener.http_error_default(self, url, fp,
1923 errcode, errmsg, headers)
1924 stuff = headers['www-authenticate']
1925 import re
1926 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1927 if not match:
1928 URLopener.http_error_default(self, url, fp,
1929 errcode, errmsg, headers)
1930 scheme, realm = match.groups()
1931 if scheme.lower() != 'basic':
1932 URLopener.http_error_default(self, url, fp,
1933 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001934 if not retry:
1935 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1936 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001937 name = 'retry_' + self.type + '_basic_auth'
1938 if data is None:
1939 return getattr(self,name)(url, realm)
1940 else:
1941 return getattr(self,name)(url, realm, data)
1942
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001943 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
1944 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001945 """Error 407 -- proxy authentication required.
1946 This function supports Basic authentication only."""
1947 if not 'proxy-authenticate' in headers:
1948 URLopener.http_error_default(self, url, fp,
1949 errcode, errmsg, headers)
1950 stuff = headers['proxy-authenticate']
1951 import re
1952 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1953 if not match:
1954 URLopener.http_error_default(self, url, fp,
1955 errcode, errmsg, headers)
1956 scheme, realm = match.groups()
1957 if scheme.lower() != 'basic':
1958 URLopener.http_error_default(self, url, fp,
1959 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001960 if not retry:
1961 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1962 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001963 name = 'retry_proxy_' + self.type + '_basic_auth'
1964 if data is None:
1965 return getattr(self,name)(url, realm)
1966 else:
1967 return getattr(self,name)(url, realm, data)
1968
1969 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001970 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001971 newurl = 'http://' + host + selector
1972 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001973 urltype, proxyhost = splittype(proxy)
1974 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001975 i = proxyhost.find('@') + 1
1976 proxyhost = proxyhost[i:]
1977 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1978 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001979 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001980 quote(passwd, safe=''), proxyhost)
1981 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1982 if data is None:
1983 return self.open(newurl)
1984 else:
1985 return self.open(newurl, data)
1986
1987 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001988 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001989 newurl = 'https://' + host + selector
1990 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001991 urltype, proxyhost = splittype(proxy)
1992 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001993 i = proxyhost.find('@') + 1
1994 proxyhost = proxyhost[i:]
1995 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1996 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001997 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001998 quote(passwd, safe=''), proxyhost)
1999 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2000 if data is None:
2001 return self.open(newurl)
2002 else:
2003 return self.open(newurl, data)
2004
2005 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002006 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002007 i = host.find('@') + 1
2008 host = host[i:]
2009 user, passwd = self.get_user_passwd(host, realm, i)
2010 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002011 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002012 quote(passwd, safe=''), host)
2013 newurl = 'http://' + host + selector
2014 if data is None:
2015 return self.open(newurl)
2016 else:
2017 return self.open(newurl, data)
2018
2019 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002020 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002021 i = host.find('@') + 1
2022 host = host[i:]
2023 user, passwd = self.get_user_passwd(host, realm, i)
2024 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002025 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002026 quote(passwd, safe=''), host)
2027 newurl = 'https://' + host + selector
2028 if data is None:
2029 return self.open(newurl)
2030 else:
2031 return self.open(newurl, data)
2032
Florent Xicluna757445b2010-05-17 17:24:07 +00002033 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002034 key = realm + '@' + host.lower()
2035 if key in self.auth_cache:
2036 if clear_cache:
2037 del self.auth_cache[key]
2038 else:
2039 return self.auth_cache[key]
2040 user, passwd = self.prompt_user_passwd(host, realm)
2041 if user or passwd: self.auth_cache[key] = (user, passwd)
2042 return user, passwd
2043
2044 def prompt_user_passwd(self, host, realm):
2045 """Override this in a GUI environment!"""
2046 import getpass
2047 try:
2048 user = input("Enter username for %s at %s: " % (realm, host))
2049 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2050 (user, realm, host))
2051 return user, passwd
2052 except KeyboardInterrupt:
2053 print()
2054 return None, None
2055
2056
2057# Utility functions
2058
2059_localhost = None
2060def localhost():
2061 """Return the IP address of the magic hostname 'localhost'."""
2062 global _localhost
2063 if _localhost is None:
2064 _localhost = socket.gethostbyname('localhost')
2065 return _localhost
2066
2067_thishost = None
2068def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002069 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002070 global _thishost
2071 if _thishost is None:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002072 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname()[2]))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002073 return _thishost
2074
2075_ftperrors = None
2076def ftperrors():
2077 """Return the set of errors raised by the FTP class."""
2078 global _ftperrors
2079 if _ftperrors is None:
2080 import ftplib
2081 _ftperrors = ftplib.all_errors
2082 return _ftperrors
2083
2084_noheaders = None
2085def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002086 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002087 global _noheaders
2088 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002089 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002090 return _noheaders
2091
2092
2093# Utility classes
2094
2095class ftpwrapper:
2096 """Class used by open_ftp() for cache of open FTP connections."""
2097
2098 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2099 self.user = user
2100 self.passwd = passwd
2101 self.host = host
2102 self.port = port
2103 self.dirs = dirs
2104 self.timeout = timeout
2105 self.init()
2106
2107 def init(self):
2108 import ftplib
2109 self.busy = 0
2110 self.ftp = ftplib.FTP()
2111 self.ftp.connect(self.host, self.port, self.timeout)
2112 self.ftp.login(self.user, self.passwd)
2113 for dir in self.dirs:
2114 self.ftp.cwd(dir)
2115
2116 def retrfile(self, file, type):
2117 import ftplib
2118 self.endtransfer()
2119 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2120 else: cmd = 'TYPE ' + type; isdir = 0
2121 try:
2122 self.ftp.voidcmd(cmd)
2123 except ftplib.all_errors:
2124 self.init()
2125 self.ftp.voidcmd(cmd)
2126 conn = None
2127 if file and not isdir:
2128 # Try to retrieve as a file
2129 try:
2130 cmd = 'RETR ' + file
2131 conn = self.ftp.ntransfercmd(cmd)
2132 except ftplib.error_perm as reason:
2133 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002134 raise URLError('ftp error', reason).with_traceback(
2135 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002136 if not conn:
2137 # Set transfer mode to ASCII!
2138 self.ftp.voidcmd('TYPE A')
2139 # Try a directory listing. Verify that directory exists.
2140 if file:
2141 pwd = self.ftp.pwd()
2142 try:
2143 try:
2144 self.ftp.cwd(file)
2145 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002146 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002147 finally:
2148 self.ftp.cwd(pwd)
2149 cmd = 'LIST ' + file
2150 else:
2151 cmd = 'LIST'
2152 conn = self.ftp.ntransfercmd(cmd)
2153 self.busy = 1
2154 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002155 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002156 def endtransfer(self):
2157 if not self.busy:
2158 return
2159 self.busy = 0
2160 try:
2161 self.ftp.voidresp()
2162 except ftperrors():
2163 pass
2164
2165 def close(self):
2166 self.endtransfer()
2167 try:
2168 self.ftp.close()
2169 except ftperrors():
2170 pass
2171
2172# Proxy handling
2173def getproxies_environment():
2174 """Return a dictionary of scheme -> proxy server URL mappings.
2175
2176 Scan the environment for variables named <scheme>_proxy;
2177 this seems to be the standard convention. If you need a
2178 different way, you can pass a proxies dictionary to the
2179 [Fancy]URLopener constructor.
2180
2181 """
2182 proxies = {}
2183 for name, value in os.environ.items():
2184 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002185 if value and name[-6:] == '_proxy':
2186 proxies[name[:-6]] = value
2187 return proxies
2188
2189def proxy_bypass_environment(host):
2190 """Test if proxies should not be used for a particular host.
2191
2192 Checks the environment for a variable named no_proxy, which should
2193 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2194 """
2195 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2196 # '*' is special case for always bypass
2197 if no_proxy == '*':
2198 return 1
2199 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002200 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002201 # check if the host ends with any of the DNS suffixes
2202 for name in no_proxy.split(','):
2203 if name and (hostonly.endswith(name) or host.endswith(name)):
2204 return 1
2205 # otherwise, don't bypass
2206 return 0
2207
2208
2209if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002210 from _scproxy import _get_proxy_settings, _get_proxies
2211
2212 def proxy_bypass_macosx_sysconf(host):
2213 """
2214 Return True iff this host shouldn't be accessed using a proxy
2215
2216 This function uses the MacOSX framework SystemConfiguration
2217 to fetch the proxy information.
2218 """
2219 import re
2220 import socket
2221 from fnmatch import fnmatch
2222
2223 hostonly, port = splitport(host)
2224
2225 def ip2num(ipAddr):
2226 parts = ipAddr.split('.')
Mark Dickinsonc3f45c22010-05-09 12:16:29 +00002227 parts = list(map(int, parts))
Ronald Oussoren84151202010-04-18 20:46:11 +00002228 if len(parts) != 4:
2229 parts = (parts + [0, 0, 0, 0])[:4]
2230 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2231
2232 proxy_settings = _get_proxy_settings()
2233
2234 # Check for simple host names:
2235 if '.' not in host:
2236 if proxy_settings['exclude_simple']:
2237 return True
2238
2239 hostIP = None
2240
2241 for value in proxy_settings.get('exceptions', ()):
2242 # Items in the list are strings like these: *.local, 169.254/16
2243 if not value: continue
2244
2245 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2246 if m is not None:
2247 if hostIP is None:
2248 try:
2249 hostIP = socket.gethostbyname(hostonly)
2250 hostIP = ip2num(hostIP)
2251 except socket.error:
2252 continue
2253
2254 base = ip2num(m.group(1))
Ronald Oussorenab90f8e2010-06-27 14:26:30 +00002255 mask = m.group(2)
2256 if mask is None:
2257 mask = 8 * (m.group(1).count('.') + 1)
2258
2259 else:
2260 mask = int(mask[1:])
2261 mask = 32 - mask
Ronald Oussoren84151202010-04-18 20:46:11 +00002262
2263 if (hostIP >> mask) == (base >> mask):
2264 return True
2265
2266 elif fnmatch(host, value):
2267 return True
2268
2269 return False
2270
2271
2272 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002273 """Return a dictionary of scheme -> proxy server URL mappings.
2274
Ronald Oussoren84151202010-04-18 20:46:11 +00002275 This function uses the MacOSX framework SystemConfiguration
2276 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002277 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002278 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002279
Ronald Oussoren84151202010-04-18 20:46:11 +00002280
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002281
2282 def proxy_bypass(host):
2283 if getproxies_environment():
2284 return proxy_bypass_environment(host)
2285 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002286 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002287
2288 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002289 return getproxies_environment() or getproxies_macosx_sysconf()
2290
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002291
2292elif os.name == 'nt':
2293 def getproxies_registry():
2294 """Return a dictionary of scheme -> proxy server URL mappings.
2295
2296 Win32 uses the registry to store proxies.
2297
2298 """
2299 proxies = {}
2300 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002301 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002302 except ImportError:
2303 # Std module, so should be around - but you never know!
2304 return proxies
2305 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002306 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002307 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002308 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002309 'ProxyEnable')[0]
2310 if proxyEnable:
2311 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002312 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002313 'ProxyServer')[0])
2314 if '=' in proxyServer:
2315 # Per-protocol settings
2316 for p in proxyServer.split(';'):
2317 protocol, address = p.split('=', 1)
2318 # See if address has a type:// prefix
2319 import re
2320 if not re.match('^([^/:]+)://', address):
2321 address = '%s://%s' % (protocol, address)
2322 proxies[protocol] = address
2323 else:
2324 # Use one setting for all protocols
2325 if proxyServer[:5] == 'http:':
2326 proxies['http'] = proxyServer
2327 else:
2328 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002329 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002330 proxies['ftp'] = 'ftp://%s' % proxyServer
2331 internetSettings.Close()
2332 except (WindowsError, ValueError, TypeError):
2333 # Either registry key not found etc, or the value in an
2334 # unexpected format.
2335 # proxies already set up to be empty so nothing to do
2336 pass
2337 return proxies
2338
2339 def getproxies():
2340 """Return a dictionary of scheme -> proxy server URL mappings.
2341
2342 Returns settings gathered from the environment, if specified,
2343 or the registry.
2344
2345 """
2346 return getproxies_environment() or getproxies_registry()
2347
2348 def proxy_bypass_registry(host):
2349 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002350 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002351 import re
2352 except ImportError:
2353 # Std modules, so should be around - but you never know!
2354 return 0
2355 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002356 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002357 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002358 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002359 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002360 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002361 'ProxyOverride')[0])
2362 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2363 except WindowsError:
2364 return 0
2365 if not proxyEnable or not proxyOverride:
2366 return 0
2367 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002368 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002369 host = [rawHost]
2370 try:
2371 addr = socket.gethostbyname(rawHost)
2372 if addr != rawHost:
2373 host.append(addr)
2374 except socket.error:
2375 pass
2376 try:
2377 fqdn = socket.getfqdn(rawHost)
2378 if fqdn != rawHost:
2379 host.append(fqdn)
2380 except socket.error:
2381 pass
2382 # make a check value list from the registry entry: replace the
2383 # '<local>' string by the localhost entry and the corresponding
2384 # canonical entry.
2385 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002386 # now check if we match one of the registry values.
2387 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002388 if test == '<local>':
2389 if '.' not in rawHost:
2390 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002391 test = test.replace(".", r"\.") # mask dots
2392 test = test.replace("*", r".*") # change glob sequence
2393 test = test.replace("?", r".") # change glob char
2394 for val in host:
2395 # print "%s <--> %s" %( test, val )
2396 if re.match(test, val, re.I):
2397 return 1
2398 return 0
2399
2400 def proxy_bypass(host):
2401 """Return a dictionary of scheme -> proxy server URL mappings.
2402
2403 Returns settings gathered from the environment, if specified,
2404 or the registry.
2405
2406 """
2407 if getproxies_environment():
2408 return proxy_bypass_environment(host)
2409 else:
2410 return proxy_bypass_registry(host)
2411
2412else:
2413 # By default use environment variables
2414 getproxies = getproxies_environment
2415 proxy_bypass = proxy_bypass_environment