blob: ebbebe945bc048191a0af3a6bd8009ec1dd64be8 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000097import collections
Jeremy Hylton1afc1692008-06-18 20:49:58 +000098
Georg Brandl13e89462008-07-01 19:56:00 +000099from urllib.error import URLError, HTTPError, ContentTooShortError
100from urllib.parse import (
101 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
102 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000103 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000104from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106# check for SSL
107try:
108 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000109except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110 _have_ssl = False
111else:
112 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113
114# used in User-Agent header sent
115__version__ = sys.version[:3]
116
117_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000118def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
119 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000120 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000121 if cafile or capath:
122 if not _have_ssl:
123 raise ValueError('SSL support not available')
124 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
125 context.options |= ssl.OP_NO_SSLv2
126 if cafile or capath:
127 context.verify_mode = ssl.CERT_REQUIRED
128 context.load_verify_locations(cafile, capath)
129 check_hostname = True
130 else:
131 check_hostname = False
132 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
133 opener = build_opener(https_handler)
134 elif _opener is None:
135 _opener = opener = build_opener()
136 else:
137 opener = _opener
138 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000139
140def install_opener(opener):
141 global _opener
142 _opener = opener
143
144# TODO(jhylton): Make this work with the same global opener.
145_urlopener = None
146def urlretrieve(url, filename=None, reporthook=None, data=None):
147 global _urlopener
148 if not _urlopener:
149 _urlopener = FancyURLopener()
150 return _urlopener.retrieve(url, filename, reporthook, data)
151
152def urlcleanup():
153 if _urlopener:
154 _urlopener.cleanup()
155 global _opener
156 if _opener:
157 _opener = None
158
159# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000160_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000161def request_host(request):
162 """Return request-host, as defined by RFC 2965.
163
164 Variation from RFC: returned value is lowercased, for convenient
165 comparison.
166
167 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000168 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000169 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000170 if host == "":
171 host = request.get_header("Host", "")
172
173 # remove port, if present
174 host = _cut_port_re.sub("", host, 1)
175 return host.lower()
176
177class Request:
178
179 def __init__(self, url, data=None, headers={},
180 origin_req_host=None, unverifiable=False):
181 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000182 self.full_url = unwrap(url)
Senthil Kumarand95cc752010-08-08 11:27:53 +0000183 self.full_url, fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000184 self.data = data
185 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000186 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000187 for key, value in headers.items():
188 self.add_header(key, value)
189 self.unredirected_hdrs = {}
190 if origin_req_host is None:
191 origin_req_host = request_host(self)
192 self.origin_req_host = origin_req_host
193 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000194 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000195
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000196 def _parse(self):
197 self.type, rest = splittype(self.full_url)
198 if self.type is None:
199 raise ValueError("unknown url type: %s" % self.full_url)
200 self.host, self.selector = splithost(rest)
201 if self.host:
202 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000203
204 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000205 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000206 return "POST"
207 else:
208 return "GET"
209
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000210 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211
212 def add_data(self, data):
213 self.data = data
214
215 def has_data(self):
216 return self.data is not None
217
218 def get_data(self):
219 return self.data
220
221 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000222 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000223
224 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000225 return self.type
226
227 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000228 return self.host
229
230 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000231 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000232
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000233 def is_unverifiable(self):
234 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000235
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000236 def get_origin_req_host(self):
237 return self.origin_req_host
238
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000239 # End deprecated methods
240
241 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000242 if self.type == 'https' and not self._tunnel_host:
243 self._tunnel_host = self.host
244 else:
245 self.type= type
246 self.selector = self.full_url
247 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000248
249 def has_proxy(self):
250 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000251
252 def add_header(self, key, val):
253 # useful for something like authentication
254 self.headers[key.capitalize()] = val
255
256 def add_unredirected_header(self, key, val):
257 # will not be added to a redirected request
258 self.unredirected_hdrs[key.capitalize()] = val
259
260 def has_header(self, header_name):
261 return (header_name in self.headers or
262 header_name in self.unredirected_hdrs)
263
264 def get_header(self, header_name, default=None):
265 return self.headers.get(
266 header_name,
267 self.unredirected_hdrs.get(header_name, default))
268
269 def header_items(self):
270 hdrs = self.unredirected_hdrs.copy()
271 hdrs.update(self.headers)
272 return list(hdrs.items())
273
274class OpenerDirector:
275 def __init__(self):
276 client_version = "Python-urllib/%s" % __version__
277 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000278 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000279 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000280 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000281 self.handle_open = {}
282 self.handle_error = {}
283 self.process_response = {}
284 self.process_request = {}
285
286 def add_handler(self, handler):
287 if not hasattr(handler, "add_parent"):
288 raise TypeError("expected BaseHandler instance, got %r" %
289 type(handler))
290
291 added = False
292 for meth in dir(handler):
293 if meth in ["redirect_request", "do_open", "proxy_open"]:
294 # oops, coincidental match
295 continue
296
297 i = meth.find("_")
298 protocol = meth[:i]
299 condition = meth[i+1:]
300
301 if condition.startswith("error"):
302 j = condition.find("_") + i + 1
303 kind = meth[j+1:]
304 try:
305 kind = int(kind)
306 except ValueError:
307 pass
308 lookup = self.handle_error.get(protocol, {})
309 self.handle_error[protocol] = lookup
310 elif condition == "open":
311 kind = protocol
312 lookup = self.handle_open
313 elif condition == "response":
314 kind = protocol
315 lookup = self.process_response
316 elif condition == "request":
317 kind = protocol
318 lookup = self.process_request
319 else:
320 continue
321
322 handlers = lookup.setdefault(kind, [])
323 if handlers:
324 bisect.insort(handlers, handler)
325 else:
326 handlers.append(handler)
327 added = True
328
329 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330 bisect.insort(self.handlers, handler)
331 handler.add_parent(self)
332
333 def close(self):
334 # Only exists for backwards compatibility.
335 pass
336
337 def _call_chain(self, chain, kind, meth_name, *args):
338 # Handlers raise an exception if no one else should try to handle
339 # the request, or return None if they can't but another handler
340 # could. Otherwise, they return the response.
341 handlers = chain.get(kind, ())
342 for handler in handlers:
343 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000344 result = func(*args)
345 if result is not None:
346 return result
347
348 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
349 # accept a URL or a Request object
350 if isinstance(fullurl, str):
351 req = Request(fullurl, data)
352 else:
353 req = fullurl
354 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000355 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000356
357 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000358 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000359
360 # pre-process request
361 meth_name = protocol+"_request"
362 for processor in self.process_request.get(protocol, []):
363 meth = getattr(processor, meth_name)
364 req = meth(req)
365
366 response = self._open(req, data)
367
368 # post-process response
369 meth_name = protocol+"_response"
370 for processor in self.process_response.get(protocol, []):
371 meth = getattr(processor, meth_name)
372 response = meth(req, response)
373
374 return response
375
376 def _open(self, req, data=None):
377 result = self._call_chain(self.handle_open, 'default',
378 'default_open', req)
379 if result:
380 return result
381
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000382 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000383 result = self._call_chain(self.handle_open, protocol, protocol +
384 '_open', req)
385 if result:
386 return result
387
388 return self._call_chain(self.handle_open, 'unknown',
389 'unknown_open', req)
390
391 def error(self, proto, *args):
392 if proto in ('http', 'https'):
393 # XXX http[s] protocols are special-cased
394 dict = self.handle_error['http'] # https is not different than http
395 proto = args[2] # YUCK!
396 meth_name = 'http_error_%s' % proto
397 http_err = 1
398 orig_args = args
399 else:
400 dict = self.handle_error
401 meth_name = proto + '_error'
402 http_err = 0
403 args = (dict, proto, meth_name) + args
404 result = self._call_chain(*args)
405 if result:
406 return result
407
408 if http_err:
409 args = (dict, 'default', 'http_error_default') + orig_args
410 return self._call_chain(*args)
411
412# XXX probably also want an abstract factory that knows when it makes
413# sense to skip a superclass in favor of a subclass and when it might
414# make sense to include both
415
416def build_opener(*handlers):
417 """Create an opener object from a list of handlers.
418
419 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000420 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000421
422 If any of the handlers passed as arguments are subclasses of the
423 default handlers, the default handlers will not be used.
424 """
425 def isclass(obj):
426 return isinstance(obj, type) or hasattr(obj, "__bases__")
427
428 opener = OpenerDirector()
429 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
430 HTTPDefaultErrorHandler, HTTPRedirectHandler,
431 FTPHandler, FileHandler, HTTPErrorProcessor]
432 if hasattr(http.client, "HTTPSConnection"):
433 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000434 skip = set()
435 for klass in default_classes:
436 for check in handlers:
437 if isclass(check):
438 if issubclass(check, klass):
439 skip.add(klass)
440 elif isinstance(check, klass):
441 skip.add(klass)
442 for klass in skip:
443 default_classes.remove(klass)
444
445 for klass in default_classes:
446 opener.add_handler(klass())
447
448 for h in handlers:
449 if isclass(h):
450 h = h()
451 opener.add_handler(h)
452 return opener
453
454class BaseHandler:
455 handler_order = 500
456
457 def add_parent(self, parent):
458 self.parent = parent
459
460 def close(self):
461 # Only exists for backwards compatibility
462 pass
463
464 def __lt__(self, other):
465 if not hasattr(other, "handler_order"):
466 # Try to preserve the old behavior of having custom classes
467 # inserted after default ones (works only for custom user
468 # classes which are not aware of handler_order).
469 return True
470 return self.handler_order < other.handler_order
471
472
473class HTTPErrorProcessor(BaseHandler):
474 """Process HTTP error responses."""
475 handler_order = 1000 # after all other processing
476
477 def http_response(self, request, response):
478 code, msg, hdrs = response.code, response.msg, response.info()
479
480 # According to RFC 2616, "2xx" code indicates that the client's
481 # request was successfully received, understood, and accepted.
482 if not (200 <= code < 300):
483 response = self.parent.error(
484 'http', request, response, code, msg, hdrs)
485
486 return response
487
488 https_response = http_response
489
490class HTTPDefaultErrorHandler(BaseHandler):
491 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000492 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000493
494class HTTPRedirectHandler(BaseHandler):
495 # maximum number of redirections to any single URL
496 # this is needed because of the state that cookies introduce
497 max_repeats = 4
498 # maximum total number of redirections (regardless of URL) before
499 # assuming we're in a loop
500 max_redirections = 10
501
502 def redirect_request(self, req, fp, code, msg, headers, newurl):
503 """Return a Request or None in response to a redirect.
504
505 This is called by the http_error_30x methods when a
506 redirection response is received. If a redirection should
507 take place, return a new Request to allow http_error_30x to
508 perform the redirect. Otherwise, raise HTTPError if no-one
509 else should try to handle this url. Return None if you can't
510 but another Handler might.
511 """
512 m = req.get_method()
513 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
514 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000515 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000516
517 # Strictly (according to RFC 2616), 301 or 302 in response to
518 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000519 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000520 # essentially all clients do redirect in this case, so we do
521 # the same.
522 # be conciliant with URIs containing a space
523 newurl = newurl.replace(' ', '%20')
524 CONTENT_HEADERS = ("content-length", "content-type")
525 newheaders = dict((k, v) for k, v in req.headers.items()
526 if k.lower() not in CONTENT_HEADERS)
527 return Request(newurl,
528 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000529 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000530 unverifiable=True)
531
532 # Implementation note: To avoid the server sending us into an
533 # infinite loop, the request object needs to track what URLs we
534 # have already seen. Do this by adding a handler-specific
535 # attribute to the Request object.
536 def http_error_302(self, req, fp, code, msg, headers):
537 # Some servers (incorrectly) return multiple Location headers
538 # (so probably same goes for URI). Use first header.
539 if "location" in headers:
540 newurl = headers["location"]
541 elif "uri" in headers:
542 newurl = headers["uri"]
543 else:
544 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000545
546 # fix a possible malformed URL
547 urlparts = urlparse(newurl)
guido@google.coma119df92011-03-29 11:41:02 -0700548
549 # For security reasons we don't allow redirection to anything other
550 # than http, https or ftp.
551
552 if not urlparts.scheme in ('http', 'https', 'ftp'):
553 raise HTTPError(newurl, code,
554 msg +
555 " - Redirection to url '%s' is not allowed" %
556 newurl,
557 headers, fp)
558
Facundo Batistaf24802c2008-08-17 03:36:03 +0000559 if not urlparts.path:
560 urlparts = list(urlparts)
561 urlparts[2] = "/"
562 newurl = urlunparse(urlparts)
563
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000564 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000565
566 # XXX Probably want to forget about the state of the current
567 # request, although that might interact poorly with other
568 # handlers that also use handler-specific request attributes
569 new = self.redirect_request(req, fp, code, msg, headers, newurl)
570 if new is None:
571 return
572
573 # loop detection
574 # .redirect_dict has a key url if url was previously visited.
575 if hasattr(req, 'redirect_dict'):
576 visited = new.redirect_dict = req.redirect_dict
577 if (visited.get(newurl, 0) >= self.max_repeats or
578 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000579 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000580 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000581 else:
582 visited = new.redirect_dict = req.redirect_dict = {}
583 visited[newurl] = visited.get(newurl, 0) + 1
584
585 # Don't close the fp until we are sure that we won't use it
586 # with HTTPError.
587 fp.read()
588 fp.close()
589
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000590 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000591
592 http_error_301 = http_error_303 = http_error_307 = http_error_302
593
594 inf_msg = "The HTTP server returned a redirect error that would " \
595 "lead to an infinite loop.\n" \
596 "The last 30x error message was:\n"
597
598
599def _parse_proxy(proxy):
600 """Return (scheme, user, password, host/port) given a URL or an authority.
601
602 If a URL is supplied, it must have an authority (host:port) component.
603 According to RFC 3986, having an authority component means the URL must
604 have two slashes after the scheme:
605
606 >>> _parse_proxy('file:/ftp.example.com/')
607 Traceback (most recent call last):
608 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
609
610 The first three items of the returned tuple may be None.
611
612 Examples of authority parsing:
613
614 >>> _parse_proxy('proxy.example.com')
615 (None, None, None, 'proxy.example.com')
616 >>> _parse_proxy('proxy.example.com:3128')
617 (None, None, None, 'proxy.example.com:3128')
618
619 The authority component may optionally include userinfo (assumed to be
620 username:password):
621
622 >>> _parse_proxy('joe:password@proxy.example.com')
623 (None, 'joe', 'password', 'proxy.example.com')
624 >>> _parse_proxy('joe:password@proxy.example.com:3128')
625 (None, 'joe', 'password', 'proxy.example.com:3128')
626
627 Same examples, but with URLs instead:
628
629 >>> _parse_proxy('http://proxy.example.com/')
630 ('http', None, None, 'proxy.example.com')
631 >>> _parse_proxy('http://proxy.example.com:3128/')
632 ('http', None, None, 'proxy.example.com:3128')
633 >>> _parse_proxy('http://joe:password@proxy.example.com/')
634 ('http', 'joe', 'password', 'proxy.example.com')
635 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
636 ('http', 'joe', 'password', 'proxy.example.com:3128')
637
638 Everything after the authority is ignored:
639
640 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
641 ('ftp', 'joe', 'password', 'proxy.example.com')
642
643 Test for no trailing '/' case:
644
645 >>> _parse_proxy('http://joe:password@proxy.example.com')
646 ('http', 'joe', 'password', 'proxy.example.com')
647
648 """
Georg Brandl13e89462008-07-01 19:56:00 +0000649 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000650 if not r_scheme.startswith("/"):
651 # authority
652 scheme = None
653 authority = proxy
654 else:
655 # URL
656 if not r_scheme.startswith("//"):
657 raise ValueError("proxy URL with no authority: %r" % proxy)
658 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
659 # and 3.3.), path is empty or starts with '/'
660 end = r_scheme.find("/", 2)
661 if end == -1:
662 end = None
663 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000664 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000665 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000666 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000667 else:
668 user = password = None
669 return scheme, user, password, hostport
670
671class ProxyHandler(BaseHandler):
672 # Proxies must be in front
673 handler_order = 100
674
675 def __init__(self, proxies=None):
676 if proxies is None:
677 proxies = getproxies()
678 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
679 self.proxies = proxies
680 for type, url in proxies.items():
681 setattr(self, '%s_open' % type,
682 lambda r, proxy=url, type=type, meth=self.proxy_open: \
683 meth(r, proxy, type))
684
685 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000686 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000687 proxy_type, user, password, hostport = _parse_proxy(proxy)
688 if proxy_type is None:
689 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000690
691 if req.host and proxy_bypass(req.host):
692 return None
693
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000694 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000695 user_pass = '%s:%s' % (unquote(user),
696 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000697 creds = base64.b64encode(user_pass.encode()).decode("ascii")
698 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000699 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000700 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000701 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000702 # let other handlers take care of it
703 return None
704 else:
705 # need to start over, because the other handlers don't
706 # grok the proxy's URL type
707 # e.g. if we have a constructor arg proxies like so:
708 # {'http': 'ftp://proxy.example.com'}, we may end up turning
709 # a request for http://acme.example.com/a into one for
710 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000711 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000712
713class HTTPPasswordMgr:
714
715 def __init__(self):
716 self.passwd = {}
717
718 def add_password(self, realm, uri, user, passwd):
719 # uri could be a single URI or a sequence
720 if isinstance(uri, str):
721 uri = [uri]
722 if not realm in self.passwd:
723 self.passwd[realm] = {}
724 for default_port in True, False:
725 reduced_uri = tuple(
726 [self.reduce_uri(u, default_port) for u in uri])
727 self.passwd[realm][reduced_uri] = (user, passwd)
728
729 def find_user_password(self, realm, authuri):
730 domains = self.passwd.get(realm, {})
731 for default_port in True, False:
732 reduced_authuri = self.reduce_uri(authuri, default_port)
733 for uris, authinfo in domains.items():
734 for uri in uris:
735 if self.is_suburi(uri, reduced_authuri):
736 return authinfo
737 return None, None
738
739 def reduce_uri(self, uri, default_port=True):
740 """Accept authority or URI and extract only the authority and path."""
741 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000742 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000743 if parts[1]:
744 # URI
745 scheme = parts[0]
746 authority = parts[1]
747 path = parts[2] or '/'
748 else:
749 # host or host:port
750 scheme = None
751 authority = uri
752 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000753 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000754 if default_port and port is None and scheme is not None:
755 dport = {"http": 80,
756 "https": 443,
757 }.get(scheme)
758 if dport is not None:
759 authority = "%s:%d" % (host, dport)
760 return authority, path
761
762 def is_suburi(self, base, test):
763 """Check if test is below base in a URI tree
764
765 Both args must be URIs in reduced form.
766 """
767 if base == test:
768 return True
769 if base[0] != test[0]:
770 return False
771 common = posixpath.commonprefix((base[1], test[1]))
772 if len(common) == len(base[1]):
773 return True
774 return False
775
776
777class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
778
779 def find_user_password(self, realm, authuri):
780 user, password = HTTPPasswordMgr.find_user_password(self, realm,
781 authuri)
782 if user is not None:
783 return user, password
784 return HTTPPasswordMgr.find_user_password(self, None, authuri)
785
786
787class AbstractBasicAuthHandler:
788
789 # XXX this allows for multiple auth-schemes, but will stupidly pick
790 # the last one with a realm specified.
791
792 # allow for double- and single-quoted realm values
793 # (single quotes are a violation of the RFC, but appear in the wild)
794 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
795 'realm=(["\'])(.*?)\\2', re.I)
796
797 # XXX could pre-emptively send auth info already accepted (RFC 2617,
798 # end of section 2, and section 1.2 immediately after "credentials"
799 # production).
800
801 def __init__(self, password_mgr=None):
802 if password_mgr is None:
803 password_mgr = HTTPPasswordMgr()
804 self.passwd = password_mgr
805 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000806 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000807
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000808 def reset_retry_count(self):
809 self.retried = 0
810
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000811 def http_error_auth_reqed(self, authreq, host, req, headers):
812 # host may be an authority (without userinfo) or a URL with an
813 # authority
814 # XXX could be multiple headers
815 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000816
817 if self.retried > 5:
818 # retry sending the username:password 5 times before failing.
819 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
820 headers, None)
821 else:
822 self.retried += 1
823
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000824 if authreq:
825 mo = AbstractBasicAuthHandler.rx.search(authreq)
826 if mo:
827 scheme, quote, realm = mo.groups()
828 if scheme.lower() == 'basic':
Senthil Kumaran4bb5c272010-08-26 06:16:22 +0000829 response = self.retry_http_basic_auth(host, req, realm)
830 if response and response.code != 401:
831 self.retried = 0
832 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000833
834 def retry_http_basic_auth(self, host, req, realm):
835 user, pw = self.passwd.find_user_password(realm, host)
836 if pw is not None:
837 raw = "%s:%s" % (user, pw)
838 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
839 if req.headers.get(self.auth_header, None) == auth:
840 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000841 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000842 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000843 else:
844 return None
845
846
847class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
848
849 auth_header = 'Authorization'
850
851 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000852 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000853 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000854 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000855 self.reset_retry_count()
856 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000857
858
859class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
860
861 auth_header = 'Proxy-authorization'
862
863 def http_error_407(self, req, fp, code, msg, headers):
864 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000865 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000866 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
867 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000868 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000869 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000870 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000871 self.reset_retry_count()
872 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000873
874
875def randombytes(n):
876 """Return n random bytes."""
877 return os.urandom(n)
878
879class AbstractDigestAuthHandler:
880 # Digest authentication is specified in RFC 2617.
881
882 # XXX The client does not inspect the Authentication-Info header
883 # in a successful response.
884
885 # XXX It should be possible to test this implementation against
886 # a mock server that just generates a static set of challenges.
887
888 # XXX qop="auth-int" supports is shaky
889
890 def __init__(self, passwd=None):
891 if passwd is None:
892 passwd = HTTPPasswordMgr()
893 self.passwd = passwd
894 self.add_password = self.passwd.add_password
895 self.retried = 0
896 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000897 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000898
899 def reset_retry_count(self):
900 self.retried = 0
901
902 def http_error_auth_reqed(self, auth_header, host, req, headers):
903 authreq = headers.get(auth_header, None)
904 if self.retried > 5:
905 # Don't fail endlessly - if we failed once, we'll probably
906 # fail a second time. Hm. Unless the Password Manager is
907 # prompting for the information. Crap. This isn't great
908 # but it's better than the current 'repeat until recursion
909 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000910 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000911 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000912 else:
913 self.retried += 1
914 if authreq:
915 scheme = authreq.split()[0]
916 if scheme.lower() == 'digest':
917 return self.retry_http_digest_auth(req, authreq)
918
919 def retry_http_digest_auth(self, req, auth):
920 token, challenge = auth.split(' ', 1)
921 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
922 auth = self.get_authorization(req, chal)
923 if auth:
924 auth_val = 'Digest %s' % auth
925 if req.headers.get(self.auth_header, None) == auth_val:
926 return None
927 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000928 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000929 return resp
930
931 def get_cnonce(self, nonce):
932 # The cnonce-value is an opaque
933 # quoted string value provided by the client and used by both client
934 # and server to avoid chosen plaintext attacks, to provide mutual
935 # authentication, and to provide some message integrity protection.
936 # This isn't a fabulous effort, but it's probably Good Enough.
937 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
938 b = s.encode("ascii") + randombytes(8)
939 dig = hashlib.sha1(b).hexdigest()
940 return dig[:16]
941
942 def get_authorization(self, req, chal):
943 try:
944 realm = chal['realm']
945 nonce = chal['nonce']
946 qop = chal.get('qop')
947 algorithm = chal.get('algorithm', 'MD5')
948 # mod_digest doesn't send an opaque, even though it isn't
949 # supposed to be optional
950 opaque = chal.get('opaque', None)
951 except KeyError:
952 return None
953
954 H, KD = self.get_algorithm_impls(algorithm)
955 if H is None:
956 return None
957
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000958 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000959 if user is None:
960 return None
961
962 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000963 if req.data is not None:
964 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000965 else:
966 entdig = None
967
968 A1 = "%s:%s:%s" % (user, realm, pw)
969 A2 = "%s:%s" % (req.get_method(),
970 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000971 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000972 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000973 if nonce == self.last_nonce:
974 self.nonce_count += 1
975 else:
976 self.nonce_count = 1
977 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000978 ncvalue = '%08x' % self.nonce_count
979 cnonce = self.get_cnonce(nonce)
980 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
981 respdig = KD(H(A1), noncebit)
982 elif qop is None:
983 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
984 else:
985 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000986 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000987
988 # XXX should the partial digests be encoded too?
989
990 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000991 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000992 respdig)
993 if opaque:
994 base += ', opaque="%s"' % opaque
995 if entdig:
996 base += ', digest="%s"' % entdig
997 base += ', algorithm="%s"' % algorithm
998 if qop:
999 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1000 return base
1001
1002 def get_algorithm_impls(self, algorithm):
1003 # lambdas assume digest modules are imported at the top level
1004 if algorithm == 'MD5':
1005 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1006 elif algorithm == 'SHA':
1007 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1008 # XXX MD5-sess
1009 KD = lambda s, d: H("%s:%s" % (s, d))
1010 return H, KD
1011
1012 def get_entity_digest(self, data, chal):
1013 # XXX not implemented yet
1014 return None
1015
1016
1017class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1018 """An authentication protocol defined by RFC 2069
1019
1020 Digest authentication improves on basic authentication because it
1021 does not transmit passwords in the clear.
1022 """
1023
1024 auth_header = 'Authorization'
1025 handler_order = 490 # before Basic auth
1026
1027 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001028 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001029 retry = self.http_error_auth_reqed('www-authenticate',
1030 host, req, headers)
1031 self.reset_retry_count()
1032 return retry
1033
1034
1035class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1036
1037 auth_header = 'Proxy-Authorization'
1038 handler_order = 490 # before Basic auth
1039
1040 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001041 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001042 retry = self.http_error_auth_reqed('proxy-authenticate',
1043 host, req, headers)
1044 self.reset_retry_count()
1045 return retry
1046
1047class AbstractHTTPHandler(BaseHandler):
1048
1049 def __init__(self, debuglevel=0):
1050 self._debuglevel = debuglevel
1051
1052 def set_http_debuglevel(self, level):
1053 self._debuglevel = level
1054
1055 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001056 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001057 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001058 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001059
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001060 if request.data is not None: # POST
1061 data = request.data
Senthil Kumaran29333122011-02-11 11:25:47 +00001062 if isinstance(data, str):
1063 raise TypeError("POST data should be bytes"
1064 " or an iterable of bytes. It cannot be str.")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001065 if not request.has_header('Content-type'):
1066 request.add_unredirected_header(
1067 'Content-type',
1068 'application/x-www-form-urlencoded')
1069 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001070 try:
1071 mv = memoryview(data)
1072 except TypeError:
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001073 if isinstance(data, collections.Iterable):
Georg Brandl61536042011-02-03 07:46:41 +00001074 raise ValueError("Content-Length should be specified "
1075 "for iterable data of type %r %r" % (type(data),
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001076 data))
1077 else:
1078 request.add_unredirected_header(
Senthil Kumaran1e991f22010-12-24 04:03:59 +00001079 'Content-length', '%d' % (len(mv) * mv.itemsize))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001080
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001081 sel_host = host
1082 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001083 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001084 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001085 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001086 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001087 for name, value in self.parent.addheaders:
1088 name = name.capitalize()
1089 if not request.has_header(name):
1090 request.add_unredirected_header(name, value)
1091
1092 return request
1093
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001094 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001095 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001096
1097 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001098 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001099 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001100 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001101 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001102
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001103 # will parse host:port
1104 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001105
1106 headers = dict(req.unredirected_hdrs)
1107 headers.update(dict((k, v) for k, v in req.headers.items()
1108 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001109
1110 # TODO(jhylton): Should this be redesigned to handle
1111 # persistent connections?
1112
1113 # We want to make an HTTP/1.1 request, but the addinfourl
1114 # class isn't prepared to deal with a persistent connection.
1115 # It will try to read all remaining data from the socket,
1116 # which will block while the server waits for the next request.
1117 # So make sure the connection gets closed after the (only)
1118 # request.
1119 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001120 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001121
1122 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001123 tunnel_headers = {}
1124 proxy_auth_hdr = "Proxy-Authorization"
1125 if proxy_auth_hdr in headers:
1126 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1127 # Proxy-Authorization should not be sent to origin
1128 # server.
1129 del headers[proxy_auth_hdr]
1130 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001131
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001132 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001133 h.request(req.get_method(), req.selector, req.data, headers)
1134 r = h.getresponse() # an HTTPResponse instance
1135 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001136 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001137
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001138 r.url = req.full_url
1139 # This line replaces the .msg attribute of the HTTPResponse
1140 # with .headers, because urllib clients expect the response to
1141 # have the reason in .msg. It would be good to mark this
1142 # attribute is deprecated and get then to use info() or
1143 # .headers.
1144 r.msg = r.reason
1145 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001146
1147
1148class HTTPHandler(AbstractHTTPHandler):
1149
1150 def http_open(self, req):
1151 return self.do_open(http.client.HTTPConnection, req)
1152
1153 http_request = AbstractHTTPHandler.do_request_
1154
1155if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001156 import ssl
1157
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001158 class HTTPSHandler(AbstractHTTPHandler):
1159
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001160 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1161 AbstractHTTPHandler.__init__(self, debuglevel)
1162 self._context = context
1163 self._check_hostname = check_hostname
1164
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001165 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001166 return self.do_open(http.client.HTTPSConnection, req,
1167 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001168
1169 https_request = AbstractHTTPHandler.do_request_
1170
1171class HTTPCookieProcessor(BaseHandler):
1172 def __init__(self, cookiejar=None):
1173 import http.cookiejar
1174 if cookiejar is None:
1175 cookiejar = http.cookiejar.CookieJar()
1176 self.cookiejar = cookiejar
1177
1178 def http_request(self, request):
1179 self.cookiejar.add_cookie_header(request)
1180 return request
1181
1182 def http_response(self, request, response):
1183 self.cookiejar.extract_cookies(response, request)
1184 return response
1185
1186 https_request = http_request
1187 https_response = http_response
1188
1189class UnknownHandler(BaseHandler):
1190 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001191 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001192 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001193
1194def parse_keqv_list(l):
1195 """Parse list of key=value strings where keys are not duplicated."""
1196 parsed = {}
1197 for elt in l:
1198 k, v = elt.split('=', 1)
1199 if v[0] == '"' and v[-1] == '"':
1200 v = v[1:-1]
1201 parsed[k] = v
1202 return parsed
1203
1204def parse_http_list(s):
1205 """Parse lists as described by RFC 2068 Section 2.
1206
1207 In particular, parse comma-separated lists where the elements of
1208 the list may include quoted-strings. A quoted-string could
1209 contain a comma. A non-quoted string could have quotes in the
1210 middle. Neither commas nor quotes count if they are escaped.
1211 Only double-quotes count, not single-quotes.
1212 """
1213 res = []
1214 part = ''
1215
1216 escape = quote = False
1217 for cur in s:
1218 if escape:
1219 part += cur
1220 escape = False
1221 continue
1222 if quote:
1223 if cur == '\\':
1224 escape = True
1225 continue
1226 elif cur == '"':
1227 quote = False
1228 part += cur
1229 continue
1230
1231 if cur == ',':
1232 res.append(part)
1233 part = ''
1234 continue
1235
1236 if cur == '"':
1237 quote = True
1238
1239 part += cur
1240
1241 # append last part
1242 if part:
1243 res.append(part)
1244
1245 return [part.strip() for part in res]
1246
1247class FileHandler(BaseHandler):
1248 # Use local file or FTP depending on form of URL
1249 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001250 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001251 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1252 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001253 if not req.host is self.get_names():
1254 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001255 else:
1256 return self.open_local_file(req)
1257
1258 # names for the localhost
1259 names = None
1260 def get_names(self):
1261 if FileHandler.names is None:
1262 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001263 FileHandler.names = tuple(
1264 socket.gethostbyname_ex('localhost')[2] +
1265 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001266 except socket.gaierror:
1267 FileHandler.names = (socket.gethostbyname('localhost'),)
1268 return FileHandler.names
1269
1270 # not entirely sure what the rules are here
1271 def open_local_file(self, req):
1272 import email.utils
1273 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001274 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001275 filename = req.selector
1276 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001277 try:
1278 stats = os.stat(localfile)
1279 size = stats.st_size
1280 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001281 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001282 headers = email.message_from_string(
1283 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1284 (mtype or 'text/plain', size, modified))
1285 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001286 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001287 if not host or \
1288 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001289 if host:
1290 origurl = 'file://' + host + filename
1291 else:
1292 origurl = 'file://' + filename
1293 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001294 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001295 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001296 raise URLError(msg)
1297 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001298
1299def _safe_gethostbyname(host):
1300 try:
1301 return socket.gethostbyname(host)
1302 except socket.gaierror:
1303 return None
1304
1305class FTPHandler(BaseHandler):
1306 def ftp_open(self, req):
1307 import ftplib
1308 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001309 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001310 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001311 raise URLError('ftp error: no host given')
1312 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001313 if port is None:
1314 port = ftplib.FTP_PORT
1315 else:
1316 port = int(port)
1317
1318 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001319 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001320 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001321 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001322 else:
1323 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001324 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001325 user = user or ''
1326 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001327
1328 try:
1329 host = socket.gethostbyname(host)
1330 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001331 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001332 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001333 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001334 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001335 dirs, file = dirs[:-1], dirs[-1]
1336 if dirs and not dirs[0]:
1337 dirs = dirs[1:]
1338 try:
1339 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1340 type = file and 'I' or 'D'
1341 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001342 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001343 if attr.lower() == 'type' and \
1344 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1345 type = value.upper()
1346 fp, retrlen = fw.retrfile(file, type)
1347 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001348 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001349 if mtype:
1350 headers += "Content-type: %s\n" % mtype
1351 if retrlen is not None and retrlen >= 0:
1352 headers += "Content-length: %d\n" % retrlen
1353 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001354 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001355 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001356 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001357 raise exc.with_traceback(sys.exc_info()[2])
1358
1359 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1360 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1361 return fw
1362
1363class CacheFTPHandler(FTPHandler):
1364 # XXX would be nice to have pluggable cache strategies
1365 # XXX this stuff is definitely not thread safe
1366 def __init__(self):
1367 self.cache = {}
1368 self.timeout = {}
1369 self.soonest = 0
1370 self.delay = 60
1371 self.max_conns = 16
1372
1373 def setTimeout(self, t):
1374 self.delay = t
1375
1376 def setMaxConns(self, m):
1377 self.max_conns = m
1378
1379 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1380 key = user, host, port, '/'.join(dirs), timeout
1381 if key in self.cache:
1382 self.timeout[key] = time.time() + self.delay
1383 else:
1384 self.cache[key] = ftpwrapper(user, passwd, host, port,
1385 dirs, timeout)
1386 self.timeout[key] = time.time() + self.delay
1387 self.check_cache()
1388 return self.cache[key]
1389
1390 def check_cache(self):
1391 # first check for old ones
1392 t = time.time()
1393 if self.soonest <= t:
1394 for k, v in list(self.timeout.items()):
1395 if v < t:
1396 self.cache[k].close()
1397 del self.cache[k]
1398 del self.timeout[k]
1399 self.soonest = min(list(self.timeout.values()))
1400
1401 # then check the size
1402 if len(self.cache) == self.max_conns:
1403 for k, v in list(self.timeout.items()):
1404 if v == self.soonest:
1405 del self.cache[k]
1406 del self.timeout[k]
1407 break
1408 self.soonest = min(list(self.timeout.values()))
1409
1410# Code move from the old urllib module
1411
1412MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1413
1414# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001415if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001416 from nturl2path import url2pathname, pathname2url
1417else:
1418 def url2pathname(pathname):
1419 """OS-specific conversion from a relative URL of the 'file' scheme
1420 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001421 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001422
1423 def pathname2url(pathname):
1424 """OS-specific conversion from a file system path to a relative URL
1425 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001426 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001427
1428# This really consists of two pieces:
1429# (1) a class which handles opening of all sorts of URLs
1430# (plus assorted utilities etc.)
1431# (2) a set of functions for parsing URLs
1432# XXX Should these be separated out into different modules?
1433
1434
1435ftpcache = {}
1436class URLopener:
1437 """Class to open URLs.
1438 This is a class rather than just a subroutine because we may need
1439 more than one set of global protocol-specific options.
1440 Note -- this is a base class for those who don't want the
1441 automatic handling of errors type 302 (relocated) and 401
1442 (authorization needed)."""
1443
1444 __tempfiles = None
1445
1446 version = "Python-urllib/%s" % __version__
1447
1448 # Constructor
1449 def __init__(self, proxies=None, **x509):
1450 if proxies is None:
1451 proxies = getproxies()
1452 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1453 self.proxies = proxies
1454 self.key_file = x509.get('key_file')
1455 self.cert_file = x509.get('cert_file')
1456 self.addheaders = [('User-Agent', self.version)]
1457 self.__tempfiles = []
1458 self.__unlink = os.unlink # See cleanup()
1459 self.tempcache = None
1460 # Undocumented feature: if you assign {} to tempcache,
1461 # it is used to cache files retrieved with
1462 # self.retrieve(). This is not enabled by default
1463 # since it does not work for changing documents (and I
1464 # haven't got the logic to check expiration headers
1465 # yet).
1466 self.ftpcache = ftpcache
1467 # Undocumented feature: you can use a different
1468 # ftp cache by assigning to the .ftpcache member;
1469 # in case you want logically independent URL openers
1470 # XXX This is not threadsafe. Bah.
1471
1472 def __del__(self):
1473 self.close()
1474
1475 def close(self):
1476 self.cleanup()
1477
1478 def cleanup(self):
1479 # This code sometimes runs when the rest of this module
1480 # has already been deleted, so it can't use any globals
1481 # or import anything.
1482 if self.__tempfiles:
1483 for file in self.__tempfiles:
1484 try:
1485 self.__unlink(file)
1486 except OSError:
1487 pass
1488 del self.__tempfiles[:]
1489 if self.tempcache:
1490 self.tempcache.clear()
1491
1492 def addheader(self, *args):
1493 """Add a header to be used by the HTTP interface only
1494 e.g. u.addheader('Accept', 'sound/basic')"""
1495 self.addheaders.append(args)
1496
1497 # External interface
1498 def open(self, fullurl, data=None):
1499 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001500 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001501 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001502 if self.tempcache and fullurl in self.tempcache:
1503 filename, headers = self.tempcache[fullurl]
1504 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001505 return addinfourl(fp, headers, fullurl)
1506 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001507 if not urltype:
1508 urltype = 'file'
1509 if urltype in self.proxies:
1510 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001511 urltype, proxyhost = splittype(proxy)
1512 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001513 url = (host, fullurl) # Signal special case to open_*()
1514 else:
1515 proxy = None
1516 name = 'open_' + urltype
1517 self.type = urltype
1518 name = name.replace('-', '_')
1519 if not hasattr(self, name):
1520 if proxy:
1521 return self.open_unknown_proxy(proxy, fullurl, data)
1522 else:
1523 return self.open_unknown(fullurl, data)
1524 try:
1525 if data is None:
1526 return getattr(self, name)(url)
1527 else:
1528 return getattr(self, name)(url, data)
1529 except socket.error as msg:
1530 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1531
1532 def open_unknown(self, fullurl, data=None):
1533 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001534 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001535 raise IOError('url error', 'unknown url type', type)
1536
1537 def open_unknown_proxy(self, proxy, fullurl, data=None):
1538 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001539 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001540 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1541
1542 # External interface
1543 def retrieve(self, url, filename=None, reporthook=None, data=None):
1544 """retrieve(url) returns (filename, headers) for a local object
1545 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001546 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001547 if self.tempcache and url in self.tempcache:
1548 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001549 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001550 if filename is None and (not type or type == 'file'):
1551 try:
1552 fp = self.open_local_file(url1)
1553 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001554 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001555 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001556 except IOError as msg:
1557 pass
1558 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001559 try:
1560 headers = fp.info()
1561 if filename:
1562 tfp = open(filename, 'wb')
1563 else:
1564 import tempfile
1565 garbage, path = splittype(url)
1566 garbage, path = splithost(path or "")
1567 path, garbage = splitquery(path or "")
1568 path, garbage = splitattr(path or "")
1569 suffix = os.path.splitext(path)[1]
1570 (fd, filename) = tempfile.mkstemp(suffix)
1571 self.__tempfiles.append(filename)
1572 tfp = os.fdopen(fd, 'wb')
1573 try:
1574 result = filename, headers
1575 if self.tempcache is not None:
1576 self.tempcache[url] = result
1577 bs = 1024*8
1578 size = -1
1579 read = 0
1580 blocknum = 0
1581 if reporthook:
1582 if "content-length" in headers:
1583 size = int(headers["Content-Length"])
1584 reporthook(blocknum, bs, size)
1585 while 1:
1586 block = fp.read(bs)
1587 if not block:
1588 break
1589 read += len(block)
1590 tfp.write(block)
1591 blocknum += 1
1592 if reporthook:
1593 reporthook(blocknum, bs, size)
1594 finally:
1595 tfp.close()
1596 finally:
1597 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001598
1599 # raise exception if actual size does not match content-length header
1600 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001601 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001602 "retrieval incomplete: got only %i out of %i bytes"
1603 % (read, size), result)
1604
1605 return result
1606
1607 # Each method named open_<type> knows how to open that type of URL
1608
1609 def _open_generic_http(self, connection_factory, url, data):
1610 """Make an HTTP connection using connection_class.
1611
1612 This is an internal method that should be called from
1613 open_http() or open_https().
1614
1615 Arguments:
1616 - connection_factory should take a host name and return an
1617 HTTPConnection instance.
1618 - url is the url to retrieval or a host, relative-path pair.
1619 - data is payload for a POST request or None.
1620 """
1621
1622 user_passwd = None
1623 proxy_passwd= None
1624 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001625 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001626 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001627 user_passwd, host = splituser(host)
1628 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001629 realhost = host
1630 else:
1631 host, selector = url
1632 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001633 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001634 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001635 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001636 url = rest
1637 user_passwd = None
1638 if urltype.lower() != 'http':
1639 realhost = None
1640 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001641 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001642 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001643 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001644 if user_passwd:
1645 selector = "%s://%s%s" % (urltype, realhost, rest)
1646 if proxy_bypass(realhost):
1647 host = realhost
1648
1649 #print "proxy via http:", host, selector
1650 if not host: raise IOError('http error', 'no host given')
1651
1652 if proxy_passwd:
1653 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001654 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001655 else:
1656 proxy_auth = None
1657
1658 if user_passwd:
1659 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001660 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001661 else:
1662 auth = None
1663 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001664 headers = {}
1665 if proxy_auth:
1666 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1667 if auth:
1668 headers["Authorization"] = "Basic %s" % auth
1669 if realhost:
1670 headers["Host"] = realhost
Senthil Kumarand91ffca2011-03-19 17:25:27 +08001671
1672 # Add Connection:close as we don't support persistent connections yet.
1673 # This helps in closing the socket and avoiding ResourceWarning
1674
1675 headers["Connection"] = "close"
1676
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001677 for header, value in self.addheaders:
1678 headers[header] = value
1679
1680 if data is not None:
1681 headers["Content-Type"] = "application/x-www-form-urlencoded"
1682 http_conn.request("POST", selector, data, headers)
1683 else:
1684 http_conn.request("GET", selector, headers=headers)
1685
1686 try:
1687 response = http_conn.getresponse()
1688 except http.client.BadStatusLine:
1689 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001690 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001691
1692 # According to RFC 2616, "2xx" code indicates that the client's
1693 # request was successfully received, understood, and accepted.
1694 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001695 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001696 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001697 else:
1698 return self.http_error(
1699 url, response.fp,
1700 response.status, response.reason, response.msg, data)
1701
1702 def open_http(self, url, data=None):
1703 """Use HTTP protocol."""
1704 return self._open_generic_http(http.client.HTTPConnection, url, data)
1705
1706 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1707 """Handle http errors.
1708
1709 Derived class can override this, or provide specific handlers
1710 named http_error_DDD where DDD is the 3-digit error code."""
1711 # First check if there's a specific handler for this error
1712 name = 'http_error_%d' % errcode
1713 if hasattr(self, name):
1714 method = getattr(self, name)
1715 if data is None:
1716 result = method(url, fp, errcode, errmsg, headers)
1717 else:
1718 result = method(url, fp, errcode, errmsg, headers, data)
1719 if result: return result
1720 return self.http_error_default(url, fp, errcode, errmsg, headers)
1721
1722 def http_error_default(self, url, fp, errcode, errmsg, headers):
1723 """Default error handler: close the connection and raise IOError."""
1724 void = fp.read()
1725 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001726 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001727
1728 if _have_ssl:
1729 def _https_connection(self, host):
1730 return http.client.HTTPSConnection(host,
1731 key_file=self.key_file,
1732 cert_file=self.cert_file)
1733
1734 def open_https(self, url, data=None):
1735 """Use HTTPS protocol."""
1736 return self._open_generic_http(self._https_connection, url, data)
1737
1738 def open_file(self, url):
1739 """Use local file or FTP depending on form of URL."""
1740 if not isinstance(url, str):
1741 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1742 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001743 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001744 else:
1745 return self.open_local_file(url)
1746
1747 def open_local_file(self, url):
1748 """Use local file."""
1749 import mimetypes, email.utils
1750 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001751 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001752 localname = url2pathname(file)
1753 try:
1754 stats = os.stat(localname)
1755 except OSError as e:
1756 raise URLError(e.errno, e.strerror, e.filename)
1757 size = stats.st_size
1758 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1759 mtype = mimetypes.guess_type(url)[0]
1760 headers = email.message_from_string(
1761 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1762 (mtype or 'text/plain', size, modified))
1763 if not host:
1764 urlfile = file
1765 if file[:1] == '/':
1766 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001767 return addinfourl(open(localname, 'rb'), headers, urlfile)
1768 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001769 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001770 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001771 urlfile = file
1772 if file[:1] == '/':
1773 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001774 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001775 raise URLError('local file error', 'not on local host')
1776
1777 def open_ftp(self, url):
1778 """Use FTP protocol."""
1779 if not isinstance(url, str):
1780 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1781 import mimetypes
1782 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001783 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001784 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001785 host, port = splitport(host)
1786 user, host = splituser(host)
1787 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001788 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001789 host = unquote(host)
1790 user = unquote(user or '')
1791 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001792 host = socket.gethostbyname(host)
1793 if not port:
1794 import ftplib
1795 port = ftplib.FTP_PORT
1796 else:
1797 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001798 path, attrs = splitattr(path)
1799 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001800 dirs = path.split('/')
1801 dirs, file = dirs[:-1], dirs[-1]
1802 if dirs and not dirs[0]: dirs = dirs[1:]
1803 if dirs and not dirs[0]: dirs[0] = '/'
1804 key = user, host, port, '/'.join(dirs)
1805 # XXX thread unsafe!
1806 if len(self.ftpcache) > MAXFTPCACHE:
1807 # Prune the cache, rather arbitrarily
1808 for k in self.ftpcache.keys():
1809 if k != key:
1810 v = self.ftpcache[k]
1811 del self.ftpcache[k]
1812 v.close()
1813 try:
1814 if not key in self.ftpcache:
1815 self.ftpcache[key] = \
1816 ftpwrapper(user, passwd, host, port, dirs)
1817 if not file: type = 'D'
1818 else: type = 'I'
1819 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001820 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001821 if attr.lower() == 'type' and \
1822 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1823 type = value.upper()
1824 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1825 mtype = mimetypes.guess_type("ftp:" + url)[0]
1826 headers = ""
1827 if mtype:
1828 headers += "Content-Type: %s\n" % mtype
1829 if retrlen is not None and retrlen >= 0:
1830 headers += "Content-Length: %d\n" % retrlen
1831 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001832 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001833 except ftperrors() as msg:
1834 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1835
1836 def open_data(self, url, data=None):
1837 """Use "data" URL."""
1838 if not isinstance(url, str):
1839 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1840 # ignore POSTed data
1841 #
1842 # syntax of data URLs:
1843 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1844 # mediatype := [ type "/" subtype ] *( ";" parameter )
1845 # data := *urlchar
1846 # parameter := attribute "=" value
1847 try:
1848 [type, data] = url.split(',', 1)
1849 except ValueError:
1850 raise IOError('data error', 'bad data URL')
1851 if not type:
1852 type = 'text/plain;charset=US-ASCII'
1853 semi = type.rfind(';')
1854 if semi >= 0 and '=' not in type[semi:]:
1855 encoding = type[semi+1:]
1856 type = type[:semi]
1857 else:
1858 encoding = ''
1859 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001860 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001861 time.gmtime(time.time())))
1862 msg.append('Content-type: %s' % type)
1863 if encoding == 'base64':
1864 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001865 # XXX is this encoding/decoding ok?
1866 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001867 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001868 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001869 msg.append('Content-Length: %d' % len(data))
1870 msg.append('')
1871 msg.append(data)
1872 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001873 headers = email.message_from_string(msg)
1874 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001875 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001876 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001877
1878
1879class FancyURLopener(URLopener):
1880 """Derived class with handlers for errors we can handle (perhaps)."""
1881
1882 def __init__(self, *args, **kwargs):
1883 URLopener.__init__(self, *args, **kwargs)
1884 self.auth_cache = {}
1885 self.tries = 0
1886 self.maxtries = 10
1887
1888 def http_error_default(self, url, fp, errcode, errmsg, headers):
1889 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001890 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001891
1892 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1893 """Error 302 -- relocated (temporarily)."""
1894 self.tries += 1
1895 if self.maxtries and self.tries >= self.maxtries:
1896 if hasattr(self, "http_error_500"):
1897 meth = self.http_error_500
1898 else:
1899 meth = self.http_error_default
1900 self.tries = 0
1901 return meth(url, fp, 500,
1902 "Internal Server Error: Redirect Recursion", headers)
1903 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1904 data)
1905 self.tries = 0
1906 return result
1907
1908 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1909 if 'location' in headers:
1910 newurl = headers['location']
1911 elif 'uri' in headers:
1912 newurl = headers['uri']
1913 else:
1914 return
1915 void = fp.read()
1916 fp.close()
guido@google.coma119df92011-03-29 11:41:02 -07001917
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001918 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001919 newurl = urljoin(self.type + ":" + url, newurl)
guido@google.coma119df92011-03-29 11:41:02 -07001920
1921 urlparts = urlparse(newurl)
1922
1923 # For security reasons, we don't allow redirection to anything other
1924 # than http, https and ftp.
1925
1926 # We are using newer HTTPError with older redirect_internal method
1927 # This older method will get deprecated in 3.3
1928
1929 if not urlparts.scheme in ('http', 'https', 'ftp'):
1930 raise HTTPError(newurl, errcode,
1931 errmsg +
1932 " Redirection to url '%s' is not allowed." % newurl,
1933 headers, fp)
1934
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001935 return self.open(newurl)
1936
1937 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1938 """Error 301 -- also relocated (permanently)."""
1939 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1940
1941 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1942 """Error 303 -- also relocated (essentially identical to 302)."""
1943 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1944
1945 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1946 """Error 307 -- relocated, but turn POST into error."""
1947 if data is None:
1948 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1949 else:
1950 return self.http_error_default(url, fp, errcode, errmsg, headers)
1951
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001952 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1953 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001954 """Error 401 -- authentication required.
1955 This function supports Basic authentication only."""
1956 if not 'www-authenticate' in headers:
1957 URLopener.http_error_default(self, url, fp,
1958 errcode, errmsg, headers)
1959 stuff = headers['www-authenticate']
1960 import re
1961 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1962 if not match:
1963 URLopener.http_error_default(self, url, fp,
1964 errcode, errmsg, headers)
1965 scheme, realm = match.groups()
1966 if scheme.lower() != 'basic':
1967 URLopener.http_error_default(self, url, fp,
1968 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001969 if not retry:
1970 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1971 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001972 name = 'retry_' + self.type + '_basic_auth'
1973 if data is None:
1974 return getattr(self,name)(url, realm)
1975 else:
1976 return getattr(self,name)(url, realm, data)
1977
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001978 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
1979 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001980 """Error 407 -- proxy authentication required.
1981 This function supports Basic authentication only."""
1982 if not 'proxy-authenticate' in headers:
1983 URLopener.http_error_default(self, url, fp,
1984 errcode, errmsg, headers)
1985 stuff = headers['proxy-authenticate']
1986 import re
1987 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1988 if not match:
1989 URLopener.http_error_default(self, url, fp,
1990 errcode, errmsg, headers)
1991 scheme, realm = match.groups()
1992 if scheme.lower() != 'basic':
1993 URLopener.http_error_default(self, url, fp,
1994 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001995 if not retry:
1996 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1997 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001998 name = 'retry_proxy_' + self.type + '_basic_auth'
1999 if data is None:
2000 return getattr(self,name)(url, realm)
2001 else:
2002 return getattr(self,name)(url, realm, data)
2003
2004 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002005 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002006 newurl = 'http://' + host + selector
2007 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00002008 urltype, proxyhost = splittype(proxy)
2009 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002010 i = proxyhost.find('@') + 1
2011 proxyhost = proxyhost[i:]
2012 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2013 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002014 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002015 quote(passwd, safe=''), proxyhost)
2016 self.proxies['http'] = 'http://' + proxyhost + proxyselector
2017 if data is None:
2018 return self.open(newurl)
2019 else:
2020 return self.open(newurl, data)
2021
2022 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002023 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002024 newurl = 'https://' + host + selector
2025 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00002026 urltype, proxyhost = splittype(proxy)
2027 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002028 i = proxyhost.find('@') + 1
2029 proxyhost = proxyhost[i:]
2030 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2031 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002032 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002033 quote(passwd, safe=''), proxyhost)
2034 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2035 if data is None:
2036 return self.open(newurl)
2037 else:
2038 return self.open(newurl, data)
2039
2040 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002041 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002042 i = host.find('@') + 1
2043 host = host[i:]
2044 user, passwd = self.get_user_passwd(host, realm, i)
2045 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002046 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002047 quote(passwd, safe=''), host)
2048 newurl = 'http://' + host + selector
2049 if data is None:
2050 return self.open(newurl)
2051 else:
2052 return self.open(newurl, data)
2053
2054 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002055 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002056 i = host.find('@') + 1
2057 host = host[i:]
2058 user, passwd = self.get_user_passwd(host, realm, i)
2059 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002060 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002061 quote(passwd, safe=''), host)
2062 newurl = 'https://' + host + selector
2063 if data is None:
2064 return self.open(newurl)
2065 else:
2066 return self.open(newurl, data)
2067
Florent Xicluna757445b2010-05-17 17:24:07 +00002068 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002069 key = realm + '@' + host.lower()
2070 if key in self.auth_cache:
2071 if clear_cache:
2072 del self.auth_cache[key]
2073 else:
2074 return self.auth_cache[key]
2075 user, passwd = self.prompt_user_passwd(host, realm)
2076 if user or passwd: self.auth_cache[key] = (user, passwd)
2077 return user, passwd
2078
2079 def prompt_user_passwd(self, host, realm):
2080 """Override this in a GUI environment!"""
2081 import getpass
2082 try:
2083 user = input("Enter username for %s at %s: " % (realm, host))
2084 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2085 (user, realm, host))
2086 return user, passwd
2087 except KeyboardInterrupt:
2088 print()
2089 return None, None
2090
2091
2092# Utility functions
2093
2094_localhost = None
2095def localhost():
2096 """Return the IP address of the magic hostname 'localhost'."""
2097 global _localhost
2098 if _localhost is None:
2099 _localhost = socket.gethostbyname('localhost')
2100 return _localhost
2101
2102_thishost = None
2103def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002104 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002105 global _thishost
2106 if _thishost is None:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002107 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname()[2]))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002108 return _thishost
2109
2110_ftperrors = None
2111def ftperrors():
2112 """Return the set of errors raised by the FTP class."""
2113 global _ftperrors
2114 if _ftperrors is None:
2115 import ftplib
2116 _ftperrors = ftplib.all_errors
2117 return _ftperrors
2118
2119_noheaders = None
2120def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002121 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002122 global _noheaders
2123 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002124 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002125 return _noheaders
2126
2127
2128# Utility classes
2129
2130class ftpwrapper:
2131 """Class used by open_ftp() for cache of open FTP connections."""
2132
2133 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2134 self.user = user
2135 self.passwd = passwd
2136 self.host = host
2137 self.port = port
2138 self.dirs = dirs
2139 self.timeout = timeout
2140 self.init()
2141
2142 def init(self):
2143 import ftplib
2144 self.busy = 0
2145 self.ftp = ftplib.FTP()
2146 self.ftp.connect(self.host, self.port, self.timeout)
2147 self.ftp.login(self.user, self.passwd)
2148 for dir in self.dirs:
2149 self.ftp.cwd(dir)
2150
2151 def retrfile(self, file, type):
2152 import ftplib
2153 self.endtransfer()
2154 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2155 else: cmd = 'TYPE ' + type; isdir = 0
2156 try:
2157 self.ftp.voidcmd(cmd)
2158 except ftplib.all_errors:
2159 self.init()
2160 self.ftp.voidcmd(cmd)
2161 conn = None
2162 if file and not isdir:
2163 # Try to retrieve as a file
2164 try:
2165 cmd = 'RETR ' + file
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002166 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002167 except ftplib.error_perm as reason:
2168 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002169 raise URLError('ftp error', reason).with_traceback(
2170 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002171 if not conn:
2172 # Set transfer mode to ASCII!
2173 self.ftp.voidcmd('TYPE A')
2174 # Try a directory listing. Verify that directory exists.
2175 if file:
2176 pwd = self.ftp.pwd()
2177 try:
2178 try:
2179 self.ftp.cwd(file)
2180 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002181 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002182 finally:
2183 self.ftp.cwd(pwd)
2184 cmd = 'LIST ' + file
2185 else:
2186 cmd = 'LIST'
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002187 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002188 self.busy = 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002189
2190 ftpobj = addclosehook(conn.makefile('rb'), self.endtransfer)
2191 conn.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002192 # Pass back both a suitably decorated object and a retrieval length
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002193 return (ftpobj, retrlen)
2194
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002195 def endtransfer(self):
2196 if not self.busy:
2197 return
2198 self.busy = 0
2199 try:
2200 self.ftp.voidresp()
2201 except ftperrors():
2202 pass
2203
2204 def close(self):
2205 self.endtransfer()
2206 try:
2207 self.ftp.close()
2208 except ftperrors():
2209 pass
2210
2211# Proxy handling
2212def getproxies_environment():
2213 """Return a dictionary of scheme -> proxy server URL mappings.
2214
2215 Scan the environment for variables named <scheme>_proxy;
2216 this seems to be the standard convention. If you need a
2217 different way, you can pass a proxies dictionary to the
2218 [Fancy]URLopener constructor.
2219
2220 """
2221 proxies = {}
2222 for name, value in os.environ.items():
2223 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002224 if value and name[-6:] == '_proxy':
2225 proxies[name[:-6]] = value
2226 return proxies
2227
2228def proxy_bypass_environment(host):
2229 """Test if proxies should not be used for a particular host.
2230
2231 Checks the environment for a variable named no_proxy, which should
2232 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2233 """
2234 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2235 # '*' is special case for always bypass
2236 if no_proxy == '*':
2237 return 1
2238 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002239 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002240 # check if the host ends with any of the DNS suffixes
2241 for name in no_proxy.split(','):
2242 if name and (hostonly.endswith(name) or host.endswith(name)):
2243 return 1
2244 # otherwise, don't bypass
2245 return 0
2246
2247
Ronald Oussorene72e1612011-03-14 18:15:25 -04002248# This code tests an OSX specific data structure but is testable on all
2249# platforms
2250def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2251 """
2252 Return True iff this host shouldn't be accessed using a proxy
2253
2254 This function uses the MacOSX framework SystemConfiguration
2255 to fetch the proxy information.
2256
2257 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2258 { 'exclude_simple': bool,
2259 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2260 }
2261 """
2262 import re
2263 import socket
2264 from fnmatch import fnmatch
2265
2266 hostonly, port = splitport(host)
2267
2268 def ip2num(ipAddr):
2269 parts = ipAddr.split('.')
2270 parts = list(map(int, parts))
2271 if len(parts) != 4:
2272 parts = (parts + [0, 0, 0, 0])[:4]
2273 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2274
2275 # Check for simple host names:
2276 if '.' not in host:
2277 if proxy_settings['exclude_simple']:
2278 return True
2279
2280 hostIP = None
2281
2282 for value in proxy_settings.get('exceptions', ()):
2283 # Items in the list are strings like these: *.local, 169.254/16
2284 if not value: continue
2285
2286 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2287 if m is not None:
2288 if hostIP is None:
2289 try:
2290 hostIP = socket.gethostbyname(hostonly)
2291 hostIP = ip2num(hostIP)
2292 except socket.error:
2293 continue
2294
2295 base = ip2num(m.group(1))
2296 mask = m.group(2)
2297 if mask is None:
2298 mask = 8 * (m.group(1).count('.') + 1)
2299 else:
2300 mask = int(mask[1:])
2301 mask = 32 - mask
2302
2303 if (hostIP >> mask) == (base >> mask):
2304 return True
2305
2306 elif fnmatch(host, value):
2307 return True
2308
2309 return False
2310
2311
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002312if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002313 from _scproxy import _get_proxy_settings, _get_proxies
2314
2315 def proxy_bypass_macosx_sysconf(host):
Ronald Oussoren84151202010-04-18 20:46:11 +00002316 proxy_settings = _get_proxy_settings()
Ronald Oussorene72e1612011-03-14 18:15:25 -04002317 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
Ronald Oussoren84151202010-04-18 20:46:11 +00002318
2319 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002320 """Return a dictionary of scheme -> proxy server URL mappings.
2321
Ronald Oussoren84151202010-04-18 20:46:11 +00002322 This function uses the MacOSX framework SystemConfiguration
2323 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002324 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002325 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002326
Ronald Oussoren84151202010-04-18 20:46:11 +00002327
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002328
2329 def proxy_bypass(host):
2330 if getproxies_environment():
2331 return proxy_bypass_environment(host)
2332 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002333 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002334
2335 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002336 return getproxies_environment() or getproxies_macosx_sysconf()
2337
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002338
2339elif os.name == 'nt':
2340 def getproxies_registry():
2341 """Return a dictionary of scheme -> proxy server URL mappings.
2342
2343 Win32 uses the registry to store proxies.
2344
2345 """
2346 proxies = {}
2347 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002348 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002349 except ImportError:
2350 # Std module, so should be around - but you never know!
2351 return proxies
2352 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002353 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002354 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002355 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002356 'ProxyEnable')[0]
2357 if proxyEnable:
2358 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002359 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002360 'ProxyServer')[0])
2361 if '=' in proxyServer:
2362 # Per-protocol settings
2363 for p in proxyServer.split(';'):
2364 protocol, address = p.split('=', 1)
2365 # See if address has a type:// prefix
2366 import re
2367 if not re.match('^([^/:]+)://', address):
2368 address = '%s://%s' % (protocol, address)
2369 proxies[protocol] = address
2370 else:
2371 # Use one setting for all protocols
2372 if proxyServer[:5] == 'http:':
2373 proxies['http'] = proxyServer
2374 else:
2375 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002376 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002377 proxies['ftp'] = 'ftp://%s' % proxyServer
2378 internetSettings.Close()
2379 except (WindowsError, ValueError, TypeError):
2380 # Either registry key not found etc, or the value in an
2381 # unexpected format.
2382 # proxies already set up to be empty so nothing to do
2383 pass
2384 return proxies
2385
2386 def getproxies():
2387 """Return a dictionary of scheme -> proxy server URL mappings.
2388
2389 Returns settings gathered from the environment, if specified,
2390 or the registry.
2391
2392 """
2393 return getproxies_environment() or getproxies_registry()
2394
2395 def proxy_bypass_registry(host):
2396 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002397 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002398 import re
2399 except ImportError:
2400 # Std modules, so should be around - but you never know!
2401 return 0
2402 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002403 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002404 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002405 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002406 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002407 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002408 'ProxyOverride')[0])
2409 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2410 except WindowsError:
2411 return 0
2412 if not proxyEnable or not proxyOverride:
2413 return 0
2414 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002415 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002416 host = [rawHost]
2417 try:
2418 addr = socket.gethostbyname(rawHost)
2419 if addr != rawHost:
2420 host.append(addr)
2421 except socket.error:
2422 pass
2423 try:
2424 fqdn = socket.getfqdn(rawHost)
2425 if fqdn != rawHost:
2426 host.append(fqdn)
2427 except socket.error:
2428 pass
2429 # make a check value list from the registry entry: replace the
2430 # '<local>' string by the localhost entry and the corresponding
2431 # canonical entry.
2432 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002433 # now check if we match one of the registry values.
2434 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002435 if test == '<local>':
2436 if '.' not in rawHost:
2437 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002438 test = test.replace(".", r"\.") # mask dots
2439 test = test.replace("*", r".*") # change glob sequence
2440 test = test.replace("?", r".") # change glob char
2441 for val in host:
2442 # print "%s <--> %s" %( test, val )
2443 if re.match(test, val, re.I):
2444 return 1
2445 return 0
2446
2447 def proxy_bypass(host):
2448 """Return a dictionary of scheme -> proxy server URL mappings.
2449
2450 Returns settings gathered from the environment, if specified,
2451 or the registry.
2452
2453 """
2454 if getproxies_environment():
2455 return proxy_bypass_environment(host)
2456 else:
2457 return proxy_bypass_registry(host)
2458
2459else:
2460 # By default use environment variables
2461 getproxies = getproxies_environment
2462 proxy_bypass = proxy_bypass_environment