blob: 4436105a07d0a7ad1af3b0d57d7f8dd2448802d8 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000097import collections
Jeremy Hylton1afc1692008-06-18 20:49:58 +000098
Georg Brandl13e89462008-07-01 19:56:00 +000099from urllib.error import URLError, HTTPError, ContentTooShortError
100from urllib.parse import (
101 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
102 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000103 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000104from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106# check for SSL
107try:
108 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000109except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110 _have_ssl = False
111else:
112 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113
114# used in User-Agent header sent
115__version__ = sys.version[:3]
116
117_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000118def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
119 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000120 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000121 if cafile or capath:
122 if not _have_ssl:
123 raise ValueError('SSL support not available')
124 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
125 context.options |= ssl.OP_NO_SSLv2
126 if cafile or capath:
127 context.verify_mode = ssl.CERT_REQUIRED
128 context.load_verify_locations(cafile, capath)
129 check_hostname = True
130 else:
131 check_hostname = False
132 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
133 opener = build_opener(https_handler)
134 elif _opener is None:
135 _opener = opener = build_opener()
136 else:
137 opener = _opener
138 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000139
140def install_opener(opener):
141 global _opener
142 _opener = opener
143
144# TODO(jhylton): Make this work with the same global opener.
145_urlopener = None
146def urlretrieve(url, filename=None, reporthook=None, data=None):
147 global _urlopener
148 if not _urlopener:
149 _urlopener = FancyURLopener()
150 return _urlopener.retrieve(url, filename, reporthook, data)
151
152def urlcleanup():
153 if _urlopener:
154 _urlopener.cleanup()
155 global _opener
156 if _opener:
157 _opener = None
158
159# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000160_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000161def request_host(request):
162 """Return request-host, as defined by RFC 2965.
163
164 Variation from RFC: returned value is lowercased, for convenient
165 comparison.
166
167 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000168 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000169 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000170 if host == "":
171 host = request.get_header("Host", "")
172
173 # remove port, if present
174 host = _cut_port_re.sub("", host, 1)
175 return host.lower()
176
177class Request:
178
179 def __init__(self, url, data=None, headers={},
180 origin_req_host=None, unverifiable=False):
181 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000182 self.full_url = unwrap(url)
Senthil Kumarand95cc752010-08-08 11:27:53 +0000183 self.full_url, fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000184 self.data = data
185 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000186 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000187 for key, value in headers.items():
188 self.add_header(key, value)
189 self.unredirected_hdrs = {}
190 if origin_req_host is None:
191 origin_req_host = request_host(self)
192 self.origin_req_host = origin_req_host
193 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000194 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000195
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000196 def _parse(self):
197 self.type, rest = splittype(self.full_url)
198 if self.type is None:
199 raise ValueError("unknown url type: %s" % self.full_url)
200 self.host, self.selector = splithost(rest)
201 if self.host:
202 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000203
204 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000205 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000206 return "POST"
207 else:
208 return "GET"
209
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000210 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211
212 def add_data(self, data):
213 self.data = data
214
215 def has_data(self):
216 return self.data is not None
217
218 def get_data(self):
219 return self.data
220
221 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000222 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000223
224 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000225 return self.type
226
227 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000228 return self.host
229
230 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000231 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000232
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000233 def is_unverifiable(self):
234 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000235
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000236 def get_origin_req_host(self):
237 return self.origin_req_host
238
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000239 # End deprecated methods
240
241 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000242 if self.type == 'https' and not self._tunnel_host:
243 self._tunnel_host = self.host
244 else:
245 self.type= type
246 self.selector = self.full_url
247 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000248
249 def has_proxy(self):
250 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000251
252 def add_header(self, key, val):
253 # useful for something like authentication
254 self.headers[key.capitalize()] = val
255
256 def add_unredirected_header(self, key, val):
257 # will not be added to a redirected request
258 self.unredirected_hdrs[key.capitalize()] = val
259
260 def has_header(self, header_name):
261 return (header_name in self.headers or
262 header_name in self.unredirected_hdrs)
263
264 def get_header(self, header_name, default=None):
265 return self.headers.get(
266 header_name,
267 self.unredirected_hdrs.get(header_name, default))
268
269 def header_items(self):
270 hdrs = self.unredirected_hdrs.copy()
271 hdrs.update(self.headers)
272 return list(hdrs.items())
273
274class OpenerDirector:
275 def __init__(self):
276 client_version = "Python-urllib/%s" % __version__
277 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000278 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000279 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000280 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000281 self.handle_open = {}
282 self.handle_error = {}
283 self.process_response = {}
284 self.process_request = {}
285
286 def add_handler(self, handler):
287 if not hasattr(handler, "add_parent"):
288 raise TypeError("expected BaseHandler instance, got %r" %
289 type(handler))
290
291 added = False
292 for meth in dir(handler):
293 if meth in ["redirect_request", "do_open", "proxy_open"]:
294 # oops, coincidental match
295 continue
296
297 i = meth.find("_")
298 protocol = meth[:i]
299 condition = meth[i+1:]
300
301 if condition.startswith("error"):
302 j = condition.find("_") + i + 1
303 kind = meth[j+1:]
304 try:
305 kind = int(kind)
306 except ValueError:
307 pass
308 lookup = self.handle_error.get(protocol, {})
309 self.handle_error[protocol] = lookup
310 elif condition == "open":
311 kind = protocol
312 lookup = self.handle_open
313 elif condition == "response":
314 kind = protocol
315 lookup = self.process_response
316 elif condition == "request":
317 kind = protocol
318 lookup = self.process_request
319 else:
320 continue
321
322 handlers = lookup.setdefault(kind, [])
323 if handlers:
324 bisect.insort(handlers, handler)
325 else:
326 handlers.append(handler)
327 added = True
328
329 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330 bisect.insort(self.handlers, handler)
331 handler.add_parent(self)
332
333 def close(self):
334 # Only exists for backwards compatibility.
335 pass
336
337 def _call_chain(self, chain, kind, meth_name, *args):
338 # Handlers raise an exception if no one else should try to handle
339 # the request, or return None if they can't but another handler
340 # could. Otherwise, they return the response.
341 handlers = chain.get(kind, ())
342 for handler in handlers:
343 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000344 result = func(*args)
345 if result is not None:
346 return result
347
348 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
349 # accept a URL or a Request object
350 if isinstance(fullurl, str):
351 req = Request(fullurl, data)
352 else:
353 req = fullurl
354 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000355 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000356
357 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000358 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000359
360 # pre-process request
361 meth_name = protocol+"_request"
362 for processor in self.process_request.get(protocol, []):
363 meth = getattr(processor, meth_name)
364 req = meth(req)
365
366 response = self._open(req, data)
367
368 # post-process response
369 meth_name = protocol+"_response"
370 for processor in self.process_response.get(protocol, []):
371 meth = getattr(processor, meth_name)
372 response = meth(req, response)
373
374 return response
375
376 def _open(self, req, data=None):
377 result = self._call_chain(self.handle_open, 'default',
378 'default_open', req)
379 if result:
380 return result
381
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000382 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000383 result = self._call_chain(self.handle_open, protocol, protocol +
384 '_open', req)
385 if result:
386 return result
387
388 return self._call_chain(self.handle_open, 'unknown',
389 'unknown_open', req)
390
391 def error(self, proto, *args):
392 if proto in ('http', 'https'):
393 # XXX http[s] protocols are special-cased
394 dict = self.handle_error['http'] # https is not different than http
395 proto = args[2] # YUCK!
396 meth_name = 'http_error_%s' % proto
397 http_err = 1
398 orig_args = args
399 else:
400 dict = self.handle_error
401 meth_name = proto + '_error'
402 http_err = 0
403 args = (dict, proto, meth_name) + args
404 result = self._call_chain(*args)
405 if result:
406 return result
407
408 if http_err:
409 args = (dict, 'default', 'http_error_default') + orig_args
410 return self._call_chain(*args)
411
412# XXX probably also want an abstract factory that knows when it makes
413# sense to skip a superclass in favor of a subclass and when it might
414# make sense to include both
415
416def build_opener(*handlers):
417 """Create an opener object from a list of handlers.
418
419 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000420 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000421
422 If any of the handlers passed as arguments are subclasses of the
423 default handlers, the default handlers will not be used.
424 """
425 def isclass(obj):
426 return isinstance(obj, type) or hasattr(obj, "__bases__")
427
428 opener = OpenerDirector()
429 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
430 HTTPDefaultErrorHandler, HTTPRedirectHandler,
431 FTPHandler, FileHandler, HTTPErrorProcessor]
432 if hasattr(http.client, "HTTPSConnection"):
433 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000434 skip = set()
435 for klass in default_classes:
436 for check in handlers:
437 if isclass(check):
438 if issubclass(check, klass):
439 skip.add(klass)
440 elif isinstance(check, klass):
441 skip.add(klass)
442 for klass in skip:
443 default_classes.remove(klass)
444
445 for klass in default_classes:
446 opener.add_handler(klass())
447
448 for h in handlers:
449 if isclass(h):
450 h = h()
451 opener.add_handler(h)
452 return opener
453
454class BaseHandler:
455 handler_order = 500
456
457 def add_parent(self, parent):
458 self.parent = parent
459
460 def close(self):
461 # Only exists for backwards compatibility
462 pass
463
464 def __lt__(self, other):
465 if not hasattr(other, "handler_order"):
466 # Try to preserve the old behavior of having custom classes
467 # inserted after default ones (works only for custom user
468 # classes which are not aware of handler_order).
469 return True
470 return self.handler_order < other.handler_order
471
472
473class HTTPErrorProcessor(BaseHandler):
474 """Process HTTP error responses."""
475 handler_order = 1000 # after all other processing
476
477 def http_response(self, request, response):
478 code, msg, hdrs = response.code, response.msg, response.info()
479
480 # According to RFC 2616, "2xx" code indicates that the client's
481 # request was successfully received, understood, and accepted.
482 if not (200 <= code < 300):
483 response = self.parent.error(
484 'http', request, response, code, msg, hdrs)
485
486 return response
487
488 https_response = http_response
489
490class HTTPDefaultErrorHandler(BaseHandler):
491 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000492 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000493
494class HTTPRedirectHandler(BaseHandler):
495 # maximum number of redirections to any single URL
496 # this is needed because of the state that cookies introduce
497 max_repeats = 4
498 # maximum total number of redirections (regardless of URL) before
499 # assuming we're in a loop
500 max_redirections = 10
501
502 def redirect_request(self, req, fp, code, msg, headers, newurl):
503 """Return a Request or None in response to a redirect.
504
505 This is called by the http_error_30x methods when a
506 redirection response is received. If a redirection should
507 take place, return a new Request to allow http_error_30x to
508 perform the redirect. Otherwise, raise HTTPError if no-one
509 else should try to handle this url. Return None if you can't
510 but another Handler might.
511 """
512 m = req.get_method()
513 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
514 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000515 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000516
517 # Strictly (according to RFC 2616), 301 or 302 in response to
518 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000519 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000520 # essentially all clients do redirect in this case, so we do
521 # the same.
522 # be conciliant with URIs containing a space
523 newurl = newurl.replace(' ', '%20')
524 CONTENT_HEADERS = ("content-length", "content-type")
525 newheaders = dict((k, v) for k, v in req.headers.items()
526 if k.lower() not in CONTENT_HEADERS)
527 return Request(newurl,
528 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000529 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000530 unverifiable=True)
531
532 # Implementation note: To avoid the server sending us into an
533 # infinite loop, the request object needs to track what URLs we
534 # have already seen. Do this by adding a handler-specific
535 # attribute to the Request object.
536 def http_error_302(self, req, fp, code, msg, headers):
537 # Some servers (incorrectly) return multiple Location headers
538 # (so probably same goes for URI). Use first header.
539 if "location" in headers:
540 newurl = headers["location"]
541 elif "uri" in headers:
542 newurl = headers["uri"]
543 else:
544 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000545
546 # fix a possible malformed URL
547 urlparts = urlparse(newurl)
548 if not urlparts.path:
549 urlparts = list(urlparts)
550 urlparts[2] = "/"
551 newurl = urlunparse(urlparts)
552
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000553 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000554
555 # XXX Probably want to forget about the state of the current
556 # request, although that might interact poorly with other
557 # handlers that also use handler-specific request attributes
558 new = self.redirect_request(req, fp, code, msg, headers, newurl)
559 if new is None:
560 return
561
562 # loop detection
563 # .redirect_dict has a key url if url was previously visited.
564 if hasattr(req, 'redirect_dict'):
565 visited = new.redirect_dict = req.redirect_dict
566 if (visited.get(newurl, 0) >= self.max_repeats or
567 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000568 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000569 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000570 else:
571 visited = new.redirect_dict = req.redirect_dict = {}
572 visited[newurl] = visited.get(newurl, 0) + 1
573
574 # Don't close the fp until we are sure that we won't use it
575 # with HTTPError.
576 fp.read()
577 fp.close()
578
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000579 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000580
581 http_error_301 = http_error_303 = http_error_307 = http_error_302
582
583 inf_msg = "The HTTP server returned a redirect error that would " \
584 "lead to an infinite loop.\n" \
585 "The last 30x error message was:\n"
586
587
588def _parse_proxy(proxy):
589 """Return (scheme, user, password, host/port) given a URL or an authority.
590
591 If a URL is supplied, it must have an authority (host:port) component.
592 According to RFC 3986, having an authority component means the URL must
593 have two slashes after the scheme:
594
595 >>> _parse_proxy('file:/ftp.example.com/')
596 Traceback (most recent call last):
597 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
598
599 The first three items of the returned tuple may be None.
600
601 Examples of authority parsing:
602
603 >>> _parse_proxy('proxy.example.com')
604 (None, None, None, 'proxy.example.com')
605 >>> _parse_proxy('proxy.example.com:3128')
606 (None, None, None, 'proxy.example.com:3128')
607
608 The authority component may optionally include userinfo (assumed to be
609 username:password):
610
611 >>> _parse_proxy('joe:password@proxy.example.com')
612 (None, 'joe', 'password', 'proxy.example.com')
613 >>> _parse_proxy('joe:password@proxy.example.com:3128')
614 (None, 'joe', 'password', 'proxy.example.com:3128')
615
616 Same examples, but with URLs instead:
617
618 >>> _parse_proxy('http://proxy.example.com/')
619 ('http', None, None, 'proxy.example.com')
620 >>> _parse_proxy('http://proxy.example.com:3128/')
621 ('http', None, None, 'proxy.example.com:3128')
622 >>> _parse_proxy('http://joe:password@proxy.example.com/')
623 ('http', 'joe', 'password', 'proxy.example.com')
624 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
625 ('http', 'joe', 'password', 'proxy.example.com:3128')
626
627 Everything after the authority is ignored:
628
629 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
630 ('ftp', 'joe', 'password', 'proxy.example.com')
631
632 Test for no trailing '/' case:
633
634 >>> _parse_proxy('http://joe:password@proxy.example.com')
635 ('http', 'joe', 'password', 'proxy.example.com')
636
637 """
Georg Brandl13e89462008-07-01 19:56:00 +0000638 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000639 if not r_scheme.startswith("/"):
640 # authority
641 scheme = None
642 authority = proxy
643 else:
644 # URL
645 if not r_scheme.startswith("//"):
646 raise ValueError("proxy URL with no authority: %r" % proxy)
647 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
648 # and 3.3.), path is empty or starts with '/'
649 end = r_scheme.find("/", 2)
650 if end == -1:
651 end = None
652 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000653 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000654 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000655 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000656 else:
657 user = password = None
658 return scheme, user, password, hostport
659
660class ProxyHandler(BaseHandler):
661 # Proxies must be in front
662 handler_order = 100
663
664 def __init__(self, proxies=None):
665 if proxies is None:
666 proxies = getproxies()
667 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
668 self.proxies = proxies
669 for type, url in proxies.items():
670 setattr(self, '%s_open' % type,
671 lambda r, proxy=url, type=type, meth=self.proxy_open: \
672 meth(r, proxy, type))
673
674 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000675 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000676 proxy_type, user, password, hostport = _parse_proxy(proxy)
677 if proxy_type is None:
678 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000679
680 if req.host and proxy_bypass(req.host):
681 return None
682
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000683 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000684 user_pass = '%s:%s' % (unquote(user),
685 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000686 creds = base64.b64encode(user_pass.encode()).decode("ascii")
687 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000688 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000689 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000690 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000691 # let other handlers take care of it
692 return None
693 else:
694 # need to start over, because the other handlers don't
695 # grok the proxy's URL type
696 # e.g. if we have a constructor arg proxies like so:
697 # {'http': 'ftp://proxy.example.com'}, we may end up turning
698 # a request for http://acme.example.com/a into one for
699 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000700 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000701
702class HTTPPasswordMgr:
703
704 def __init__(self):
705 self.passwd = {}
706
707 def add_password(self, realm, uri, user, passwd):
708 # uri could be a single URI or a sequence
709 if isinstance(uri, str):
710 uri = [uri]
711 if not realm in self.passwd:
712 self.passwd[realm] = {}
713 for default_port in True, False:
714 reduced_uri = tuple(
715 [self.reduce_uri(u, default_port) for u in uri])
716 self.passwd[realm][reduced_uri] = (user, passwd)
717
718 def find_user_password(self, realm, authuri):
719 domains = self.passwd.get(realm, {})
720 for default_port in True, False:
721 reduced_authuri = self.reduce_uri(authuri, default_port)
722 for uris, authinfo in domains.items():
723 for uri in uris:
724 if self.is_suburi(uri, reduced_authuri):
725 return authinfo
726 return None, None
727
728 def reduce_uri(self, uri, default_port=True):
729 """Accept authority or URI and extract only the authority and path."""
730 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000731 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000732 if parts[1]:
733 # URI
734 scheme = parts[0]
735 authority = parts[1]
736 path = parts[2] or '/'
737 else:
738 # host or host:port
739 scheme = None
740 authority = uri
741 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000742 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000743 if default_port and port is None and scheme is not None:
744 dport = {"http": 80,
745 "https": 443,
746 }.get(scheme)
747 if dport is not None:
748 authority = "%s:%d" % (host, dport)
749 return authority, path
750
751 def is_suburi(self, base, test):
752 """Check if test is below base in a URI tree
753
754 Both args must be URIs in reduced form.
755 """
756 if base == test:
757 return True
758 if base[0] != test[0]:
759 return False
760 common = posixpath.commonprefix((base[1], test[1]))
761 if len(common) == len(base[1]):
762 return True
763 return False
764
765
766class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
767
768 def find_user_password(self, realm, authuri):
769 user, password = HTTPPasswordMgr.find_user_password(self, realm,
770 authuri)
771 if user is not None:
772 return user, password
773 return HTTPPasswordMgr.find_user_password(self, None, authuri)
774
775
776class AbstractBasicAuthHandler:
777
778 # XXX this allows for multiple auth-schemes, but will stupidly pick
779 # the last one with a realm specified.
780
781 # allow for double- and single-quoted realm values
782 # (single quotes are a violation of the RFC, but appear in the wild)
783 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
784 'realm=(["\'])(.*?)\\2', re.I)
785
786 # XXX could pre-emptively send auth info already accepted (RFC 2617,
787 # end of section 2, and section 1.2 immediately after "credentials"
788 # production).
789
790 def __init__(self, password_mgr=None):
791 if password_mgr is None:
792 password_mgr = HTTPPasswordMgr()
793 self.passwd = password_mgr
794 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000795 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000796
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000797 def reset_retry_count(self):
798 self.retried = 0
799
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000800 def http_error_auth_reqed(self, authreq, host, req, headers):
801 # host may be an authority (without userinfo) or a URL with an
802 # authority
803 # XXX could be multiple headers
804 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000805
806 if self.retried > 5:
807 # retry sending the username:password 5 times before failing.
808 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
809 headers, None)
810 else:
811 self.retried += 1
812
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 if authreq:
814 mo = AbstractBasicAuthHandler.rx.search(authreq)
815 if mo:
816 scheme, quote, realm = mo.groups()
817 if scheme.lower() == 'basic':
Senthil Kumaran4bb5c272010-08-26 06:16:22 +0000818 response = self.retry_http_basic_auth(host, req, realm)
819 if response and response.code != 401:
820 self.retried = 0
821 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000822
823 def retry_http_basic_auth(self, host, req, realm):
824 user, pw = self.passwd.find_user_password(realm, host)
825 if pw is not None:
826 raw = "%s:%s" % (user, pw)
827 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
828 if req.headers.get(self.auth_header, None) == auth:
829 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000830 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000831 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000832 else:
833 return None
834
835
836class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
837
838 auth_header = 'Authorization'
839
840 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000841 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000842 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000843 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000844 self.reset_retry_count()
845 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000846
847
848class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
849
850 auth_header = 'Proxy-authorization'
851
852 def http_error_407(self, req, fp, code, msg, headers):
853 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000854 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000855 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
856 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000857 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000858 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000859 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000860 self.reset_retry_count()
861 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000862
863
864def randombytes(n):
865 """Return n random bytes."""
866 return os.urandom(n)
867
868class AbstractDigestAuthHandler:
869 # Digest authentication is specified in RFC 2617.
870
871 # XXX The client does not inspect the Authentication-Info header
872 # in a successful response.
873
874 # XXX It should be possible to test this implementation against
875 # a mock server that just generates a static set of challenges.
876
877 # XXX qop="auth-int" supports is shaky
878
879 def __init__(self, passwd=None):
880 if passwd is None:
881 passwd = HTTPPasswordMgr()
882 self.passwd = passwd
883 self.add_password = self.passwd.add_password
884 self.retried = 0
885 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000886 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000887
888 def reset_retry_count(self):
889 self.retried = 0
890
891 def http_error_auth_reqed(self, auth_header, host, req, headers):
892 authreq = headers.get(auth_header, None)
893 if self.retried > 5:
894 # Don't fail endlessly - if we failed once, we'll probably
895 # fail a second time. Hm. Unless the Password Manager is
896 # prompting for the information. Crap. This isn't great
897 # but it's better than the current 'repeat until recursion
898 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000899 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000900 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000901 else:
902 self.retried += 1
903 if authreq:
904 scheme = authreq.split()[0]
905 if scheme.lower() == 'digest':
906 return self.retry_http_digest_auth(req, authreq)
907
908 def retry_http_digest_auth(self, req, auth):
909 token, challenge = auth.split(' ', 1)
910 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
911 auth = self.get_authorization(req, chal)
912 if auth:
913 auth_val = 'Digest %s' % auth
914 if req.headers.get(self.auth_header, None) == auth_val:
915 return None
916 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000917 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000918 return resp
919
920 def get_cnonce(self, nonce):
921 # The cnonce-value is an opaque
922 # quoted string value provided by the client and used by both client
923 # and server to avoid chosen plaintext attacks, to provide mutual
924 # authentication, and to provide some message integrity protection.
925 # This isn't a fabulous effort, but it's probably Good Enough.
926 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
927 b = s.encode("ascii") + randombytes(8)
928 dig = hashlib.sha1(b).hexdigest()
929 return dig[:16]
930
931 def get_authorization(self, req, chal):
932 try:
933 realm = chal['realm']
934 nonce = chal['nonce']
935 qop = chal.get('qop')
936 algorithm = chal.get('algorithm', 'MD5')
937 # mod_digest doesn't send an opaque, even though it isn't
938 # supposed to be optional
939 opaque = chal.get('opaque', None)
940 except KeyError:
941 return None
942
943 H, KD = self.get_algorithm_impls(algorithm)
944 if H is None:
945 return None
946
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000947 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000948 if user is None:
949 return None
950
951 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000952 if req.data is not None:
953 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000954 else:
955 entdig = None
956
957 A1 = "%s:%s:%s" % (user, realm, pw)
958 A2 = "%s:%s" % (req.get_method(),
959 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000960 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000961 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000962 if nonce == self.last_nonce:
963 self.nonce_count += 1
964 else:
965 self.nonce_count = 1
966 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000967 ncvalue = '%08x' % self.nonce_count
968 cnonce = self.get_cnonce(nonce)
969 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
970 respdig = KD(H(A1), noncebit)
971 elif qop is None:
972 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
973 else:
974 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000975 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000976
977 # XXX should the partial digests be encoded too?
978
979 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000980 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000981 respdig)
982 if opaque:
983 base += ', opaque="%s"' % opaque
984 if entdig:
985 base += ', digest="%s"' % entdig
986 base += ', algorithm="%s"' % algorithm
987 if qop:
988 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
989 return base
990
991 def get_algorithm_impls(self, algorithm):
992 # lambdas assume digest modules are imported at the top level
993 if algorithm == 'MD5':
994 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
995 elif algorithm == 'SHA':
996 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
997 # XXX MD5-sess
998 KD = lambda s, d: H("%s:%s" % (s, d))
999 return H, KD
1000
1001 def get_entity_digest(self, data, chal):
1002 # XXX not implemented yet
1003 return None
1004
1005
1006class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1007 """An authentication protocol defined by RFC 2069
1008
1009 Digest authentication improves on basic authentication because it
1010 does not transmit passwords in the clear.
1011 """
1012
1013 auth_header = 'Authorization'
1014 handler_order = 490 # before Basic auth
1015
1016 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001017 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001018 retry = self.http_error_auth_reqed('www-authenticate',
1019 host, req, headers)
1020 self.reset_retry_count()
1021 return retry
1022
1023
1024class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1025
1026 auth_header = 'Proxy-Authorization'
1027 handler_order = 490 # before Basic auth
1028
1029 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001030 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001031 retry = self.http_error_auth_reqed('proxy-authenticate',
1032 host, req, headers)
1033 self.reset_retry_count()
1034 return retry
1035
1036class AbstractHTTPHandler(BaseHandler):
1037
1038 def __init__(self, debuglevel=0):
1039 self._debuglevel = debuglevel
1040
1041 def set_http_debuglevel(self, level):
1042 self._debuglevel = level
1043
1044 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001045 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001046 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001047 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001048
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001049 if request.data is not None: # POST
1050 data = request.data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001051 if not request.has_header('Content-type'):
1052 request.add_unredirected_header(
1053 'Content-type',
1054 'application/x-www-form-urlencoded')
1055 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001056 try:
1057 mv = memoryview(data)
1058 except TypeError:
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001059 if isinstance(data, collections.Iterable):
1060 raise ValueError("Content-Length should be specified \
1061 for iterable data of type %r %r" % (type(data),
1062 data))
1063 else:
1064 request.add_unredirected_header(
Senthil Kumaran1e991f22010-12-24 04:03:59 +00001065 'Content-length', '%d' % (len(mv) * mv.itemsize))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001066
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001067 sel_host = host
1068 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001069 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001070 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001071 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001072 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001073 for name, value in self.parent.addheaders:
1074 name = name.capitalize()
1075 if not request.has_header(name):
1076 request.add_unredirected_header(name, value)
1077
1078 return request
1079
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001080 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001081 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001082
1083 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001084 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001085 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001086 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001087 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001088
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001089 # will parse host:port
1090 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001091
1092 headers = dict(req.unredirected_hdrs)
1093 headers.update(dict((k, v) for k, v in req.headers.items()
1094 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001095
1096 # TODO(jhylton): Should this be redesigned to handle
1097 # persistent connections?
1098
1099 # We want to make an HTTP/1.1 request, but the addinfourl
1100 # class isn't prepared to deal with a persistent connection.
1101 # It will try to read all remaining data from the socket,
1102 # which will block while the server waits for the next request.
1103 # So make sure the connection gets closed after the (only)
1104 # request.
1105 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001106 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001107
1108 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001109 tunnel_headers = {}
1110 proxy_auth_hdr = "Proxy-Authorization"
1111 if proxy_auth_hdr in headers:
1112 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1113 # Proxy-Authorization should not be sent to origin
1114 # server.
1115 del headers[proxy_auth_hdr]
1116 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001117
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001118 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001119 h.request(req.get_method(), req.selector, req.data, headers)
1120 r = h.getresponse() # an HTTPResponse instance
1121 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001122 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001123
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001124 r.url = req.full_url
1125 # This line replaces the .msg attribute of the HTTPResponse
1126 # with .headers, because urllib clients expect the response to
1127 # have the reason in .msg. It would be good to mark this
1128 # attribute is deprecated and get then to use info() or
1129 # .headers.
1130 r.msg = r.reason
1131 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001132
1133
1134class HTTPHandler(AbstractHTTPHandler):
1135
1136 def http_open(self, req):
1137 return self.do_open(http.client.HTTPConnection, req)
1138
1139 http_request = AbstractHTTPHandler.do_request_
1140
1141if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001142 import ssl
1143
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001144 class HTTPSHandler(AbstractHTTPHandler):
1145
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001146 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1147 AbstractHTTPHandler.__init__(self, debuglevel)
1148 self._context = context
1149 self._check_hostname = check_hostname
1150
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001151 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001152 return self.do_open(http.client.HTTPSConnection, req,
1153 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001154
1155 https_request = AbstractHTTPHandler.do_request_
1156
1157class HTTPCookieProcessor(BaseHandler):
1158 def __init__(self, cookiejar=None):
1159 import http.cookiejar
1160 if cookiejar is None:
1161 cookiejar = http.cookiejar.CookieJar()
1162 self.cookiejar = cookiejar
1163
1164 def http_request(self, request):
1165 self.cookiejar.add_cookie_header(request)
1166 return request
1167
1168 def http_response(self, request, response):
1169 self.cookiejar.extract_cookies(response, request)
1170 return response
1171
1172 https_request = http_request
1173 https_response = http_response
1174
1175class UnknownHandler(BaseHandler):
1176 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001177 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001178 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001179
1180def parse_keqv_list(l):
1181 """Parse list of key=value strings where keys are not duplicated."""
1182 parsed = {}
1183 for elt in l:
1184 k, v = elt.split('=', 1)
1185 if v[0] == '"' and v[-1] == '"':
1186 v = v[1:-1]
1187 parsed[k] = v
1188 return parsed
1189
1190def parse_http_list(s):
1191 """Parse lists as described by RFC 2068 Section 2.
1192
1193 In particular, parse comma-separated lists where the elements of
1194 the list may include quoted-strings. A quoted-string could
1195 contain a comma. A non-quoted string could have quotes in the
1196 middle. Neither commas nor quotes count if they are escaped.
1197 Only double-quotes count, not single-quotes.
1198 """
1199 res = []
1200 part = ''
1201
1202 escape = quote = False
1203 for cur in s:
1204 if escape:
1205 part += cur
1206 escape = False
1207 continue
1208 if quote:
1209 if cur == '\\':
1210 escape = True
1211 continue
1212 elif cur == '"':
1213 quote = False
1214 part += cur
1215 continue
1216
1217 if cur == ',':
1218 res.append(part)
1219 part = ''
1220 continue
1221
1222 if cur == '"':
1223 quote = True
1224
1225 part += cur
1226
1227 # append last part
1228 if part:
1229 res.append(part)
1230
1231 return [part.strip() for part in res]
1232
1233class FileHandler(BaseHandler):
1234 # Use local file or FTP depending on form of URL
1235 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001236 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001237 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1238 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001239 if not req.host is self.get_names():
1240 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001241 else:
1242 return self.open_local_file(req)
1243
1244 # names for the localhost
1245 names = None
1246 def get_names(self):
1247 if FileHandler.names is None:
1248 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001249 FileHandler.names = tuple(
1250 socket.gethostbyname_ex('localhost')[2] +
1251 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001252 except socket.gaierror:
1253 FileHandler.names = (socket.gethostbyname('localhost'),)
1254 return FileHandler.names
1255
1256 # not entirely sure what the rules are here
1257 def open_local_file(self, req):
1258 import email.utils
1259 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001260 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001261 filename = req.selector
1262 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001263 try:
1264 stats = os.stat(localfile)
1265 size = stats.st_size
1266 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001267 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001268 headers = email.message_from_string(
1269 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1270 (mtype or 'text/plain', size, modified))
1271 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001272 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001273 if not host or \
1274 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001275 if host:
1276 origurl = 'file://' + host + filename
1277 else:
1278 origurl = 'file://' + filename
1279 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001280 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001281 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001282 raise URLError(msg)
1283 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001284
1285def _safe_gethostbyname(host):
1286 try:
1287 return socket.gethostbyname(host)
1288 except socket.gaierror:
1289 return None
1290
1291class FTPHandler(BaseHandler):
1292 def ftp_open(self, req):
1293 import ftplib
1294 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001295 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001296 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001297 raise URLError('ftp error: no host given')
1298 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001299 if port is None:
1300 port = ftplib.FTP_PORT
1301 else:
1302 port = int(port)
1303
1304 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001305 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001306 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001307 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001308 else:
1309 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001310 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001311 user = user or ''
1312 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001313
1314 try:
1315 host = socket.gethostbyname(host)
1316 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001317 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001318 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001319 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001320 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001321 dirs, file = dirs[:-1], dirs[-1]
1322 if dirs and not dirs[0]:
1323 dirs = dirs[1:]
1324 try:
1325 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1326 type = file and 'I' or 'D'
1327 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001328 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001329 if attr.lower() == 'type' and \
1330 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1331 type = value.upper()
1332 fp, retrlen = fw.retrfile(file, type)
1333 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001334 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001335 if mtype:
1336 headers += "Content-type: %s\n" % mtype
1337 if retrlen is not None and retrlen >= 0:
1338 headers += "Content-length: %d\n" % retrlen
1339 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001340 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001341 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001342 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001343 raise exc.with_traceback(sys.exc_info()[2])
1344
1345 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1346 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1347 return fw
1348
1349class CacheFTPHandler(FTPHandler):
1350 # XXX would be nice to have pluggable cache strategies
1351 # XXX this stuff is definitely not thread safe
1352 def __init__(self):
1353 self.cache = {}
1354 self.timeout = {}
1355 self.soonest = 0
1356 self.delay = 60
1357 self.max_conns = 16
1358
1359 def setTimeout(self, t):
1360 self.delay = t
1361
1362 def setMaxConns(self, m):
1363 self.max_conns = m
1364
1365 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1366 key = user, host, port, '/'.join(dirs), timeout
1367 if key in self.cache:
1368 self.timeout[key] = time.time() + self.delay
1369 else:
1370 self.cache[key] = ftpwrapper(user, passwd, host, port,
1371 dirs, timeout)
1372 self.timeout[key] = time.time() + self.delay
1373 self.check_cache()
1374 return self.cache[key]
1375
1376 def check_cache(self):
1377 # first check for old ones
1378 t = time.time()
1379 if self.soonest <= t:
1380 for k, v in list(self.timeout.items()):
1381 if v < t:
1382 self.cache[k].close()
1383 del self.cache[k]
1384 del self.timeout[k]
1385 self.soonest = min(list(self.timeout.values()))
1386
1387 # then check the size
1388 if len(self.cache) == self.max_conns:
1389 for k, v in list(self.timeout.items()):
1390 if v == self.soonest:
1391 del self.cache[k]
1392 del self.timeout[k]
1393 break
1394 self.soonest = min(list(self.timeout.values()))
1395
1396# Code move from the old urllib module
1397
1398MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1399
1400# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001401if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001402 from nturl2path import url2pathname, pathname2url
1403else:
1404 def url2pathname(pathname):
1405 """OS-specific conversion from a relative URL of the 'file' scheme
1406 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001407 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001408
1409 def pathname2url(pathname):
1410 """OS-specific conversion from a file system path to a relative URL
1411 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001412 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001413
1414# This really consists of two pieces:
1415# (1) a class which handles opening of all sorts of URLs
1416# (plus assorted utilities etc.)
1417# (2) a set of functions for parsing URLs
1418# XXX Should these be separated out into different modules?
1419
1420
1421ftpcache = {}
1422class URLopener:
1423 """Class to open URLs.
1424 This is a class rather than just a subroutine because we may need
1425 more than one set of global protocol-specific options.
1426 Note -- this is a base class for those who don't want the
1427 automatic handling of errors type 302 (relocated) and 401
1428 (authorization needed)."""
1429
1430 __tempfiles = None
1431
1432 version = "Python-urllib/%s" % __version__
1433
1434 # Constructor
1435 def __init__(self, proxies=None, **x509):
1436 if proxies is None:
1437 proxies = getproxies()
1438 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1439 self.proxies = proxies
1440 self.key_file = x509.get('key_file')
1441 self.cert_file = x509.get('cert_file')
1442 self.addheaders = [('User-Agent', self.version)]
1443 self.__tempfiles = []
1444 self.__unlink = os.unlink # See cleanup()
1445 self.tempcache = None
1446 # Undocumented feature: if you assign {} to tempcache,
1447 # it is used to cache files retrieved with
1448 # self.retrieve(). This is not enabled by default
1449 # since it does not work for changing documents (and I
1450 # haven't got the logic to check expiration headers
1451 # yet).
1452 self.ftpcache = ftpcache
1453 # Undocumented feature: you can use a different
1454 # ftp cache by assigning to the .ftpcache member;
1455 # in case you want logically independent URL openers
1456 # XXX This is not threadsafe. Bah.
1457
1458 def __del__(self):
1459 self.close()
1460
1461 def close(self):
1462 self.cleanup()
1463
1464 def cleanup(self):
1465 # This code sometimes runs when the rest of this module
1466 # has already been deleted, so it can't use any globals
1467 # or import anything.
1468 if self.__tempfiles:
1469 for file in self.__tempfiles:
1470 try:
1471 self.__unlink(file)
1472 except OSError:
1473 pass
1474 del self.__tempfiles[:]
1475 if self.tempcache:
1476 self.tempcache.clear()
1477
1478 def addheader(self, *args):
1479 """Add a header to be used by the HTTP interface only
1480 e.g. u.addheader('Accept', 'sound/basic')"""
1481 self.addheaders.append(args)
1482
1483 # External interface
1484 def open(self, fullurl, data=None):
1485 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001486 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001487 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001488 if self.tempcache and fullurl in self.tempcache:
1489 filename, headers = self.tempcache[fullurl]
1490 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001491 return addinfourl(fp, headers, fullurl)
1492 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001493 if not urltype:
1494 urltype = 'file'
1495 if urltype in self.proxies:
1496 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001497 urltype, proxyhost = splittype(proxy)
1498 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001499 url = (host, fullurl) # Signal special case to open_*()
1500 else:
1501 proxy = None
1502 name = 'open_' + urltype
1503 self.type = urltype
1504 name = name.replace('-', '_')
1505 if not hasattr(self, name):
1506 if proxy:
1507 return self.open_unknown_proxy(proxy, fullurl, data)
1508 else:
1509 return self.open_unknown(fullurl, data)
1510 try:
1511 if data is None:
1512 return getattr(self, name)(url)
1513 else:
1514 return getattr(self, name)(url, data)
1515 except socket.error as msg:
1516 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1517
1518 def open_unknown(self, fullurl, data=None):
1519 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001520 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001521 raise IOError('url error', 'unknown url type', type)
1522
1523 def open_unknown_proxy(self, proxy, fullurl, data=None):
1524 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001525 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001526 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1527
1528 # External interface
1529 def retrieve(self, url, filename=None, reporthook=None, data=None):
1530 """retrieve(url) returns (filename, headers) for a local object
1531 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001532 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001533 if self.tempcache and url in self.tempcache:
1534 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001535 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001536 if filename is None and (not type or type == 'file'):
1537 try:
1538 fp = self.open_local_file(url1)
1539 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001540 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001541 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001542 except IOError as msg:
1543 pass
1544 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001545 try:
1546 headers = fp.info()
1547 if filename:
1548 tfp = open(filename, 'wb')
1549 else:
1550 import tempfile
1551 garbage, path = splittype(url)
1552 garbage, path = splithost(path or "")
1553 path, garbage = splitquery(path or "")
1554 path, garbage = splitattr(path or "")
1555 suffix = os.path.splitext(path)[1]
1556 (fd, filename) = tempfile.mkstemp(suffix)
1557 self.__tempfiles.append(filename)
1558 tfp = os.fdopen(fd, 'wb')
1559 try:
1560 result = filename, headers
1561 if self.tempcache is not None:
1562 self.tempcache[url] = result
1563 bs = 1024*8
1564 size = -1
1565 read = 0
1566 blocknum = 0
1567 if reporthook:
1568 if "content-length" in headers:
1569 size = int(headers["Content-Length"])
1570 reporthook(blocknum, bs, size)
1571 while 1:
1572 block = fp.read(bs)
1573 if not block:
1574 break
1575 read += len(block)
1576 tfp.write(block)
1577 blocknum += 1
1578 if reporthook:
1579 reporthook(blocknum, bs, size)
1580 finally:
1581 tfp.close()
1582 finally:
1583 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001584
1585 # raise exception if actual size does not match content-length header
1586 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001587 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001588 "retrieval incomplete: got only %i out of %i bytes"
1589 % (read, size), result)
1590
1591 return result
1592
1593 # Each method named open_<type> knows how to open that type of URL
1594
1595 def _open_generic_http(self, connection_factory, url, data):
1596 """Make an HTTP connection using connection_class.
1597
1598 This is an internal method that should be called from
1599 open_http() or open_https().
1600
1601 Arguments:
1602 - connection_factory should take a host name and return an
1603 HTTPConnection instance.
1604 - url is the url to retrieval or a host, relative-path pair.
1605 - data is payload for a POST request or None.
1606 """
1607
1608 user_passwd = None
1609 proxy_passwd= None
1610 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001611 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001612 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001613 user_passwd, host = splituser(host)
1614 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001615 realhost = host
1616 else:
1617 host, selector = url
1618 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001619 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001620 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001621 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001622 url = rest
1623 user_passwd = None
1624 if urltype.lower() != 'http':
1625 realhost = None
1626 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001627 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001628 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001629 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001630 if user_passwd:
1631 selector = "%s://%s%s" % (urltype, realhost, rest)
1632 if proxy_bypass(realhost):
1633 host = realhost
1634
1635 #print "proxy via http:", host, selector
1636 if not host: raise IOError('http error', 'no host given')
1637
1638 if proxy_passwd:
1639 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001640 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001641 else:
1642 proxy_auth = None
1643
1644 if user_passwd:
1645 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001646 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001647 else:
1648 auth = None
1649 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001650 headers = {}
1651 if proxy_auth:
1652 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1653 if auth:
1654 headers["Authorization"] = "Basic %s" % auth
1655 if realhost:
1656 headers["Host"] = realhost
1657 for header, value in self.addheaders:
1658 headers[header] = value
1659
1660 if data is not None:
1661 headers["Content-Type"] = "application/x-www-form-urlencoded"
1662 http_conn.request("POST", selector, data, headers)
1663 else:
1664 http_conn.request("GET", selector, headers=headers)
1665
1666 try:
1667 response = http_conn.getresponse()
1668 except http.client.BadStatusLine:
1669 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001670 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001671
1672 # According to RFC 2616, "2xx" code indicates that the client's
1673 # request was successfully received, understood, and accepted.
1674 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001675 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001676 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001677 else:
1678 return self.http_error(
1679 url, response.fp,
1680 response.status, response.reason, response.msg, data)
1681
1682 def open_http(self, url, data=None):
1683 """Use HTTP protocol."""
1684 return self._open_generic_http(http.client.HTTPConnection, url, data)
1685
1686 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1687 """Handle http errors.
1688
1689 Derived class can override this, or provide specific handlers
1690 named http_error_DDD where DDD is the 3-digit error code."""
1691 # First check if there's a specific handler for this error
1692 name = 'http_error_%d' % errcode
1693 if hasattr(self, name):
1694 method = getattr(self, name)
1695 if data is None:
1696 result = method(url, fp, errcode, errmsg, headers)
1697 else:
1698 result = method(url, fp, errcode, errmsg, headers, data)
1699 if result: return result
1700 return self.http_error_default(url, fp, errcode, errmsg, headers)
1701
1702 def http_error_default(self, url, fp, errcode, errmsg, headers):
1703 """Default error handler: close the connection and raise IOError."""
1704 void = fp.read()
1705 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001706 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001707
1708 if _have_ssl:
1709 def _https_connection(self, host):
1710 return http.client.HTTPSConnection(host,
1711 key_file=self.key_file,
1712 cert_file=self.cert_file)
1713
1714 def open_https(self, url, data=None):
1715 """Use HTTPS protocol."""
1716 return self._open_generic_http(self._https_connection, url, data)
1717
1718 def open_file(self, url):
1719 """Use local file or FTP depending on form of URL."""
1720 if not isinstance(url, str):
1721 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1722 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001723 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001724 else:
1725 return self.open_local_file(url)
1726
1727 def open_local_file(self, url):
1728 """Use local file."""
1729 import mimetypes, email.utils
1730 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001731 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001732 localname = url2pathname(file)
1733 try:
1734 stats = os.stat(localname)
1735 except OSError as e:
1736 raise URLError(e.errno, e.strerror, e.filename)
1737 size = stats.st_size
1738 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1739 mtype = mimetypes.guess_type(url)[0]
1740 headers = email.message_from_string(
1741 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1742 (mtype or 'text/plain', size, modified))
1743 if not host:
1744 urlfile = file
1745 if file[:1] == '/':
1746 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001747 return addinfourl(open(localname, 'rb'), headers, urlfile)
1748 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001749 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001750 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001751 urlfile = file
1752 if file[:1] == '/':
1753 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001754 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001755 raise URLError('local file error', 'not on local host')
1756
1757 def open_ftp(self, url):
1758 """Use FTP protocol."""
1759 if not isinstance(url, str):
1760 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1761 import mimetypes
1762 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001763 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001764 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001765 host, port = splitport(host)
1766 user, host = splituser(host)
1767 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001768 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001769 host = unquote(host)
1770 user = unquote(user or '')
1771 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001772 host = socket.gethostbyname(host)
1773 if not port:
1774 import ftplib
1775 port = ftplib.FTP_PORT
1776 else:
1777 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001778 path, attrs = splitattr(path)
1779 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001780 dirs = path.split('/')
1781 dirs, file = dirs[:-1], dirs[-1]
1782 if dirs and not dirs[0]: dirs = dirs[1:]
1783 if dirs and not dirs[0]: dirs[0] = '/'
1784 key = user, host, port, '/'.join(dirs)
1785 # XXX thread unsafe!
1786 if len(self.ftpcache) > MAXFTPCACHE:
1787 # Prune the cache, rather arbitrarily
1788 for k in self.ftpcache.keys():
1789 if k != key:
1790 v = self.ftpcache[k]
1791 del self.ftpcache[k]
1792 v.close()
1793 try:
1794 if not key in self.ftpcache:
1795 self.ftpcache[key] = \
1796 ftpwrapper(user, passwd, host, port, dirs)
1797 if not file: type = 'D'
1798 else: type = 'I'
1799 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001800 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001801 if attr.lower() == 'type' and \
1802 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1803 type = value.upper()
1804 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1805 mtype = mimetypes.guess_type("ftp:" + url)[0]
1806 headers = ""
1807 if mtype:
1808 headers += "Content-Type: %s\n" % mtype
1809 if retrlen is not None and retrlen >= 0:
1810 headers += "Content-Length: %d\n" % retrlen
1811 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001812 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001813 except ftperrors() as msg:
1814 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1815
1816 def open_data(self, url, data=None):
1817 """Use "data" URL."""
1818 if not isinstance(url, str):
1819 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1820 # ignore POSTed data
1821 #
1822 # syntax of data URLs:
1823 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1824 # mediatype := [ type "/" subtype ] *( ";" parameter )
1825 # data := *urlchar
1826 # parameter := attribute "=" value
1827 try:
1828 [type, data] = url.split(',', 1)
1829 except ValueError:
1830 raise IOError('data error', 'bad data URL')
1831 if not type:
1832 type = 'text/plain;charset=US-ASCII'
1833 semi = type.rfind(';')
1834 if semi >= 0 and '=' not in type[semi:]:
1835 encoding = type[semi+1:]
1836 type = type[:semi]
1837 else:
1838 encoding = ''
1839 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001840 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001841 time.gmtime(time.time())))
1842 msg.append('Content-type: %s' % type)
1843 if encoding == 'base64':
1844 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001845 # XXX is this encoding/decoding ok?
1846 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001847 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001848 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001849 msg.append('Content-Length: %d' % len(data))
1850 msg.append('')
1851 msg.append(data)
1852 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001853 headers = email.message_from_string(msg)
1854 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001855 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001856 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001857
1858
1859class FancyURLopener(URLopener):
1860 """Derived class with handlers for errors we can handle (perhaps)."""
1861
1862 def __init__(self, *args, **kwargs):
1863 URLopener.__init__(self, *args, **kwargs)
1864 self.auth_cache = {}
1865 self.tries = 0
1866 self.maxtries = 10
1867
1868 def http_error_default(self, url, fp, errcode, errmsg, headers):
1869 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001870 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001871
1872 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1873 """Error 302 -- relocated (temporarily)."""
1874 self.tries += 1
1875 if self.maxtries and self.tries >= self.maxtries:
1876 if hasattr(self, "http_error_500"):
1877 meth = self.http_error_500
1878 else:
1879 meth = self.http_error_default
1880 self.tries = 0
1881 return meth(url, fp, 500,
1882 "Internal Server Error: Redirect Recursion", headers)
1883 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1884 data)
1885 self.tries = 0
1886 return result
1887
1888 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1889 if 'location' in headers:
1890 newurl = headers['location']
1891 elif 'uri' in headers:
1892 newurl = headers['uri']
1893 else:
1894 return
1895 void = fp.read()
1896 fp.close()
1897 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001898 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001899 return self.open(newurl)
1900
1901 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1902 """Error 301 -- also relocated (permanently)."""
1903 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1904
1905 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1906 """Error 303 -- also relocated (essentially identical to 302)."""
1907 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1908
1909 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1910 """Error 307 -- relocated, but turn POST into error."""
1911 if data is None:
1912 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1913 else:
1914 return self.http_error_default(url, fp, errcode, errmsg, headers)
1915
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001916 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1917 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001918 """Error 401 -- authentication required.
1919 This function supports Basic authentication only."""
1920 if not 'www-authenticate' in headers:
1921 URLopener.http_error_default(self, url, fp,
1922 errcode, errmsg, headers)
1923 stuff = headers['www-authenticate']
1924 import re
1925 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1926 if not match:
1927 URLopener.http_error_default(self, url, fp,
1928 errcode, errmsg, headers)
1929 scheme, realm = match.groups()
1930 if scheme.lower() != 'basic':
1931 URLopener.http_error_default(self, url, fp,
1932 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001933 if not retry:
1934 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1935 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001936 name = 'retry_' + self.type + '_basic_auth'
1937 if data is None:
1938 return getattr(self,name)(url, realm)
1939 else:
1940 return getattr(self,name)(url, realm, data)
1941
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001942 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
1943 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001944 """Error 407 -- proxy authentication required.
1945 This function supports Basic authentication only."""
1946 if not 'proxy-authenticate' in headers:
1947 URLopener.http_error_default(self, url, fp,
1948 errcode, errmsg, headers)
1949 stuff = headers['proxy-authenticate']
1950 import re
1951 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1952 if not match:
1953 URLopener.http_error_default(self, url, fp,
1954 errcode, errmsg, headers)
1955 scheme, realm = match.groups()
1956 if scheme.lower() != 'basic':
1957 URLopener.http_error_default(self, url, fp,
1958 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001959 if not retry:
1960 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1961 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001962 name = 'retry_proxy_' + self.type + '_basic_auth'
1963 if data is None:
1964 return getattr(self,name)(url, realm)
1965 else:
1966 return getattr(self,name)(url, realm, data)
1967
1968 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001969 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001970 newurl = 'http://' + host + selector
1971 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001972 urltype, proxyhost = splittype(proxy)
1973 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001974 i = proxyhost.find('@') + 1
1975 proxyhost = proxyhost[i:]
1976 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1977 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001978 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001979 quote(passwd, safe=''), proxyhost)
1980 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1981 if data is None:
1982 return self.open(newurl)
1983 else:
1984 return self.open(newurl, data)
1985
1986 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001987 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001988 newurl = 'https://' + host + selector
1989 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001990 urltype, proxyhost = splittype(proxy)
1991 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001992 i = proxyhost.find('@') + 1
1993 proxyhost = proxyhost[i:]
1994 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1995 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001996 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001997 quote(passwd, safe=''), proxyhost)
1998 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1999 if data is None:
2000 return self.open(newurl)
2001 else:
2002 return self.open(newurl, data)
2003
2004 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002005 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002006 i = host.find('@') + 1
2007 host = host[i:]
2008 user, passwd = self.get_user_passwd(host, realm, i)
2009 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002010 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002011 quote(passwd, safe=''), host)
2012 newurl = 'http://' + host + selector
2013 if data is None:
2014 return self.open(newurl)
2015 else:
2016 return self.open(newurl, data)
2017
2018 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002019 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002020 i = host.find('@') + 1
2021 host = host[i:]
2022 user, passwd = self.get_user_passwd(host, realm, i)
2023 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002024 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002025 quote(passwd, safe=''), host)
2026 newurl = 'https://' + host + selector
2027 if data is None:
2028 return self.open(newurl)
2029 else:
2030 return self.open(newurl, data)
2031
Florent Xicluna757445b2010-05-17 17:24:07 +00002032 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002033 key = realm + '@' + host.lower()
2034 if key in self.auth_cache:
2035 if clear_cache:
2036 del self.auth_cache[key]
2037 else:
2038 return self.auth_cache[key]
2039 user, passwd = self.prompt_user_passwd(host, realm)
2040 if user or passwd: self.auth_cache[key] = (user, passwd)
2041 return user, passwd
2042
2043 def prompt_user_passwd(self, host, realm):
2044 """Override this in a GUI environment!"""
2045 import getpass
2046 try:
2047 user = input("Enter username for %s at %s: " % (realm, host))
2048 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2049 (user, realm, host))
2050 return user, passwd
2051 except KeyboardInterrupt:
2052 print()
2053 return None, None
2054
2055
2056# Utility functions
2057
2058_localhost = None
2059def localhost():
2060 """Return the IP address of the magic hostname 'localhost'."""
2061 global _localhost
2062 if _localhost is None:
2063 _localhost = socket.gethostbyname('localhost')
2064 return _localhost
2065
2066_thishost = None
2067def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002068 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002069 global _thishost
2070 if _thishost is None:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002071 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname()[2]))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002072 return _thishost
2073
2074_ftperrors = None
2075def ftperrors():
2076 """Return the set of errors raised by the FTP class."""
2077 global _ftperrors
2078 if _ftperrors is None:
2079 import ftplib
2080 _ftperrors = ftplib.all_errors
2081 return _ftperrors
2082
2083_noheaders = None
2084def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002085 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002086 global _noheaders
2087 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002088 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002089 return _noheaders
2090
2091
2092# Utility classes
2093
2094class ftpwrapper:
2095 """Class used by open_ftp() for cache of open FTP connections."""
2096
2097 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2098 self.user = user
2099 self.passwd = passwd
2100 self.host = host
2101 self.port = port
2102 self.dirs = dirs
2103 self.timeout = timeout
2104 self.init()
2105
2106 def init(self):
2107 import ftplib
2108 self.busy = 0
2109 self.ftp = ftplib.FTP()
2110 self.ftp.connect(self.host, self.port, self.timeout)
2111 self.ftp.login(self.user, self.passwd)
2112 for dir in self.dirs:
2113 self.ftp.cwd(dir)
2114
2115 def retrfile(self, file, type):
2116 import ftplib
2117 self.endtransfer()
2118 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2119 else: cmd = 'TYPE ' + type; isdir = 0
2120 try:
2121 self.ftp.voidcmd(cmd)
2122 except ftplib.all_errors:
2123 self.init()
2124 self.ftp.voidcmd(cmd)
2125 conn = None
2126 if file and not isdir:
2127 # Try to retrieve as a file
2128 try:
2129 cmd = 'RETR ' + file
2130 conn = self.ftp.ntransfercmd(cmd)
2131 except ftplib.error_perm as reason:
2132 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002133 raise URLError('ftp error', reason).with_traceback(
2134 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002135 if not conn:
2136 # Set transfer mode to ASCII!
2137 self.ftp.voidcmd('TYPE A')
2138 # Try a directory listing. Verify that directory exists.
2139 if file:
2140 pwd = self.ftp.pwd()
2141 try:
2142 try:
2143 self.ftp.cwd(file)
2144 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002145 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002146 finally:
2147 self.ftp.cwd(pwd)
2148 cmd = 'LIST ' + file
2149 else:
2150 cmd = 'LIST'
2151 conn = self.ftp.ntransfercmd(cmd)
2152 self.busy = 1
2153 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002154 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002155 def endtransfer(self):
2156 if not self.busy:
2157 return
2158 self.busy = 0
2159 try:
2160 self.ftp.voidresp()
2161 except ftperrors():
2162 pass
2163
2164 def close(self):
2165 self.endtransfer()
2166 try:
2167 self.ftp.close()
2168 except ftperrors():
2169 pass
2170
2171# Proxy handling
2172def getproxies_environment():
2173 """Return a dictionary of scheme -> proxy server URL mappings.
2174
2175 Scan the environment for variables named <scheme>_proxy;
2176 this seems to be the standard convention. If you need a
2177 different way, you can pass a proxies dictionary to the
2178 [Fancy]URLopener constructor.
2179
2180 """
2181 proxies = {}
2182 for name, value in os.environ.items():
2183 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002184 if value and name[-6:] == '_proxy':
2185 proxies[name[:-6]] = value
2186 return proxies
2187
2188def proxy_bypass_environment(host):
2189 """Test if proxies should not be used for a particular host.
2190
2191 Checks the environment for a variable named no_proxy, which should
2192 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2193 """
2194 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2195 # '*' is special case for always bypass
2196 if no_proxy == '*':
2197 return 1
2198 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002199 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002200 # check if the host ends with any of the DNS suffixes
2201 for name in no_proxy.split(','):
2202 if name and (hostonly.endswith(name) or host.endswith(name)):
2203 return 1
2204 # otherwise, don't bypass
2205 return 0
2206
2207
2208if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002209 from _scproxy import _get_proxy_settings, _get_proxies
2210
2211 def proxy_bypass_macosx_sysconf(host):
2212 """
2213 Return True iff this host shouldn't be accessed using a proxy
2214
2215 This function uses the MacOSX framework SystemConfiguration
2216 to fetch the proxy information.
2217 """
2218 import re
2219 import socket
2220 from fnmatch import fnmatch
2221
2222 hostonly, port = splitport(host)
2223
2224 def ip2num(ipAddr):
2225 parts = ipAddr.split('.')
Mark Dickinsonc3f45c22010-05-09 12:16:29 +00002226 parts = list(map(int, parts))
Ronald Oussoren84151202010-04-18 20:46:11 +00002227 if len(parts) != 4:
2228 parts = (parts + [0, 0, 0, 0])[:4]
2229 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2230
2231 proxy_settings = _get_proxy_settings()
2232
2233 # Check for simple host names:
2234 if '.' not in host:
2235 if proxy_settings['exclude_simple']:
2236 return True
2237
2238 hostIP = None
2239
2240 for value in proxy_settings.get('exceptions', ()):
2241 # Items in the list are strings like these: *.local, 169.254/16
2242 if not value: continue
2243
2244 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2245 if m is not None:
2246 if hostIP is None:
2247 try:
2248 hostIP = socket.gethostbyname(hostonly)
2249 hostIP = ip2num(hostIP)
2250 except socket.error:
2251 continue
2252
2253 base = ip2num(m.group(1))
Ronald Oussorenab90f8e2010-06-27 14:26:30 +00002254 mask = m.group(2)
2255 if mask is None:
2256 mask = 8 * (m.group(1).count('.') + 1)
2257
2258 else:
2259 mask = int(mask[1:])
2260 mask = 32 - mask
Ronald Oussoren84151202010-04-18 20:46:11 +00002261
2262 if (hostIP >> mask) == (base >> mask):
2263 return True
2264
2265 elif fnmatch(host, value):
2266 return True
2267
2268 return False
2269
2270
2271 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002272 """Return a dictionary of scheme -> proxy server URL mappings.
2273
Ronald Oussoren84151202010-04-18 20:46:11 +00002274 This function uses the MacOSX framework SystemConfiguration
2275 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002276 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002277 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002278
Ronald Oussoren84151202010-04-18 20:46:11 +00002279
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002280
2281 def proxy_bypass(host):
2282 if getproxies_environment():
2283 return proxy_bypass_environment(host)
2284 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002285 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002286
2287 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002288 return getproxies_environment() or getproxies_macosx_sysconf()
2289
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002290
2291elif os.name == 'nt':
2292 def getproxies_registry():
2293 """Return a dictionary of scheme -> proxy server URL mappings.
2294
2295 Win32 uses the registry to store proxies.
2296
2297 """
2298 proxies = {}
2299 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002300 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002301 except ImportError:
2302 # Std module, so should be around - but you never know!
2303 return proxies
2304 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002305 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002306 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002307 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002308 'ProxyEnable')[0]
2309 if proxyEnable:
2310 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002311 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002312 'ProxyServer')[0])
2313 if '=' in proxyServer:
2314 # Per-protocol settings
2315 for p in proxyServer.split(';'):
2316 protocol, address = p.split('=', 1)
2317 # See if address has a type:// prefix
2318 import re
2319 if not re.match('^([^/:]+)://', address):
2320 address = '%s://%s' % (protocol, address)
2321 proxies[protocol] = address
2322 else:
2323 # Use one setting for all protocols
2324 if proxyServer[:5] == 'http:':
2325 proxies['http'] = proxyServer
2326 else:
2327 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002328 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002329 proxies['ftp'] = 'ftp://%s' % proxyServer
2330 internetSettings.Close()
2331 except (WindowsError, ValueError, TypeError):
2332 # Either registry key not found etc, or the value in an
2333 # unexpected format.
2334 # proxies already set up to be empty so nothing to do
2335 pass
2336 return proxies
2337
2338 def getproxies():
2339 """Return a dictionary of scheme -> proxy server URL mappings.
2340
2341 Returns settings gathered from the environment, if specified,
2342 or the registry.
2343
2344 """
2345 return getproxies_environment() or getproxies_registry()
2346
2347 def proxy_bypass_registry(host):
2348 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002349 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002350 import re
2351 except ImportError:
2352 # Std modules, so should be around - but you never know!
2353 return 0
2354 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002355 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002356 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002357 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002358 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002359 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002360 'ProxyOverride')[0])
2361 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2362 except WindowsError:
2363 return 0
2364 if not proxyEnable or not proxyOverride:
2365 return 0
2366 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002367 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002368 host = [rawHost]
2369 try:
2370 addr = socket.gethostbyname(rawHost)
2371 if addr != rawHost:
2372 host.append(addr)
2373 except socket.error:
2374 pass
2375 try:
2376 fqdn = socket.getfqdn(rawHost)
2377 if fqdn != rawHost:
2378 host.append(fqdn)
2379 except socket.error:
2380 pass
2381 # make a check value list from the registry entry: replace the
2382 # '<local>' string by the localhost entry and the corresponding
2383 # canonical entry.
2384 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002385 # now check if we match one of the registry values.
2386 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002387 if test == '<local>':
2388 if '.' not in rawHost:
2389 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002390 test = test.replace(".", r"\.") # mask dots
2391 test = test.replace("*", r".*") # change glob sequence
2392 test = test.replace("?", r".") # change glob char
2393 for val in host:
2394 # print "%s <--> %s" %( test, val )
2395 if re.match(test, val, re.I):
2396 return 1
2397 return 0
2398
2399 def proxy_bypass(host):
2400 """Return a dictionary of scheme -> proxy server URL mappings.
2401
2402 Returns settings gathered from the environment, if specified,
2403 or the registry.
2404
2405 """
2406 if getproxies_environment():
2407 return proxy_bypass_environment(host)
2408 else:
2409 return proxy_bypass_registry(host)
2410
2411else:
2412 # By default use environment variables
2413 getproxies = getproxies_environment
2414 proxy_bypass = proxy_bypass_environment