blob: dfdbdecb80c8b076b1d34f4ed33b531619a3a281 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000097import collections
Jeremy Hylton1afc1692008-06-18 20:49:58 +000098
Georg Brandl13e89462008-07-01 19:56:00 +000099from urllib.error import URLError, HTTPError, ContentTooShortError
100from urllib.parse import (
101 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
102 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000103 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000104from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106# check for SSL
107try:
108 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000109except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110 _have_ssl = False
111else:
112 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113
114# used in User-Agent header sent
115__version__ = sys.version[:3]
116
117_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000118def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
119 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000120 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000121 if cafile or capath:
122 if not _have_ssl:
123 raise ValueError('SSL support not available')
124 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
125 context.options |= ssl.OP_NO_SSLv2
126 if cafile or capath:
127 context.verify_mode = ssl.CERT_REQUIRED
128 context.load_verify_locations(cafile, capath)
129 check_hostname = True
130 else:
131 check_hostname = False
132 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
133 opener = build_opener(https_handler)
134 elif _opener is None:
135 _opener = opener = build_opener()
136 else:
137 opener = _opener
138 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000139
140def install_opener(opener):
141 global _opener
142 _opener = opener
143
144# TODO(jhylton): Make this work with the same global opener.
145_urlopener = None
146def urlretrieve(url, filename=None, reporthook=None, data=None):
147 global _urlopener
148 if not _urlopener:
149 _urlopener = FancyURLopener()
150 return _urlopener.retrieve(url, filename, reporthook, data)
151
152def urlcleanup():
153 if _urlopener:
154 _urlopener.cleanup()
155 global _opener
156 if _opener:
157 _opener = None
158
159# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000160_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000161def request_host(request):
162 """Return request-host, as defined by RFC 2965.
163
164 Variation from RFC: returned value is lowercased, for convenient
165 comparison.
166
167 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000168 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000169 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000170 if host == "":
171 host = request.get_header("Host", "")
172
173 # remove port, if present
174 host = _cut_port_re.sub("", host, 1)
175 return host.lower()
176
177class Request:
178
179 def __init__(self, url, data=None, headers={},
180 origin_req_host=None, unverifiable=False):
181 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000182 self.full_url = unwrap(url)
Senthil Kumarand95cc752010-08-08 11:27:53 +0000183 self.full_url, fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000184 self.data = data
185 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000186 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000187 for key, value in headers.items():
188 self.add_header(key, value)
189 self.unredirected_hdrs = {}
190 if origin_req_host is None:
191 origin_req_host = request_host(self)
192 self.origin_req_host = origin_req_host
193 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000194 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000195
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000196 def _parse(self):
197 self.type, rest = splittype(self.full_url)
198 if self.type is None:
199 raise ValueError("unknown url type: %s" % self.full_url)
200 self.host, self.selector = splithost(rest)
201 if self.host:
202 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000203
204 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000205 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000206 return "POST"
207 else:
208 return "GET"
209
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000210 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211
212 def add_data(self, data):
213 self.data = data
214
215 def has_data(self):
216 return self.data is not None
217
218 def get_data(self):
219 return self.data
220
221 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000222 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000223
224 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000225 return self.type
226
227 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000228 return self.host
229
230 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000231 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000232
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000233 def is_unverifiable(self):
234 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000235
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000236 def get_origin_req_host(self):
237 return self.origin_req_host
238
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000239 # End deprecated methods
240
241 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000242 if self.type == 'https' and not self._tunnel_host:
243 self._tunnel_host = self.host
244 else:
245 self.type= type
246 self.selector = self.full_url
247 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000248
249 def has_proxy(self):
250 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000251
252 def add_header(self, key, val):
253 # useful for something like authentication
254 self.headers[key.capitalize()] = val
255
256 def add_unredirected_header(self, key, val):
257 # will not be added to a redirected request
258 self.unredirected_hdrs[key.capitalize()] = val
259
260 def has_header(self, header_name):
261 return (header_name in self.headers or
262 header_name in self.unredirected_hdrs)
263
264 def get_header(self, header_name, default=None):
265 return self.headers.get(
266 header_name,
267 self.unredirected_hdrs.get(header_name, default))
268
269 def header_items(self):
270 hdrs = self.unredirected_hdrs.copy()
271 hdrs.update(self.headers)
272 return list(hdrs.items())
273
274class OpenerDirector:
275 def __init__(self):
276 client_version = "Python-urllib/%s" % __version__
277 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000278 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000279 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000280 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000281 self.handle_open = {}
282 self.handle_error = {}
283 self.process_response = {}
284 self.process_request = {}
285
286 def add_handler(self, handler):
287 if not hasattr(handler, "add_parent"):
288 raise TypeError("expected BaseHandler instance, got %r" %
289 type(handler))
290
291 added = False
292 for meth in dir(handler):
293 if meth in ["redirect_request", "do_open", "proxy_open"]:
294 # oops, coincidental match
295 continue
296
297 i = meth.find("_")
298 protocol = meth[:i]
299 condition = meth[i+1:]
300
301 if condition.startswith("error"):
302 j = condition.find("_") + i + 1
303 kind = meth[j+1:]
304 try:
305 kind = int(kind)
306 except ValueError:
307 pass
308 lookup = self.handle_error.get(protocol, {})
309 self.handle_error[protocol] = lookup
310 elif condition == "open":
311 kind = protocol
312 lookup = self.handle_open
313 elif condition == "response":
314 kind = protocol
315 lookup = self.process_response
316 elif condition == "request":
317 kind = protocol
318 lookup = self.process_request
319 else:
320 continue
321
322 handlers = lookup.setdefault(kind, [])
323 if handlers:
324 bisect.insort(handlers, handler)
325 else:
326 handlers.append(handler)
327 added = True
328
329 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330 bisect.insort(self.handlers, handler)
331 handler.add_parent(self)
332
333 def close(self):
334 # Only exists for backwards compatibility.
335 pass
336
337 def _call_chain(self, chain, kind, meth_name, *args):
338 # Handlers raise an exception if no one else should try to handle
339 # the request, or return None if they can't but another handler
340 # could. Otherwise, they return the response.
341 handlers = chain.get(kind, ())
342 for handler in handlers:
343 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000344 result = func(*args)
345 if result is not None:
346 return result
347
348 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
349 # accept a URL or a Request object
350 if isinstance(fullurl, str):
351 req = Request(fullurl, data)
352 else:
353 req = fullurl
354 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000355 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000356
357 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000358 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000359
360 # pre-process request
361 meth_name = protocol+"_request"
362 for processor in self.process_request.get(protocol, []):
363 meth = getattr(processor, meth_name)
364 req = meth(req)
365
366 response = self._open(req, data)
367
368 # post-process response
369 meth_name = protocol+"_response"
370 for processor in self.process_response.get(protocol, []):
371 meth = getattr(processor, meth_name)
372 response = meth(req, response)
373
374 return response
375
376 def _open(self, req, data=None):
377 result = self._call_chain(self.handle_open, 'default',
378 'default_open', req)
379 if result:
380 return result
381
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000382 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000383 result = self._call_chain(self.handle_open, protocol, protocol +
384 '_open', req)
385 if result:
386 return result
387
388 return self._call_chain(self.handle_open, 'unknown',
389 'unknown_open', req)
390
391 def error(self, proto, *args):
392 if proto in ('http', 'https'):
393 # XXX http[s] protocols are special-cased
394 dict = self.handle_error['http'] # https is not different than http
395 proto = args[2] # YUCK!
396 meth_name = 'http_error_%s' % proto
397 http_err = 1
398 orig_args = args
399 else:
400 dict = self.handle_error
401 meth_name = proto + '_error'
402 http_err = 0
403 args = (dict, proto, meth_name) + args
404 result = self._call_chain(*args)
405 if result:
406 return result
407
408 if http_err:
409 args = (dict, 'default', 'http_error_default') + orig_args
410 return self._call_chain(*args)
411
412# XXX probably also want an abstract factory that knows when it makes
413# sense to skip a superclass in favor of a subclass and when it might
414# make sense to include both
415
416def build_opener(*handlers):
417 """Create an opener object from a list of handlers.
418
419 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000420 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000421
422 If any of the handlers passed as arguments are subclasses of the
423 default handlers, the default handlers will not be used.
424 """
425 def isclass(obj):
426 return isinstance(obj, type) or hasattr(obj, "__bases__")
427
428 opener = OpenerDirector()
429 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
430 HTTPDefaultErrorHandler, HTTPRedirectHandler,
431 FTPHandler, FileHandler, HTTPErrorProcessor]
432 if hasattr(http.client, "HTTPSConnection"):
433 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000434 skip = set()
435 for klass in default_classes:
436 for check in handlers:
437 if isclass(check):
438 if issubclass(check, klass):
439 skip.add(klass)
440 elif isinstance(check, klass):
441 skip.add(klass)
442 for klass in skip:
443 default_classes.remove(klass)
444
445 for klass in default_classes:
446 opener.add_handler(klass())
447
448 for h in handlers:
449 if isclass(h):
450 h = h()
451 opener.add_handler(h)
452 return opener
453
454class BaseHandler:
455 handler_order = 500
456
457 def add_parent(self, parent):
458 self.parent = parent
459
460 def close(self):
461 # Only exists for backwards compatibility
462 pass
463
464 def __lt__(self, other):
465 if not hasattr(other, "handler_order"):
466 # Try to preserve the old behavior of having custom classes
467 # inserted after default ones (works only for custom user
468 # classes which are not aware of handler_order).
469 return True
470 return self.handler_order < other.handler_order
471
472
473class HTTPErrorProcessor(BaseHandler):
474 """Process HTTP error responses."""
475 handler_order = 1000 # after all other processing
476
477 def http_response(self, request, response):
478 code, msg, hdrs = response.code, response.msg, response.info()
479
480 # According to RFC 2616, "2xx" code indicates that the client's
481 # request was successfully received, understood, and accepted.
482 if not (200 <= code < 300):
483 response = self.parent.error(
484 'http', request, response, code, msg, hdrs)
485
486 return response
487
488 https_response = http_response
489
490class HTTPDefaultErrorHandler(BaseHandler):
491 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000492 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000493
494class HTTPRedirectHandler(BaseHandler):
495 # maximum number of redirections to any single URL
496 # this is needed because of the state that cookies introduce
497 max_repeats = 4
498 # maximum total number of redirections (regardless of URL) before
499 # assuming we're in a loop
500 max_redirections = 10
501
502 def redirect_request(self, req, fp, code, msg, headers, newurl):
503 """Return a Request or None in response to a redirect.
504
505 This is called by the http_error_30x methods when a
506 redirection response is received. If a redirection should
507 take place, return a new Request to allow http_error_30x to
508 perform the redirect. Otherwise, raise HTTPError if no-one
509 else should try to handle this url. Return None if you can't
510 but another Handler might.
511 """
512 m = req.get_method()
513 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
514 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000515 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000516
517 # Strictly (according to RFC 2616), 301 or 302 in response to
518 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000519 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000520 # essentially all clients do redirect in this case, so we do
521 # the same.
522 # be conciliant with URIs containing a space
523 newurl = newurl.replace(' ', '%20')
524 CONTENT_HEADERS = ("content-length", "content-type")
525 newheaders = dict((k, v) for k, v in req.headers.items()
526 if k.lower() not in CONTENT_HEADERS)
527 return Request(newurl,
528 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000529 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000530 unverifiable=True)
531
532 # Implementation note: To avoid the server sending us into an
533 # infinite loop, the request object needs to track what URLs we
534 # have already seen. Do this by adding a handler-specific
535 # attribute to the Request object.
536 def http_error_302(self, req, fp, code, msg, headers):
537 # Some servers (incorrectly) return multiple Location headers
538 # (so probably same goes for URI). Use first header.
539 if "location" in headers:
540 newurl = headers["location"]
541 elif "uri" in headers:
542 newurl = headers["uri"]
543 else:
544 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000545
546 # fix a possible malformed URL
547 urlparts = urlparse(newurl)
548 if not urlparts.path:
549 urlparts = list(urlparts)
550 urlparts[2] = "/"
551 newurl = urlunparse(urlparts)
552
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000553 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000554
555 # XXX Probably want to forget about the state of the current
556 # request, although that might interact poorly with other
557 # handlers that also use handler-specific request attributes
558 new = self.redirect_request(req, fp, code, msg, headers, newurl)
559 if new is None:
560 return
561
562 # loop detection
563 # .redirect_dict has a key url if url was previously visited.
564 if hasattr(req, 'redirect_dict'):
565 visited = new.redirect_dict = req.redirect_dict
566 if (visited.get(newurl, 0) >= self.max_repeats or
567 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000568 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000569 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000570 else:
571 visited = new.redirect_dict = req.redirect_dict = {}
572 visited[newurl] = visited.get(newurl, 0) + 1
573
574 # Don't close the fp until we are sure that we won't use it
575 # with HTTPError.
576 fp.read()
577 fp.close()
578
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000579 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000580
581 http_error_301 = http_error_303 = http_error_307 = http_error_302
582
583 inf_msg = "The HTTP server returned a redirect error that would " \
584 "lead to an infinite loop.\n" \
585 "The last 30x error message was:\n"
586
587
588def _parse_proxy(proxy):
589 """Return (scheme, user, password, host/port) given a URL or an authority.
590
591 If a URL is supplied, it must have an authority (host:port) component.
592 According to RFC 3986, having an authority component means the URL must
593 have two slashes after the scheme:
594
595 >>> _parse_proxy('file:/ftp.example.com/')
596 Traceback (most recent call last):
597 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
598
599 The first three items of the returned tuple may be None.
600
601 Examples of authority parsing:
602
603 >>> _parse_proxy('proxy.example.com')
604 (None, None, None, 'proxy.example.com')
605 >>> _parse_proxy('proxy.example.com:3128')
606 (None, None, None, 'proxy.example.com:3128')
607
608 The authority component may optionally include userinfo (assumed to be
609 username:password):
610
611 >>> _parse_proxy('joe:password@proxy.example.com')
612 (None, 'joe', 'password', 'proxy.example.com')
613 >>> _parse_proxy('joe:password@proxy.example.com:3128')
614 (None, 'joe', 'password', 'proxy.example.com:3128')
615
616 Same examples, but with URLs instead:
617
618 >>> _parse_proxy('http://proxy.example.com/')
619 ('http', None, None, 'proxy.example.com')
620 >>> _parse_proxy('http://proxy.example.com:3128/')
621 ('http', None, None, 'proxy.example.com:3128')
622 >>> _parse_proxy('http://joe:password@proxy.example.com/')
623 ('http', 'joe', 'password', 'proxy.example.com')
624 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
625 ('http', 'joe', 'password', 'proxy.example.com:3128')
626
627 Everything after the authority is ignored:
628
629 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
630 ('ftp', 'joe', 'password', 'proxy.example.com')
631
632 Test for no trailing '/' case:
633
634 >>> _parse_proxy('http://joe:password@proxy.example.com')
635 ('http', 'joe', 'password', 'proxy.example.com')
636
637 """
Georg Brandl13e89462008-07-01 19:56:00 +0000638 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000639 if not r_scheme.startswith("/"):
640 # authority
641 scheme = None
642 authority = proxy
643 else:
644 # URL
645 if not r_scheme.startswith("//"):
646 raise ValueError("proxy URL with no authority: %r" % proxy)
647 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
648 # and 3.3.), path is empty or starts with '/'
649 end = r_scheme.find("/", 2)
650 if end == -1:
651 end = None
652 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000653 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000654 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000655 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000656 else:
657 user = password = None
658 return scheme, user, password, hostport
659
660class ProxyHandler(BaseHandler):
661 # Proxies must be in front
662 handler_order = 100
663
664 def __init__(self, proxies=None):
665 if proxies is None:
666 proxies = getproxies()
667 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
668 self.proxies = proxies
669 for type, url in proxies.items():
670 setattr(self, '%s_open' % type,
671 lambda r, proxy=url, type=type, meth=self.proxy_open: \
672 meth(r, proxy, type))
673
674 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000675 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000676 proxy_type, user, password, hostport = _parse_proxy(proxy)
677 if proxy_type is None:
678 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000679
680 if req.host and proxy_bypass(req.host):
681 return None
682
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000683 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000684 user_pass = '%s:%s' % (unquote(user),
685 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000686 creds = base64.b64encode(user_pass.encode()).decode("ascii")
687 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000688 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000689 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000690 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000691 # let other handlers take care of it
692 return None
693 else:
694 # need to start over, because the other handlers don't
695 # grok the proxy's URL type
696 # e.g. if we have a constructor arg proxies like so:
697 # {'http': 'ftp://proxy.example.com'}, we may end up turning
698 # a request for http://acme.example.com/a into one for
699 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000700 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000701
702class HTTPPasswordMgr:
703
704 def __init__(self):
705 self.passwd = {}
706
707 def add_password(self, realm, uri, user, passwd):
708 # uri could be a single URI or a sequence
709 if isinstance(uri, str):
710 uri = [uri]
711 if not realm in self.passwd:
712 self.passwd[realm] = {}
713 for default_port in True, False:
714 reduced_uri = tuple(
715 [self.reduce_uri(u, default_port) for u in uri])
716 self.passwd[realm][reduced_uri] = (user, passwd)
717
718 def find_user_password(self, realm, authuri):
719 domains = self.passwd.get(realm, {})
720 for default_port in True, False:
721 reduced_authuri = self.reduce_uri(authuri, default_port)
722 for uris, authinfo in domains.items():
723 for uri in uris:
724 if self.is_suburi(uri, reduced_authuri):
725 return authinfo
726 return None, None
727
728 def reduce_uri(self, uri, default_port=True):
729 """Accept authority or URI and extract only the authority and path."""
730 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000731 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000732 if parts[1]:
733 # URI
734 scheme = parts[0]
735 authority = parts[1]
736 path = parts[2] or '/'
737 else:
738 # host or host:port
739 scheme = None
740 authority = uri
741 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000742 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000743 if default_port and port is None and scheme is not None:
744 dport = {"http": 80,
745 "https": 443,
746 }.get(scheme)
747 if dport is not None:
748 authority = "%s:%d" % (host, dport)
749 return authority, path
750
751 def is_suburi(self, base, test):
752 """Check if test is below base in a URI tree
753
754 Both args must be URIs in reduced form.
755 """
756 if base == test:
757 return True
758 if base[0] != test[0]:
759 return False
760 common = posixpath.commonprefix((base[1], test[1]))
761 if len(common) == len(base[1]):
762 return True
763 return False
764
765
766class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
767
768 def find_user_password(self, realm, authuri):
769 user, password = HTTPPasswordMgr.find_user_password(self, realm,
770 authuri)
771 if user is not None:
772 return user, password
773 return HTTPPasswordMgr.find_user_password(self, None, authuri)
774
775
776class AbstractBasicAuthHandler:
777
778 # XXX this allows for multiple auth-schemes, but will stupidly pick
779 # the last one with a realm specified.
780
781 # allow for double- and single-quoted realm values
782 # (single quotes are a violation of the RFC, but appear in the wild)
783 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
784 'realm=(["\'])(.*?)\\2', re.I)
785
786 # XXX could pre-emptively send auth info already accepted (RFC 2617,
787 # end of section 2, and section 1.2 immediately after "credentials"
788 # production).
789
790 def __init__(self, password_mgr=None):
791 if password_mgr is None:
792 password_mgr = HTTPPasswordMgr()
793 self.passwd = password_mgr
794 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000795 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000796
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000797 def reset_retry_count(self):
798 self.retried = 0
799
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000800 def http_error_auth_reqed(self, authreq, host, req, headers):
801 # host may be an authority (without userinfo) or a URL with an
802 # authority
803 # XXX could be multiple headers
804 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000805
806 if self.retried > 5:
807 # retry sending the username:password 5 times before failing.
808 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
809 headers, None)
810 else:
811 self.retried += 1
812
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 if authreq:
814 mo = AbstractBasicAuthHandler.rx.search(authreq)
815 if mo:
816 scheme, quote, realm = mo.groups()
817 if scheme.lower() == 'basic':
Senthil Kumaran4bb5c272010-08-26 06:16:22 +0000818 response = self.retry_http_basic_auth(host, req, realm)
819 if response and response.code != 401:
820 self.retried = 0
821 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000822
823 def retry_http_basic_auth(self, host, req, realm):
824 user, pw = self.passwd.find_user_password(realm, host)
825 if pw is not None:
826 raw = "%s:%s" % (user, pw)
827 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
828 if req.headers.get(self.auth_header, None) == auth:
829 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000830 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000831 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000832 else:
833 return None
834
835
836class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
837
838 auth_header = 'Authorization'
839
840 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000841 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000842 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000843 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000844 self.reset_retry_count()
845 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000846
847
848class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
849
850 auth_header = 'Proxy-authorization'
851
852 def http_error_407(self, req, fp, code, msg, headers):
853 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000854 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000855 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
856 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000857 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000858 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000859 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000860 self.reset_retry_count()
861 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000862
863
864def randombytes(n):
865 """Return n random bytes."""
866 return os.urandom(n)
867
868class AbstractDigestAuthHandler:
869 # Digest authentication is specified in RFC 2617.
870
871 # XXX The client does not inspect the Authentication-Info header
872 # in a successful response.
873
874 # XXX It should be possible to test this implementation against
875 # a mock server that just generates a static set of challenges.
876
877 # XXX qop="auth-int" supports is shaky
878
879 def __init__(self, passwd=None):
880 if passwd is None:
881 passwd = HTTPPasswordMgr()
882 self.passwd = passwd
883 self.add_password = self.passwd.add_password
884 self.retried = 0
885 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000886 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000887
888 def reset_retry_count(self):
889 self.retried = 0
890
891 def http_error_auth_reqed(self, auth_header, host, req, headers):
892 authreq = headers.get(auth_header, None)
893 if self.retried > 5:
894 # Don't fail endlessly - if we failed once, we'll probably
895 # fail a second time. Hm. Unless the Password Manager is
896 # prompting for the information. Crap. This isn't great
897 # but it's better than the current 'repeat until recursion
898 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000899 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000900 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000901 else:
902 self.retried += 1
903 if authreq:
904 scheme = authreq.split()[0]
905 if scheme.lower() == 'digest':
906 return self.retry_http_digest_auth(req, authreq)
907
908 def retry_http_digest_auth(self, req, auth):
909 token, challenge = auth.split(' ', 1)
910 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
911 auth = self.get_authorization(req, chal)
912 if auth:
913 auth_val = 'Digest %s' % auth
914 if req.headers.get(self.auth_header, None) == auth_val:
915 return None
916 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000917 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000918 return resp
919
920 def get_cnonce(self, nonce):
921 # The cnonce-value is an opaque
922 # quoted string value provided by the client and used by both client
923 # and server to avoid chosen plaintext attacks, to provide mutual
924 # authentication, and to provide some message integrity protection.
925 # This isn't a fabulous effort, but it's probably Good Enough.
926 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
927 b = s.encode("ascii") + randombytes(8)
928 dig = hashlib.sha1(b).hexdigest()
929 return dig[:16]
930
931 def get_authorization(self, req, chal):
932 try:
933 realm = chal['realm']
934 nonce = chal['nonce']
935 qop = chal.get('qop')
936 algorithm = chal.get('algorithm', 'MD5')
937 # mod_digest doesn't send an opaque, even though it isn't
938 # supposed to be optional
939 opaque = chal.get('opaque', None)
940 except KeyError:
941 return None
942
943 H, KD = self.get_algorithm_impls(algorithm)
944 if H is None:
945 return None
946
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000947 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000948 if user is None:
949 return None
950
951 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000952 if req.data is not None:
953 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000954 else:
955 entdig = None
956
957 A1 = "%s:%s:%s" % (user, realm, pw)
958 A2 = "%s:%s" % (req.get_method(),
959 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000960 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000961 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000962 if nonce == self.last_nonce:
963 self.nonce_count += 1
964 else:
965 self.nonce_count = 1
966 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000967 ncvalue = '%08x' % self.nonce_count
968 cnonce = self.get_cnonce(nonce)
969 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
970 respdig = KD(H(A1), noncebit)
971 elif qop is None:
972 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
973 else:
974 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000975 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000976
977 # XXX should the partial digests be encoded too?
978
979 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000980 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000981 respdig)
982 if opaque:
983 base += ', opaque="%s"' % opaque
984 if entdig:
985 base += ', digest="%s"' % entdig
986 base += ', algorithm="%s"' % algorithm
987 if qop:
988 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
989 return base
990
991 def get_algorithm_impls(self, algorithm):
992 # lambdas assume digest modules are imported at the top level
993 if algorithm == 'MD5':
994 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
995 elif algorithm == 'SHA':
996 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
997 # XXX MD5-sess
998 KD = lambda s, d: H("%s:%s" % (s, d))
999 return H, KD
1000
1001 def get_entity_digest(self, data, chal):
1002 # XXX not implemented yet
1003 return None
1004
1005
1006class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1007 """An authentication protocol defined by RFC 2069
1008
1009 Digest authentication improves on basic authentication because it
1010 does not transmit passwords in the clear.
1011 """
1012
1013 auth_header = 'Authorization'
1014 handler_order = 490 # before Basic auth
1015
1016 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001017 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001018 retry = self.http_error_auth_reqed('www-authenticate',
1019 host, req, headers)
1020 self.reset_retry_count()
1021 return retry
1022
1023
1024class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1025
1026 auth_header = 'Proxy-Authorization'
1027 handler_order = 490 # before Basic auth
1028
1029 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001030 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001031 retry = self.http_error_auth_reqed('proxy-authenticate',
1032 host, req, headers)
1033 self.reset_retry_count()
1034 return retry
1035
1036class AbstractHTTPHandler(BaseHandler):
1037
1038 def __init__(self, debuglevel=0):
1039 self._debuglevel = debuglevel
1040
1041 def set_http_debuglevel(self, level):
1042 self._debuglevel = level
1043
1044 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001045 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001046 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001047 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001048
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001049 if request.data is not None: # POST
1050 data = request.data
Senthil Kumaran29333122011-02-11 11:25:47 +00001051 if isinstance(data, str):
1052 raise TypeError("POST data should be bytes"
1053 " or an iterable of bytes. It cannot be str.")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001054 if not request.has_header('Content-type'):
1055 request.add_unredirected_header(
1056 'Content-type',
1057 'application/x-www-form-urlencoded')
1058 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001059 try:
1060 mv = memoryview(data)
1061 except TypeError:
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001062 if isinstance(data, collections.Iterable):
Georg Brandl61536042011-02-03 07:46:41 +00001063 raise ValueError("Content-Length should be specified "
1064 "for iterable data of type %r %r" % (type(data),
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001065 data))
1066 else:
1067 request.add_unredirected_header(
Senthil Kumaran1e991f22010-12-24 04:03:59 +00001068 'Content-length', '%d' % (len(mv) * mv.itemsize))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001069
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001070 sel_host = host
1071 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001072 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001073 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001074 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001075 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001076 for name, value in self.parent.addheaders:
1077 name = name.capitalize()
1078 if not request.has_header(name):
1079 request.add_unredirected_header(name, value)
1080
1081 return request
1082
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001083 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001084 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001085
1086 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001087 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001088 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001089 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001090 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001091
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001092 # will parse host:port
1093 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001094
1095 headers = dict(req.unredirected_hdrs)
1096 headers.update(dict((k, v) for k, v in req.headers.items()
1097 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001098
1099 # TODO(jhylton): Should this be redesigned to handle
1100 # persistent connections?
1101
1102 # We want to make an HTTP/1.1 request, but the addinfourl
1103 # class isn't prepared to deal with a persistent connection.
1104 # It will try to read all remaining data from the socket,
1105 # which will block while the server waits for the next request.
1106 # So make sure the connection gets closed after the (only)
1107 # request.
1108 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001109 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001110
1111 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001112 tunnel_headers = {}
1113 proxy_auth_hdr = "Proxy-Authorization"
1114 if proxy_auth_hdr in headers:
1115 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1116 # Proxy-Authorization should not be sent to origin
1117 # server.
1118 del headers[proxy_auth_hdr]
1119 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001120
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001121 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001122 h.request(req.get_method(), req.selector, req.data, headers)
1123 r = h.getresponse() # an HTTPResponse instance
1124 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001125 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001126
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001127 r.url = req.full_url
1128 # This line replaces the .msg attribute of the HTTPResponse
1129 # with .headers, because urllib clients expect the response to
1130 # have the reason in .msg. It would be good to mark this
1131 # attribute is deprecated and get then to use info() or
1132 # .headers.
1133 r.msg = r.reason
1134 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001135
1136
1137class HTTPHandler(AbstractHTTPHandler):
1138
1139 def http_open(self, req):
1140 return self.do_open(http.client.HTTPConnection, req)
1141
1142 http_request = AbstractHTTPHandler.do_request_
1143
1144if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001145 import ssl
1146
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001147 class HTTPSHandler(AbstractHTTPHandler):
1148
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001149 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1150 AbstractHTTPHandler.__init__(self, debuglevel)
1151 self._context = context
1152 self._check_hostname = check_hostname
1153
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001154 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001155 return self.do_open(http.client.HTTPSConnection, req,
1156 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001157
1158 https_request = AbstractHTTPHandler.do_request_
1159
1160class HTTPCookieProcessor(BaseHandler):
1161 def __init__(self, cookiejar=None):
1162 import http.cookiejar
1163 if cookiejar is None:
1164 cookiejar = http.cookiejar.CookieJar()
1165 self.cookiejar = cookiejar
1166
1167 def http_request(self, request):
1168 self.cookiejar.add_cookie_header(request)
1169 return request
1170
1171 def http_response(self, request, response):
1172 self.cookiejar.extract_cookies(response, request)
1173 return response
1174
1175 https_request = http_request
1176 https_response = http_response
1177
1178class UnknownHandler(BaseHandler):
1179 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001180 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001181 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001182
1183def parse_keqv_list(l):
1184 """Parse list of key=value strings where keys are not duplicated."""
1185 parsed = {}
1186 for elt in l:
1187 k, v = elt.split('=', 1)
1188 if v[0] == '"' and v[-1] == '"':
1189 v = v[1:-1]
1190 parsed[k] = v
1191 return parsed
1192
1193def parse_http_list(s):
1194 """Parse lists as described by RFC 2068 Section 2.
1195
1196 In particular, parse comma-separated lists where the elements of
1197 the list may include quoted-strings. A quoted-string could
1198 contain a comma. A non-quoted string could have quotes in the
1199 middle. Neither commas nor quotes count if they are escaped.
1200 Only double-quotes count, not single-quotes.
1201 """
1202 res = []
1203 part = ''
1204
1205 escape = quote = False
1206 for cur in s:
1207 if escape:
1208 part += cur
1209 escape = False
1210 continue
1211 if quote:
1212 if cur == '\\':
1213 escape = True
1214 continue
1215 elif cur == '"':
1216 quote = False
1217 part += cur
1218 continue
1219
1220 if cur == ',':
1221 res.append(part)
1222 part = ''
1223 continue
1224
1225 if cur == '"':
1226 quote = True
1227
1228 part += cur
1229
1230 # append last part
1231 if part:
1232 res.append(part)
1233
1234 return [part.strip() for part in res]
1235
1236class FileHandler(BaseHandler):
1237 # Use local file or FTP depending on form of URL
1238 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001239 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001240 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1241 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001242 if not req.host is self.get_names():
1243 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001244 else:
1245 return self.open_local_file(req)
1246
1247 # names for the localhost
1248 names = None
1249 def get_names(self):
1250 if FileHandler.names is None:
1251 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001252 FileHandler.names = tuple(
1253 socket.gethostbyname_ex('localhost')[2] +
1254 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001255 except socket.gaierror:
1256 FileHandler.names = (socket.gethostbyname('localhost'),)
1257 return FileHandler.names
1258
1259 # not entirely sure what the rules are here
1260 def open_local_file(self, req):
1261 import email.utils
1262 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001263 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001264 filename = req.selector
1265 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001266 try:
1267 stats = os.stat(localfile)
1268 size = stats.st_size
1269 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001270 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001271 headers = email.message_from_string(
1272 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1273 (mtype or 'text/plain', size, modified))
1274 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001275 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001276 if not host or \
1277 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001278 if host:
1279 origurl = 'file://' + host + filename
1280 else:
1281 origurl = 'file://' + filename
1282 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001283 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001284 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001285 raise URLError(msg)
1286 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001287
1288def _safe_gethostbyname(host):
1289 try:
1290 return socket.gethostbyname(host)
1291 except socket.gaierror:
1292 return None
1293
1294class FTPHandler(BaseHandler):
1295 def ftp_open(self, req):
1296 import ftplib
1297 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001298 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001299 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001300 raise URLError('ftp error: no host given')
1301 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001302 if port is None:
1303 port = ftplib.FTP_PORT
1304 else:
1305 port = int(port)
1306
1307 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001308 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001309 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001310 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001311 else:
1312 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001313 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001314 user = user or ''
1315 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001316
1317 try:
1318 host = socket.gethostbyname(host)
1319 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001320 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001321 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001322 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001323 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001324 dirs, file = dirs[:-1], dirs[-1]
1325 if dirs and not dirs[0]:
1326 dirs = dirs[1:]
1327 try:
1328 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1329 type = file and 'I' or 'D'
1330 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001331 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001332 if attr.lower() == 'type' and \
1333 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1334 type = value.upper()
1335 fp, retrlen = fw.retrfile(file, type)
1336 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001337 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001338 if mtype:
1339 headers += "Content-type: %s\n" % mtype
1340 if retrlen is not None and retrlen >= 0:
1341 headers += "Content-length: %d\n" % retrlen
1342 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001343 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001344 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001345 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001346 raise exc.with_traceback(sys.exc_info()[2])
1347
1348 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1349 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1350 return fw
1351
1352class CacheFTPHandler(FTPHandler):
1353 # XXX would be nice to have pluggable cache strategies
1354 # XXX this stuff is definitely not thread safe
1355 def __init__(self):
1356 self.cache = {}
1357 self.timeout = {}
1358 self.soonest = 0
1359 self.delay = 60
1360 self.max_conns = 16
1361
1362 def setTimeout(self, t):
1363 self.delay = t
1364
1365 def setMaxConns(self, m):
1366 self.max_conns = m
1367
1368 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1369 key = user, host, port, '/'.join(dirs), timeout
1370 if key in self.cache:
1371 self.timeout[key] = time.time() + self.delay
1372 else:
1373 self.cache[key] = ftpwrapper(user, passwd, host, port,
1374 dirs, timeout)
1375 self.timeout[key] = time.time() + self.delay
1376 self.check_cache()
1377 return self.cache[key]
1378
1379 def check_cache(self):
1380 # first check for old ones
1381 t = time.time()
1382 if self.soonest <= t:
1383 for k, v in list(self.timeout.items()):
1384 if v < t:
1385 self.cache[k].close()
1386 del self.cache[k]
1387 del self.timeout[k]
1388 self.soonest = min(list(self.timeout.values()))
1389
1390 # then check the size
1391 if len(self.cache) == self.max_conns:
1392 for k, v in list(self.timeout.items()):
1393 if v == self.soonest:
1394 del self.cache[k]
1395 del self.timeout[k]
1396 break
1397 self.soonest = min(list(self.timeout.values()))
1398
1399# Code move from the old urllib module
1400
1401MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1402
1403# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001404if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001405 from nturl2path import url2pathname, pathname2url
1406else:
1407 def url2pathname(pathname):
1408 """OS-specific conversion from a relative URL of the 'file' scheme
1409 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001410 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001411
1412 def pathname2url(pathname):
1413 """OS-specific conversion from a file system path to a relative URL
1414 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001415 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001416
1417# This really consists of two pieces:
1418# (1) a class which handles opening of all sorts of URLs
1419# (plus assorted utilities etc.)
1420# (2) a set of functions for parsing URLs
1421# XXX Should these be separated out into different modules?
1422
1423
1424ftpcache = {}
1425class URLopener:
1426 """Class to open URLs.
1427 This is a class rather than just a subroutine because we may need
1428 more than one set of global protocol-specific options.
1429 Note -- this is a base class for those who don't want the
1430 automatic handling of errors type 302 (relocated) and 401
1431 (authorization needed)."""
1432
1433 __tempfiles = None
1434
1435 version = "Python-urllib/%s" % __version__
1436
1437 # Constructor
1438 def __init__(self, proxies=None, **x509):
1439 if proxies is None:
1440 proxies = getproxies()
1441 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1442 self.proxies = proxies
1443 self.key_file = x509.get('key_file')
1444 self.cert_file = x509.get('cert_file')
1445 self.addheaders = [('User-Agent', self.version)]
1446 self.__tempfiles = []
1447 self.__unlink = os.unlink # See cleanup()
1448 self.tempcache = None
1449 # Undocumented feature: if you assign {} to tempcache,
1450 # it is used to cache files retrieved with
1451 # self.retrieve(). This is not enabled by default
1452 # since it does not work for changing documents (and I
1453 # haven't got the logic to check expiration headers
1454 # yet).
1455 self.ftpcache = ftpcache
1456 # Undocumented feature: you can use a different
1457 # ftp cache by assigning to the .ftpcache member;
1458 # in case you want logically independent URL openers
1459 # XXX This is not threadsafe. Bah.
1460
1461 def __del__(self):
1462 self.close()
1463
1464 def close(self):
1465 self.cleanup()
1466
1467 def cleanup(self):
1468 # This code sometimes runs when the rest of this module
1469 # has already been deleted, so it can't use any globals
1470 # or import anything.
1471 if self.__tempfiles:
1472 for file in self.__tempfiles:
1473 try:
1474 self.__unlink(file)
1475 except OSError:
1476 pass
1477 del self.__tempfiles[:]
1478 if self.tempcache:
1479 self.tempcache.clear()
1480
1481 def addheader(self, *args):
1482 """Add a header to be used by the HTTP interface only
1483 e.g. u.addheader('Accept', 'sound/basic')"""
1484 self.addheaders.append(args)
1485
1486 # External interface
1487 def open(self, fullurl, data=None):
1488 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001489 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001490 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001491 if self.tempcache and fullurl in self.tempcache:
1492 filename, headers = self.tempcache[fullurl]
1493 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001494 return addinfourl(fp, headers, fullurl)
1495 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001496 if not urltype:
1497 urltype = 'file'
1498 if urltype in self.proxies:
1499 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001500 urltype, proxyhost = splittype(proxy)
1501 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001502 url = (host, fullurl) # Signal special case to open_*()
1503 else:
1504 proxy = None
1505 name = 'open_' + urltype
1506 self.type = urltype
1507 name = name.replace('-', '_')
1508 if not hasattr(self, name):
1509 if proxy:
1510 return self.open_unknown_proxy(proxy, fullurl, data)
1511 else:
1512 return self.open_unknown(fullurl, data)
1513 try:
1514 if data is None:
1515 return getattr(self, name)(url)
1516 else:
1517 return getattr(self, name)(url, data)
1518 except socket.error as msg:
1519 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1520
1521 def open_unknown(self, fullurl, data=None):
1522 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001523 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001524 raise IOError('url error', 'unknown url type', type)
1525
1526 def open_unknown_proxy(self, proxy, fullurl, data=None):
1527 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001528 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001529 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1530
1531 # External interface
1532 def retrieve(self, url, filename=None, reporthook=None, data=None):
1533 """retrieve(url) returns (filename, headers) for a local object
1534 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001535 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001536 if self.tempcache and url in self.tempcache:
1537 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001538 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001539 if filename is None and (not type or type == 'file'):
1540 try:
1541 fp = self.open_local_file(url1)
1542 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001543 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001544 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001545 except IOError as msg:
1546 pass
1547 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001548 try:
1549 headers = fp.info()
1550 if filename:
1551 tfp = open(filename, 'wb')
1552 else:
1553 import tempfile
1554 garbage, path = splittype(url)
1555 garbage, path = splithost(path or "")
1556 path, garbage = splitquery(path or "")
1557 path, garbage = splitattr(path or "")
1558 suffix = os.path.splitext(path)[1]
1559 (fd, filename) = tempfile.mkstemp(suffix)
1560 self.__tempfiles.append(filename)
1561 tfp = os.fdopen(fd, 'wb')
1562 try:
1563 result = filename, headers
1564 if self.tempcache is not None:
1565 self.tempcache[url] = result
1566 bs = 1024*8
1567 size = -1
1568 read = 0
1569 blocknum = 0
1570 if reporthook:
1571 if "content-length" in headers:
1572 size = int(headers["Content-Length"])
1573 reporthook(blocknum, bs, size)
1574 while 1:
1575 block = fp.read(bs)
1576 if not block:
1577 break
1578 read += len(block)
1579 tfp.write(block)
1580 blocknum += 1
1581 if reporthook:
1582 reporthook(blocknum, bs, size)
1583 finally:
1584 tfp.close()
1585 finally:
1586 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001587
1588 # raise exception if actual size does not match content-length header
1589 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001590 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001591 "retrieval incomplete: got only %i out of %i bytes"
1592 % (read, size), result)
1593
1594 return result
1595
1596 # Each method named open_<type> knows how to open that type of URL
1597
1598 def _open_generic_http(self, connection_factory, url, data):
1599 """Make an HTTP connection using connection_class.
1600
1601 This is an internal method that should be called from
1602 open_http() or open_https().
1603
1604 Arguments:
1605 - connection_factory should take a host name and return an
1606 HTTPConnection instance.
1607 - url is the url to retrieval or a host, relative-path pair.
1608 - data is payload for a POST request or None.
1609 """
1610
1611 user_passwd = None
1612 proxy_passwd= None
1613 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001614 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001615 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001616 user_passwd, host = splituser(host)
1617 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001618 realhost = host
1619 else:
1620 host, selector = url
1621 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001622 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001623 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001624 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001625 url = rest
1626 user_passwd = None
1627 if urltype.lower() != 'http':
1628 realhost = None
1629 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001630 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001631 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001632 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001633 if user_passwd:
1634 selector = "%s://%s%s" % (urltype, realhost, rest)
1635 if proxy_bypass(realhost):
1636 host = realhost
1637
1638 #print "proxy via http:", host, selector
1639 if not host: raise IOError('http error', 'no host given')
1640
1641 if proxy_passwd:
1642 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001643 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001644 else:
1645 proxy_auth = None
1646
1647 if user_passwd:
1648 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001649 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001650 else:
1651 auth = None
1652 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001653 headers = {}
1654 if proxy_auth:
1655 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1656 if auth:
1657 headers["Authorization"] = "Basic %s" % auth
1658 if realhost:
1659 headers["Host"] = realhost
1660 for header, value in self.addheaders:
1661 headers[header] = value
1662
1663 if data is not None:
1664 headers["Content-Type"] = "application/x-www-form-urlencoded"
1665 http_conn.request("POST", selector, data, headers)
1666 else:
1667 http_conn.request("GET", selector, headers=headers)
1668
1669 try:
1670 response = http_conn.getresponse()
1671 except http.client.BadStatusLine:
1672 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001673 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001674
1675 # According to RFC 2616, "2xx" code indicates that the client's
1676 # request was successfully received, understood, and accepted.
1677 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001678 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001679 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001680 else:
1681 return self.http_error(
1682 url, response.fp,
1683 response.status, response.reason, response.msg, data)
1684
1685 def open_http(self, url, data=None):
1686 """Use HTTP protocol."""
1687 return self._open_generic_http(http.client.HTTPConnection, url, data)
1688
1689 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1690 """Handle http errors.
1691
1692 Derived class can override this, or provide specific handlers
1693 named http_error_DDD where DDD is the 3-digit error code."""
1694 # First check if there's a specific handler for this error
1695 name = 'http_error_%d' % errcode
1696 if hasattr(self, name):
1697 method = getattr(self, name)
1698 if data is None:
1699 result = method(url, fp, errcode, errmsg, headers)
1700 else:
1701 result = method(url, fp, errcode, errmsg, headers, data)
1702 if result: return result
1703 return self.http_error_default(url, fp, errcode, errmsg, headers)
1704
1705 def http_error_default(self, url, fp, errcode, errmsg, headers):
1706 """Default error handler: close the connection and raise IOError."""
1707 void = fp.read()
1708 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001709 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001710
1711 if _have_ssl:
1712 def _https_connection(self, host):
1713 return http.client.HTTPSConnection(host,
1714 key_file=self.key_file,
1715 cert_file=self.cert_file)
1716
1717 def open_https(self, url, data=None):
1718 """Use HTTPS protocol."""
1719 return self._open_generic_http(self._https_connection, url, data)
1720
1721 def open_file(self, url):
1722 """Use local file or FTP depending on form of URL."""
1723 if not isinstance(url, str):
1724 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1725 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001726 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001727 else:
1728 return self.open_local_file(url)
1729
1730 def open_local_file(self, url):
1731 """Use local file."""
1732 import mimetypes, email.utils
1733 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001734 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001735 localname = url2pathname(file)
1736 try:
1737 stats = os.stat(localname)
1738 except OSError as e:
1739 raise URLError(e.errno, e.strerror, e.filename)
1740 size = stats.st_size
1741 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1742 mtype = mimetypes.guess_type(url)[0]
1743 headers = email.message_from_string(
1744 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1745 (mtype or 'text/plain', size, modified))
1746 if not host:
1747 urlfile = file
1748 if file[:1] == '/':
1749 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001750 return addinfourl(open(localname, 'rb'), headers, urlfile)
1751 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001752 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001753 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001754 urlfile = file
1755 if file[:1] == '/':
1756 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001757 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001758 raise URLError('local file error', 'not on local host')
1759
1760 def open_ftp(self, url):
1761 """Use FTP protocol."""
1762 if not isinstance(url, str):
1763 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1764 import mimetypes
1765 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001766 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001767 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001768 host, port = splitport(host)
1769 user, host = splituser(host)
1770 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001771 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001772 host = unquote(host)
1773 user = unquote(user or '')
1774 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001775 host = socket.gethostbyname(host)
1776 if not port:
1777 import ftplib
1778 port = ftplib.FTP_PORT
1779 else:
1780 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001781 path, attrs = splitattr(path)
1782 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001783 dirs = path.split('/')
1784 dirs, file = dirs[:-1], dirs[-1]
1785 if dirs and not dirs[0]: dirs = dirs[1:]
1786 if dirs and not dirs[0]: dirs[0] = '/'
1787 key = user, host, port, '/'.join(dirs)
1788 # XXX thread unsafe!
1789 if len(self.ftpcache) > MAXFTPCACHE:
1790 # Prune the cache, rather arbitrarily
1791 for k in self.ftpcache.keys():
1792 if k != key:
1793 v = self.ftpcache[k]
1794 del self.ftpcache[k]
1795 v.close()
1796 try:
1797 if not key in self.ftpcache:
1798 self.ftpcache[key] = \
1799 ftpwrapper(user, passwd, host, port, dirs)
1800 if not file: type = 'D'
1801 else: type = 'I'
1802 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001803 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001804 if attr.lower() == 'type' and \
1805 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1806 type = value.upper()
1807 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1808 mtype = mimetypes.guess_type("ftp:" + url)[0]
1809 headers = ""
1810 if mtype:
1811 headers += "Content-Type: %s\n" % mtype
1812 if retrlen is not None and retrlen >= 0:
1813 headers += "Content-Length: %d\n" % retrlen
1814 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001815 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001816 except ftperrors() as msg:
1817 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1818
1819 def open_data(self, url, data=None):
1820 """Use "data" URL."""
1821 if not isinstance(url, str):
1822 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1823 # ignore POSTed data
1824 #
1825 # syntax of data URLs:
1826 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1827 # mediatype := [ type "/" subtype ] *( ";" parameter )
1828 # data := *urlchar
1829 # parameter := attribute "=" value
1830 try:
1831 [type, data] = url.split(',', 1)
1832 except ValueError:
1833 raise IOError('data error', 'bad data URL')
1834 if not type:
1835 type = 'text/plain;charset=US-ASCII'
1836 semi = type.rfind(';')
1837 if semi >= 0 and '=' not in type[semi:]:
1838 encoding = type[semi+1:]
1839 type = type[:semi]
1840 else:
1841 encoding = ''
1842 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001843 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001844 time.gmtime(time.time())))
1845 msg.append('Content-type: %s' % type)
1846 if encoding == 'base64':
1847 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001848 # XXX is this encoding/decoding ok?
1849 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001850 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001851 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001852 msg.append('Content-Length: %d' % len(data))
1853 msg.append('')
1854 msg.append(data)
1855 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001856 headers = email.message_from_string(msg)
1857 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001858 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001859 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001860
1861
1862class FancyURLopener(URLopener):
1863 """Derived class with handlers for errors we can handle (perhaps)."""
1864
1865 def __init__(self, *args, **kwargs):
1866 URLopener.__init__(self, *args, **kwargs)
1867 self.auth_cache = {}
1868 self.tries = 0
1869 self.maxtries = 10
1870
1871 def http_error_default(self, url, fp, errcode, errmsg, headers):
1872 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001873 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001874
1875 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1876 """Error 302 -- relocated (temporarily)."""
1877 self.tries += 1
1878 if self.maxtries and self.tries >= self.maxtries:
1879 if hasattr(self, "http_error_500"):
1880 meth = self.http_error_500
1881 else:
1882 meth = self.http_error_default
1883 self.tries = 0
1884 return meth(url, fp, 500,
1885 "Internal Server Error: Redirect Recursion", headers)
1886 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1887 data)
1888 self.tries = 0
1889 return result
1890
1891 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1892 if 'location' in headers:
1893 newurl = headers['location']
1894 elif 'uri' in headers:
1895 newurl = headers['uri']
1896 else:
1897 return
1898 void = fp.read()
1899 fp.close()
1900 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001901 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001902 return self.open(newurl)
1903
1904 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1905 """Error 301 -- also relocated (permanently)."""
1906 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1907
1908 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1909 """Error 303 -- also relocated (essentially identical to 302)."""
1910 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1911
1912 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1913 """Error 307 -- relocated, but turn POST into error."""
1914 if data is None:
1915 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1916 else:
1917 return self.http_error_default(url, fp, errcode, errmsg, headers)
1918
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001919 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1920 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001921 """Error 401 -- authentication required.
1922 This function supports Basic authentication only."""
1923 if not 'www-authenticate' in headers:
1924 URLopener.http_error_default(self, url, fp,
1925 errcode, errmsg, headers)
1926 stuff = headers['www-authenticate']
1927 import re
1928 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1929 if not match:
1930 URLopener.http_error_default(self, url, fp,
1931 errcode, errmsg, headers)
1932 scheme, realm = match.groups()
1933 if scheme.lower() != 'basic':
1934 URLopener.http_error_default(self, url, fp,
1935 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001936 if not retry:
1937 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1938 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001939 name = 'retry_' + self.type + '_basic_auth'
1940 if data is None:
1941 return getattr(self,name)(url, realm)
1942 else:
1943 return getattr(self,name)(url, realm, data)
1944
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001945 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
1946 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001947 """Error 407 -- proxy authentication required.
1948 This function supports Basic authentication only."""
1949 if not 'proxy-authenticate' in headers:
1950 URLopener.http_error_default(self, url, fp,
1951 errcode, errmsg, headers)
1952 stuff = headers['proxy-authenticate']
1953 import re
1954 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1955 if not match:
1956 URLopener.http_error_default(self, url, fp,
1957 errcode, errmsg, headers)
1958 scheme, realm = match.groups()
1959 if scheme.lower() != 'basic':
1960 URLopener.http_error_default(self, url, fp,
1961 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001962 if not retry:
1963 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1964 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001965 name = 'retry_proxy_' + self.type + '_basic_auth'
1966 if data is None:
1967 return getattr(self,name)(url, realm)
1968 else:
1969 return getattr(self,name)(url, realm, data)
1970
1971 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001972 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001973 newurl = 'http://' + host + selector
1974 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001975 urltype, proxyhost = splittype(proxy)
1976 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001977 i = proxyhost.find('@') + 1
1978 proxyhost = proxyhost[i:]
1979 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1980 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001981 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001982 quote(passwd, safe=''), proxyhost)
1983 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1984 if data is None:
1985 return self.open(newurl)
1986 else:
1987 return self.open(newurl, data)
1988
1989 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001990 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001991 newurl = 'https://' + host + selector
1992 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001993 urltype, proxyhost = splittype(proxy)
1994 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001995 i = proxyhost.find('@') + 1
1996 proxyhost = proxyhost[i:]
1997 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1998 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001999 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002000 quote(passwd, safe=''), proxyhost)
2001 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2002 if data is None:
2003 return self.open(newurl)
2004 else:
2005 return self.open(newurl, data)
2006
2007 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002008 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002009 i = host.find('@') + 1
2010 host = host[i:]
2011 user, passwd = self.get_user_passwd(host, realm, i)
2012 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002013 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002014 quote(passwd, safe=''), host)
2015 newurl = 'http://' + host + selector
2016 if data is None:
2017 return self.open(newurl)
2018 else:
2019 return self.open(newurl, data)
2020
2021 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002022 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002023 i = host.find('@') + 1
2024 host = host[i:]
2025 user, passwd = self.get_user_passwd(host, realm, i)
2026 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002027 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002028 quote(passwd, safe=''), host)
2029 newurl = 'https://' + host + selector
2030 if data is None:
2031 return self.open(newurl)
2032 else:
2033 return self.open(newurl, data)
2034
Florent Xicluna757445b2010-05-17 17:24:07 +00002035 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002036 key = realm + '@' + host.lower()
2037 if key in self.auth_cache:
2038 if clear_cache:
2039 del self.auth_cache[key]
2040 else:
2041 return self.auth_cache[key]
2042 user, passwd = self.prompt_user_passwd(host, realm)
2043 if user or passwd: self.auth_cache[key] = (user, passwd)
2044 return user, passwd
2045
2046 def prompt_user_passwd(self, host, realm):
2047 """Override this in a GUI environment!"""
2048 import getpass
2049 try:
2050 user = input("Enter username for %s at %s: " % (realm, host))
2051 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2052 (user, realm, host))
2053 return user, passwd
2054 except KeyboardInterrupt:
2055 print()
2056 return None, None
2057
2058
2059# Utility functions
2060
2061_localhost = None
2062def localhost():
2063 """Return the IP address of the magic hostname 'localhost'."""
2064 global _localhost
2065 if _localhost is None:
2066 _localhost = socket.gethostbyname('localhost')
2067 return _localhost
2068
2069_thishost = None
2070def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002071 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002072 global _thishost
2073 if _thishost is None:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002074 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname()[2]))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002075 return _thishost
2076
2077_ftperrors = None
2078def ftperrors():
2079 """Return the set of errors raised by the FTP class."""
2080 global _ftperrors
2081 if _ftperrors is None:
2082 import ftplib
2083 _ftperrors = ftplib.all_errors
2084 return _ftperrors
2085
2086_noheaders = None
2087def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002088 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002089 global _noheaders
2090 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002091 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002092 return _noheaders
2093
2094
2095# Utility classes
2096
2097class ftpwrapper:
2098 """Class used by open_ftp() for cache of open FTP connections."""
2099
2100 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2101 self.user = user
2102 self.passwd = passwd
2103 self.host = host
2104 self.port = port
2105 self.dirs = dirs
2106 self.timeout = timeout
2107 self.init()
2108
2109 def init(self):
2110 import ftplib
2111 self.busy = 0
2112 self.ftp = ftplib.FTP()
2113 self.ftp.connect(self.host, self.port, self.timeout)
2114 self.ftp.login(self.user, self.passwd)
2115 for dir in self.dirs:
2116 self.ftp.cwd(dir)
2117
2118 def retrfile(self, file, type):
2119 import ftplib
2120 self.endtransfer()
2121 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2122 else: cmd = 'TYPE ' + type; isdir = 0
2123 try:
2124 self.ftp.voidcmd(cmd)
2125 except ftplib.all_errors:
2126 self.init()
2127 self.ftp.voidcmd(cmd)
2128 conn = None
2129 if file and not isdir:
2130 # Try to retrieve as a file
2131 try:
2132 cmd = 'RETR ' + file
2133 conn = self.ftp.ntransfercmd(cmd)
2134 except ftplib.error_perm as reason:
2135 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002136 raise URLError('ftp error', reason).with_traceback(
2137 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002138 if not conn:
2139 # Set transfer mode to ASCII!
2140 self.ftp.voidcmd('TYPE A')
2141 # Try a directory listing. Verify that directory exists.
2142 if file:
2143 pwd = self.ftp.pwd()
2144 try:
2145 try:
2146 self.ftp.cwd(file)
2147 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002148 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002149 finally:
2150 self.ftp.cwd(pwd)
2151 cmd = 'LIST ' + file
2152 else:
2153 cmd = 'LIST'
2154 conn = self.ftp.ntransfercmd(cmd)
2155 self.busy = 1
2156 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002157 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002158 def endtransfer(self):
2159 if not self.busy:
2160 return
2161 self.busy = 0
2162 try:
2163 self.ftp.voidresp()
2164 except ftperrors():
2165 pass
2166
2167 def close(self):
2168 self.endtransfer()
2169 try:
2170 self.ftp.close()
2171 except ftperrors():
2172 pass
2173
2174# Proxy handling
2175def getproxies_environment():
2176 """Return a dictionary of scheme -> proxy server URL mappings.
2177
2178 Scan the environment for variables named <scheme>_proxy;
2179 this seems to be the standard convention. If you need a
2180 different way, you can pass a proxies dictionary to the
2181 [Fancy]URLopener constructor.
2182
2183 """
2184 proxies = {}
2185 for name, value in os.environ.items():
2186 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002187 if value and name[-6:] == '_proxy':
2188 proxies[name[:-6]] = value
2189 return proxies
2190
2191def proxy_bypass_environment(host):
2192 """Test if proxies should not be used for a particular host.
2193
2194 Checks the environment for a variable named no_proxy, which should
2195 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2196 """
2197 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2198 # '*' is special case for always bypass
2199 if no_proxy == '*':
2200 return 1
2201 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002202 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002203 # check if the host ends with any of the DNS suffixes
2204 for name in no_proxy.split(','):
2205 if name and (hostonly.endswith(name) or host.endswith(name)):
2206 return 1
2207 # otherwise, don't bypass
2208 return 0
2209
2210
2211if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002212 from _scproxy import _get_proxy_settings, _get_proxies
2213
2214 def proxy_bypass_macosx_sysconf(host):
2215 """
2216 Return True iff this host shouldn't be accessed using a proxy
2217
2218 This function uses the MacOSX framework SystemConfiguration
2219 to fetch the proxy information.
2220 """
2221 import re
2222 import socket
2223 from fnmatch import fnmatch
2224
2225 hostonly, port = splitport(host)
2226
2227 def ip2num(ipAddr):
2228 parts = ipAddr.split('.')
Mark Dickinsonc3f45c22010-05-09 12:16:29 +00002229 parts = list(map(int, parts))
Ronald Oussoren84151202010-04-18 20:46:11 +00002230 if len(parts) != 4:
2231 parts = (parts + [0, 0, 0, 0])[:4]
2232 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2233
2234 proxy_settings = _get_proxy_settings()
2235
2236 # Check for simple host names:
2237 if '.' not in host:
2238 if proxy_settings['exclude_simple']:
2239 return True
2240
2241 hostIP = None
2242
2243 for value in proxy_settings.get('exceptions', ()):
2244 # Items in the list are strings like these: *.local, 169.254/16
2245 if not value: continue
2246
2247 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2248 if m is not None:
2249 if hostIP is None:
2250 try:
2251 hostIP = socket.gethostbyname(hostonly)
2252 hostIP = ip2num(hostIP)
2253 except socket.error:
2254 continue
2255
2256 base = ip2num(m.group(1))
Ronald Oussorenab90f8e2010-06-27 14:26:30 +00002257 mask = m.group(2)
2258 if mask is None:
2259 mask = 8 * (m.group(1).count('.') + 1)
2260
2261 else:
2262 mask = int(mask[1:])
2263 mask = 32 - mask
Ronald Oussoren84151202010-04-18 20:46:11 +00002264
2265 if (hostIP >> mask) == (base >> mask):
2266 return True
2267
2268 elif fnmatch(host, value):
2269 return True
2270
2271 return False
2272
2273
2274 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002275 """Return a dictionary of scheme -> proxy server URL mappings.
2276
Ronald Oussoren84151202010-04-18 20:46:11 +00002277 This function uses the MacOSX framework SystemConfiguration
2278 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002279 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002280 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002281
Ronald Oussoren84151202010-04-18 20:46:11 +00002282
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002283
2284 def proxy_bypass(host):
2285 if getproxies_environment():
2286 return proxy_bypass_environment(host)
2287 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002288 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002289
2290 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002291 return getproxies_environment() or getproxies_macosx_sysconf()
2292
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002293
2294elif os.name == 'nt':
2295 def getproxies_registry():
2296 """Return a dictionary of scheme -> proxy server URL mappings.
2297
2298 Win32 uses the registry to store proxies.
2299
2300 """
2301 proxies = {}
2302 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002303 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002304 except ImportError:
2305 # Std module, so should be around - but you never know!
2306 return proxies
2307 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002308 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002309 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002310 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002311 'ProxyEnable')[0]
2312 if proxyEnable:
2313 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002314 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002315 'ProxyServer')[0])
2316 if '=' in proxyServer:
2317 # Per-protocol settings
2318 for p in proxyServer.split(';'):
2319 protocol, address = p.split('=', 1)
2320 # See if address has a type:// prefix
2321 import re
2322 if not re.match('^([^/:]+)://', address):
2323 address = '%s://%s' % (protocol, address)
2324 proxies[protocol] = address
2325 else:
2326 # Use one setting for all protocols
2327 if proxyServer[:5] == 'http:':
2328 proxies['http'] = proxyServer
2329 else:
2330 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002331 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002332 proxies['ftp'] = 'ftp://%s' % proxyServer
2333 internetSettings.Close()
2334 except (WindowsError, ValueError, TypeError):
2335 # Either registry key not found etc, or the value in an
2336 # unexpected format.
2337 # proxies already set up to be empty so nothing to do
2338 pass
2339 return proxies
2340
2341 def getproxies():
2342 """Return a dictionary of scheme -> proxy server URL mappings.
2343
2344 Returns settings gathered from the environment, if specified,
2345 or the registry.
2346
2347 """
2348 return getproxies_environment() or getproxies_registry()
2349
2350 def proxy_bypass_registry(host):
2351 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002352 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002353 import re
2354 except ImportError:
2355 # Std modules, so should be around - but you never know!
2356 return 0
2357 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002358 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002359 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002360 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002361 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002362 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002363 'ProxyOverride')[0])
2364 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2365 except WindowsError:
2366 return 0
2367 if not proxyEnable or not proxyOverride:
2368 return 0
2369 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002370 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002371 host = [rawHost]
2372 try:
2373 addr = socket.gethostbyname(rawHost)
2374 if addr != rawHost:
2375 host.append(addr)
2376 except socket.error:
2377 pass
2378 try:
2379 fqdn = socket.getfqdn(rawHost)
2380 if fqdn != rawHost:
2381 host.append(fqdn)
2382 except socket.error:
2383 pass
2384 # make a check value list from the registry entry: replace the
2385 # '<local>' string by the localhost entry and the corresponding
2386 # canonical entry.
2387 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002388 # now check if we match one of the registry values.
2389 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002390 if test == '<local>':
2391 if '.' not in rawHost:
2392 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002393 test = test.replace(".", r"\.") # mask dots
2394 test = test.replace("*", r".*") # change glob sequence
2395 test = test.replace("?", r".") # change glob char
2396 for val in host:
2397 # print "%s <--> %s" %( test, val )
2398 if re.match(test, val, re.I):
2399 return 1
2400 return 0
2401
2402 def proxy_bypass(host):
2403 """Return a dictionary of scheme -> proxy server URL mappings.
2404
2405 Returns settings gathered from the environment, if specified,
2406 or the registry.
2407
2408 """
2409 if getproxies_environment():
2410 return proxy_bypass_environment(host)
2411 else:
2412 return proxy_bypass_registry(host)
2413
2414else:
2415 # By default use environment variables
2416 getproxies = getproxies_environment
2417 proxy_bypass = proxy_bypass_environment