blob: 732c112a0abe1977b845c9c7cd2f7d43eede2048 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000097import collections
Jeremy Hylton1afc1692008-06-18 20:49:58 +000098
Georg Brandl13e89462008-07-01 19:56:00 +000099from urllib.error import URLError, HTTPError, ContentTooShortError
100from urllib.parse import (
101 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
102 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000103 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000104from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106# check for SSL
107try:
108 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000109except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110 _have_ssl = False
111else:
112 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113
114# used in User-Agent header sent
115__version__ = sys.version[:3]
116
117_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000118def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
119 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000120 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000121 if cafile or capath:
122 if not _have_ssl:
123 raise ValueError('SSL support not available')
124 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
125 context.options |= ssl.OP_NO_SSLv2
126 if cafile or capath:
127 context.verify_mode = ssl.CERT_REQUIRED
128 context.load_verify_locations(cafile, capath)
129 check_hostname = True
130 else:
131 check_hostname = False
132 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
133 opener = build_opener(https_handler)
134 elif _opener is None:
135 _opener = opener = build_opener()
136 else:
137 opener = _opener
138 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000139
140def install_opener(opener):
141 global _opener
142 _opener = opener
143
144# TODO(jhylton): Make this work with the same global opener.
145_urlopener = None
146def urlretrieve(url, filename=None, reporthook=None, data=None):
147 global _urlopener
148 if not _urlopener:
149 _urlopener = FancyURLopener()
150 return _urlopener.retrieve(url, filename, reporthook, data)
151
152def urlcleanup():
153 if _urlopener:
154 _urlopener.cleanup()
155 global _opener
156 if _opener:
157 _opener = None
158
159# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000160_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000161def request_host(request):
162 """Return request-host, as defined by RFC 2965.
163
164 Variation from RFC: returned value is lowercased, for convenient
165 comparison.
166
167 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000168 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000169 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000170 if host == "":
171 host = request.get_header("Host", "")
172
173 # remove port, if present
174 host = _cut_port_re.sub("", host, 1)
175 return host.lower()
176
177class Request:
178
179 def __init__(self, url, data=None, headers={},
180 origin_req_host=None, unverifiable=False):
181 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000182 self.full_url = unwrap(url)
Senthil Kumarand95cc752010-08-08 11:27:53 +0000183 self.full_url, fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000184 self.data = data
185 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000186 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000187 for key, value in headers.items():
188 self.add_header(key, value)
189 self.unredirected_hdrs = {}
190 if origin_req_host is None:
191 origin_req_host = request_host(self)
192 self.origin_req_host = origin_req_host
193 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000194 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000195
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000196 def _parse(self):
197 self.type, rest = splittype(self.full_url)
198 if self.type is None:
199 raise ValueError("unknown url type: %s" % self.full_url)
200 self.host, self.selector = splithost(rest)
201 if self.host:
202 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000203
204 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000205 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000206 return "POST"
207 else:
208 return "GET"
209
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000210 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211
212 def add_data(self, data):
213 self.data = data
214
215 def has_data(self):
216 return self.data is not None
217
218 def get_data(self):
219 return self.data
220
221 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000222 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000223
224 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000225 return self.type
226
227 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000228 return self.host
229
230 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000231 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000232
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000233 def is_unverifiable(self):
234 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000235
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000236 def get_origin_req_host(self):
237 return self.origin_req_host
238
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000239 # End deprecated methods
240
241 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000242 if self.type == 'https' and not self._tunnel_host:
243 self._tunnel_host = self.host
244 else:
245 self.type= type
246 self.selector = self.full_url
247 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000248
249 def has_proxy(self):
250 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000251
252 def add_header(self, key, val):
253 # useful for something like authentication
254 self.headers[key.capitalize()] = val
255
256 def add_unredirected_header(self, key, val):
257 # will not be added to a redirected request
258 self.unredirected_hdrs[key.capitalize()] = val
259
260 def has_header(self, header_name):
261 return (header_name in self.headers or
262 header_name in self.unredirected_hdrs)
263
264 def get_header(self, header_name, default=None):
265 return self.headers.get(
266 header_name,
267 self.unredirected_hdrs.get(header_name, default))
268
269 def header_items(self):
270 hdrs = self.unredirected_hdrs.copy()
271 hdrs.update(self.headers)
272 return list(hdrs.items())
273
274class OpenerDirector:
275 def __init__(self):
276 client_version = "Python-urllib/%s" % __version__
277 self.addheaders = [('User-agent', client_version)]
278 # manage the individual handlers
279 self.handlers = []
280 self.handle_open = {}
281 self.handle_error = {}
282 self.process_response = {}
283 self.process_request = {}
284
285 def add_handler(self, handler):
286 if not hasattr(handler, "add_parent"):
287 raise TypeError("expected BaseHandler instance, got %r" %
288 type(handler))
289
290 added = False
291 for meth in dir(handler):
292 if meth in ["redirect_request", "do_open", "proxy_open"]:
293 # oops, coincidental match
294 continue
295
296 i = meth.find("_")
297 protocol = meth[:i]
298 condition = meth[i+1:]
299
300 if condition.startswith("error"):
301 j = condition.find("_") + i + 1
302 kind = meth[j+1:]
303 try:
304 kind = int(kind)
305 except ValueError:
306 pass
307 lookup = self.handle_error.get(protocol, {})
308 self.handle_error[protocol] = lookup
309 elif condition == "open":
310 kind = protocol
311 lookup = self.handle_open
312 elif condition == "response":
313 kind = protocol
314 lookup = self.process_response
315 elif condition == "request":
316 kind = protocol
317 lookup = self.process_request
318 else:
319 continue
320
321 handlers = lookup.setdefault(kind, [])
322 if handlers:
323 bisect.insort(handlers, handler)
324 else:
325 handlers.append(handler)
326 added = True
327
328 if added:
329 # the handlers must work in an specific order, the order
330 # is specified in a Handler attribute
331 bisect.insort(self.handlers, handler)
332 handler.add_parent(self)
333
334 def close(self):
335 # Only exists for backwards compatibility.
336 pass
337
338 def _call_chain(self, chain, kind, meth_name, *args):
339 # Handlers raise an exception if no one else should try to handle
340 # the request, or return None if they can't but another handler
341 # could. Otherwise, they return the response.
342 handlers = chain.get(kind, ())
343 for handler in handlers:
344 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000345 result = func(*args)
346 if result is not None:
347 return result
348
349 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
350 # accept a URL or a Request object
351 if isinstance(fullurl, str):
352 req = Request(fullurl, data)
353 else:
354 req = fullurl
355 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000356 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000357
358 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000359 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000360
361 # pre-process request
362 meth_name = protocol+"_request"
363 for processor in self.process_request.get(protocol, []):
364 meth = getattr(processor, meth_name)
365 req = meth(req)
366
367 response = self._open(req, data)
368
369 # post-process response
370 meth_name = protocol+"_response"
371 for processor in self.process_response.get(protocol, []):
372 meth = getattr(processor, meth_name)
373 response = meth(req, response)
374
375 return response
376
377 def _open(self, req, data=None):
378 result = self._call_chain(self.handle_open, 'default',
379 'default_open', req)
380 if result:
381 return result
382
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000383 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000384 result = self._call_chain(self.handle_open, protocol, protocol +
385 '_open', req)
386 if result:
387 return result
388
389 return self._call_chain(self.handle_open, 'unknown',
390 'unknown_open', req)
391
392 def error(self, proto, *args):
393 if proto in ('http', 'https'):
394 # XXX http[s] protocols are special-cased
395 dict = self.handle_error['http'] # https is not different than http
396 proto = args[2] # YUCK!
397 meth_name = 'http_error_%s' % proto
398 http_err = 1
399 orig_args = args
400 else:
401 dict = self.handle_error
402 meth_name = proto + '_error'
403 http_err = 0
404 args = (dict, proto, meth_name) + args
405 result = self._call_chain(*args)
406 if result:
407 return result
408
409 if http_err:
410 args = (dict, 'default', 'http_error_default') + orig_args
411 return self._call_chain(*args)
412
413# XXX probably also want an abstract factory that knows when it makes
414# sense to skip a superclass in favor of a subclass and when it might
415# make sense to include both
416
417def build_opener(*handlers):
418 """Create an opener object from a list of handlers.
419
420 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000421 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000422
423 If any of the handlers passed as arguments are subclasses of the
424 default handlers, the default handlers will not be used.
425 """
426 def isclass(obj):
427 return isinstance(obj, type) or hasattr(obj, "__bases__")
428
429 opener = OpenerDirector()
430 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
431 HTTPDefaultErrorHandler, HTTPRedirectHandler,
432 FTPHandler, FileHandler, HTTPErrorProcessor]
433 if hasattr(http.client, "HTTPSConnection"):
434 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000435 skip = set()
436 for klass in default_classes:
437 for check in handlers:
438 if isclass(check):
439 if issubclass(check, klass):
440 skip.add(klass)
441 elif isinstance(check, klass):
442 skip.add(klass)
443 for klass in skip:
444 default_classes.remove(klass)
445
446 for klass in default_classes:
447 opener.add_handler(klass())
448
449 for h in handlers:
450 if isclass(h):
451 h = h()
452 opener.add_handler(h)
453 return opener
454
455class BaseHandler:
456 handler_order = 500
457
458 def add_parent(self, parent):
459 self.parent = parent
460
461 def close(self):
462 # Only exists for backwards compatibility
463 pass
464
465 def __lt__(self, other):
466 if not hasattr(other, "handler_order"):
467 # Try to preserve the old behavior of having custom classes
468 # inserted after default ones (works only for custom user
469 # classes which are not aware of handler_order).
470 return True
471 return self.handler_order < other.handler_order
472
473
474class HTTPErrorProcessor(BaseHandler):
475 """Process HTTP error responses."""
476 handler_order = 1000 # after all other processing
477
478 def http_response(self, request, response):
479 code, msg, hdrs = response.code, response.msg, response.info()
480
481 # According to RFC 2616, "2xx" code indicates that the client's
482 # request was successfully received, understood, and accepted.
483 if not (200 <= code < 300):
484 response = self.parent.error(
485 'http', request, response, code, msg, hdrs)
486
487 return response
488
489 https_response = http_response
490
491class HTTPDefaultErrorHandler(BaseHandler):
492 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000493 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000494
495class HTTPRedirectHandler(BaseHandler):
496 # maximum number of redirections to any single URL
497 # this is needed because of the state that cookies introduce
498 max_repeats = 4
499 # maximum total number of redirections (regardless of URL) before
500 # assuming we're in a loop
501 max_redirections = 10
502
503 def redirect_request(self, req, fp, code, msg, headers, newurl):
504 """Return a Request or None in response to a redirect.
505
506 This is called by the http_error_30x methods when a
507 redirection response is received. If a redirection should
508 take place, return a new Request to allow http_error_30x to
509 perform the redirect. Otherwise, raise HTTPError if no-one
510 else should try to handle this url. Return None if you can't
511 but another Handler might.
512 """
513 m = req.get_method()
514 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
515 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000516 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000517
518 # Strictly (according to RFC 2616), 301 or 302 in response to
519 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000520 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000521 # essentially all clients do redirect in this case, so we do
522 # the same.
523 # be conciliant with URIs containing a space
524 newurl = newurl.replace(' ', '%20')
525 CONTENT_HEADERS = ("content-length", "content-type")
526 newheaders = dict((k, v) for k, v in req.headers.items()
527 if k.lower() not in CONTENT_HEADERS)
528 return Request(newurl,
529 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000530 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000531 unverifiable=True)
532
533 # Implementation note: To avoid the server sending us into an
534 # infinite loop, the request object needs to track what URLs we
535 # have already seen. Do this by adding a handler-specific
536 # attribute to the Request object.
537 def http_error_302(self, req, fp, code, msg, headers):
538 # Some servers (incorrectly) return multiple Location headers
539 # (so probably same goes for URI). Use first header.
540 if "location" in headers:
541 newurl = headers["location"]
542 elif "uri" in headers:
543 newurl = headers["uri"]
544 else:
545 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000546
547 # fix a possible malformed URL
548 urlparts = urlparse(newurl)
549 if not urlparts.path:
550 urlparts = list(urlparts)
551 urlparts[2] = "/"
552 newurl = urlunparse(urlparts)
553
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000554 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000555
556 # XXX Probably want to forget about the state of the current
557 # request, although that might interact poorly with other
558 # handlers that also use handler-specific request attributes
559 new = self.redirect_request(req, fp, code, msg, headers, newurl)
560 if new is None:
561 return
562
563 # loop detection
564 # .redirect_dict has a key url if url was previously visited.
565 if hasattr(req, 'redirect_dict'):
566 visited = new.redirect_dict = req.redirect_dict
567 if (visited.get(newurl, 0) >= self.max_repeats or
568 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000569 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000570 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000571 else:
572 visited = new.redirect_dict = req.redirect_dict = {}
573 visited[newurl] = visited.get(newurl, 0) + 1
574
575 # Don't close the fp until we are sure that we won't use it
576 # with HTTPError.
577 fp.read()
578 fp.close()
579
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000580 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000581
582 http_error_301 = http_error_303 = http_error_307 = http_error_302
583
584 inf_msg = "The HTTP server returned a redirect error that would " \
585 "lead to an infinite loop.\n" \
586 "The last 30x error message was:\n"
587
588
589def _parse_proxy(proxy):
590 """Return (scheme, user, password, host/port) given a URL or an authority.
591
592 If a URL is supplied, it must have an authority (host:port) component.
593 According to RFC 3986, having an authority component means the URL must
594 have two slashes after the scheme:
595
596 >>> _parse_proxy('file:/ftp.example.com/')
597 Traceback (most recent call last):
598 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
599
600 The first three items of the returned tuple may be None.
601
602 Examples of authority parsing:
603
604 >>> _parse_proxy('proxy.example.com')
605 (None, None, None, 'proxy.example.com')
606 >>> _parse_proxy('proxy.example.com:3128')
607 (None, None, None, 'proxy.example.com:3128')
608
609 The authority component may optionally include userinfo (assumed to be
610 username:password):
611
612 >>> _parse_proxy('joe:password@proxy.example.com')
613 (None, 'joe', 'password', 'proxy.example.com')
614 >>> _parse_proxy('joe:password@proxy.example.com:3128')
615 (None, 'joe', 'password', 'proxy.example.com:3128')
616
617 Same examples, but with URLs instead:
618
619 >>> _parse_proxy('http://proxy.example.com/')
620 ('http', None, None, 'proxy.example.com')
621 >>> _parse_proxy('http://proxy.example.com:3128/')
622 ('http', None, None, 'proxy.example.com:3128')
623 >>> _parse_proxy('http://joe:password@proxy.example.com/')
624 ('http', 'joe', 'password', 'proxy.example.com')
625 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
626 ('http', 'joe', 'password', 'proxy.example.com:3128')
627
628 Everything after the authority is ignored:
629
630 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
631 ('ftp', 'joe', 'password', 'proxy.example.com')
632
633 Test for no trailing '/' case:
634
635 >>> _parse_proxy('http://joe:password@proxy.example.com')
636 ('http', 'joe', 'password', 'proxy.example.com')
637
638 """
Georg Brandl13e89462008-07-01 19:56:00 +0000639 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000640 if not r_scheme.startswith("/"):
641 # authority
642 scheme = None
643 authority = proxy
644 else:
645 # URL
646 if not r_scheme.startswith("//"):
647 raise ValueError("proxy URL with no authority: %r" % proxy)
648 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
649 # and 3.3.), path is empty or starts with '/'
650 end = r_scheme.find("/", 2)
651 if end == -1:
652 end = None
653 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000654 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000655 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000656 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000657 else:
658 user = password = None
659 return scheme, user, password, hostport
660
661class ProxyHandler(BaseHandler):
662 # Proxies must be in front
663 handler_order = 100
664
665 def __init__(self, proxies=None):
666 if proxies is None:
667 proxies = getproxies()
668 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
669 self.proxies = proxies
670 for type, url in proxies.items():
671 setattr(self, '%s_open' % type,
672 lambda r, proxy=url, type=type, meth=self.proxy_open: \
673 meth(r, proxy, type))
674
675 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000676 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000677 proxy_type, user, password, hostport = _parse_proxy(proxy)
678 if proxy_type is None:
679 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000680
681 if req.host and proxy_bypass(req.host):
682 return None
683
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000684 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000685 user_pass = '%s:%s' % (unquote(user),
686 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000687 creds = base64.b64encode(user_pass.encode()).decode("ascii")
688 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000689 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000690 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000691 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000692 # let other handlers take care of it
693 return None
694 else:
695 # need to start over, because the other handlers don't
696 # grok the proxy's URL type
697 # e.g. if we have a constructor arg proxies like so:
698 # {'http': 'ftp://proxy.example.com'}, we may end up turning
699 # a request for http://acme.example.com/a into one for
700 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000701 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000702
703class HTTPPasswordMgr:
704
705 def __init__(self):
706 self.passwd = {}
707
708 def add_password(self, realm, uri, user, passwd):
709 # uri could be a single URI or a sequence
710 if isinstance(uri, str):
711 uri = [uri]
712 if not realm in self.passwd:
713 self.passwd[realm] = {}
714 for default_port in True, False:
715 reduced_uri = tuple(
716 [self.reduce_uri(u, default_port) for u in uri])
717 self.passwd[realm][reduced_uri] = (user, passwd)
718
719 def find_user_password(self, realm, authuri):
720 domains = self.passwd.get(realm, {})
721 for default_port in True, False:
722 reduced_authuri = self.reduce_uri(authuri, default_port)
723 for uris, authinfo in domains.items():
724 for uri in uris:
725 if self.is_suburi(uri, reduced_authuri):
726 return authinfo
727 return None, None
728
729 def reduce_uri(self, uri, default_port=True):
730 """Accept authority or URI and extract only the authority and path."""
731 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000732 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000733 if parts[1]:
734 # URI
735 scheme = parts[0]
736 authority = parts[1]
737 path = parts[2] or '/'
738 else:
739 # host or host:port
740 scheme = None
741 authority = uri
742 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000743 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000744 if default_port and port is None and scheme is not None:
745 dport = {"http": 80,
746 "https": 443,
747 }.get(scheme)
748 if dport is not None:
749 authority = "%s:%d" % (host, dport)
750 return authority, path
751
752 def is_suburi(self, base, test):
753 """Check if test is below base in a URI tree
754
755 Both args must be URIs in reduced form.
756 """
757 if base == test:
758 return True
759 if base[0] != test[0]:
760 return False
761 common = posixpath.commonprefix((base[1], test[1]))
762 if len(common) == len(base[1]):
763 return True
764 return False
765
766
767class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
768
769 def find_user_password(self, realm, authuri):
770 user, password = HTTPPasswordMgr.find_user_password(self, realm,
771 authuri)
772 if user is not None:
773 return user, password
774 return HTTPPasswordMgr.find_user_password(self, None, authuri)
775
776
777class AbstractBasicAuthHandler:
778
779 # XXX this allows for multiple auth-schemes, but will stupidly pick
780 # the last one with a realm specified.
781
782 # allow for double- and single-quoted realm values
783 # (single quotes are a violation of the RFC, but appear in the wild)
784 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
785 'realm=(["\'])(.*?)\\2', re.I)
786
787 # XXX could pre-emptively send auth info already accepted (RFC 2617,
788 # end of section 2, and section 1.2 immediately after "credentials"
789 # production).
790
791 def __init__(self, password_mgr=None):
792 if password_mgr is None:
793 password_mgr = HTTPPasswordMgr()
794 self.passwd = password_mgr
795 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000796 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000797
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000798 def reset_retry_count(self):
799 self.retried = 0
800
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000801 def http_error_auth_reqed(self, authreq, host, req, headers):
802 # host may be an authority (without userinfo) or a URL with an
803 # authority
804 # XXX could be multiple headers
805 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000806
807 if self.retried > 5:
808 # retry sending the username:password 5 times before failing.
809 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
810 headers, None)
811 else:
812 self.retried += 1
813
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000814 if authreq:
815 mo = AbstractBasicAuthHandler.rx.search(authreq)
816 if mo:
817 scheme, quote, realm = mo.groups()
818 if scheme.lower() == 'basic':
Senthil Kumaran4bb5c272010-08-26 06:16:22 +0000819 response = self.retry_http_basic_auth(host, req, realm)
820 if response and response.code != 401:
821 self.retried = 0
822 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000823
824 def retry_http_basic_auth(self, host, req, realm):
825 user, pw = self.passwd.find_user_password(realm, host)
826 if pw is not None:
827 raw = "%s:%s" % (user, pw)
828 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
829 if req.headers.get(self.auth_header, None) == auth:
830 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000831 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000832 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000833 else:
834 return None
835
836
837class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
838
839 auth_header = 'Authorization'
840
841 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000842 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000843 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000844 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000845 self.reset_retry_count()
846 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000847
848
849class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
850
851 auth_header = 'Proxy-authorization'
852
853 def http_error_407(self, req, fp, code, msg, headers):
854 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000855 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000856 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
857 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000858 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000859 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000860 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000861 self.reset_retry_count()
862 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000863
864
865def randombytes(n):
866 """Return n random bytes."""
867 return os.urandom(n)
868
869class AbstractDigestAuthHandler:
870 # Digest authentication is specified in RFC 2617.
871
872 # XXX The client does not inspect the Authentication-Info header
873 # in a successful response.
874
875 # XXX It should be possible to test this implementation against
876 # a mock server that just generates a static set of challenges.
877
878 # XXX qop="auth-int" supports is shaky
879
880 def __init__(self, passwd=None):
881 if passwd is None:
882 passwd = HTTPPasswordMgr()
883 self.passwd = passwd
884 self.add_password = self.passwd.add_password
885 self.retried = 0
886 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000887 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000888
889 def reset_retry_count(self):
890 self.retried = 0
891
892 def http_error_auth_reqed(self, auth_header, host, req, headers):
893 authreq = headers.get(auth_header, None)
894 if self.retried > 5:
895 # Don't fail endlessly - if we failed once, we'll probably
896 # fail a second time. Hm. Unless the Password Manager is
897 # prompting for the information. Crap. This isn't great
898 # but it's better than the current 'repeat until recursion
899 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000900 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000901 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000902 else:
903 self.retried += 1
904 if authreq:
905 scheme = authreq.split()[0]
906 if scheme.lower() == 'digest':
907 return self.retry_http_digest_auth(req, authreq)
908
909 def retry_http_digest_auth(self, req, auth):
910 token, challenge = auth.split(' ', 1)
911 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
912 auth = self.get_authorization(req, chal)
913 if auth:
914 auth_val = 'Digest %s' % auth
915 if req.headers.get(self.auth_header, None) == auth_val:
916 return None
917 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000918 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000919 return resp
920
921 def get_cnonce(self, nonce):
922 # The cnonce-value is an opaque
923 # quoted string value provided by the client and used by both client
924 # and server to avoid chosen plaintext attacks, to provide mutual
925 # authentication, and to provide some message integrity protection.
926 # This isn't a fabulous effort, but it's probably Good Enough.
927 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
928 b = s.encode("ascii") + randombytes(8)
929 dig = hashlib.sha1(b).hexdigest()
930 return dig[:16]
931
932 def get_authorization(self, req, chal):
933 try:
934 realm = chal['realm']
935 nonce = chal['nonce']
936 qop = chal.get('qop')
937 algorithm = chal.get('algorithm', 'MD5')
938 # mod_digest doesn't send an opaque, even though it isn't
939 # supposed to be optional
940 opaque = chal.get('opaque', None)
941 except KeyError:
942 return None
943
944 H, KD = self.get_algorithm_impls(algorithm)
945 if H is None:
946 return None
947
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000948 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000949 if user is None:
950 return None
951
952 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000953 if req.data is not None:
954 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000955 else:
956 entdig = None
957
958 A1 = "%s:%s:%s" % (user, realm, pw)
959 A2 = "%s:%s" % (req.get_method(),
960 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000961 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000962 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000963 if nonce == self.last_nonce:
964 self.nonce_count += 1
965 else:
966 self.nonce_count = 1
967 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000968 ncvalue = '%08x' % self.nonce_count
969 cnonce = self.get_cnonce(nonce)
970 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
971 respdig = KD(H(A1), noncebit)
972 elif qop is None:
973 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
974 else:
975 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000976 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000977
978 # XXX should the partial digests be encoded too?
979
980 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000981 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000982 respdig)
983 if opaque:
984 base += ', opaque="%s"' % opaque
985 if entdig:
986 base += ', digest="%s"' % entdig
987 base += ', algorithm="%s"' % algorithm
988 if qop:
989 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
990 return base
991
992 def get_algorithm_impls(self, algorithm):
993 # lambdas assume digest modules are imported at the top level
994 if algorithm == 'MD5':
995 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
996 elif algorithm == 'SHA':
997 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
998 # XXX MD5-sess
999 KD = lambda s, d: H("%s:%s" % (s, d))
1000 return H, KD
1001
1002 def get_entity_digest(self, data, chal):
1003 # XXX not implemented yet
1004 return None
1005
1006
1007class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1008 """An authentication protocol defined by RFC 2069
1009
1010 Digest authentication improves on basic authentication because it
1011 does not transmit passwords in the clear.
1012 """
1013
1014 auth_header = 'Authorization'
1015 handler_order = 490 # before Basic auth
1016
1017 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001018 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001019 retry = self.http_error_auth_reqed('www-authenticate',
1020 host, req, headers)
1021 self.reset_retry_count()
1022 return retry
1023
1024
1025class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1026
1027 auth_header = 'Proxy-Authorization'
1028 handler_order = 490 # before Basic auth
1029
1030 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001031 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001032 retry = self.http_error_auth_reqed('proxy-authenticate',
1033 host, req, headers)
1034 self.reset_retry_count()
1035 return retry
1036
1037class AbstractHTTPHandler(BaseHandler):
1038
1039 def __init__(self, debuglevel=0):
1040 self._debuglevel = debuglevel
1041
1042 def set_http_debuglevel(self, level):
1043 self._debuglevel = level
1044
1045 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001046 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001047 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001048 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001049
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001050 if request.data is not None: # POST
1051 data = request.data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001052 if not request.has_header('Content-type'):
1053 request.add_unredirected_header(
1054 'Content-type',
1055 'application/x-www-form-urlencoded')
1056 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001057 try:
1058 mv = memoryview(data)
1059 except TypeError:
1060 print(data)
1061 if isinstance(data, collections.Iterable):
1062 raise ValueError("Content-Length should be specified \
1063 for iterable data of type %r %r" % (type(data),
1064 data))
1065 else:
1066 request.add_unredirected_header(
1067 'Content-length', '%d' % len(mv) * mv.itemsize)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001068
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001069 sel_host = host
1070 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001071 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001072 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001073 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001074 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001075 for name, value in self.parent.addheaders:
1076 name = name.capitalize()
1077 if not request.has_header(name):
1078 request.add_unredirected_header(name, value)
1079
1080 return request
1081
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001082 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001083 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001084
1085 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001086 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001087 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001088 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001089 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001090
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001091 # will parse host:port
1092 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001093
1094 headers = dict(req.unredirected_hdrs)
1095 headers.update(dict((k, v) for k, v in req.headers.items()
1096 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001097
1098 # TODO(jhylton): Should this be redesigned to handle
1099 # persistent connections?
1100
1101 # We want to make an HTTP/1.1 request, but the addinfourl
1102 # class isn't prepared to deal with a persistent connection.
1103 # It will try to read all remaining data from the socket,
1104 # which will block while the server waits for the next request.
1105 # So make sure the connection gets closed after the (only)
1106 # request.
1107 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001108 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001109
1110 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001111 tunnel_headers = {}
1112 proxy_auth_hdr = "Proxy-Authorization"
1113 if proxy_auth_hdr in headers:
1114 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1115 # Proxy-Authorization should not be sent to origin
1116 # server.
1117 del headers[proxy_auth_hdr]
1118 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001119
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001120 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001121 h.request(req.get_method(), req.selector, req.data, headers)
1122 r = h.getresponse() # an HTTPResponse instance
1123 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001124 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001125
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001126 r.url = req.full_url
1127 # This line replaces the .msg attribute of the HTTPResponse
1128 # with .headers, because urllib clients expect the response to
1129 # have the reason in .msg. It would be good to mark this
1130 # attribute is deprecated and get then to use info() or
1131 # .headers.
1132 r.msg = r.reason
1133 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001134
1135
1136class HTTPHandler(AbstractHTTPHandler):
1137
1138 def http_open(self, req):
1139 return self.do_open(http.client.HTTPConnection, req)
1140
1141 http_request = AbstractHTTPHandler.do_request_
1142
1143if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001144 import ssl
1145
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001146 class HTTPSHandler(AbstractHTTPHandler):
1147
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001148 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1149 AbstractHTTPHandler.__init__(self, debuglevel)
1150 self._context = context
1151 self._check_hostname = check_hostname
1152
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001153 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001154 return self.do_open(http.client.HTTPSConnection, req,
1155 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001156
1157 https_request = AbstractHTTPHandler.do_request_
1158
1159class HTTPCookieProcessor(BaseHandler):
1160 def __init__(self, cookiejar=None):
1161 import http.cookiejar
1162 if cookiejar is None:
1163 cookiejar = http.cookiejar.CookieJar()
1164 self.cookiejar = cookiejar
1165
1166 def http_request(self, request):
1167 self.cookiejar.add_cookie_header(request)
1168 return request
1169
1170 def http_response(self, request, response):
1171 self.cookiejar.extract_cookies(response, request)
1172 return response
1173
1174 https_request = http_request
1175 https_response = http_response
1176
1177class UnknownHandler(BaseHandler):
1178 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001179 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001180 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001181
1182def parse_keqv_list(l):
1183 """Parse list of key=value strings where keys are not duplicated."""
1184 parsed = {}
1185 for elt in l:
1186 k, v = elt.split('=', 1)
1187 if v[0] == '"' and v[-1] == '"':
1188 v = v[1:-1]
1189 parsed[k] = v
1190 return parsed
1191
1192def parse_http_list(s):
1193 """Parse lists as described by RFC 2068 Section 2.
1194
1195 In particular, parse comma-separated lists where the elements of
1196 the list may include quoted-strings. A quoted-string could
1197 contain a comma. A non-quoted string could have quotes in the
1198 middle. Neither commas nor quotes count if they are escaped.
1199 Only double-quotes count, not single-quotes.
1200 """
1201 res = []
1202 part = ''
1203
1204 escape = quote = False
1205 for cur in s:
1206 if escape:
1207 part += cur
1208 escape = False
1209 continue
1210 if quote:
1211 if cur == '\\':
1212 escape = True
1213 continue
1214 elif cur == '"':
1215 quote = False
1216 part += cur
1217 continue
1218
1219 if cur == ',':
1220 res.append(part)
1221 part = ''
1222 continue
1223
1224 if cur == '"':
1225 quote = True
1226
1227 part += cur
1228
1229 # append last part
1230 if part:
1231 res.append(part)
1232
1233 return [part.strip() for part in res]
1234
1235class FileHandler(BaseHandler):
1236 # Use local file or FTP depending on form of URL
1237 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001238 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001239 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1240 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001241 if not req.host is self.get_names():
1242 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001243 else:
1244 return self.open_local_file(req)
1245
1246 # names for the localhost
1247 names = None
1248 def get_names(self):
1249 if FileHandler.names is None:
1250 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001251 FileHandler.names = tuple(
1252 socket.gethostbyname_ex('localhost')[2] +
1253 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001254 except socket.gaierror:
1255 FileHandler.names = (socket.gethostbyname('localhost'),)
1256 return FileHandler.names
1257
1258 # not entirely sure what the rules are here
1259 def open_local_file(self, req):
1260 import email.utils
1261 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001262 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001263 filename = req.selector
1264 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001265 try:
1266 stats = os.stat(localfile)
1267 size = stats.st_size
1268 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001269 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001270 headers = email.message_from_string(
1271 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1272 (mtype or 'text/plain', size, modified))
1273 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001274 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001275 if not host or \
1276 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001277 if host:
1278 origurl = 'file://' + host + filename
1279 else:
1280 origurl = 'file://' + filename
1281 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001282 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001283 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001284 raise URLError(msg)
1285 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001286
1287def _safe_gethostbyname(host):
1288 try:
1289 return socket.gethostbyname(host)
1290 except socket.gaierror:
1291 return None
1292
1293class FTPHandler(BaseHandler):
1294 def ftp_open(self, req):
1295 import ftplib
1296 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001297 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001298 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001299 raise URLError('ftp error: no host given')
1300 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001301 if port is None:
1302 port = ftplib.FTP_PORT
1303 else:
1304 port = int(port)
1305
1306 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001307 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001308 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001309 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001310 else:
1311 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001312 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001313 user = user or ''
1314 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001315
1316 try:
1317 host = socket.gethostbyname(host)
1318 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001319 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001320 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001321 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001322 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001323 dirs, file = dirs[:-1], dirs[-1]
1324 if dirs and not dirs[0]:
1325 dirs = dirs[1:]
1326 try:
1327 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1328 type = file and 'I' or 'D'
1329 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001330 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001331 if attr.lower() == 'type' and \
1332 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1333 type = value.upper()
1334 fp, retrlen = fw.retrfile(file, type)
1335 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001336 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001337 if mtype:
1338 headers += "Content-type: %s\n" % mtype
1339 if retrlen is not None and retrlen >= 0:
1340 headers += "Content-length: %d\n" % retrlen
1341 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001342 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001343 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001344 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001345 raise exc.with_traceback(sys.exc_info()[2])
1346
1347 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1348 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1349 return fw
1350
1351class CacheFTPHandler(FTPHandler):
1352 # XXX would be nice to have pluggable cache strategies
1353 # XXX this stuff is definitely not thread safe
1354 def __init__(self):
1355 self.cache = {}
1356 self.timeout = {}
1357 self.soonest = 0
1358 self.delay = 60
1359 self.max_conns = 16
1360
1361 def setTimeout(self, t):
1362 self.delay = t
1363
1364 def setMaxConns(self, m):
1365 self.max_conns = m
1366
1367 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1368 key = user, host, port, '/'.join(dirs), timeout
1369 if key in self.cache:
1370 self.timeout[key] = time.time() + self.delay
1371 else:
1372 self.cache[key] = ftpwrapper(user, passwd, host, port,
1373 dirs, timeout)
1374 self.timeout[key] = time.time() + self.delay
1375 self.check_cache()
1376 return self.cache[key]
1377
1378 def check_cache(self):
1379 # first check for old ones
1380 t = time.time()
1381 if self.soonest <= t:
1382 for k, v in list(self.timeout.items()):
1383 if v < t:
1384 self.cache[k].close()
1385 del self.cache[k]
1386 del self.timeout[k]
1387 self.soonest = min(list(self.timeout.values()))
1388
1389 # then check the size
1390 if len(self.cache) == self.max_conns:
1391 for k, v in list(self.timeout.items()):
1392 if v == self.soonest:
1393 del self.cache[k]
1394 del self.timeout[k]
1395 break
1396 self.soonest = min(list(self.timeout.values()))
1397
1398# Code move from the old urllib module
1399
1400MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1401
1402# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001403if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001404 from nturl2path import url2pathname, pathname2url
1405else:
1406 def url2pathname(pathname):
1407 """OS-specific conversion from a relative URL of the 'file' scheme
1408 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001409 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001410
1411 def pathname2url(pathname):
1412 """OS-specific conversion from a file system path to a relative URL
1413 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001414 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001415
1416# This really consists of two pieces:
1417# (1) a class which handles opening of all sorts of URLs
1418# (plus assorted utilities etc.)
1419# (2) a set of functions for parsing URLs
1420# XXX Should these be separated out into different modules?
1421
1422
1423ftpcache = {}
1424class URLopener:
1425 """Class to open URLs.
1426 This is a class rather than just a subroutine because we may need
1427 more than one set of global protocol-specific options.
1428 Note -- this is a base class for those who don't want the
1429 automatic handling of errors type 302 (relocated) and 401
1430 (authorization needed)."""
1431
1432 __tempfiles = None
1433
1434 version = "Python-urllib/%s" % __version__
1435
1436 # Constructor
1437 def __init__(self, proxies=None, **x509):
1438 if proxies is None:
1439 proxies = getproxies()
1440 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1441 self.proxies = proxies
1442 self.key_file = x509.get('key_file')
1443 self.cert_file = x509.get('cert_file')
1444 self.addheaders = [('User-Agent', self.version)]
1445 self.__tempfiles = []
1446 self.__unlink = os.unlink # See cleanup()
1447 self.tempcache = None
1448 # Undocumented feature: if you assign {} to tempcache,
1449 # it is used to cache files retrieved with
1450 # self.retrieve(). This is not enabled by default
1451 # since it does not work for changing documents (and I
1452 # haven't got the logic to check expiration headers
1453 # yet).
1454 self.ftpcache = ftpcache
1455 # Undocumented feature: you can use a different
1456 # ftp cache by assigning to the .ftpcache member;
1457 # in case you want logically independent URL openers
1458 # XXX This is not threadsafe. Bah.
1459
1460 def __del__(self):
1461 self.close()
1462
1463 def close(self):
1464 self.cleanup()
1465
1466 def cleanup(self):
1467 # This code sometimes runs when the rest of this module
1468 # has already been deleted, so it can't use any globals
1469 # or import anything.
1470 if self.__tempfiles:
1471 for file in self.__tempfiles:
1472 try:
1473 self.__unlink(file)
1474 except OSError:
1475 pass
1476 del self.__tempfiles[:]
1477 if self.tempcache:
1478 self.tempcache.clear()
1479
1480 def addheader(self, *args):
1481 """Add a header to be used by the HTTP interface only
1482 e.g. u.addheader('Accept', 'sound/basic')"""
1483 self.addheaders.append(args)
1484
1485 # External interface
1486 def open(self, fullurl, data=None):
1487 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001488 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001489 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001490 if self.tempcache and fullurl in self.tempcache:
1491 filename, headers = self.tempcache[fullurl]
1492 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001493 return addinfourl(fp, headers, fullurl)
1494 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001495 if not urltype:
1496 urltype = 'file'
1497 if urltype in self.proxies:
1498 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001499 urltype, proxyhost = splittype(proxy)
1500 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001501 url = (host, fullurl) # Signal special case to open_*()
1502 else:
1503 proxy = None
1504 name = 'open_' + urltype
1505 self.type = urltype
1506 name = name.replace('-', '_')
1507 if not hasattr(self, name):
1508 if proxy:
1509 return self.open_unknown_proxy(proxy, fullurl, data)
1510 else:
1511 return self.open_unknown(fullurl, data)
1512 try:
1513 if data is None:
1514 return getattr(self, name)(url)
1515 else:
1516 return getattr(self, name)(url, data)
1517 except socket.error as msg:
1518 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1519
1520 def open_unknown(self, fullurl, data=None):
1521 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001522 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001523 raise IOError('url error', 'unknown url type', type)
1524
1525 def open_unknown_proxy(self, proxy, fullurl, data=None):
1526 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001527 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001528 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1529
1530 # External interface
1531 def retrieve(self, url, filename=None, reporthook=None, data=None):
1532 """retrieve(url) returns (filename, headers) for a local object
1533 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001534 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001535 if self.tempcache and url in self.tempcache:
1536 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001537 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001538 if filename is None and (not type or type == 'file'):
1539 try:
1540 fp = self.open_local_file(url1)
1541 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001542 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001543 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001544 except IOError as msg:
1545 pass
1546 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001547 try:
1548 headers = fp.info()
1549 if filename:
1550 tfp = open(filename, 'wb')
1551 else:
1552 import tempfile
1553 garbage, path = splittype(url)
1554 garbage, path = splithost(path or "")
1555 path, garbage = splitquery(path or "")
1556 path, garbage = splitattr(path or "")
1557 suffix = os.path.splitext(path)[1]
1558 (fd, filename) = tempfile.mkstemp(suffix)
1559 self.__tempfiles.append(filename)
1560 tfp = os.fdopen(fd, 'wb')
1561 try:
1562 result = filename, headers
1563 if self.tempcache is not None:
1564 self.tempcache[url] = result
1565 bs = 1024*8
1566 size = -1
1567 read = 0
1568 blocknum = 0
1569 if reporthook:
1570 if "content-length" in headers:
1571 size = int(headers["Content-Length"])
1572 reporthook(blocknum, bs, size)
1573 while 1:
1574 block = fp.read(bs)
1575 if not block:
1576 break
1577 read += len(block)
1578 tfp.write(block)
1579 blocknum += 1
1580 if reporthook:
1581 reporthook(blocknum, bs, size)
1582 finally:
1583 tfp.close()
1584 finally:
1585 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001586
1587 # raise exception if actual size does not match content-length header
1588 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001589 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001590 "retrieval incomplete: got only %i out of %i bytes"
1591 % (read, size), result)
1592
1593 return result
1594
1595 # Each method named open_<type> knows how to open that type of URL
1596
1597 def _open_generic_http(self, connection_factory, url, data):
1598 """Make an HTTP connection using connection_class.
1599
1600 This is an internal method that should be called from
1601 open_http() or open_https().
1602
1603 Arguments:
1604 - connection_factory should take a host name and return an
1605 HTTPConnection instance.
1606 - url is the url to retrieval or a host, relative-path pair.
1607 - data is payload for a POST request or None.
1608 """
1609
1610 user_passwd = None
1611 proxy_passwd= None
1612 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001613 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001614 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001615 user_passwd, host = splituser(host)
1616 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001617 realhost = host
1618 else:
1619 host, selector = url
1620 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001621 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001622 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001623 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001624 url = rest
1625 user_passwd = None
1626 if urltype.lower() != 'http':
1627 realhost = None
1628 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001629 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001630 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001631 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001632 if user_passwd:
1633 selector = "%s://%s%s" % (urltype, realhost, rest)
1634 if proxy_bypass(realhost):
1635 host = realhost
1636
1637 #print "proxy via http:", host, selector
1638 if not host: raise IOError('http error', 'no host given')
1639
1640 if proxy_passwd:
1641 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001642 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001643 else:
1644 proxy_auth = None
1645
1646 if user_passwd:
1647 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001648 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001649 else:
1650 auth = None
1651 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001652 headers = {}
1653 if proxy_auth:
1654 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1655 if auth:
1656 headers["Authorization"] = "Basic %s" % auth
1657 if realhost:
1658 headers["Host"] = realhost
1659 for header, value in self.addheaders:
1660 headers[header] = value
1661
1662 if data is not None:
1663 headers["Content-Type"] = "application/x-www-form-urlencoded"
1664 http_conn.request("POST", selector, data, headers)
1665 else:
1666 http_conn.request("GET", selector, headers=headers)
1667
1668 try:
1669 response = http_conn.getresponse()
1670 except http.client.BadStatusLine:
1671 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001672 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001673
1674 # According to RFC 2616, "2xx" code indicates that the client's
1675 # request was successfully received, understood, and accepted.
1676 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001677 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001678 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001679 else:
1680 return self.http_error(
1681 url, response.fp,
1682 response.status, response.reason, response.msg, data)
1683
1684 def open_http(self, url, data=None):
1685 """Use HTTP protocol."""
1686 return self._open_generic_http(http.client.HTTPConnection, url, data)
1687
1688 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1689 """Handle http errors.
1690
1691 Derived class can override this, or provide specific handlers
1692 named http_error_DDD where DDD is the 3-digit error code."""
1693 # First check if there's a specific handler for this error
1694 name = 'http_error_%d' % errcode
1695 if hasattr(self, name):
1696 method = getattr(self, name)
1697 if data is None:
1698 result = method(url, fp, errcode, errmsg, headers)
1699 else:
1700 result = method(url, fp, errcode, errmsg, headers, data)
1701 if result: return result
1702 return self.http_error_default(url, fp, errcode, errmsg, headers)
1703
1704 def http_error_default(self, url, fp, errcode, errmsg, headers):
1705 """Default error handler: close the connection and raise IOError."""
1706 void = fp.read()
1707 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001708 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001709
1710 if _have_ssl:
1711 def _https_connection(self, host):
1712 return http.client.HTTPSConnection(host,
1713 key_file=self.key_file,
1714 cert_file=self.cert_file)
1715
1716 def open_https(self, url, data=None):
1717 """Use HTTPS protocol."""
1718 return self._open_generic_http(self._https_connection, url, data)
1719
1720 def open_file(self, url):
1721 """Use local file or FTP depending on form of URL."""
1722 if not isinstance(url, str):
1723 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1724 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001725 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001726 else:
1727 return self.open_local_file(url)
1728
1729 def open_local_file(self, url):
1730 """Use local file."""
1731 import mimetypes, email.utils
1732 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001733 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001734 localname = url2pathname(file)
1735 try:
1736 stats = os.stat(localname)
1737 except OSError as e:
1738 raise URLError(e.errno, e.strerror, e.filename)
1739 size = stats.st_size
1740 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1741 mtype = mimetypes.guess_type(url)[0]
1742 headers = email.message_from_string(
1743 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1744 (mtype or 'text/plain', size, modified))
1745 if not host:
1746 urlfile = file
1747 if file[:1] == '/':
1748 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001749 return addinfourl(open(localname, 'rb'), headers, urlfile)
1750 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001751 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001752 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001753 urlfile = file
1754 if file[:1] == '/':
1755 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001756 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001757 raise URLError('local file error', 'not on local host')
1758
1759 def open_ftp(self, url):
1760 """Use FTP protocol."""
1761 if not isinstance(url, str):
1762 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1763 import mimetypes
1764 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001765 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001766 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001767 host, port = splitport(host)
1768 user, host = splituser(host)
1769 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001770 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001771 host = unquote(host)
1772 user = unquote(user or '')
1773 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001774 host = socket.gethostbyname(host)
1775 if not port:
1776 import ftplib
1777 port = ftplib.FTP_PORT
1778 else:
1779 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001780 path, attrs = splitattr(path)
1781 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001782 dirs = path.split('/')
1783 dirs, file = dirs[:-1], dirs[-1]
1784 if dirs and not dirs[0]: dirs = dirs[1:]
1785 if dirs and not dirs[0]: dirs[0] = '/'
1786 key = user, host, port, '/'.join(dirs)
1787 # XXX thread unsafe!
1788 if len(self.ftpcache) > MAXFTPCACHE:
1789 # Prune the cache, rather arbitrarily
1790 for k in self.ftpcache.keys():
1791 if k != key:
1792 v = self.ftpcache[k]
1793 del self.ftpcache[k]
1794 v.close()
1795 try:
1796 if not key in self.ftpcache:
1797 self.ftpcache[key] = \
1798 ftpwrapper(user, passwd, host, port, dirs)
1799 if not file: type = 'D'
1800 else: type = 'I'
1801 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001802 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001803 if attr.lower() == 'type' and \
1804 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1805 type = value.upper()
1806 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1807 mtype = mimetypes.guess_type("ftp:" + url)[0]
1808 headers = ""
1809 if mtype:
1810 headers += "Content-Type: %s\n" % mtype
1811 if retrlen is not None and retrlen >= 0:
1812 headers += "Content-Length: %d\n" % retrlen
1813 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001814 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001815 except ftperrors() as msg:
1816 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1817
1818 def open_data(self, url, data=None):
1819 """Use "data" URL."""
1820 if not isinstance(url, str):
1821 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1822 # ignore POSTed data
1823 #
1824 # syntax of data URLs:
1825 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1826 # mediatype := [ type "/" subtype ] *( ";" parameter )
1827 # data := *urlchar
1828 # parameter := attribute "=" value
1829 try:
1830 [type, data] = url.split(',', 1)
1831 except ValueError:
1832 raise IOError('data error', 'bad data URL')
1833 if not type:
1834 type = 'text/plain;charset=US-ASCII'
1835 semi = type.rfind(';')
1836 if semi >= 0 and '=' not in type[semi:]:
1837 encoding = type[semi+1:]
1838 type = type[:semi]
1839 else:
1840 encoding = ''
1841 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001842 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001843 time.gmtime(time.time())))
1844 msg.append('Content-type: %s' % type)
1845 if encoding == 'base64':
1846 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001847 # XXX is this encoding/decoding ok?
1848 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001849 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001850 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001851 msg.append('Content-Length: %d' % len(data))
1852 msg.append('')
1853 msg.append(data)
1854 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001855 headers = email.message_from_string(msg)
1856 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001857 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001858 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001859
1860
1861class FancyURLopener(URLopener):
1862 """Derived class with handlers for errors we can handle (perhaps)."""
1863
1864 def __init__(self, *args, **kwargs):
1865 URLopener.__init__(self, *args, **kwargs)
1866 self.auth_cache = {}
1867 self.tries = 0
1868 self.maxtries = 10
1869
1870 def http_error_default(self, url, fp, errcode, errmsg, headers):
1871 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001872 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001873
1874 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1875 """Error 302 -- relocated (temporarily)."""
1876 self.tries += 1
1877 if self.maxtries and self.tries >= self.maxtries:
1878 if hasattr(self, "http_error_500"):
1879 meth = self.http_error_500
1880 else:
1881 meth = self.http_error_default
1882 self.tries = 0
1883 return meth(url, fp, 500,
1884 "Internal Server Error: Redirect Recursion", headers)
1885 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1886 data)
1887 self.tries = 0
1888 return result
1889
1890 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1891 if 'location' in headers:
1892 newurl = headers['location']
1893 elif 'uri' in headers:
1894 newurl = headers['uri']
1895 else:
1896 return
1897 void = fp.read()
1898 fp.close()
1899 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001900 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001901 return self.open(newurl)
1902
1903 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1904 """Error 301 -- also relocated (permanently)."""
1905 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1906
1907 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1908 """Error 303 -- also relocated (essentially identical to 302)."""
1909 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1910
1911 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1912 """Error 307 -- relocated, but turn POST into error."""
1913 if data is None:
1914 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1915 else:
1916 return self.http_error_default(url, fp, errcode, errmsg, headers)
1917
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001918 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1919 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001920 """Error 401 -- authentication required.
1921 This function supports Basic authentication only."""
1922 if not 'www-authenticate' in headers:
1923 URLopener.http_error_default(self, url, fp,
1924 errcode, errmsg, headers)
1925 stuff = headers['www-authenticate']
1926 import re
1927 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1928 if not match:
1929 URLopener.http_error_default(self, url, fp,
1930 errcode, errmsg, headers)
1931 scheme, realm = match.groups()
1932 if scheme.lower() != 'basic':
1933 URLopener.http_error_default(self, url, fp,
1934 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001935 if not retry:
1936 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1937 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001938 name = 'retry_' + self.type + '_basic_auth'
1939 if data is None:
1940 return getattr(self,name)(url, realm)
1941 else:
1942 return getattr(self,name)(url, realm, data)
1943
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001944 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
1945 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001946 """Error 407 -- proxy authentication required.
1947 This function supports Basic authentication only."""
1948 if not 'proxy-authenticate' in headers:
1949 URLopener.http_error_default(self, url, fp,
1950 errcode, errmsg, headers)
1951 stuff = headers['proxy-authenticate']
1952 import re
1953 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1954 if not match:
1955 URLopener.http_error_default(self, url, fp,
1956 errcode, errmsg, headers)
1957 scheme, realm = match.groups()
1958 if scheme.lower() != 'basic':
1959 URLopener.http_error_default(self, url, fp,
1960 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001961 if not retry:
1962 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1963 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001964 name = 'retry_proxy_' + self.type + '_basic_auth'
1965 if data is None:
1966 return getattr(self,name)(url, realm)
1967 else:
1968 return getattr(self,name)(url, realm, data)
1969
1970 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001971 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001972 newurl = 'http://' + host + selector
1973 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001974 urltype, proxyhost = splittype(proxy)
1975 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001976 i = proxyhost.find('@') + 1
1977 proxyhost = proxyhost[i:]
1978 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1979 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001980 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001981 quote(passwd, safe=''), proxyhost)
1982 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1983 if data is None:
1984 return self.open(newurl)
1985 else:
1986 return self.open(newurl, data)
1987
1988 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001989 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001990 newurl = 'https://' + host + selector
1991 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001992 urltype, proxyhost = splittype(proxy)
1993 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001994 i = proxyhost.find('@') + 1
1995 proxyhost = proxyhost[i:]
1996 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1997 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001998 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001999 quote(passwd, safe=''), proxyhost)
2000 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2001 if data is None:
2002 return self.open(newurl)
2003 else:
2004 return self.open(newurl, data)
2005
2006 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002007 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002008 i = host.find('@') + 1
2009 host = host[i:]
2010 user, passwd = self.get_user_passwd(host, realm, i)
2011 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002012 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002013 quote(passwd, safe=''), host)
2014 newurl = 'http://' + host + selector
2015 if data is None:
2016 return self.open(newurl)
2017 else:
2018 return self.open(newurl, data)
2019
2020 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002021 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002022 i = host.find('@') + 1
2023 host = host[i:]
2024 user, passwd = self.get_user_passwd(host, realm, i)
2025 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002026 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002027 quote(passwd, safe=''), host)
2028 newurl = 'https://' + host + selector
2029 if data is None:
2030 return self.open(newurl)
2031 else:
2032 return self.open(newurl, data)
2033
Florent Xicluna757445b2010-05-17 17:24:07 +00002034 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002035 key = realm + '@' + host.lower()
2036 if key in self.auth_cache:
2037 if clear_cache:
2038 del self.auth_cache[key]
2039 else:
2040 return self.auth_cache[key]
2041 user, passwd = self.prompt_user_passwd(host, realm)
2042 if user or passwd: self.auth_cache[key] = (user, passwd)
2043 return user, passwd
2044
2045 def prompt_user_passwd(self, host, realm):
2046 """Override this in a GUI environment!"""
2047 import getpass
2048 try:
2049 user = input("Enter username for %s at %s: " % (realm, host))
2050 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2051 (user, realm, host))
2052 return user, passwd
2053 except KeyboardInterrupt:
2054 print()
2055 return None, None
2056
2057
2058# Utility functions
2059
2060_localhost = None
2061def localhost():
2062 """Return the IP address of the magic hostname 'localhost'."""
2063 global _localhost
2064 if _localhost is None:
2065 _localhost = socket.gethostbyname('localhost')
2066 return _localhost
2067
2068_thishost = None
2069def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002070 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002071 global _thishost
2072 if _thishost is None:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002073 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname()[2]))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002074 return _thishost
2075
2076_ftperrors = None
2077def ftperrors():
2078 """Return the set of errors raised by the FTP class."""
2079 global _ftperrors
2080 if _ftperrors is None:
2081 import ftplib
2082 _ftperrors = ftplib.all_errors
2083 return _ftperrors
2084
2085_noheaders = None
2086def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002087 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002088 global _noheaders
2089 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002090 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002091 return _noheaders
2092
2093
2094# Utility classes
2095
2096class ftpwrapper:
2097 """Class used by open_ftp() for cache of open FTP connections."""
2098
2099 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2100 self.user = user
2101 self.passwd = passwd
2102 self.host = host
2103 self.port = port
2104 self.dirs = dirs
2105 self.timeout = timeout
2106 self.init()
2107
2108 def init(self):
2109 import ftplib
2110 self.busy = 0
2111 self.ftp = ftplib.FTP()
2112 self.ftp.connect(self.host, self.port, self.timeout)
2113 self.ftp.login(self.user, self.passwd)
2114 for dir in self.dirs:
2115 self.ftp.cwd(dir)
2116
2117 def retrfile(self, file, type):
2118 import ftplib
2119 self.endtransfer()
2120 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2121 else: cmd = 'TYPE ' + type; isdir = 0
2122 try:
2123 self.ftp.voidcmd(cmd)
2124 except ftplib.all_errors:
2125 self.init()
2126 self.ftp.voidcmd(cmd)
2127 conn = None
2128 if file and not isdir:
2129 # Try to retrieve as a file
2130 try:
2131 cmd = 'RETR ' + file
2132 conn = self.ftp.ntransfercmd(cmd)
2133 except ftplib.error_perm as reason:
2134 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002135 raise URLError('ftp error', reason).with_traceback(
2136 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002137 if not conn:
2138 # Set transfer mode to ASCII!
2139 self.ftp.voidcmd('TYPE A')
2140 # Try a directory listing. Verify that directory exists.
2141 if file:
2142 pwd = self.ftp.pwd()
2143 try:
2144 try:
2145 self.ftp.cwd(file)
2146 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002147 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002148 finally:
2149 self.ftp.cwd(pwd)
2150 cmd = 'LIST ' + file
2151 else:
2152 cmd = 'LIST'
2153 conn = self.ftp.ntransfercmd(cmd)
2154 self.busy = 1
2155 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002156 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002157 def endtransfer(self):
2158 if not self.busy:
2159 return
2160 self.busy = 0
2161 try:
2162 self.ftp.voidresp()
2163 except ftperrors():
2164 pass
2165
2166 def close(self):
2167 self.endtransfer()
2168 try:
2169 self.ftp.close()
2170 except ftperrors():
2171 pass
2172
2173# Proxy handling
2174def getproxies_environment():
2175 """Return a dictionary of scheme -> proxy server URL mappings.
2176
2177 Scan the environment for variables named <scheme>_proxy;
2178 this seems to be the standard convention. If you need a
2179 different way, you can pass a proxies dictionary to the
2180 [Fancy]URLopener constructor.
2181
2182 """
2183 proxies = {}
2184 for name, value in os.environ.items():
2185 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002186 if value and name[-6:] == '_proxy':
2187 proxies[name[:-6]] = value
2188 return proxies
2189
2190def proxy_bypass_environment(host):
2191 """Test if proxies should not be used for a particular host.
2192
2193 Checks the environment for a variable named no_proxy, which should
2194 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2195 """
2196 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2197 # '*' is special case for always bypass
2198 if no_proxy == '*':
2199 return 1
2200 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002201 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002202 # check if the host ends with any of the DNS suffixes
2203 for name in no_proxy.split(','):
2204 if name and (hostonly.endswith(name) or host.endswith(name)):
2205 return 1
2206 # otherwise, don't bypass
2207 return 0
2208
2209
2210if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002211 from _scproxy import _get_proxy_settings, _get_proxies
2212
2213 def proxy_bypass_macosx_sysconf(host):
2214 """
2215 Return True iff this host shouldn't be accessed using a proxy
2216
2217 This function uses the MacOSX framework SystemConfiguration
2218 to fetch the proxy information.
2219 """
2220 import re
2221 import socket
2222 from fnmatch import fnmatch
2223
2224 hostonly, port = splitport(host)
2225
2226 def ip2num(ipAddr):
2227 parts = ipAddr.split('.')
Mark Dickinsonc3f45c22010-05-09 12:16:29 +00002228 parts = list(map(int, parts))
Ronald Oussoren84151202010-04-18 20:46:11 +00002229 if len(parts) != 4:
2230 parts = (parts + [0, 0, 0, 0])[:4]
2231 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2232
2233 proxy_settings = _get_proxy_settings()
2234
2235 # Check for simple host names:
2236 if '.' not in host:
2237 if proxy_settings['exclude_simple']:
2238 return True
2239
2240 hostIP = None
2241
2242 for value in proxy_settings.get('exceptions', ()):
2243 # Items in the list are strings like these: *.local, 169.254/16
2244 if not value: continue
2245
2246 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2247 if m is not None:
2248 if hostIP is None:
2249 try:
2250 hostIP = socket.gethostbyname(hostonly)
2251 hostIP = ip2num(hostIP)
2252 except socket.error:
2253 continue
2254
2255 base = ip2num(m.group(1))
Ronald Oussorenab90f8e2010-06-27 14:26:30 +00002256 mask = m.group(2)
2257 if mask is None:
2258 mask = 8 * (m.group(1).count('.') + 1)
2259
2260 else:
2261 mask = int(mask[1:])
2262 mask = 32 - mask
Ronald Oussoren84151202010-04-18 20:46:11 +00002263
2264 if (hostIP >> mask) == (base >> mask):
2265 return True
2266
2267 elif fnmatch(host, value):
2268 return True
2269
2270 return False
2271
2272
2273 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002274 """Return a dictionary of scheme -> proxy server URL mappings.
2275
Ronald Oussoren84151202010-04-18 20:46:11 +00002276 This function uses the MacOSX framework SystemConfiguration
2277 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002278 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002279 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002280
Ronald Oussoren84151202010-04-18 20:46:11 +00002281
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002282
2283 def proxy_bypass(host):
2284 if getproxies_environment():
2285 return proxy_bypass_environment(host)
2286 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002287 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002288
2289 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002290 return getproxies_environment() or getproxies_macosx_sysconf()
2291
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002292
2293elif os.name == 'nt':
2294 def getproxies_registry():
2295 """Return a dictionary of scheme -> proxy server URL mappings.
2296
2297 Win32 uses the registry to store proxies.
2298
2299 """
2300 proxies = {}
2301 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002302 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002303 except ImportError:
2304 # Std module, so should be around - but you never know!
2305 return proxies
2306 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002307 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002308 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002309 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002310 'ProxyEnable')[0]
2311 if proxyEnable:
2312 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002313 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002314 'ProxyServer')[0])
2315 if '=' in proxyServer:
2316 # Per-protocol settings
2317 for p in proxyServer.split(';'):
2318 protocol, address = p.split('=', 1)
2319 # See if address has a type:// prefix
2320 import re
2321 if not re.match('^([^/:]+)://', address):
2322 address = '%s://%s' % (protocol, address)
2323 proxies[protocol] = address
2324 else:
2325 # Use one setting for all protocols
2326 if proxyServer[:5] == 'http:':
2327 proxies['http'] = proxyServer
2328 else:
2329 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002330 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002331 proxies['ftp'] = 'ftp://%s' % proxyServer
2332 internetSettings.Close()
2333 except (WindowsError, ValueError, TypeError):
2334 # Either registry key not found etc, or the value in an
2335 # unexpected format.
2336 # proxies already set up to be empty so nothing to do
2337 pass
2338 return proxies
2339
2340 def getproxies():
2341 """Return a dictionary of scheme -> proxy server URL mappings.
2342
2343 Returns settings gathered from the environment, if specified,
2344 or the registry.
2345
2346 """
2347 return getproxies_environment() or getproxies_registry()
2348
2349 def proxy_bypass_registry(host):
2350 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002351 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002352 import re
2353 except ImportError:
2354 # Std modules, so should be around - but you never know!
2355 return 0
2356 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002357 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002358 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002359 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002360 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002361 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002362 'ProxyOverride')[0])
2363 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2364 except WindowsError:
2365 return 0
2366 if not proxyEnable or not proxyOverride:
2367 return 0
2368 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002369 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002370 host = [rawHost]
2371 try:
2372 addr = socket.gethostbyname(rawHost)
2373 if addr != rawHost:
2374 host.append(addr)
2375 except socket.error:
2376 pass
2377 try:
2378 fqdn = socket.getfqdn(rawHost)
2379 if fqdn != rawHost:
2380 host.append(fqdn)
2381 except socket.error:
2382 pass
2383 # make a check value list from the registry entry: replace the
2384 # '<local>' string by the localhost entry and the corresponding
2385 # canonical entry.
2386 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002387 # now check if we match one of the registry values.
2388 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002389 if test == '<local>':
2390 if '.' not in rawHost:
2391 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002392 test = test.replace(".", r"\.") # mask dots
2393 test = test.replace("*", r".*") # change glob sequence
2394 test = test.replace("?", r".") # change glob char
2395 for val in host:
2396 # print "%s <--> %s" %( test, val )
2397 if re.match(test, val, re.I):
2398 return 1
2399 return 0
2400
2401 def proxy_bypass(host):
2402 """Return a dictionary of scheme -> proxy server URL mappings.
2403
2404 Returns settings gathered from the environment, if specified,
2405 or the registry.
2406
2407 """
2408 if getproxies_environment():
2409 return proxy_bypass_environment(host)
2410 else:
2411 return proxy_bypass_registry(host)
2412
2413else:
2414 # By default use environment variables
2415 getproxies = getproxies_environment
2416 proxy_bypass = proxy_bypass_environment