blob: 53e8107c564cc56457e38dcf975c59b5a1986572 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Senthil Kumaran7bc0d872010-12-19 10:49:52 +000097import collections
Jeremy Hylton1afc1692008-06-18 20:49:58 +000098
Georg Brandl13e89462008-07-01 19:56:00 +000099from urllib.error import URLError, HTTPError, ContentTooShortError
100from urllib.parse import (
101 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
102 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000103 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000104from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105
106# check for SSL
107try:
108 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000109except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110 _have_ssl = False
111else:
112 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113
114# used in User-Agent header sent
115__version__ = sys.version[:3]
116
117_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000118def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
119 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000120 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000121 if cafile or capath:
122 if not _have_ssl:
123 raise ValueError('SSL support not available')
124 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
125 context.options |= ssl.OP_NO_SSLv2
126 if cafile or capath:
127 context.verify_mode = ssl.CERT_REQUIRED
128 context.load_verify_locations(cafile, capath)
129 check_hostname = True
130 else:
131 check_hostname = False
132 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
133 opener = build_opener(https_handler)
134 elif _opener is None:
135 _opener = opener = build_opener()
136 else:
137 opener = _opener
138 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000139
140def install_opener(opener):
141 global _opener
142 _opener = opener
143
144# TODO(jhylton): Make this work with the same global opener.
145_urlopener = None
146def urlretrieve(url, filename=None, reporthook=None, data=None):
147 global _urlopener
148 if not _urlopener:
149 _urlopener = FancyURLopener()
150 return _urlopener.retrieve(url, filename, reporthook, data)
151
152def urlcleanup():
153 if _urlopener:
154 _urlopener.cleanup()
155 global _opener
156 if _opener:
157 _opener = None
158
159# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000160_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000161def request_host(request):
162 """Return request-host, as defined by RFC 2965.
163
164 Variation from RFC: returned value is lowercased, for convenient
165 comparison.
166
167 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000168 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000169 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000170 if host == "":
171 host = request.get_header("Host", "")
172
173 # remove port, if present
174 host = _cut_port_re.sub("", host, 1)
175 return host.lower()
176
177class Request:
178
179 def __init__(self, url, data=None, headers={},
180 origin_req_host=None, unverifiable=False):
181 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000182 self.full_url = unwrap(url)
Senthil Kumarand95cc752010-08-08 11:27:53 +0000183 self.full_url, fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000184 self.data = data
185 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000186 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000187 for key, value in headers.items():
188 self.add_header(key, value)
189 self.unredirected_hdrs = {}
190 if origin_req_host is None:
191 origin_req_host = request_host(self)
192 self.origin_req_host = origin_req_host
193 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000194 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000195
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000196 def _parse(self):
197 self.type, rest = splittype(self.full_url)
198 if self.type is None:
199 raise ValueError("unknown url type: %s" % self.full_url)
200 self.host, self.selector = splithost(rest)
201 if self.host:
202 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000203
204 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000205 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000206 return "POST"
207 else:
208 return "GET"
209
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000210 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211
212 def add_data(self, data):
213 self.data = data
214
215 def has_data(self):
216 return self.data is not None
217
218 def get_data(self):
219 return self.data
220
221 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000222 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000223
224 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000225 return self.type
226
227 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000228 return self.host
229
230 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000231 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000232
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000233 def is_unverifiable(self):
234 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000235
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000236 def get_origin_req_host(self):
237 return self.origin_req_host
238
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000239 # End deprecated methods
240
241 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000242 if self.type == 'https' and not self._tunnel_host:
243 self._tunnel_host = self.host
244 else:
245 self.type= type
246 self.selector = self.full_url
247 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000248
249 def has_proxy(self):
250 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000251
252 def add_header(self, key, val):
253 # useful for something like authentication
254 self.headers[key.capitalize()] = val
255
256 def add_unredirected_header(self, key, val):
257 # will not be added to a redirected request
258 self.unredirected_hdrs[key.capitalize()] = val
259
260 def has_header(self, header_name):
261 return (header_name in self.headers or
262 header_name in self.unredirected_hdrs)
263
264 def get_header(self, header_name, default=None):
265 return self.headers.get(
266 header_name,
267 self.unredirected_hdrs.get(header_name, default))
268
269 def header_items(self):
270 hdrs = self.unredirected_hdrs.copy()
271 hdrs.update(self.headers)
272 return list(hdrs.items())
273
274class OpenerDirector:
275 def __init__(self):
276 client_version = "Python-urllib/%s" % __version__
277 self.addheaders = [('User-agent', client_version)]
R. David Murray25b8cca2010-12-23 19:44:49 +0000278 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000279 self.handlers = []
R. David Murray25b8cca2010-12-23 19:44:49 +0000280 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000281 self.handle_open = {}
282 self.handle_error = {}
283 self.process_response = {}
284 self.process_request = {}
285
286 def add_handler(self, handler):
287 if not hasattr(handler, "add_parent"):
288 raise TypeError("expected BaseHandler instance, got %r" %
289 type(handler))
290
291 added = False
292 for meth in dir(handler):
293 if meth in ["redirect_request", "do_open", "proxy_open"]:
294 # oops, coincidental match
295 continue
296
297 i = meth.find("_")
298 protocol = meth[:i]
299 condition = meth[i+1:]
300
301 if condition.startswith("error"):
302 j = condition.find("_") + i + 1
303 kind = meth[j+1:]
304 try:
305 kind = int(kind)
306 except ValueError:
307 pass
308 lookup = self.handle_error.get(protocol, {})
309 self.handle_error[protocol] = lookup
310 elif condition == "open":
311 kind = protocol
312 lookup = self.handle_open
313 elif condition == "response":
314 kind = protocol
315 lookup = self.process_response
316 elif condition == "request":
317 kind = protocol
318 lookup = self.process_request
319 else:
320 continue
321
322 handlers = lookup.setdefault(kind, [])
323 if handlers:
324 bisect.insort(handlers, handler)
325 else:
326 handlers.append(handler)
327 added = True
328
329 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330 bisect.insort(self.handlers, handler)
331 handler.add_parent(self)
332
333 def close(self):
334 # Only exists for backwards compatibility.
335 pass
336
337 def _call_chain(self, chain, kind, meth_name, *args):
338 # Handlers raise an exception if no one else should try to handle
339 # the request, or return None if they can't but another handler
340 # could. Otherwise, they return the response.
341 handlers = chain.get(kind, ())
342 for handler in handlers:
343 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000344 result = func(*args)
345 if result is not None:
346 return result
347
348 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
349 # accept a URL or a Request object
350 if isinstance(fullurl, str):
351 req = Request(fullurl, data)
352 else:
353 req = fullurl
354 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000355 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000356
357 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000358 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000359
360 # pre-process request
361 meth_name = protocol+"_request"
362 for processor in self.process_request.get(protocol, []):
363 meth = getattr(processor, meth_name)
364 req = meth(req)
365
366 response = self._open(req, data)
367
368 # post-process response
369 meth_name = protocol+"_response"
370 for processor in self.process_response.get(protocol, []):
371 meth = getattr(processor, meth_name)
372 response = meth(req, response)
373
374 return response
375
376 def _open(self, req, data=None):
377 result = self._call_chain(self.handle_open, 'default',
378 'default_open', req)
379 if result:
380 return result
381
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000382 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000383 result = self._call_chain(self.handle_open, protocol, protocol +
384 '_open', req)
385 if result:
386 return result
387
388 return self._call_chain(self.handle_open, 'unknown',
389 'unknown_open', req)
390
391 def error(self, proto, *args):
392 if proto in ('http', 'https'):
393 # XXX http[s] protocols are special-cased
394 dict = self.handle_error['http'] # https is not different than http
395 proto = args[2] # YUCK!
396 meth_name = 'http_error_%s' % proto
397 http_err = 1
398 orig_args = args
399 else:
400 dict = self.handle_error
401 meth_name = proto + '_error'
402 http_err = 0
403 args = (dict, proto, meth_name) + args
404 result = self._call_chain(*args)
405 if result:
406 return result
407
408 if http_err:
409 args = (dict, 'default', 'http_error_default') + orig_args
410 return self._call_chain(*args)
411
412# XXX probably also want an abstract factory that knows when it makes
413# sense to skip a superclass in favor of a subclass and when it might
414# make sense to include both
415
416def build_opener(*handlers):
417 """Create an opener object from a list of handlers.
418
419 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000420 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000421
422 If any of the handlers passed as arguments are subclasses of the
423 default handlers, the default handlers will not be used.
424 """
425 def isclass(obj):
426 return isinstance(obj, type) or hasattr(obj, "__bases__")
427
428 opener = OpenerDirector()
429 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
430 HTTPDefaultErrorHandler, HTTPRedirectHandler,
431 FTPHandler, FileHandler, HTTPErrorProcessor]
432 if hasattr(http.client, "HTTPSConnection"):
433 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000434 skip = set()
435 for klass in default_classes:
436 for check in handlers:
437 if isclass(check):
438 if issubclass(check, klass):
439 skip.add(klass)
440 elif isinstance(check, klass):
441 skip.add(klass)
442 for klass in skip:
443 default_classes.remove(klass)
444
445 for klass in default_classes:
446 opener.add_handler(klass())
447
448 for h in handlers:
449 if isclass(h):
450 h = h()
451 opener.add_handler(h)
452 return opener
453
454class BaseHandler:
455 handler_order = 500
456
457 def add_parent(self, parent):
458 self.parent = parent
459
460 def close(self):
461 # Only exists for backwards compatibility
462 pass
463
464 def __lt__(self, other):
465 if not hasattr(other, "handler_order"):
466 # Try to preserve the old behavior of having custom classes
467 # inserted after default ones (works only for custom user
468 # classes which are not aware of handler_order).
469 return True
470 return self.handler_order < other.handler_order
471
472
473class HTTPErrorProcessor(BaseHandler):
474 """Process HTTP error responses."""
475 handler_order = 1000 # after all other processing
476
477 def http_response(self, request, response):
478 code, msg, hdrs = response.code, response.msg, response.info()
479
480 # According to RFC 2616, "2xx" code indicates that the client's
481 # request was successfully received, understood, and accepted.
482 if not (200 <= code < 300):
483 response = self.parent.error(
484 'http', request, response, code, msg, hdrs)
485
486 return response
487
488 https_response = http_response
489
490class HTTPDefaultErrorHandler(BaseHandler):
491 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000492 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000493
494class HTTPRedirectHandler(BaseHandler):
495 # maximum number of redirections to any single URL
496 # this is needed because of the state that cookies introduce
497 max_repeats = 4
498 # maximum total number of redirections (regardless of URL) before
499 # assuming we're in a loop
500 max_redirections = 10
501
502 def redirect_request(self, req, fp, code, msg, headers, newurl):
503 """Return a Request or None in response to a redirect.
504
505 This is called by the http_error_30x methods when a
506 redirection response is received. If a redirection should
507 take place, return a new Request to allow http_error_30x to
508 perform the redirect. Otherwise, raise HTTPError if no-one
509 else should try to handle this url. Return None if you can't
510 but another Handler might.
511 """
512 m = req.get_method()
513 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
514 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000515 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000516
517 # Strictly (according to RFC 2616), 301 or 302 in response to
518 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000519 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000520 # essentially all clients do redirect in this case, so we do
521 # the same.
522 # be conciliant with URIs containing a space
523 newurl = newurl.replace(' ', '%20')
524 CONTENT_HEADERS = ("content-length", "content-type")
525 newheaders = dict((k, v) for k, v in req.headers.items()
526 if k.lower() not in CONTENT_HEADERS)
527 return Request(newurl,
528 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000529 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000530 unverifiable=True)
531
532 # Implementation note: To avoid the server sending us into an
533 # infinite loop, the request object needs to track what URLs we
534 # have already seen. Do this by adding a handler-specific
535 # attribute to the Request object.
536 def http_error_302(self, req, fp, code, msg, headers):
537 # Some servers (incorrectly) return multiple Location headers
538 # (so probably same goes for URI). Use first header.
539 if "location" in headers:
540 newurl = headers["location"]
541 elif "uri" in headers:
542 newurl = headers["uri"]
543 else:
544 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000545
546 # fix a possible malformed URL
547 urlparts = urlparse(newurl)
548 if not urlparts.path:
549 urlparts = list(urlparts)
550 urlparts[2] = "/"
551 newurl = urlunparse(urlparts)
552
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000553 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000554
555 # XXX Probably want to forget about the state of the current
556 # request, although that might interact poorly with other
557 # handlers that also use handler-specific request attributes
558 new = self.redirect_request(req, fp, code, msg, headers, newurl)
559 if new is None:
560 return
561
562 # loop detection
563 # .redirect_dict has a key url if url was previously visited.
564 if hasattr(req, 'redirect_dict'):
565 visited = new.redirect_dict = req.redirect_dict
566 if (visited.get(newurl, 0) >= self.max_repeats or
567 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000568 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000569 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000570 else:
571 visited = new.redirect_dict = req.redirect_dict = {}
572 visited[newurl] = visited.get(newurl, 0) + 1
573
574 # Don't close the fp until we are sure that we won't use it
575 # with HTTPError.
576 fp.read()
577 fp.close()
578
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000579 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000580
581 http_error_301 = http_error_303 = http_error_307 = http_error_302
582
583 inf_msg = "The HTTP server returned a redirect error that would " \
584 "lead to an infinite loop.\n" \
585 "The last 30x error message was:\n"
586
587
588def _parse_proxy(proxy):
589 """Return (scheme, user, password, host/port) given a URL or an authority.
590
591 If a URL is supplied, it must have an authority (host:port) component.
592 According to RFC 3986, having an authority component means the URL must
593 have two slashes after the scheme:
594
595 >>> _parse_proxy('file:/ftp.example.com/')
596 Traceback (most recent call last):
597 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
598
599 The first three items of the returned tuple may be None.
600
601 Examples of authority parsing:
602
603 >>> _parse_proxy('proxy.example.com')
604 (None, None, None, 'proxy.example.com')
605 >>> _parse_proxy('proxy.example.com:3128')
606 (None, None, None, 'proxy.example.com:3128')
607
608 The authority component may optionally include userinfo (assumed to be
609 username:password):
610
611 >>> _parse_proxy('joe:password@proxy.example.com')
612 (None, 'joe', 'password', 'proxy.example.com')
613 >>> _parse_proxy('joe:password@proxy.example.com:3128')
614 (None, 'joe', 'password', 'proxy.example.com:3128')
615
616 Same examples, but with URLs instead:
617
618 >>> _parse_proxy('http://proxy.example.com/')
619 ('http', None, None, 'proxy.example.com')
620 >>> _parse_proxy('http://proxy.example.com:3128/')
621 ('http', None, None, 'proxy.example.com:3128')
622 >>> _parse_proxy('http://joe:password@proxy.example.com/')
623 ('http', 'joe', 'password', 'proxy.example.com')
624 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
625 ('http', 'joe', 'password', 'proxy.example.com:3128')
626
627 Everything after the authority is ignored:
628
629 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
630 ('ftp', 'joe', 'password', 'proxy.example.com')
631
632 Test for no trailing '/' case:
633
634 >>> _parse_proxy('http://joe:password@proxy.example.com')
635 ('http', 'joe', 'password', 'proxy.example.com')
636
637 """
Georg Brandl13e89462008-07-01 19:56:00 +0000638 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000639 if not r_scheme.startswith("/"):
640 # authority
641 scheme = None
642 authority = proxy
643 else:
644 # URL
645 if not r_scheme.startswith("//"):
646 raise ValueError("proxy URL with no authority: %r" % proxy)
647 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
648 # and 3.3.), path is empty or starts with '/'
649 end = r_scheme.find("/", 2)
650 if end == -1:
651 end = None
652 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000653 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000654 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000655 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000656 else:
657 user = password = None
658 return scheme, user, password, hostport
659
660class ProxyHandler(BaseHandler):
661 # Proxies must be in front
662 handler_order = 100
663
664 def __init__(self, proxies=None):
665 if proxies is None:
666 proxies = getproxies()
667 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
668 self.proxies = proxies
669 for type, url in proxies.items():
670 setattr(self, '%s_open' % type,
671 lambda r, proxy=url, type=type, meth=self.proxy_open: \
672 meth(r, proxy, type))
673
674 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000675 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000676 proxy_type, user, password, hostport = _parse_proxy(proxy)
677 if proxy_type is None:
678 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000679
680 if req.host and proxy_bypass(req.host):
681 return None
682
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000683 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000684 user_pass = '%s:%s' % (unquote(user),
685 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000686 creds = base64.b64encode(user_pass.encode()).decode("ascii")
687 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000688 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000689 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000690 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000691 # let other handlers take care of it
692 return None
693 else:
694 # need to start over, because the other handlers don't
695 # grok the proxy's URL type
696 # e.g. if we have a constructor arg proxies like so:
697 # {'http': 'ftp://proxy.example.com'}, we may end up turning
698 # a request for http://acme.example.com/a into one for
699 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000700 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000701
702class HTTPPasswordMgr:
703
704 def __init__(self):
705 self.passwd = {}
706
707 def add_password(self, realm, uri, user, passwd):
708 # uri could be a single URI or a sequence
709 if isinstance(uri, str):
710 uri = [uri]
711 if not realm in self.passwd:
712 self.passwd[realm] = {}
713 for default_port in True, False:
714 reduced_uri = tuple(
715 [self.reduce_uri(u, default_port) for u in uri])
716 self.passwd[realm][reduced_uri] = (user, passwd)
717
718 def find_user_password(self, realm, authuri):
719 domains = self.passwd.get(realm, {})
720 for default_port in True, False:
721 reduced_authuri = self.reduce_uri(authuri, default_port)
722 for uris, authinfo in domains.items():
723 for uri in uris:
724 if self.is_suburi(uri, reduced_authuri):
725 return authinfo
726 return None, None
727
728 def reduce_uri(self, uri, default_port=True):
729 """Accept authority or URI and extract only the authority and path."""
730 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000731 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000732 if parts[1]:
733 # URI
734 scheme = parts[0]
735 authority = parts[1]
736 path = parts[2] or '/'
737 else:
738 # host or host:port
739 scheme = None
740 authority = uri
741 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000742 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000743 if default_port and port is None and scheme is not None:
744 dport = {"http": 80,
745 "https": 443,
746 }.get(scheme)
747 if dport is not None:
748 authority = "%s:%d" % (host, dport)
749 return authority, path
750
751 def is_suburi(self, base, test):
752 """Check if test is below base in a URI tree
753
754 Both args must be URIs in reduced form.
755 """
756 if base == test:
757 return True
758 if base[0] != test[0]:
759 return False
760 common = posixpath.commonprefix((base[1], test[1]))
761 if len(common) == len(base[1]):
762 return True
763 return False
764
765
766class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
767
768 def find_user_password(self, realm, authuri):
769 user, password = HTTPPasswordMgr.find_user_password(self, realm,
770 authuri)
771 if user is not None:
772 return user, password
773 return HTTPPasswordMgr.find_user_password(self, None, authuri)
774
775
776class AbstractBasicAuthHandler:
777
778 # XXX this allows for multiple auth-schemes, but will stupidly pick
779 # the last one with a realm specified.
780
781 # allow for double- and single-quoted realm values
782 # (single quotes are a violation of the RFC, but appear in the wild)
783 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
784 'realm=(["\'])(.*?)\\2', re.I)
785
786 # XXX could pre-emptively send auth info already accepted (RFC 2617,
787 # end of section 2, and section 1.2 immediately after "credentials"
788 # production).
789
790 def __init__(self, password_mgr=None):
791 if password_mgr is None:
792 password_mgr = HTTPPasswordMgr()
793 self.passwd = password_mgr
794 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000795 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000796
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000797 def reset_retry_count(self):
798 self.retried = 0
799
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000800 def http_error_auth_reqed(self, authreq, host, req, headers):
801 # host may be an authority (without userinfo) or a URL with an
802 # authority
803 # XXX could be multiple headers
804 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000805
806 if self.retried > 5:
807 # retry sending the username:password 5 times before failing.
808 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
809 headers, None)
810 else:
811 self.retried += 1
812
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 if authreq:
814 mo = AbstractBasicAuthHandler.rx.search(authreq)
815 if mo:
816 scheme, quote, realm = mo.groups()
817 if scheme.lower() == 'basic':
Senthil Kumaran4bb5c272010-08-26 06:16:22 +0000818 response = self.retry_http_basic_auth(host, req, realm)
819 if response and response.code != 401:
820 self.retried = 0
821 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000822
823 def retry_http_basic_auth(self, host, req, realm):
824 user, pw = self.passwd.find_user_password(realm, host)
825 if pw is not None:
826 raw = "%s:%s" % (user, pw)
827 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
828 if req.headers.get(self.auth_header, None) == auth:
829 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000830 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000831 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000832 else:
833 return None
834
835
836class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
837
838 auth_header = 'Authorization'
839
840 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000841 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000842 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000843 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000844 self.reset_retry_count()
845 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000846
847
848class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
849
850 auth_header = 'Proxy-authorization'
851
852 def http_error_407(self, req, fp, code, msg, headers):
853 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000854 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000855 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
856 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000857 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000858 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000859 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000860 self.reset_retry_count()
861 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000862
863
864def randombytes(n):
865 """Return n random bytes."""
866 return os.urandom(n)
867
868class AbstractDigestAuthHandler:
869 # Digest authentication is specified in RFC 2617.
870
871 # XXX The client does not inspect the Authentication-Info header
872 # in a successful response.
873
874 # XXX It should be possible to test this implementation against
875 # a mock server that just generates a static set of challenges.
876
877 # XXX qop="auth-int" supports is shaky
878
879 def __init__(self, passwd=None):
880 if passwd is None:
881 passwd = HTTPPasswordMgr()
882 self.passwd = passwd
883 self.add_password = self.passwd.add_password
884 self.retried = 0
885 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000886 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000887
888 def reset_retry_count(self):
889 self.retried = 0
890
891 def http_error_auth_reqed(self, auth_header, host, req, headers):
892 authreq = headers.get(auth_header, None)
893 if self.retried > 5:
894 # Don't fail endlessly - if we failed once, we'll probably
895 # fail a second time. Hm. Unless the Password Manager is
896 # prompting for the information. Crap. This isn't great
897 # but it's better than the current 'repeat until recursion
898 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000899 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000900 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000901 else:
902 self.retried += 1
903 if authreq:
904 scheme = authreq.split()[0]
905 if scheme.lower() == 'digest':
906 return self.retry_http_digest_auth(req, authreq)
907
908 def retry_http_digest_auth(self, req, auth):
909 token, challenge = auth.split(' ', 1)
910 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
911 auth = self.get_authorization(req, chal)
912 if auth:
913 auth_val = 'Digest %s' % auth
914 if req.headers.get(self.auth_header, None) == auth_val:
915 return None
916 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000917 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000918 return resp
919
920 def get_cnonce(self, nonce):
921 # The cnonce-value is an opaque
922 # quoted string value provided by the client and used by both client
923 # and server to avoid chosen plaintext attacks, to provide mutual
924 # authentication, and to provide some message integrity protection.
925 # This isn't a fabulous effort, but it's probably Good Enough.
926 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
927 b = s.encode("ascii") + randombytes(8)
928 dig = hashlib.sha1(b).hexdigest()
929 return dig[:16]
930
931 def get_authorization(self, req, chal):
932 try:
933 realm = chal['realm']
934 nonce = chal['nonce']
935 qop = chal.get('qop')
936 algorithm = chal.get('algorithm', 'MD5')
937 # mod_digest doesn't send an opaque, even though it isn't
938 # supposed to be optional
939 opaque = chal.get('opaque', None)
940 except KeyError:
941 return None
942
943 H, KD = self.get_algorithm_impls(algorithm)
944 if H is None:
945 return None
946
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000947 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000948 if user is None:
949 return None
950
951 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000952 if req.data is not None:
953 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000954 else:
955 entdig = None
956
957 A1 = "%s:%s:%s" % (user, realm, pw)
958 A2 = "%s:%s" % (req.get_method(),
959 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000960 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000961 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000962 if nonce == self.last_nonce:
963 self.nonce_count += 1
964 else:
965 self.nonce_count = 1
966 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000967 ncvalue = '%08x' % self.nonce_count
968 cnonce = self.get_cnonce(nonce)
969 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
970 respdig = KD(H(A1), noncebit)
971 elif qop is None:
972 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
973 else:
974 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000975 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000976
977 # XXX should the partial digests be encoded too?
978
979 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000980 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000981 respdig)
982 if opaque:
983 base += ', opaque="%s"' % opaque
984 if entdig:
985 base += ', digest="%s"' % entdig
986 base += ', algorithm="%s"' % algorithm
987 if qop:
988 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
989 return base
990
991 def get_algorithm_impls(self, algorithm):
992 # lambdas assume digest modules are imported at the top level
993 if algorithm == 'MD5':
994 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
995 elif algorithm == 'SHA':
996 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
997 # XXX MD5-sess
998 KD = lambda s, d: H("%s:%s" % (s, d))
999 return H, KD
1000
1001 def get_entity_digest(self, data, chal):
1002 # XXX not implemented yet
1003 return None
1004
1005
1006class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1007 """An authentication protocol defined by RFC 2069
1008
1009 Digest authentication improves on basic authentication because it
1010 does not transmit passwords in the clear.
1011 """
1012
1013 auth_header = 'Authorization'
1014 handler_order = 490 # before Basic auth
1015
1016 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001017 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001018 retry = self.http_error_auth_reqed('www-authenticate',
1019 host, req, headers)
1020 self.reset_retry_count()
1021 return retry
1022
1023
1024class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1025
1026 auth_header = 'Proxy-Authorization'
1027 handler_order = 490 # before Basic auth
1028
1029 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001030 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001031 retry = self.http_error_auth_reqed('proxy-authenticate',
1032 host, req, headers)
1033 self.reset_retry_count()
1034 return retry
1035
1036class AbstractHTTPHandler(BaseHandler):
1037
1038 def __init__(self, debuglevel=0):
1039 self._debuglevel = debuglevel
1040
1041 def set_http_debuglevel(self, level):
1042 self._debuglevel = level
1043
1044 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001045 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001046 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001047 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001048
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001049 if request.data is not None: # POST
1050 data = request.data
Senthil Kumaran29333122011-02-11 11:25:47 +00001051 if isinstance(data, str):
1052 raise TypeError("POST data should be bytes"
1053 " or an iterable of bytes. It cannot be str.")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001054 if not request.has_header('Content-type'):
1055 request.add_unredirected_header(
1056 'Content-type',
1057 'application/x-www-form-urlencoded')
1058 if not request.has_header('Content-length'):
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001059 try:
1060 mv = memoryview(data)
1061 except TypeError:
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001062 if isinstance(data, collections.Iterable):
Georg Brandl61536042011-02-03 07:46:41 +00001063 raise ValueError("Content-Length should be specified "
1064 "for iterable data of type %r %r" % (type(data),
Senthil Kumaran7bc0d872010-12-19 10:49:52 +00001065 data))
1066 else:
1067 request.add_unredirected_header(
Senthil Kumaran1e991f22010-12-24 04:03:59 +00001068 'Content-length', '%d' % (len(mv) * mv.itemsize))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001069
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001070 sel_host = host
1071 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001072 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001073 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001074 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001075 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001076 for name, value in self.parent.addheaders:
1077 name = name.capitalize()
1078 if not request.has_header(name):
1079 request.add_unredirected_header(name, value)
1080
1081 return request
1082
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001083 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001084 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001085
1086 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001087 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001088 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001089 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001090 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001091
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001092 # will parse host:port
1093 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001094
1095 headers = dict(req.unredirected_hdrs)
1096 headers.update(dict((k, v) for k, v in req.headers.items()
1097 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001098
1099 # TODO(jhylton): Should this be redesigned to handle
1100 # persistent connections?
1101
1102 # We want to make an HTTP/1.1 request, but the addinfourl
1103 # class isn't prepared to deal with a persistent connection.
1104 # It will try to read all remaining data from the socket,
1105 # which will block while the server waits for the next request.
1106 # So make sure the connection gets closed after the (only)
1107 # request.
1108 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001109 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001110
1111 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001112 tunnel_headers = {}
1113 proxy_auth_hdr = "Proxy-Authorization"
1114 if proxy_auth_hdr in headers:
1115 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1116 # Proxy-Authorization should not be sent to origin
1117 # server.
1118 del headers[proxy_auth_hdr]
1119 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001120
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001121 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001122 h.request(req.get_method(), req.selector, req.data, headers)
1123 r = h.getresponse() # an HTTPResponse instance
1124 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001125 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001126
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001127 r.url = req.full_url
1128 # This line replaces the .msg attribute of the HTTPResponse
1129 # with .headers, because urllib clients expect the response to
1130 # have the reason in .msg. It would be good to mark this
1131 # attribute is deprecated and get then to use info() or
1132 # .headers.
1133 r.msg = r.reason
1134 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001135
1136
1137class HTTPHandler(AbstractHTTPHandler):
1138
1139 def http_open(self, req):
1140 return self.do_open(http.client.HTTPConnection, req)
1141
1142 http_request = AbstractHTTPHandler.do_request_
1143
1144if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001145 import ssl
1146
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001147 class HTTPSHandler(AbstractHTTPHandler):
1148
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001149 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1150 AbstractHTTPHandler.__init__(self, debuglevel)
1151 self._context = context
1152 self._check_hostname = check_hostname
1153
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001154 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001155 return self.do_open(http.client.HTTPSConnection, req,
1156 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001157
1158 https_request = AbstractHTTPHandler.do_request_
1159
1160class HTTPCookieProcessor(BaseHandler):
1161 def __init__(self, cookiejar=None):
1162 import http.cookiejar
1163 if cookiejar is None:
1164 cookiejar = http.cookiejar.CookieJar()
1165 self.cookiejar = cookiejar
1166
1167 def http_request(self, request):
1168 self.cookiejar.add_cookie_header(request)
1169 return request
1170
1171 def http_response(self, request, response):
1172 self.cookiejar.extract_cookies(response, request)
1173 return response
1174
1175 https_request = http_request
1176 https_response = http_response
1177
1178class UnknownHandler(BaseHandler):
1179 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001180 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001181 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001182
1183def parse_keqv_list(l):
1184 """Parse list of key=value strings where keys are not duplicated."""
1185 parsed = {}
1186 for elt in l:
1187 k, v = elt.split('=', 1)
1188 if v[0] == '"' and v[-1] == '"':
1189 v = v[1:-1]
1190 parsed[k] = v
1191 return parsed
1192
1193def parse_http_list(s):
1194 """Parse lists as described by RFC 2068 Section 2.
1195
1196 In particular, parse comma-separated lists where the elements of
1197 the list may include quoted-strings. A quoted-string could
1198 contain a comma. A non-quoted string could have quotes in the
1199 middle. Neither commas nor quotes count if they are escaped.
1200 Only double-quotes count, not single-quotes.
1201 """
1202 res = []
1203 part = ''
1204
1205 escape = quote = False
1206 for cur in s:
1207 if escape:
1208 part += cur
1209 escape = False
1210 continue
1211 if quote:
1212 if cur == '\\':
1213 escape = True
1214 continue
1215 elif cur == '"':
1216 quote = False
1217 part += cur
1218 continue
1219
1220 if cur == ',':
1221 res.append(part)
1222 part = ''
1223 continue
1224
1225 if cur == '"':
1226 quote = True
1227
1228 part += cur
1229
1230 # append last part
1231 if part:
1232 res.append(part)
1233
1234 return [part.strip() for part in res]
1235
1236class FileHandler(BaseHandler):
1237 # Use local file or FTP depending on form of URL
1238 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001239 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001240 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1241 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001242 if not req.host is self.get_names():
1243 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001244 else:
1245 return self.open_local_file(req)
1246
1247 # names for the localhost
1248 names = None
1249 def get_names(self):
1250 if FileHandler.names is None:
1251 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001252 FileHandler.names = tuple(
1253 socket.gethostbyname_ex('localhost')[2] +
1254 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001255 except socket.gaierror:
1256 FileHandler.names = (socket.gethostbyname('localhost'),)
1257 return FileHandler.names
1258
1259 # not entirely sure what the rules are here
1260 def open_local_file(self, req):
1261 import email.utils
1262 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001263 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001264 filename = req.selector
1265 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001266 try:
1267 stats = os.stat(localfile)
1268 size = stats.st_size
1269 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001270 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001271 headers = email.message_from_string(
1272 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1273 (mtype or 'text/plain', size, modified))
1274 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001275 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001276 if not host or \
1277 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001278 if host:
1279 origurl = 'file://' + host + filename
1280 else:
1281 origurl = 'file://' + filename
1282 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001283 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001284 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001285 raise URLError(msg)
1286 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001287
1288def _safe_gethostbyname(host):
1289 try:
1290 return socket.gethostbyname(host)
1291 except socket.gaierror:
1292 return None
1293
1294class FTPHandler(BaseHandler):
1295 def ftp_open(self, req):
1296 import ftplib
1297 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001298 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001299 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001300 raise URLError('ftp error: no host given')
1301 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001302 if port is None:
1303 port = ftplib.FTP_PORT
1304 else:
1305 port = int(port)
1306
1307 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001308 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001309 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001310 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001311 else:
1312 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001313 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001314 user = user or ''
1315 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001316
1317 try:
1318 host = socket.gethostbyname(host)
1319 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001320 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001321 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001322 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001323 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001324 dirs, file = dirs[:-1], dirs[-1]
1325 if dirs and not dirs[0]:
1326 dirs = dirs[1:]
1327 try:
1328 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1329 type = file and 'I' or 'D'
1330 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001331 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001332 if attr.lower() == 'type' and \
1333 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1334 type = value.upper()
1335 fp, retrlen = fw.retrfile(file, type)
1336 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001337 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001338 if mtype:
1339 headers += "Content-type: %s\n" % mtype
1340 if retrlen is not None and retrlen >= 0:
1341 headers += "Content-length: %d\n" % retrlen
1342 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001343 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001344 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001345 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001346 raise exc.with_traceback(sys.exc_info()[2])
1347
1348 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1349 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1350 return fw
1351
1352class CacheFTPHandler(FTPHandler):
1353 # XXX would be nice to have pluggable cache strategies
1354 # XXX this stuff is definitely not thread safe
1355 def __init__(self):
1356 self.cache = {}
1357 self.timeout = {}
1358 self.soonest = 0
1359 self.delay = 60
1360 self.max_conns = 16
1361
1362 def setTimeout(self, t):
1363 self.delay = t
1364
1365 def setMaxConns(self, m):
1366 self.max_conns = m
1367
1368 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1369 key = user, host, port, '/'.join(dirs), timeout
1370 if key in self.cache:
1371 self.timeout[key] = time.time() + self.delay
1372 else:
1373 self.cache[key] = ftpwrapper(user, passwd, host, port,
1374 dirs, timeout)
1375 self.timeout[key] = time.time() + self.delay
1376 self.check_cache()
1377 return self.cache[key]
1378
1379 def check_cache(self):
1380 # first check for old ones
1381 t = time.time()
1382 if self.soonest <= t:
1383 for k, v in list(self.timeout.items()):
1384 if v < t:
1385 self.cache[k].close()
1386 del self.cache[k]
1387 del self.timeout[k]
1388 self.soonest = min(list(self.timeout.values()))
1389
1390 # then check the size
1391 if len(self.cache) == self.max_conns:
1392 for k, v in list(self.timeout.items()):
1393 if v == self.soonest:
1394 del self.cache[k]
1395 del self.timeout[k]
1396 break
1397 self.soonest = min(list(self.timeout.values()))
1398
1399# Code move from the old urllib module
1400
1401MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1402
1403# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001404if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001405 from nturl2path import url2pathname, pathname2url
1406else:
1407 def url2pathname(pathname):
1408 """OS-specific conversion from a relative URL of the 'file' scheme
1409 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001410 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001411
1412 def pathname2url(pathname):
1413 """OS-specific conversion from a file system path to a relative URL
1414 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001415 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001416
1417# This really consists of two pieces:
1418# (1) a class which handles opening of all sorts of URLs
1419# (plus assorted utilities etc.)
1420# (2) a set of functions for parsing URLs
1421# XXX Should these be separated out into different modules?
1422
1423
1424ftpcache = {}
1425class URLopener:
1426 """Class to open URLs.
1427 This is a class rather than just a subroutine because we may need
1428 more than one set of global protocol-specific options.
1429 Note -- this is a base class for those who don't want the
1430 automatic handling of errors type 302 (relocated) and 401
1431 (authorization needed)."""
1432
1433 __tempfiles = None
1434
1435 version = "Python-urllib/%s" % __version__
1436
1437 # Constructor
1438 def __init__(self, proxies=None, **x509):
1439 if proxies is None:
1440 proxies = getproxies()
1441 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1442 self.proxies = proxies
1443 self.key_file = x509.get('key_file')
1444 self.cert_file = x509.get('cert_file')
1445 self.addheaders = [('User-Agent', self.version)]
1446 self.__tempfiles = []
1447 self.__unlink = os.unlink # See cleanup()
1448 self.tempcache = None
1449 # Undocumented feature: if you assign {} to tempcache,
1450 # it is used to cache files retrieved with
1451 # self.retrieve(). This is not enabled by default
1452 # since it does not work for changing documents (and I
1453 # haven't got the logic to check expiration headers
1454 # yet).
1455 self.ftpcache = ftpcache
1456 # Undocumented feature: you can use a different
1457 # ftp cache by assigning to the .ftpcache member;
1458 # in case you want logically independent URL openers
1459 # XXX This is not threadsafe. Bah.
1460
1461 def __del__(self):
1462 self.close()
1463
1464 def close(self):
1465 self.cleanup()
1466
1467 def cleanup(self):
1468 # This code sometimes runs when the rest of this module
1469 # has already been deleted, so it can't use any globals
1470 # or import anything.
1471 if self.__tempfiles:
1472 for file in self.__tempfiles:
1473 try:
1474 self.__unlink(file)
1475 except OSError:
1476 pass
1477 del self.__tempfiles[:]
1478 if self.tempcache:
1479 self.tempcache.clear()
1480
1481 def addheader(self, *args):
1482 """Add a header to be used by the HTTP interface only
1483 e.g. u.addheader('Accept', 'sound/basic')"""
1484 self.addheaders.append(args)
1485
1486 # External interface
1487 def open(self, fullurl, data=None):
1488 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001489 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001490 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001491 if self.tempcache and fullurl in self.tempcache:
1492 filename, headers = self.tempcache[fullurl]
1493 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001494 return addinfourl(fp, headers, fullurl)
1495 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001496 if not urltype:
1497 urltype = 'file'
1498 if urltype in self.proxies:
1499 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001500 urltype, proxyhost = splittype(proxy)
1501 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001502 url = (host, fullurl) # Signal special case to open_*()
1503 else:
1504 proxy = None
1505 name = 'open_' + urltype
1506 self.type = urltype
1507 name = name.replace('-', '_')
1508 if not hasattr(self, name):
1509 if proxy:
1510 return self.open_unknown_proxy(proxy, fullurl, data)
1511 else:
1512 return self.open_unknown(fullurl, data)
1513 try:
1514 if data is None:
1515 return getattr(self, name)(url)
1516 else:
1517 return getattr(self, name)(url, data)
1518 except socket.error as msg:
1519 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1520
1521 def open_unknown(self, fullurl, data=None):
1522 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001523 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001524 raise IOError('url error', 'unknown url type', type)
1525
1526 def open_unknown_proxy(self, proxy, fullurl, data=None):
1527 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001528 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001529 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1530
1531 # External interface
1532 def retrieve(self, url, filename=None, reporthook=None, data=None):
1533 """retrieve(url) returns (filename, headers) for a local object
1534 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001535 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001536 if self.tempcache and url in self.tempcache:
1537 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001538 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001539 if filename is None and (not type or type == 'file'):
1540 try:
1541 fp = self.open_local_file(url1)
1542 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001543 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001544 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001545 except IOError as msg:
1546 pass
1547 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001548 try:
1549 headers = fp.info()
1550 if filename:
1551 tfp = open(filename, 'wb')
1552 else:
1553 import tempfile
1554 garbage, path = splittype(url)
1555 garbage, path = splithost(path or "")
1556 path, garbage = splitquery(path or "")
1557 path, garbage = splitattr(path or "")
1558 suffix = os.path.splitext(path)[1]
1559 (fd, filename) = tempfile.mkstemp(suffix)
1560 self.__tempfiles.append(filename)
1561 tfp = os.fdopen(fd, 'wb')
1562 try:
1563 result = filename, headers
1564 if self.tempcache is not None:
1565 self.tempcache[url] = result
1566 bs = 1024*8
1567 size = -1
1568 read = 0
1569 blocknum = 0
1570 if reporthook:
1571 if "content-length" in headers:
1572 size = int(headers["Content-Length"])
1573 reporthook(blocknum, bs, size)
1574 while 1:
1575 block = fp.read(bs)
1576 if not block:
1577 break
1578 read += len(block)
1579 tfp.write(block)
1580 blocknum += 1
1581 if reporthook:
1582 reporthook(blocknum, bs, size)
1583 finally:
1584 tfp.close()
1585 finally:
1586 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001587
1588 # raise exception if actual size does not match content-length header
1589 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001590 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001591 "retrieval incomplete: got only %i out of %i bytes"
1592 % (read, size), result)
1593
1594 return result
1595
1596 # Each method named open_<type> knows how to open that type of URL
1597
1598 def _open_generic_http(self, connection_factory, url, data):
1599 """Make an HTTP connection using connection_class.
1600
1601 This is an internal method that should be called from
1602 open_http() or open_https().
1603
1604 Arguments:
1605 - connection_factory should take a host name and return an
1606 HTTPConnection instance.
1607 - url is the url to retrieval or a host, relative-path pair.
1608 - data is payload for a POST request or None.
1609 """
1610
1611 user_passwd = None
1612 proxy_passwd= None
1613 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001614 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001615 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001616 user_passwd, host = splituser(host)
1617 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001618 realhost = host
1619 else:
1620 host, selector = url
1621 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001622 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001623 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001624 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001625 url = rest
1626 user_passwd = None
1627 if urltype.lower() != 'http':
1628 realhost = None
1629 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001630 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001631 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001632 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001633 if user_passwd:
1634 selector = "%s://%s%s" % (urltype, realhost, rest)
1635 if proxy_bypass(realhost):
1636 host = realhost
1637
1638 #print "proxy via http:", host, selector
1639 if not host: raise IOError('http error', 'no host given')
1640
1641 if proxy_passwd:
1642 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001643 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001644 else:
1645 proxy_auth = None
1646
1647 if user_passwd:
1648 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001649 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001650 else:
1651 auth = None
1652 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001653 headers = {}
1654 if proxy_auth:
1655 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1656 if auth:
1657 headers["Authorization"] = "Basic %s" % auth
1658 if realhost:
1659 headers["Host"] = realhost
Senthil Kumarand91ffca2011-03-19 17:25:27 +08001660
1661 # Add Connection:close as we don't support persistent connections yet.
1662 # This helps in closing the socket and avoiding ResourceWarning
1663
1664 headers["Connection"] = "close"
1665
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001666 for header, value in self.addheaders:
1667 headers[header] = value
1668
1669 if data is not None:
1670 headers["Content-Type"] = "application/x-www-form-urlencoded"
1671 http_conn.request("POST", selector, data, headers)
1672 else:
1673 http_conn.request("GET", selector, headers=headers)
1674
1675 try:
1676 response = http_conn.getresponse()
1677 except http.client.BadStatusLine:
1678 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001679 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001680
1681 # According to RFC 2616, "2xx" code indicates that the client's
1682 # request was successfully received, understood, and accepted.
1683 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001684 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001685 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001686 else:
1687 return self.http_error(
1688 url, response.fp,
1689 response.status, response.reason, response.msg, data)
1690
1691 def open_http(self, url, data=None):
1692 """Use HTTP protocol."""
1693 return self._open_generic_http(http.client.HTTPConnection, url, data)
1694
1695 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1696 """Handle http errors.
1697
1698 Derived class can override this, or provide specific handlers
1699 named http_error_DDD where DDD is the 3-digit error code."""
1700 # First check if there's a specific handler for this error
1701 name = 'http_error_%d' % errcode
1702 if hasattr(self, name):
1703 method = getattr(self, name)
1704 if data is None:
1705 result = method(url, fp, errcode, errmsg, headers)
1706 else:
1707 result = method(url, fp, errcode, errmsg, headers, data)
1708 if result: return result
1709 return self.http_error_default(url, fp, errcode, errmsg, headers)
1710
1711 def http_error_default(self, url, fp, errcode, errmsg, headers):
1712 """Default error handler: close the connection and raise IOError."""
1713 void = fp.read()
1714 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001715 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001716
1717 if _have_ssl:
1718 def _https_connection(self, host):
1719 return http.client.HTTPSConnection(host,
1720 key_file=self.key_file,
1721 cert_file=self.cert_file)
1722
1723 def open_https(self, url, data=None):
1724 """Use HTTPS protocol."""
1725 return self._open_generic_http(self._https_connection, url, data)
1726
1727 def open_file(self, url):
1728 """Use local file or FTP depending on form of URL."""
1729 if not isinstance(url, str):
1730 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1731 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001732 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001733 else:
1734 return self.open_local_file(url)
1735
1736 def open_local_file(self, url):
1737 """Use local file."""
1738 import mimetypes, email.utils
1739 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001740 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001741 localname = url2pathname(file)
1742 try:
1743 stats = os.stat(localname)
1744 except OSError as e:
1745 raise URLError(e.errno, e.strerror, e.filename)
1746 size = stats.st_size
1747 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1748 mtype = mimetypes.guess_type(url)[0]
1749 headers = email.message_from_string(
1750 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1751 (mtype or 'text/plain', size, modified))
1752 if not host:
1753 urlfile = file
1754 if file[:1] == '/':
1755 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001756 return addinfourl(open(localname, 'rb'), headers, urlfile)
1757 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001758 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001759 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001760 urlfile = file
1761 if file[:1] == '/':
1762 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001763 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001764 raise URLError('local file error', 'not on local host')
1765
1766 def open_ftp(self, url):
1767 """Use FTP protocol."""
1768 if not isinstance(url, str):
1769 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1770 import mimetypes
1771 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001772 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001773 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001774 host, port = splitport(host)
1775 user, host = splituser(host)
1776 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001777 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001778 host = unquote(host)
1779 user = unquote(user or '')
1780 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001781 host = socket.gethostbyname(host)
1782 if not port:
1783 import ftplib
1784 port = ftplib.FTP_PORT
1785 else:
1786 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001787 path, attrs = splitattr(path)
1788 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001789 dirs = path.split('/')
1790 dirs, file = dirs[:-1], dirs[-1]
1791 if dirs and not dirs[0]: dirs = dirs[1:]
1792 if dirs and not dirs[0]: dirs[0] = '/'
1793 key = user, host, port, '/'.join(dirs)
1794 # XXX thread unsafe!
1795 if len(self.ftpcache) > MAXFTPCACHE:
1796 # Prune the cache, rather arbitrarily
1797 for k in self.ftpcache.keys():
1798 if k != key:
1799 v = self.ftpcache[k]
1800 del self.ftpcache[k]
1801 v.close()
1802 try:
1803 if not key in self.ftpcache:
1804 self.ftpcache[key] = \
1805 ftpwrapper(user, passwd, host, port, dirs)
1806 if not file: type = 'D'
1807 else: type = 'I'
1808 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001809 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001810 if attr.lower() == 'type' and \
1811 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1812 type = value.upper()
1813 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1814 mtype = mimetypes.guess_type("ftp:" + url)[0]
1815 headers = ""
1816 if mtype:
1817 headers += "Content-Type: %s\n" % mtype
1818 if retrlen is not None and retrlen >= 0:
1819 headers += "Content-Length: %d\n" % retrlen
1820 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001821 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001822 except ftperrors() as msg:
1823 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1824
1825 def open_data(self, url, data=None):
1826 """Use "data" URL."""
1827 if not isinstance(url, str):
1828 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1829 # ignore POSTed data
1830 #
1831 # syntax of data URLs:
1832 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1833 # mediatype := [ type "/" subtype ] *( ";" parameter )
1834 # data := *urlchar
1835 # parameter := attribute "=" value
1836 try:
1837 [type, data] = url.split(',', 1)
1838 except ValueError:
1839 raise IOError('data error', 'bad data URL')
1840 if not type:
1841 type = 'text/plain;charset=US-ASCII'
1842 semi = type.rfind(';')
1843 if semi >= 0 and '=' not in type[semi:]:
1844 encoding = type[semi+1:]
1845 type = type[:semi]
1846 else:
1847 encoding = ''
1848 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001849 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001850 time.gmtime(time.time())))
1851 msg.append('Content-type: %s' % type)
1852 if encoding == 'base64':
1853 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001854 # XXX is this encoding/decoding ok?
1855 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001856 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001857 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001858 msg.append('Content-Length: %d' % len(data))
1859 msg.append('')
1860 msg.append(data)
1861 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001862 headers = email.message_from_string(msg)
1863 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001864 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001865 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001866
1867
1868class FancyURLopener(URLopener):
1869 """Derived class with handlers for errors we can handle (perhaps)."""
1870
1871 def __init__(self, *args, **kwargs):
1872 URLopener.__init__(self, *args, **kwargs)
1873 self.auth_cache = {}
1874 self.tries = 0
1875 self.maxtries = 10
1876
1877 def http_error_default(self, url, fp, errcode, errmsg, headers):
1878 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001879 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001880
1881 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1882 """Error 302 -- relocated (temporarily)."""
1883 self.tries += 1
1884 if self.maxtries and self.tries >= self.maxtries:
1885 if hasattr(self, "http_error_500"):
1886 meth = self.http_error_500
1887 else:
1888 meth = self.http_error_default
1889 self.tries = 0
1890 return meth(url, fp, 500,
1891 "Internal Server Error: Redirect Recursion", headers)
1892 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1893 data)
1894 self.tries = 0
1895 return result
1896
1897 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1898 if 'location' in headers:
1899 newurl = headers['location']
1900 elif 'uri' in headers:
1901 newurl = headers['uri']
1902 else:
1903 return
1904 void = fp.read()
1905 fp.close()
1906 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001907 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001908 return self.open(newurl)
1909
1910 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1911 """Error 301 -- also relocated (permanently)."""
1912 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1913
1914 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1915 """Error 303 -- also relocated (essentially identical to 302)."""
1916 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1917
1918 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1919 """Error 307 -- relocated, but turn POST into error."""
1920 if data is None:
1921 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1922 else:
1923 return self.http_error_default(url, fp, errcode, errmsg, headers)
1924
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001925 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1926 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001927 """Error 401 -- authentication required.
1928 This function supports Basic authentication only."""
1929 if not 'www-authenticate' in headers:
1930 URLopener.http_error_default(self, url, fp,
1931 errcode, errmsg, headers)
1932 stuff = headers['www-authenticate']
1933 import re
1934 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1935 if not match:
1936 URLopener.http_error_default(self, url, fp,
1937 errcode, errmsg, headers)
1938 scheme, realm = match.groups()
1939 if scheme.lower() != 'basic':
1940 URLopener.http_error_default(self, url, fp,
1941 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001942 if not retry:
1943 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1944 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001945 name = 'retry_' + self.type + '_basic_auth'
1946 if data is None:
1947 return getattr(self,name)(url, realm)
1948 else:
1949 return getattr(self,name)(url, realm, data)
1950
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001951 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
1952 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001953 """Error 407 -- proxy authentication required.
1954 This function supports Basic authentication only."""
1955 if not 'proxy-authenticate' in headers:
1956 URLopener.http_error_default(self, url, fp,
1957 errcode, errmsg, headers)
1958 stuff = headers['proxy-authenticate']
1959 import re
1960 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1961 if not match:
1962 URLopener.http_error_default(self, url, fp,
1963 errcode, errmsg, headers)
1964 scheme, realm = match.groups()
1965 if scheme.lower() != 'basic':
1966 URLopener.http_error_default(self, url, fp,
1967 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001968 if not retry:
1969 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1970 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001971 name = 'retry_proxy_' + self.type + '_basic_auth'
1972 if data is None:
1973 return getattr(self,name)(url, realm)
1974 else:
1975 return getattr(self,name)(url, realm, data)
1976
1977 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001978 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001979 newurl = 'http://' + host + selector
1980 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001981 urltype, proxyhost = splittype(proxy)
1982 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001983 i = proxyhost.find('@') + 1
1984 proxyhost = proxyhost[i:]
1985 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1986 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001987 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001988 quote(passwd, safe=''), proxyhost)
1989 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1990 if data is None:
1991 return self.open(newurl)
1992 else:
1993 return self.open(newurl, data)
1994
1995 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001996 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001997 newurl = 'https://' + host + selector
1998 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001999 urltype, proxyhost = splittype(proxy)
2000 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002001 i = proxyhost.find('@') + 1
2002 proxyhost = proxyhost[i:]
2003 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2004 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002005 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002006 quote(passwd, safe=''), proxyhost)
2007 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2008 if data is None:
2009 return self.open(newurl)
2010 else:
2011 return self.open(newurl, data)
2012
2013 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002014 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002015 i = host.find('@') + 1
2016 host = host[i:]
2017 user, passwd = self.get_user_passwd(host, realm, i)
2018 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002019 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002020 quote(passwd, safe=''), host)
2021 newurl = 'http://' + host + selector
2022 if data is None:
2023 return self.open(newurl)
2024 else:
2025 return self.open(newurl, data)
2026
2027 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002028 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002029 i = host.find('@') + 1
2030 host = host[i:]
2031 user, passwd = self.get_user_passwd(host, realm, i)
2032 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002033 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002034 quote(passwd, safe=''), host)
2035 newurl = 'https://' + host + selector
2036 if data is None:
2037 return self.open(newurl)
2038 else:
2039 return self.open(newurl, data)
2040
Florent Xicluna757445b2010-05-17 17:24:07 +00002041 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002042 key = realm + '@' + host.lower()
2043 if key in self.auth_cache:
2044 if clear_cache:
2045 del self.auth_cache[key]
2046 else:
2047 return self.auth_cache[key]
2048 user, passwd = self.prompt_user_passwd(host, realm)
2049 if user or passwd: self.auth_cache[key] = (user, passwd)
2050 return user, passwd
2051
2052 def prompt_user_passwd(self, host, realm):
2053 """Override this in a GUI environment!"""
2054 import getpass
2055 try:
2056 user = input("Enter username for %s at %s: " % (realm, host))
2057 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2058 (user, realm, host))
2059 return user, passwd
2060 except KeyboardInterrupt:
2061 print()
2062 return None, None
2063
2064
2065# Utility functions
2066
2067_localhost = None
2068def localhost():
2069 """Return the IP address of the magic hostname 'localhost'."""
2070 global _localhost
2071 if _localhost is None:
2072 _localhost = socket.gethostbyname('localhost')
2073 return _localhost
2074
2075_thishost = None
2076def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002077 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002078 global _thishost
2079 if _thishost is None:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002080 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname()[2]))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002081 return _thishost
2082
2083_ftperrors = None
2084def ftperrors():
2085 """Return the set of errors raised by the FTP class."""
2086 global _ftperrors
2087 if _ftperrors is None:
2088 import ftplib
2089 _ftperrors = ftplib.all_errors
2090 return _ftperrors
2091
2092_noheaders = None
2093def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002094 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002095 global _noheaders
2096 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002097 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002098 return _noheaders
2099
2100
2101# Utility classes
2102
2103class ftpwrapper:
2104 """Class used by open_ftp() for cache of open FTP connections."""
2105
2106 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2107 self.user = user
2108 self.passwd = passwd
2109 self.host = host
2110 self.port = port
2111 self.dirs = dirs
2112 self.timeout = timeout
2113 self.init()
2114
2115 def init(self):
2116 import ftplib
2117 self.busy = 0
2118 self.ftp = ftplib.FTP()
2119 self.ftp.connect(self.host, self.port, self.timeout)
2120 self.ftp.login(self.user, self.passwd)
2121 for dir in self.dirs:
2122 self.ftp.cwd(dir)
2123
2124 def retrfile(self, file, type):
2125 import ftplib
2126 self.endtransfer()
2127 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2128 else: cmd = 'TYPE ' + type; isdir = 0
2129 try:
2130 self.ftp.voidcmd(cmd)
2131 except ftplib.all_errors:
2132 self.init()
2133 self.ftp.voidcmd(cmd)
2134 conn = None
2135 if file and not isdir:
2136 # Try to retrieve as a file
2137 try:
2138 cmd = 'RETR ' + file
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002139 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002140 except ftplib.error_perm as reason:
2141 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002142 raise URLError('ftp error', reason).with_traceback(
2143 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002144 if not conn:
2145 # Set transfer mode to ASCII!
2146 self.ftp.voidcmd('TYPE A')
2147 # Try a directory listing. Verify that directory exists.
2148 if file:
2149 pwd = self.ftp.pwd()
2150 try:
2151 try:
2152 self.ftp.cwd(file)
2153 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002154 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002155 finally:
2156 self.ftp.cwd(pwd)
2157 cmd = 'LIST ' + file
2158 else:
2159 cmd = 'LIST'
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002160 conn, retrlen = self.ftp.ntransfercmd(cmd)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002161 self.busy = 1
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002162
2163 ftpobj = addclosehook(conn.makefile('rb'), self.endtransfer)
2164 conn.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002165 # Pass back both a suitably decorated object and a retrieval length
Senthil Kumaran2024acd2011-03-24 11:46:19 +08002166 return (ftpobj, retrlen)
2167
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002168 def endtransfer(self):
2169 if not self.busy:
2170 return
2171 self.busy = 0
2172 try:
2173 self.ftp.voidresp()
2174 except ftperrors():
2175 pass
2176
2177 def close(self):
2178 self.endtransfer()
2179 try:
2180 self.ftp.close()
2181 except ftperrors():
2182 pass
2183
2184# Proxy handling
2185def getproxies_environment():
2186 """Return a dictionary of scheme -> proxy server URL mappings.
2187
2188 Scan the environment for variables named <scheme>_proxy;
2189 this seems to be the standard convention. If you need a
2190 different way, you can pass a proxies dictionary to the
2191 [Fancy]URLopener constructor.
2192
2193 """
2194 proxies = {}
2195 for name, value in os.environ.items():
2196 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002197 if value and name[-6:] == '_proxy':
2198 proxies[name[:-6]] = value
2199 return proxies
2200
2201def proxy_bypass_environment(host):
2202 """Test if proxies should not be used for a particular host.
2203
2204 Checks the environment for a variable named no_proxy, which should
2205 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2206 """
2207 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2208 # '*' is special case for always bypass
2209 if no_proxy == '*':
2210 return 1
2211 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002212 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002213 # check if the host ends with any of the DNS suffixes
2214 for name in no_proxy.split(','):
2215 if name and (hostonly.endswith(name) or host.endswith(name)):
2216 return 1
2217 # otherwise, don't bypass
2218 return 0
2219
2220
Ronald Oussorene72e1612011-03-14 18:15:25 -04002221# This code tests an OSX specific data structure but is testable on all
2222# platforms
2223def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2224 """
2225 Return True iff this host shouldn't be accessed using a proxy
2226
2227 This function uses the MacOSX framework SystemConfiguration
2228 to fetch the proxy information.
2229
2230 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2231 { 'exclude_simple': bool,
2232 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2233 }
2234 """
2235 import re
2236 import socket
2237 from fnmatch import fnmatch
2238
2239 hostonly, port = splitport(host)
2240
2241 def ip2num(ipAddr):
2242 parts = ipAddr.split('.')
2243 parts = list(map(int, parts))
2244 if len(parts) != 4:
2245 parts = (parts + [0, 0, 0, 0])[:4]
2246 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2247
2248 # Check for simple host names:
2249 if '.' not in host:
2250 if proxy_settings['exclude_simple']:
2251 return True
2252
2253 hostIP = None
2254
2255 for value in proxy_settings.get('exceptions', ()):
2256 # Items in the list are strings like these: *.local, 169.254/16
2257 if not value: continue
2258
2259 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2260 if m is not None:
2261 if hostIP is None:
2262 try:
2263 hostIP = socket.gethostbyname(hostonly)
2264 hostIP = ip2num(hostIP)
2265 except socket.error:
2266 continue
2267
2268 base = ip2num(m.group(1))
2269 mask = m.group(2)
2270 if mask is None:
2271 mask = 8 * (m.group(1).count('.') + 1)
2272 else:
2273 mask = int(mask[1:])
2274 mask = 32 - mask
2275
2276 if (hostIP >> mask) == (base >> mask):
2277 return True
2278
2279 elif fnmatch(host, value):
2280 return True
2281
2282 return False
2283
2284
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002285if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002286 from _scproxy import _get_proxy_settings, _get_proxies
2287
2288 def proxy_bypass_macosx_sysconf(host):
Ronald Oussoren84151202010-04-18 20:46:11 +00002289 proxy_settings = _get_proxy_settings()
Ronald Oussorene72e1612011-03-14 18:15:25 -04002290 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
Ronald Oussoren84151202010-04-18 20:46:11 +00002291
2292 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002293 """Return a dictionary of scheme -> proxy server URL mappings.
2294
Ronald Oussoren84151202010-04-18 20:46:11 +00002295 This function uses the MacOSX framework SystemConfiguration
2296 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002297 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002298 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002299
Ronald Oussoren84151202010-04-18 20:46:11 +00002300
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002301
2302 def proxy_bypass(host):
2303 if getproxies_environment():
2304 return proxy_bypass_environment(host)
2305 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002306 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002307
2308 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002309 return getproxies_environment() or getproxies_macosx_sysconf()
2310
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002311
2312elif os.name == 'nt':
2313 def getproxies_registry():
2314 """Return a dictionary of scheme -> proxy server URL mappings.
2315
2316 Win32 uses the registry to store proxies.
2317
2318 """
2319 proxies = {}
2320 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002321 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002322 except ImportError:
2323 # Std module, so should be around - but you never know!
2324 return proxies
2325 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002326 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002327 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002328 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002329 'ProxyEnable')[0]
2330 if proxyEnable:
2331 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002332 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002333 'ProxyServer')[0])
2334 if '=' in proxyServer:
2335 # Per-protocol settings
2336 for p in proxyServer.split(';'):
2337 protocol, address = p.split('=', 1)
2338 # See if address has a type:// prefix
2339 import re
2340 if not re.match('^([^/:]+)://', address):
2341 address = '%s://%s' % (protocol, address)
2342 proxies[protocol] = address
2343 else:
2344 # Use one setting for all protocols
2345 if proxyServer[:5] == 'http:':
2346 proxies['http'] = proxyServer
2347 else:
2348 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002349 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002350 proxies['ftp'] = 'ftp://%s' % proxyServer
2351 internetSettings.Close()
2352 except (WindowsError, ValueError, TypeError):
2353 # Either registry key not found etc, or the value in an
2354 # unexpected format.
2355 # proxies already set up to be empty so nothing to do
2356 pass
2357 return proxies
2358
2359 def getproxies():
2360 """Return a dictionary of scheme -> proxy server URL mappings.
2361
2362 Returns settings gathered from the environment, if specified,
2363 or the registry.
2364
2365 """
2366 return getproxies_environment() or getproxies_registry()
2367
2368 def proxy_bypass_registry(host):
2369 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002370 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002371 import re
2372 except ImportError:
2373 # Std modules, so should be around - but you never know!
2374 return 0
2375 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002376 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002377 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002378 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002379 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002380 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002381 'ProxyOverride')[0])
2382 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2383 except WindowsError:
2384 return 0
2385 if not proxyEnable or not proxyOverride:
2386 return 0
2387 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002388 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002389 host = [rawHost]
2390 try:
2391 addr = socket.gethostbyname(rawHost)
2392 if addr != rawHost:
2393 host.append(addr)
2394 except socket.error:
2395 pass
2396 try:
2397 fqdn = socket.getfqdn(rawHost)
2398 if fqdn != rawHost:
2399 host.append(fqdn)
2400 except socket.error:
2401 pass
2402 # make a check value list from the registry entry: replace the
2403 # '<local>' string by the localhost entry and the corresponding
2404 # canonical entry.
2405 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002406 # now check if we match one of the registry values.
2407 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002408 if test == '<local>':
2409 if '.' not in rawHost:
2410 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002411 test = test.replace(".", r"\.") # mask dots
2412 test = test.replace("*", r".*") # change glob sequence
2413 test = test.replace("?", r".") # change glob char
2414 for val in host:
2415 # print "%s <--> %s" %( test, val )
2416 if re.match(test, val, re.I):
2417 return 1
2418 return 0
2419
2420 def proxy_bypass(host):
2421 """Return a dictionary of scheme -> proxy server URL mappings.
2422
2423 Returns settings gathered from the environment, if specified,
2424 or the registry.
2425
2426 """
2427 if getproxies_environment():
2428 return proxy_bypass_environment(host)
2429 else:
2430 return proxy_bypass_registry(host)
2431
2432else:
2433 # By default use environment variables
2434 getproxies = getproxies_environment
2435 proxy_bypass = proxy_bypass_environment