blob: fe66a678eb455d9b10bdab7eab8a1214467e1462 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097
Georg Brandl13e89462008-07-01 19:56:00 +000098from urllib.error import URLError, HTTPError, ContentTooShortError
99from urllib.parse import (
100 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
101 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumarand95cc752010-08-08 11:27:53 +0000102 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000103from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000104
105# check for SSL
106try:
107 import ssl
Senthil Kumaranc2958622010-11-22 04:48:26 +0000108except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000109 _have_ssl = False
110else:
111 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000112
113# used in User-Agent header sent
114__version__ = sys.version[:3]
115
116_opener = None
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000117def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
118 *, cafile=None, capath=None):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000119 global _opener
Antoine Pitrou803e6d62010-10-13 10:36:15 +0000120 if cafile or capath:
121 if not _have_ssl:
122 raise ValueError('SSL support not available')
123 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
124 context.options |= ssl.OP_NO_SSLv2
125 if cafile or capath:
126 context.verify_mode = ssl.CERT_REQUIRED
127 context.load_verify_locations(cafile, capath)
128 check_hostname = True
129 else:
130 check_hostname = False
131 https_handler = HTTPSHandler(context=context, check_hostname=check_hostname)
132 opener = build_opener(https_handler)
133 elif _opener is None:
134 _opener = opener = build_opener()
135 else:
136 opener = _opener
137 return opener.open(url, data, timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000138
139def install_opener(opener):
140 global _opener
141 _opener = opener
142
143# TODO(jhylton): Make this work with the same global opener.
144_urlopener = None
145def urlretrieve(url, filename=None, reporthook=None, data=None):
146 global _urlopener
147 if not _urlopener:
148 _urlopener = FancyURLopener()
149 return _urlopener.retrieve(url, filename, reporthook, data)
150
151def urlcleanup():
152 if _urlopener:
153 _urlopener.cleanup()
154 global _opener
155 if _opener:
156 _opener = None
157
158# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000159_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000160def request_host(request):
161 """Return request-host, as defined by RFC 2965.
162
163 Variation from RFC: returned value is lowercased, for convenient
164 comparison.
165
166 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000167 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000168 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000169 if host == "":
170 host = request.get_header("Host", "")
171
172 # remove port, if present
173 host = _cut_port_re.sub("", host, 1)
174 return host.lower()
175
176class Request:
177
178 def __init__(self, url, data=None, headers={},
179 origin_req_host=None, unverifiable=False):
180 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000181 self.full_url = unwrap(url)
Senthil Kumarand95cc752010-08-08 11:27:53 +0000182 self.full_url, fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000183 self.data = data
184 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000185 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000186 for key, value in headers.items():
187 self.add_header(key, value)
188 self.unredirected_hdrs = {}
189 if origin_req_host is None:
190 origin_req_host = request_host(self)
191 self.origin_req_host = origin_req_host
192 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000193 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000194
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000195 def _parse(self):
196 self.type, rest = splittype(self.full_url)
197 if self.type is None:
198 raise ValueError("unknown url type: %s" % self.full_url)
199 self.host, self.selector = splithost(rest)
200 if self.host:
201 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000202
203 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000204 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000205 return "POST"
206 else:
207 return "GET"
208
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000209 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000210
211 def add_data(self, data):
212 self.data = data
213
214 def has_data(self):
215 return self.data is not None
216
217 def get_data(self):
218 return self.data
219
220 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000221 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000222
223 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000224 return self.type
225
226 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000227 return self.host
228
229 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000230 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000231
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000232 def is_unverifiable(self):
233 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000234
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000235 def get_origin_req_host(self):
236 return self.origin_req_host
237
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000238 # End deprecated methods
239
240 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000241 if self.type == 'https' and not self._tunnel_host:
242 self._tunnel_host = self.host
243 else:
244 self.type= type
245 self.selector = self.full_url
246 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000247
248 def has_proxy(self):
249 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000250
251 def add_header(self, key, val):
252 # useful for something like authentication
253 self.headers[key.capitalize()] = val
254
255 def add_unredirected_header(self, key, val):
256 # will not be added to a redirected request
257 self.unredirected_hdrs[key.capitalize()] = val
258
259 def has_header(self, header_name):
260 return (header_name in self.headers or
261 header_name in self.unredirected_hdrs)
262
263 def get_header(self, header_name, default=None):
264 return self.headers.get(
265 header_name,
266 self.unredirected_hdrs.get(header_name, default))
267
268 def header_items(self):
269 hdrs = self.unredirected_hdrs.copy()
270 hdrs.update(self.headers)
271 return list(hdrs.items())
272
273class OpenerDirector:
274 def __init__(self):
275 client_version = "Python-urllib/%s" % __version__
276 self.addheaders = [('User-agent', client_version)]
277 # manage the individual handlers
278 self.handlers = []
279 self.handle_open = {}
280 self.handle_error = {}
281 self.process_response = {}
282 self.process_request = {}
283
284 def add_handler(self, handler):
285 if not hasattr(handler, "add_parent"):
286 raise TypeError("expected BaseHandler instance, got %r" %
287 type(handler))
288
289 added = False
290 for meth in dir(handler):
291 if meth in ["redirect_request", "do_open", "proxy_open"]:
292 # oops, coincidental match
293 continue
294
295 i = meth.find("_")
296 protocol = meth[:i]
297 condition = meth[i+1:]
298
299 if condition.startswith("error"):
300 j = condition.find("_") + i + 1
301 kind = meth[j+1:]
302 try:
303 kind = int(kind)
304 except ValueError:
305 pass
306 lookup = self.handle_error.get(protocol, {})
307 self.handle_error[protocol] = lookup
308 elif condition == "open":
309 kind = protocol
310 lookup = self.handle_open
311 elif condition == "response":
312 kind = protocol
313 lookup = self.process_response
314 elif condition == "request":
315 kind = protocol
316 lookup = self.process_request
317 else:
318 continue
319
320 handlers = lookup.setdefault(kind, [])
321 if handlers:
322 bisect.insort(handlers, handler)
323 else:
324 handlers.append(handler)
325 added = True
326
327 if added:
328 # the handlers must work in an specific order, the order
329 # is specified in a Handler attribute
330 bisect.insort(self.handlers, handler)
331 handler.add_parent(self)
332
333 def close(self):
334 # Only exists for backwards compatibility.
335 pass
336
337 def _call_chain(self, chain, kind, meth_name, *args):
338 # Handlers raise an exception if no one else should try to handle
339 # the request, or return None if they can't but another handler
340 # could. Otherwise, they return the response.
341 handlers = chain.get(kind, ())
342 for handler in handlers:
343 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000344 result = func(*args)
345 if result is not None:
346 return result
347
348 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
349 # accept a URL or a Request object
350 if isinstance(fullurl, str):
351 req = Request(fullurl, data)
352 else:
353 req = fullurl
354 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000355 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000356
357 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000358 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000359
360 # pre-process request
361 meth_name = protocol+"_request"
362 for processor in self.process_request.get(protocol, []):
363 meth = getattr(processor, meth_name)
364 req = meth(req)
365
366 response = self._open(req, data)
367
368 # post-process response
369 meth_name = protocol+"_response"
370 for processor in self.process_response.get(protocol, []):
371 meth = getattr(processor, meth_name)
372 response = meth(req, response)
373
374 return response
375
376 def _open(self, req, data=None):
377 result = self._call_chain(self.handle_open, 'default',
378 'default_open', req)
379 if result:
380 return result
381
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000382 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000383 result = self._call_chain(self.handle_open, protocol, protocol +
384 '_open', req)
385 if result:
386 return result
387
388 return self._call_chain(self.handle_open, 'unknown',
389 'unknown_open', req)
390
391 def error(self, proto, *args):
392 if proto in ('http', 'https'):
393 # XXX http[s] protocols are special-cased
394 dict = self.handle_error['http'] # https is not different than http
395 proto = args[2] # YUCK!
396 meth_name = 'http_error_%s' % proto
397 http_err = 1
398 orig_args = args
399 else:
400 dict = self.handle_error
401 meth_name = proto + '_error'
402 http_err = 0
403 args = (dict, proto, meth_name) + args
404 result = self._call_chain(*args)
405 if result:
406 return result
407
408 if http_err:
409 args = (dict, 'default', 'http_error_default') + orig_args
410 return self._call_chain(*args)
411
412# XXX probably also want an abstract factory that knows when it makes
413# sense to skip a superclass in favor of a subclass and when it might
414# make sense to include both
415
416def build_opener(*handlers):
417 """Create an opener object from a list of handlers.
418
419 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000420 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000421
422 If any of the handlers passed as arguments are subclasses of the
423 default handlers, the default handlers will not be used.
424 """
425 def isclass(obj):
426 return isinstance(obj, type) or hasattr(obj, "__bases__")
427
428 opener = OpenerDirector()
429 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
430 HTTPDefaultErrorHandler, HTTPRedirectHandler,
431 FTPHandler, FileHandler, HTTPErrorProcessor]
432 if hasattr(http.client, "HTTPSConnection"):
433 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000434 skip = set()
435 for klass in default_classes:
436 for check in handlers:
437 if isclass(check):
438 if issubclass(check, klass):
439 skip.add(klass)
440 elif isinstance(check, klass):
441 skip.add(klass)
442 for klass in skip:
443 default_classes.remove(klass)
444
445 for klass in default_classes:
446 opener.add_handler(klass())
447
448 for h in handlers:
449 if isclass(h):
450 h = h()
451 opener.add_handler(h)
452 return opener
453
454class BaseHandler:
455 handler_order = 500
456
457 def add_parent(self, parent):
458 self.parent = parent
459
460 def close(self):
461 # Only exists for backwards compatibility
462 pass
463
464 def __lt__(self, other):
465 if not hasattr(other, "handler_order"):
466 # Try to preserve the old behavior of having custom classes
467 # inserted after default ones (works only for custom user
468 # classes which are not aware of handler_order).
469 return True
470 return self.handler_order < other.handler_order
471
472
473class HTTPErrorProcessor(BaseHandler):
474 """Process HTTP error responses."""
475 handler_order = 1000 # after all other processing
476
477 def http_response(self, request, response):
478 code, msg, hdrs = response.code, response.msg, response.info()
479
480 # According to RFC 2616, "2xx" code indicates that the client's
481 # request was successfully received, understood, and accepted.
482 if not (200 <= code < 300):
483 response = self.parent.error(
484 'http', request, response, code, msg, hdrs)
485
486 return response
487
488 https_response = http_response
489
490class HTTPDefaultErrorHandler(BaseHandler):
491 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000492 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000493
494class HTTPRedirectHandler(BaseHandler):
495 # maximum number of redirections to any single URL
496 # this is needed because of the state that cookies introduce
497 max_repeats = 4
498 # maximum total number of redirections (regardless of URL) before
499 # assuming we're in a loop
500 max_redirections = 10
501
502 def redirect_request(self, req, fp, code, msg, headers, newurl):
503 """Return a Request or None in response to a redirect.
504
505 This is called by the http_error_30x methods when a
506 redirection response is received. If a redirection should
507 take place, return a new Request to allow http_error_30x to
508 perform the redirect. Otherwise, raise HTTPError if no-one
509 else should try to handle this url. Return None if you can't
510 but another Handler might.
511 """
512 m = req.get_method()
513 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
514 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000515 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000516
517 # Strictly (according to RFC 2616), 301 or 302 in response to
518 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000519 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000520 # essentially all clients do redirect in this case, so we do
521 # the same.
522 # be conciliant with URIs containing a space
523 newurl = newurl.replace(' ', '%20')
524 CONTENT_HEADERS = ("content-length", "content-type")
525 newheaders = dict((k, v) for k, v in req.headers.items()
526 if k.lower() not in CONTENT_HEADERS)
527 return Request(newurl,
528 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000529 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000530 unverifiable=True)
531
532 # Implementation note: To avoid the server sending us into an
533 # infinite loop, the request object needs to track what URLs we
534 # have already seen. Do this by adding a handler-specific
535 # attribute to the Request object.
536 def http_error_302(self, req, fp, code, msg, headers):
537 # Some servers (incorrectly) return multiple Location headers
538 # (so probably same goes for URI). Use first header.
539 if "location" in headers:
540 newurl = headers["location"]
541 elif "uri" in headers:
542 newurl = headers["uri"]
543 else:
544 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000545
546 # fix a possible malformed URL
547 urlparts = urlparse(newurl)
548 if not urlparts.path:
549 urlparts = list(urlparts)
550 urlparts[2] = "/"
551 newurl = urlunparse(urlparts)
552
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000553 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000554
555 # XXX Probably want to forget about the state of the current
556 # request, although that might interact poorly with other
557 # handlers that also use handler-specific request attributes
558 new = self.redirect_request(req, fp, code, msg, headers, newurl)
559 if new is None:
560 return
561
562 # loop detection
563 # .redirect_dict has a key url if url was previously visited.
564 if hasattr(req, 'redirect_dict'):
565 visited = new.redirect_dict = req.redirect_dict
566 if (visited.get(newurl, 0) >= self.max_repeats or
567 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000568 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000569 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000570 else:
571 visited = new.redirect_dict = req.redirect_dict = {}
572 visited[newurl] = visited.get(newurl, 0) + 1
573
574 # Don't close the fp until we are sure that we won't use it
575 # with HTTPError.
576 fp.read()
577 fp.close()
578
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000579 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000580
581 http_error_301 = http_error_303 = http_error_307 = http_error_302
582
583 inf_msg = "The HTTP server returned a redirect error that would " \
584 "lead to an infinite loop.\n" \
585 "The last 30x error message was:\n"
586
587
588def _parse_proxy(proxy):
589 """Return (scheme, user, password, host/port) given a URL or an authority.
590
591 If a URL is supplied, it must have an authority (host:port) component.
592 According to RFC 3986, having an authority component means the URL must
593 have two slashes after the scheme:
594
595 >>> _parse_proxy('file:/ftp.example.com/')
596 Traceback (most recent call last):
597 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
598
599 The first three items of the returned tuple may be None.
600
601 Examples of authority parsing:
602
603 >>> _parse_proxy('proxy.example.com')
604 (None, None, None, 'proxy.example.com')
605 >>> _parse_proxy('proxy.example.com:3128')
606 (None, None, None, 'proxy.example.com:3128')
607
608 The authority component may optionally include userinfo (assumed to be
609 username:password):
610
611 >>> _parse_proxy('joe:password@proxy.example.com')
612 (None, 'joe', 'password', 'proxy.example.com')
613 >>> _parse_proxy('joe:password@proxy.example.com:3128')
614 (None, 'joe', 'password', 'proxy.example.com:3128')
615
616 Same examples, but with URLs instead:
617
618 >>> _parse_proxy('http://proxy.example.com/')
619 ('http', None, None, 'proxy.example.com')
620 >>> _parse_proxy('http://proxy.example.com:3128/')
621 ('http', None, None, 'proxy.example.com:3128')
622 >>> _parse_proxy('http://joe:password@proxy.example.com/')
623 ('http', 'joe', 'password', 'proxy.example.com')
624 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
625 ('http', 'joe', 'password', 'proxy.example.com:3128')
626
627 Everything after the authority is ignored:
628
629 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
630 ('ftp', 'joe', 'password', 'proxy.example.com')
631
632 Test for no trailing '/' case:
633
634 >>> _parse_proxy('http://joe:password@proxy.example.com')
635 ('http', 'joe', 'password', 'proxy.example.com')
636
637 """
Georg Brandl13e89462008-07-01 19:56:00 +0000638 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000639 if not r_scheme.startswith("/"):
640 # authority
641 scheme = None
642 authority = proxy
643 else:
644 # URL
645 if not r_scheme.startswith("//"):
646 raise ValueError("proxy URL with no authority: %r" % proxy)
647 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
648 # and 3.3.), path is empty or starts with '/'
649 end = r_scheme.find("/", 2)
650 if end == -1:
651 end = None
652 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000653 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000654 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000655 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000656 else:
657 user = password = None
658 return scheme, user, password, hostport
659
660class ProxyHandler(BaseHandler):
661 # Proxies must be in front
662 handler_order = 100
663
664 def __init__(self, proxies=None):
665 if proxies is None:
666 proxies = getproxies()
667 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
668 self.proxies = proxies
669 for type, url in proxies.items():
670 setattr(self, '%s_open' % type,
671 lambda r, proxy=url, type=type, meth=self.proxy_open: \
672 meth(r, proxy, type))
673
674 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000675 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000676 proxy_type, user, password, hostport = _parse_proxy(proxy)
677 if proxy_type is None:
678 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000679
680 if req.host and proxy_bypass(req.host):
681 return None
682
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000683 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000684 user_pass = '%s:%s' % (unquote(user),
685 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000686 creds = base64.b64encode(user_pass.encode()).decode("ascii")
687 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000688 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000689 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000690 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000691 # let other handlers take care of it
692 return None
693 else:
694 # need to start over, because the other handlers don't
695 # grok the proxy's URL type
696 # e.g. if we have a constructor arg proxies like so:
697 # {'http': 'ftp://proxy.example.com'}, we may end up turning
698 # a request for http://acme.example.com/a into one for
699 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000700 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000701
702class HTTPPasswordMgr:
703
704 def __init__(self):
705 self.passwd = {}
706
707 def add_password(self, realm, uri, user, passwd):
708 # uri could be a single URI or a sequence
709 if isinstance(uri, str):
710 uri = [uri]
711 if not realm in self.passwd:
712 self.passwd[realm] = {}
713 for default_port in True, False:
714 reduced_uri = tuple(
715 [self.reduce_uri(u, default_port) for u in uri])
716 self.passwd[realm][reduced_uri] = (user, passwd)
717
718 def find_user_password(self, realm, authuri):
719 domains = self.passwd.get(realm, {})
720 for default_port in True, False:
721 reduced_authuri = self.reduce_uri(authuri, default_port)
722 for uris, authinfo in domains.items():
723 for uri in uris:
724 if self.is_suburi(uri, reduced_authuri):
725 return authinfo
726 return None, None
727
728 def reduce_uri(self, uri, default_port=True):
729 """Accept authority or URI and extract only the authority and path."""
730 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000731 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000732 if parts[1]:
733 # URI
734 scheme = parts[0]
735 authority = parts[1]
736 path = parts[2] or '/'
737 else:
738 # host or host:port
739 scheme = None
740 authority = uri
741 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000742 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000743 if default_port and port is None and scheme is not None:
744 dport = {"http": 80,
745 "https": 443,
746 }.get(scheme)
747 if dport is not None:
748 authority = "%s:%d" % (host, dport)
749 return authority, path
750
751 def is_suburi(self, base, test):
752 """Check if test is below base in a URI tree
753
754 Both args must be URIs in reduced form.
755 """
756 if base == test:
757 return True
758 if base[0] != test[0]:
759 return False
760 common = posixpath.commonprefix((base[1], test[1]))
761 if len(common) == len(base[1]):
762 return True
763 return False
764
765
766class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
767
768 def find_user_password(self, realm, authuri):
769 user, password = HTTPPasswordMgr.find_user_password(self, realm,
770 authuri)
771 if user is not None:
772 return user, password
773 return HTTPPasswordMgr.find_user_password(self, None, authuri)
774
775
776class AbstractBasicAuthHandler:
777
778 # XXX this allows for multiple auth-schemes, but will stupidly pick
779 # the last one with a realm specified.
780
781 # allow for double- and single-quoted realm values
782 # (single quotes are a violation of the RFC, but appear in the wild)
783 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
784 'realm=(["\'])(.*?)\\2', re.I)
785
786 # XXX could pre-emptively send auth info already accepted (RFC 2617,
787 # end of section 2, and section 1.2 immediately after "credentials"
788 # production).
789
790 def __init__(self, password_mgr=None):
791 if password_mgr is None:
792 password_mgr = HTTPPasswordMgr()
793 self.passwd = password_mgr
794 self.add_password = self.passwd.add_password
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000795 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000796
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000797 def reset_retry_count(self):
798 self.retried = 0
799
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000800 def http_error_auth_reqed(self, authreq, host, req, headers):
801 # host may be an authority (without userinfo) or a URL with an
802 # authority
803 # XXX could be multiple headers
804 authreq = headers.get(authreq, None)
Senthil Kumaranf4998ac2010-06-01 12:53:48 +0000805
806 if self.retried > 5:
807 # retry sending the username:password 5 times before failing.
808 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
809 headers, None)
810 else:
811 self.retried += 1
812
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 if authreq:
814 mo = AbstractBasicAuthHandler.rx.search(authreq)
815 if mo:
816 scheme, quote, realm = mo.groups()
817 if scheme.lower() == 'basic':
Senthil Kumaran4bb5c272010-08-26 06:16:22 +0000818 response = self.retry_http_basic_auth(host, req, realm)
819 if response and response.code != 401:
820 self.retried = 0
821 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000822
823 def retry_http_basic_auth(self, host, req, realm):
824 user, pw = self.passwd.find_user_password(realm, host)
825 if pw is not None:
826 raw = "%s:%s" % (user, pw)
827 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
828 if req.headers.get(self.auth_header, None) == auth:
829 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000830 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000831 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000832 else:
833 return None
834
835
836class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
837
838 auth_header = 'Authorization'
839
840 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000841 url = req.full_url
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000842 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000843 url, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000844 self.reset_retry_count()
845 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000846
847
848class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
849
850 auth_header = 'Proxy-authorization'
851
852 def http_error_407(self, req, fp, code, msg, headers):
853 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000854 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000855 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
856 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000857 authority = req.host
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000858 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000859 authority, req, headers)
Senthil Kumaran67a62a42010-08-19 17:50:31 +0000860 self.reset_retry_count()
861 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000862
863
864def randombytes(n):
865 """Return n random bytes."""
866 return os.urandom(n)
867
868class AbstractDigestAuthHandler:
869 # Digest authentication is specified in RFC 2617.
870
871 # XXX The client does not inspect the Authentication-Info header
872 # in a successful response.
873
874 # XXX It should be possible to test this implementation against
875 # a mock server that just generates a static set of challenges.
876
877 # XXX qop="auth-int" supports is shaky
878
879 def __init__(self, passwd=None):
880 if passwd is None:
881 passwd = HTTPPasswordMgr()
882 self.passwd = passwd
883 self.add_password = self.passwd.add_password
884 self.retried = 0
885 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000886 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000887
888 def reset_retry_count(self):
889 self.retried = 0
890
891 def http_error_auth_reqed(self, auth_header, host, req, headers):
892 authreq = headers.get(auth_header, None)
893 if self.retried > 5:
894 # Don't fail endlessly - if we failed once, we'll probably
895 # fail a second time. Hm. Unless the Password Manager is
896 # prompting for the information. Crap. This isn't great
897 # but it's better than the current 'repeat until recursion
898 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000899 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000900 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000901 else:
902 self.retried += 1
903 if authreq:
904 scheme = authreq.split()[0]
905 if scheme.lower() == 'digest':
906 return self.retry_http_digest_auth(req, authreq)
907
908 def retry_http_digest_auth(self, req, auth):
909 token, challenge = auth.split(' ', 1)
910 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
911 auth = self.get_authorization(req, chal)
912 if auth:
913 auth_val = 'Digest %s' % auth
914 if req.headers.get(self.auth_header, None) == auth_val:
915 return None
916 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000917 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000918 return resp
919
920 def get_cnonce(self, nonce):
921 # The cnonce-value is an opaque
922 # quoted string value provided by the client and used by both client
923 # and server to avoid chosen plaintext attacks, to provide mutual
924 # authentication, and to provide some message integrity protection.
925 # This isn't a fabulous effort, but it's probably Good Enough.
926 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
927 b = s.encode("ascii") + randombytes(8)
928 dig = hashlib.sha1(b).hexdigest()
929 return dig[:16]
930
931 def get_authorization(self, req, chal):
932 try:
933 realm = chal['realm']
934 nonce = chal['nonce']
935 qop = chal.get('qop')
936 algorithm = chal.get('algorithm', 'MD5')
937 # mod_digest doesn't send an opaque, even though it isn't
938 # supposed to be optional
939 opaque = chal.get('opaque', None)
940 except KeyError:
941 return None
942
943 H, KD = self.get_algorithm_impls(algorithm)
944 if H is None:
945 return None
946
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000947 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000948 if user is None:
949 return None
950
951 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000952 if req.data is not None:
953 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000954 else:
955 entdig = None
956
957 A1 = "%s:%s:%s" % (user, realm, pw)
958 A2 = "%s:%s" % (req.get_method(),
959 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000960 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000961 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000962 if nonce == self.last_nonce:
963 self.nonce_count += 1
964 else:
965 self.nonce_count = 1
966 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000967 ncvalue = '%08x' % self.nonce_count
968 cnonce = self.get_cnonce(nonce)
969 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
970 respdig = KD(H(A1), noncebit)
971 elif qop is None:
972 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
973 else:
974 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000975 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000976
977 # XXX should the partial digests be encoded too?
978
979 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000980 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000981 respdig)
982 if opaque:
983 base += ', opaque="%s"' % opaque
984 if entdig:
985 base += ', digest="%s"' % entdig
986 base += ', algorithm="%s"' % algorithm
987 if qop:
988 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
989 return base
990
991 def get_algorithm_impls(self, algorithm):
992 # lambdas assume digest modules are imported at the top level
993 if algorithm == 'MD5':
994 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
995 elif algorithm == 'SHA':
996 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
997 # XXX MD5-sess
998 KD = lambda s, d: H("%s:%s" % (s, d))
999 return H, KD
1000
1001 def get_entity_digest(self, data, chal):
1002 # XXX not implemented yet
1003 return None
1004
1005
1006class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1007 """An authentication protocol defined by RFC 2069
1008
1009 Digest authentication improves on basic authentication because it
1010 does not transmit passwords in the clear.
1011 """
1012
1013 auth_header = 'Authorization'
1014 handler_order = 490 # before Basic auth
1015
1016 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001017 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001018 retry = self.http_error_auth_reqed('www-authenticate',
1019 host, req, headers)
1020 self.reset_retry_count()
1021 return retry
1022
1023
1024class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1025
1026 auth_header = 'Proxy-Authorization'
1027 handler_order = 490 # before Basic auth
1028
1029 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001030 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001031 retry = self.http_error_auth_reqed('proxy-authenticate',
1032 host, req, headers)
1033 self.reset_retry_count()
1034 return retry
1035
1036class AbstractHTTPHandler(BaseHandler):
1037
1038 def __init__(self, debuglevel=0):
1039 self._debuglevel = debuglevel
1040
1041 def set_http_debuglevel(self, level):
1042 self._debuglevel = level
1043
1044 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001045 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001046 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001047 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001048
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001049 if request.data is not None: # POST
1050 data = request.data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001051 if not request.has_header('Content-type'):
1052 request.add_unredirected_header(
1053 'Content-type',
1054 'application/x-www-form-urlencoded')
1055 if not request.has_header('Content-length'):
1056 request.add_unredirected_header(
1057 'Content-length', '%d' % len(data))
1058
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001059 sel_host = host
1060 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001061 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001062 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001063 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001064 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001065 for name, value in self.parent.addheaders:
1066 name = name.capitalize()
1067 if not request.has_header(name):
1068 request.add_unredirected_header(name, value)
1069
1070 return request
1071
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001072 def do_open(self, http_class, req, **http_conn_args):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001073 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001074
1075 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001076 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001077 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001078 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001079 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001080
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001081 # will parse host:port
1082 h = http_class(host, timeout=req.timeout, **http_conn_args)
Senthil Kumaran42ef4b12010-09-27 01:26:03 +00001083
1084 headers = dict(req.unredirected_hdrs)
1085 headers.update(dict((k, v) for k, v in req.headers.items()
1086 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001087
1088 # TODO(jhylton): Should this be redesigned to handle
1089 # persistent connections?
1090
1091 # We want to make an HTTP/1.1 request, but the addinfourl
1092 # class isn't prepared to deal with a persistent connection.
1093 # It will try to read all remaining data from the socket,
1094 # which will block while the server waits for the next request.
1095 # So make sure the connection gets closed after the (only)
1096 # request.
1097 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001098 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001099
1100 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001101 tunnel_headers = {}
1102 proxy_auth_hdr = "Proxy-Authorization"
1103 if proxy_auth_hdr in headers:
1104 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1105 # Proxy-Authorization should not be sent to origin
1106 # server.
1107 del headers[proxy_auth_hdr]
1108 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001109
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001110 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001111 h.request(req.get_method(), req.selector, req.data, headers)
1112 r = h.getresponse() # an HTTPResponse instance
1113 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001114 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001115
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001116 r.url = req.full_url
1117 # This line replaces the .msg attribute of the HTTPResponse
1118 # with .headers, because urllib clients expect the response to
1119 # have the reason in .msg. It would be good to mark this
1120 # attribute is deprecated and get then to use info() or
1121 # .headers.
1122 r.msg = r.reason
1123 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001124
1125
1126class HTTPHandler(AbstractHTTPHandler):
1127
1128 def http_open(self, req):
1129 return self.do_open(http.client.HTTPConnection, req)
1130
1131 http_request = AbstractHTTPHandler.do_request_
1132
1133if hasattr(http.client, 'HTTPSConnection'):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001134 import ssl
1135
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001136 class HTTPSHandler(AbstractHTTPHandler):
1137
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001138 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1139 AbstractHTTPHandler.__init__(self, debuglevel)
1140 self._context = context
1141 self._check_hostname = check_hostname
1142
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001143 def https_open(self, req):
Antoine Pitrou803e6d62010-10-13 10:36:15 +00001144 return self.do_open(http.client.HTTPSConnection, req,
1145 context=self._context, check_hostname=self._check_hostname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001146
1147 https_request = AbstractHTTPHandler.do_request_
1148
1149class HTTPCookieProcessor(BaseHandler):
1150 def __init__(self, cookiejar=None):
1151 import http.cookiejar
1152 if cookiejar is None:
1153 cookiejar = http.cookiejar.CookieJar()
1154 self.cookiejar = cookiejar
1155
1156 def http_request(self, request):
1157 self.cookiejar.add_cookie_header(request)
1158 return request
1159
1160 def http_response(self, request, response):
1161 self.cookiejar.extract_cookies(response, request)
1162 return response
1163
1164 https_request = http_request
1165 https_response = http_response
1166
1167class UnknownHandler(BaseHandler):
1168 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001169 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001170 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001171
1172def parse_keqv_list(l):
1173 """Parse list of key=value strings where keys are not duplicated."""
1174 parsed = {}
1175 for elt in l:
1176 k, v = elt.split('=', 1)
1177 if v[0] == '"' and v[-1] == '"':
1178 v = v[1:-1]
1179 parsed[k] = v
1180 return parsed
1181
1182def parse_http_list(s):
1183 """Parse lists as described by RFC 2068 Section 2.
1184
1185 In particular, parse comma-separated lists where the elements of
1186 the list may include quoted-strings. A quoted-string could
1187 contain a comma. A non-quoted string could have quotes in the
1188 middle. Neither commas nor quotes count if they are escaped.
1189 Only double-quotes count, not single-quotes.
1190 """
1191 res = []
1192 part = ''
1193
1194 escape = quote = False
1195 for cur in s:
1196 if escape:
1197 part += cur
1198 escape = False
1199 continue
1200 if quote:
1201 if cur == '\\':
1202 escape = True
1203 continue
1204 elif cur == '"':
1205 quote = False
1206 part += cur
1207 continue
1208
1209 if cur == ',':
1210 res.append(part)
1211 part = ''
1212 continue
1213
1214 if cur == '"':
1215 quote = True
1216
1217 part += cur
1218
1219 # append last part
1220 if part:
1221 res.append(part)
1222
1223 return [part.strip() for part in res]
1224
1225class FileHandler(BaseHandler):
1226 # Use local file or FTP depending on form of URL
1227 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001228 url = req.selector
Senthil Kumaran2ef16322010-07-11 03:12:43 +00001229 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1230 req.host != 'localhost'):
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001231 if not req.host is self.get_names():
1232 raise URLError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001233 else:
1234 return self.open_local_file(req)
1235
1236 # names for the localhost
1237 names = None
1238 def get_names(self):
1239 if FileHandler.names is None:
1240 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001241 FileHandler.names = tuple(
1242 socket.gethostbyname_ex('localhost')[2] +
1243 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001244 except socket.gaierror:
1245 FileHandler.names = (socket.gethostbyname('localhost'),)
1246 return FileHandler.names
1247
1248 # not entirely sure what the rules are here
1249 def open_local_file(self, req):
1250 import email.utils
1251 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001252 host = req.host
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001253 filename = req.selector
1254 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001255 try:
1256 stats = os.stat(localfile)
1257 size = stats.st_size
1258 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001259 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001260 headers = email.message_from_string(
1261 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1262 (mtype or 'text/plain', size, modified))
1263 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001264 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001265 if not host or \
1266 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran06f5a532010-05-08 05:12:05 +00001267 if host:
1268 origurl = 'file://' + host + filename
1269 else:
1270 origurl = 'file://' + filename
1271 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001272 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001273 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001274 raise URLError(msg)
1275 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001276
1277def _safe_gethostbyname(host):
1278 try:
1279 return socket.gethostbyname(host)
1280 except socket.gaierror:
1281 return None
1282
1283class FTPHandler(BaseHandler):
1284 def ftp_open(self, req):
1285 import ftplib
1286 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001287 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001288 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001289 raise URLError('ftp error: no host given')
1290 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001291 if port is None:
1292 port = ftplib.FTP_PORT
1293 else:
1294 port = int(port)
1295
1296 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001297 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001298 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001299 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001300 else:
1301 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001302 host = unquote(host)
Senthil Kumarandaa29d02010-11-18 15:36:41 +00001303 user = user or ''
1304 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001305
1306 try:
1307 host = socket.gethostbyname(host)
1308 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001309 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001310 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001311 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001312 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001313 dirs, file = dirs[:-1], dirs[-1]
1314 if dirs and not dirs[0]:
1315 dirs = dirs[1:]
1316 try:
1317 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1318 type = file and 'I' or 'D'
1319 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001320 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001321 if attr.lower() == 'type' and \
1322 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1323 type = value.upper()
1324 fp, retrlen = fw.retrfile(file, type)
1325 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001326 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001327 if mtype:
1328 headers += "Content-type: %s\n" % mtype
1329 if retrlen is not None and retrlen >= 0:
1330 headers += "Content-length: %d\n" % retrlen
1331 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001332 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001333 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001334 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001335 raise exc.with_traceback(sys.exc_info()[2])
1336
1337 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1338 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1339 return fw
1340
1341class CacheFTPHandler(FTPHandler):
1342 # XXX would be nice to have pluggable cache strategies
1343 # XXX this stuff is definitely not thread safe
1344 def __init__(self):
1345 self.cache = {}
1346 self.timeout = {}
1347 self.soonest = 0
1348 self.delay = 60
1349 self.max_conns = 16
1350
1351 def setTimeout(self, t):
1352 self.delay = t
1353
1354 def setMaxConns(self, m):
1355 self.max_conns = m
1356
1357 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1358 key = user, host, port, '/'.join(dirs), timeout
1359 if key in self.cache:
1360 self.timeout[key] = time.time() + self.delay
1361 else:
1362 self.cache[key] = ftpwrapper(user, passwd, host, port,
1363 dirs, timeout)
1364 self.timeout[key] = time.time() + self.delay
1365 self.check_cache()
1366 return self.cache[key]
1367
1368 def check_cache(self):
1369 # first check for old ones
1370 t = time.time()
1371 if self.soonest <= t:
1372 for k, v in list(self.timeout.items()):
1373 if v < t:
1374 self.cache[k].close()
1375 del self.cache[k]
1376 del self.timeout[k]
1377 self.soonest = min(list(self.timeout.values()))
1378
1379 # then check the size
1380 if len(self.cache) == self.max_conns:
1381 for k, v in list(self.timeout.items()):
1382 if v == self.soonest:
1383 del self.cache[k]
1384 del self.timeout[k]
1385 break
1386 self.soonest = min(list(self.timeout.values()))
1387
1388# Code move from the old urllib module
1389
1390MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1391
1392# Helper for non-unix systems
Ronald Oussoren94f25282010-05-05 19:11:21 +00001393if os.name == 'nt':
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001394 from nturl2path import url2pathname, pathname2url
1395else:
1396 def url2pathname(pathname):
1397 """OS-specific conversion from a relative URL of the 'file' scheme
1398 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001399 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001400
1401 def pathname2url(pathname):
1402 """OS-specific conversion from a file system path to a relative URL
1403 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001404 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001405
1406# This really consists of two pieces:
1407# (1) a class which handles opening of all sorts of URLs
1408# (plus assorted utilities etc.)
1409# (2) a set of functions for parsing URLs
1410# XXX Should these be separated out into different modules?
1411
1412
1413ftpcache = {}
1414class URLopener:
1415 """Class to open URLs.
1416 This is a class rather than just a subroutine because we may need
1417 more than one set of global protocol-specific options.
1418 Note -- this is a base class for those who don't want the
1419 automatic handling of errors type 302 (relocated) and 401
1420 (authorization needed)."""
1421
1422 __tempfiles = None
1423
1424 version = "Python-urllib/%s" % __version__
1425
1426 # Constructor
1427 def __init__(self, proxies=None, **x509):
1428 if proxies is None:
1429 proxies = getproxies()
1430 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1431 self.proxies = proxies
1432 self.key_file = x509.get('key_file')
1433 self.cert_file = x509.get('cert_file')
1434 self.addheaders = [('User-Agent', self.version)]
1435 self.__tempfiles = []
1436 self.__unlink = os.unlink # See cleanup()
1437 self.tempcache = None
1438 # Undocumented feature: if you assign {} to tempcache,
1439 # it is used to cache files retrieved with
1440 # self.retrieve(). This is not enabled by default
1441 # since it does not work for changing documents (and I
1442 # haven't got the logic to check expiration headers
1443 # yet).
1444 self.ftpcache = ftpcache
1445 # Undocumented feature: you can use a different
1446 # ftp cache by assigning to the .ftpcache member;
1447 # in case you want logically independent URL openers
1448 # XXX This is not threadsafe. Bah.
1449
1450 def __del__(self):
1451 self.close()
1452
1453 def close(self):
1454 self.cleanup()
1455
1456 def cleanup(self):
1457 # This code sometimes runs when the rest of this module
1458 # has already been deleted, so it can't use any globals
1459 # or import anything.
1460 if self.__tempfiles:
1461 for file in self.__tempfiles:
1462 try:
1463 self.__unlink(file)
1464 except OSError:
1465 pass
1466 del self.__tempfiles[:]
1467 if self.tempcache:
1468 self.tempcache.clear()
1469
1470 def addheader(self, *args):
1471 """Add a header to be used by the HTTP interface only
1472 e.g. u.addheader('Accept', 'sound/basic')"""
1473 self.addheaders.append(args)
1474
1475 # External interface
1476 def open(self, fullurl, data=None):
1477 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001478 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001479 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001480 if self.tempcache and fullurl in self.tempcache:
1481 filename, headers = self.tempcache[fullurl]
1482 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001483 return addinfourl(fp, headers, fullurl)
1484 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001485 if not urltype:
1486 urltype = 'file'
1487 if urltype in self.proxies:
1488 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001489 urltype, proxyhost = splittype(proxy)
1490 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001491 url = (host, fullurl) # Signal special case to open_*()
1492 else:
1493 proxy = None
1494 name = 'open_' + urltype
1495 self.type = urltype
1496 name = name.replace('-', '_')
1497 if not hasattr(self, name):
1498 if proxy:
1499 return self.open_unknown_proxy(proxy, fullurl, data)
1500 else:
1501 return self.open_unknown(fullurl, data)
1502 try:
1503 if data is None:
1504 return getattr(self, name)(url)
1505 else:
1506 return getattr(self, name)(url, data)
1507 except socket.error as msg:
1508 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1509
1510 def open_unknown(self, fullurl, data=None):
1511 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001512 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001513 raise IOError('url error', 'unknown url type', type)
1514
1515 def open_unknown_proxy(self, proxy, fullurl, data=None):
1516 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001517 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001518 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1519
1520 # External interface
1521 def retrieve(self, url, filename=None, reporthook=None, data=None):
1522 """retrieve(url) returns (filename, headers) for a local object
1523 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001524 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001525 if self.tempcache and url in self.tempcache:
1526 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001527 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001528 if filename is None and (not type or type == 'file'):
1529 try:
1530 fp = self.open_local_file(url1)
1531 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001532 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001533 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001534 except IOError as msg:
1535 pass
1536 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001537 try:
1538 headers = fp.info()
1539 if filename:
1540 tfp = open(filename, 'wb')
1541 else:
1542 import tempfile
1543 garbage, path = splittype(url)
1544 garbage, path = splithost(path or "")
1545 path, garbage = splitquery(path or "")
1546 path, garbage = splitattr(path or "")
1547 suffix = os.path.splitext(path)[1]
1548 (fd, filename) = tempfile.mkstemp(suffix)
1549 self.__tempfiles.append(filename)
1550 tfp = os.fdopen(fd, 'wb')
1551 try:
1552 result = filename, headers
1553 if self.tempcache is not None:
1554 self.tempcache[url] = result
1555 bs = 1024*8
1556 size = -1
1557 read = 0
1558 blocknum = 0
1559 if reporthook:
1560 if "content-length" in headers:
1561 size = int(headers["Content-Length"])
1562 reporthook(blocknum, bs, size)
1563 while 1:
1564 block = fp.read(bs)
1565 if not block:
1566 break
1567 read += len(block)
1568 tfp.write(block)
1569 blocknum += 1
1570 if reporthook:
1571 reporthook(blocknum, bs, size)
1572 finally:
1573 tfp.close()
1574 finally:
1575 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001576
1577 # raise exception if actual size does not match content-length header
1578 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001579 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001580 "retrieval incomplete: got only %i out of %i bytes"
1581 % (read, size), result)
1582
1583 return result
1584
1585 # Each method named open_<type> knows how to open that type of URL
1586
1587 def _open_generic_http(self, connection_factory, url, data):
1588 """Make an HTTP connection using connection_class.
1589
1590 This is an internal method that should be called from
1591 open_http() or open_https().
1592
1593 Arguments:
1594 - connection_factory should take a host name and return an
1595 HTTPConnection instance.
1596 - url is the url to retrieval or a host, relative-path pair.
1597 - data is payload for a POST request or None.
1598 """
1599
1600 user_passwd = None
1601 proxy_passwd= None
1602 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001603 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001604 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001605 user_passwd, host = splituser(host)
1606 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001607 realhost = host
1608 else:
1609 host, selector = url
1610 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001611 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001612 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001613 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001614 url = rest
1615 user_passwd = None
1616 if urltype.lower() != 'http':
1617 realhost = None
1618 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001619 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001620 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001621 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001622 if user_passwd:
1623 selector = "%s://%s%s" % (urltype, realhost, rest)
1624 if proxy_bypass(realhost):
1625 host = realhost
1626
1627 #print "proxy via http:", host, selector
1628 if not host: raise IOError('http error', 'no host given')
1629
1630 if proxy_passwd:
1631 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001632 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001633 else:
1634 proxy_auth = None
1635
1636 if user_passwd:
1637 import base64
Senthil Kumaran5626eec2010-08-04 17:46:23 +00001638 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001639 else:
1640 auth = None
1641 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001642 headers = {}
1643 if proxy_auth:
1644 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1645 if auth:
1646 headers["Authorization"] = "Basic %s" % auth
1647 if realhost:
1648 headers["Host"] = realhost
1649 for header, value in self.addheaders:
1650 headers[header] = value
1651
1652 if data is not None:
1653 headers["Content-Type"] = "application/x-www-form-urlencoded"
1654 http_conn.request("POST", selector, data, headers)
1655 else:
1656 http_conn.request("GET", selector, headers=headers)
1657
1658 try:
1659 response = http_conn.getresponse()
1660 except http.client.BadStatusLine:
1661 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001662 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001663
1664 # According to RFC 2616, "2xx" code indicates that the client's
1665 # request was successfully received, understood, and accepted.
1666 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001667 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001668 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001669 else:
1670 return self.http_error(
1671 url, response.fp,
1672 response.status, response.reason, response.msg, data)
1673
1674 def open_http(self, url, data=None):
1675 """Use HTTP protocol."""
1676 return self._open_generic_http(http.client.HTTPConnection, url, data)
1677
1678 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1679 """Handle http errors.
1680
1681 Derived class can override this, or provide specific handlers
1682 named http_error_DDD where DDD is the 3-digit error code."""
1683 # First check if there's a specific handler for this error
1684 name = 'http_error_%d' % errcode
1685 if hasattr(self, name):
1686 method = getattr(self, name)
1687 if data is None:
1688 result = method(url, fp, errcode, errmsg, headers)
1689 else:
1690 result = method(url, fp, errcode, errmsg, headers, data)
1691 if result: return result
1692 return self.http_error_default(url, fp, errcode, errmsg, headers)
1693
1694 def http_error_default(self, url, fp, errcode, errmsg, headers):
1695 """Default error handler: close the connection and raise IOError."""
1696 void = fp.read()
1697 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001698 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001699
1700 if _have_ssl:
1701 def _https_connection(self, host):
1702 return http.client.HTTPSConnection(host,
1703 key_file=self.key_file,
1704 cert_file=self.cert_file)
1705
1706 def open_https(self, url, data=None):
1707 """Use HTTPS protocol."""
1708 return self._open_generic_http(self._https_connection, url, data)
1709
1710 def open_file(self, url):
1711 """Use local file or FTP depending on form of URL."""
1712 if not isinstance(url, str):
1713 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1714 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
Senthil Kumaran383c32d2010-10-14 11:57:35 +00001715 raise ValueError("file:// scheme is supported only on localhost")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001716 else:
1717 return self.open_local_file(url)
1718
1719 def open_local_file(self, url):
1720 """Use local file."""
1721 import mimetypes, email.utils
1722 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001723 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001724 localname = url2pathname(file)
1725 try:
1726 stats = os.stat(localname)
1727 except OSError as e:
1728 raise URLError(e.errno, e.strerror, e.filename)
1729 size = stats.st_size
1730 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1731 mtype = mimetypes.guess_type(url)[0]
1732 headers = email.message_from_string(
1733 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1734 (mtype or 'text/plain', size, modified))
1735 if not host:
1736 urlfile = file
1737 if file[:1] == '/':
1738 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001739 return addinfourl(open(localname, 'rb'), headers, urlfile)
1740 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001741 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001742 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001743 urlfile = file
1744 if file[:1] == '/':
1745 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001746 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001747 raise URLError('local file error', 'not on local host')
1748
1749 def open_ftp(self, url):
1750 """Use FTP protocol."""
1751 if not isinstance(url, str):
1752 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1753 import mimetypes
1754 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001755 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001756 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001757 host, port = splitport(host)
1758 user, host = splituser(host)
1759 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001760 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001761 host = unquote(host)
1762 user = unquote(user or '')
1763 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001764 host = socket.gethostbyname(host)
1765 if not port:
1766 import ftplib
1767 port = ftplib.FTP_PORT
1768 else:
1769 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001770 path, attrs = splitattr(path)
1771 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001772 dirs = path.split('/')
1773 dirs, file = dirs[:-1], dirs[-1]
1774 if dirs and not dirs[0]: dirs = dirs[1:]
1775 if dirs and not dirs[0]: dirs[0] = '/'
1776 key = user, host, port, '/'.join(dirs)
1777 # XXX thread unsafe!
1778 if len(self.ftpcache) > MAXFTPCACHE:
1779 # Prune the cache, rather arbitrarily
1780 for k in self.ftpcache.keys():
1781 if k != key:
1782 v = self.ftpcache[k]
1783 del self.ftpcache[k]
1784 v.close()
1785 try:
1786 if not key in self.ftpcache:
1787 self.ftpcache[key] = \
1788 ftpwrapper(user, passwd, host, port, dirs)
1789 if not file: type = 'D'
1790 else: type = 'I'
1791 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001792 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001793 if attr.lower() == 'type' and \
1794 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1795 type = value.upper()
1796 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1797 mtype = mimetypes.guess_type("ftp:" + url)[0]
1798 headers = ""
1799 if mtype:
1800 headers += "Content-Type: %s\n" % mtype
1801 if retrlen is not None and retrlen >= 0:
1802 headers += "Content-Length: %d\n" % retrlen
1803 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001804 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001805 except ftperrors() as msg:
1806 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1807
1808 def open_data(self, url, data=None):
1809 """Use "data" URL."""
1810 if not isinstance(url, str):
1811 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1812 # ignore POSTed data
1813 #
1814 # syntax of data URLs:
1815 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1816 # mediatype := [ type "/" subtype ] *( ";" parameter )
1817 # data := *urlchar
1818 # parameter := attribute "=" value
1819 try:
1820 [type, data] = url.split(',', 1)
1821 except ValueError:
1822 raise IOError('data error', 'bad data URL')
1823 if not type:
1824 type = 'text/plain;charset=US-ASCII'
1825 semi = type.rfind(';')
1826 if semi >= 0 and '=' not in type[semi:]:
1827 encoding = type[semi+1:]
1828 type = type[:semi]
1829 else:
1830 encoding = ''
1831 msg = []
Senthil Kumaranf6c456d2010-05-01 08:29:18 +00001832 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001833 time.gmtime(time.time())))
1834 msg.append('Content-type: %s' % type)
1835 if encoding == 'base64':
1836 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001837 # XXX is this encoding/decoding ok?
1838 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001839 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001840 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001841 msg.append('Content-Length: %d' % len(data))
1842 msg.append('')
1843 msg.append(data)
1844 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001845 headers = email.message_from_string(msg)
1846 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001847 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001848 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001849
1850
1851class FancyURLopener(URLopener):
1852 """Derived class with handlers for errors we can handle (perhaps)."""
1853
1854 def __init__(self, *args, **kwargs):
1855 URLopener.__init__(self, *args, **kwargs)
1856 self.auth_cache = {}
1857 self.tries = 0
1858 self.maxtries = 10
1859
1860 def http_error_default(self, url, fp, errcode, errmsg, headers):
1861 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001862 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001863
1864 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1865 """Error 302 -- relocated (temporarily)."""
1866 self.tries += 1
1867 if self.maxtries and self.tries >= self.maxtries:
1868 if hasattr(self, "http_error_500"):
1869 meth = self.http_error_500
1870 else:
1871 meth = self.http_error_default
1872 self.tries = 0
1873 return meth(url, fp, 500,
1874 "Internal Server Error: Redirect Recursion", headers)
1875 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1876 data)
1877 self.tries = 0
1878 return result
1879
1880 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1881 if 'location' in headers:
1882 newurl = headers['location']
1883 elif 'uri' in headers:
1884 newurl = headers['uri']
1885 else:
1886 return
1887 void = fp.read()
1888 fp.close()
1889 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001890 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001891 return self.open(newurl)
1892
1893 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1894 """Error 301 -- also relocated (permanently)."""
1895 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1896
1897 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1898 """Error 303 -- also relocated (essentially identical to 302)."""
1899 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1900
1901 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1902 """Error 307 -- relocated, but turn POST into error."""
1903 if data is None:
1904 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1905 else:
1906 return self.http_error_default(url, fp, errcode, errmsg, headers)
1907
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001908 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1909 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001910 """Error 401 -- authentication required.
1911 This function supports Basic authentication only."""
1912 if not 'www-authenticate' in headers:
1913 URLopener.http_error_default(self, url, fp,
1914 errcode, errmsg, headers)
1915 stuff = headers['www-authenticate']
1916 import re
1917 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1918 if not match:
1919 URLopener.http_error_default(self, url, fp,
1920 errcode, errmsg, headers)
1921 scheme, realm = match.groups()
1922 if scheme.lower() != 'basic':
1923 URLopener.http_error_default(self, url, fp,
1924 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001925 if not retry:
1926 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1927 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001928 name = 'retry_' + self.type + '_basic_auth'
1929 if data is None:
1930 return getattr(self,name)(url, realm)
1931 else:
1932 return getattr(self,name)(url, realm, data)
1933
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001934 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
1935 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001936 """Error 407 -- proxy authentication required.
1937 This function supports Basic authentication only."""
1938 if not 'proxy-authenticate' in headers:
1939 URLopener.http_error_default(self, url, fp,
1940 errcode, errmsg, headers)
1941 stuff = headers['proxy-authenticate']
1942 import re
1943 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1944 if not match:
1945 URLopener.http_error_default(self, url, fp,
1946 errcode, errmsg, headers)
1947 scheme, realm = match.groups()
1948 if scheme.lower() != 'basic':
1949 URLopener.http_error_default(self, url, fp,
1950 errcode, errmsg, headers)
Senthil Kumaran80f1b052010-06-18 15:08:18 +00001951 if not retry:
1952 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1953 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001954 name = 'retry_proxy_' + self.type + '_basic_auth'
1955 if data is None:
1956 return getattr(self,name)(url, realm)
1957 else:
1958 return getattr(self,name)(url, realm, data)
1959
1960 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001961 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001962 newurl = 'http://' + host + selector
1963 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001964 urltype, proxyhost = splittype(proxy)
1965 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001966 i = proxyhost.find('@') + 1
1967 proxyhost = proxyhost[i:]
1968 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1969 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001970 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001971 quote(passwd, safe=''), proxyhost)
1972 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1973 if data is None:
1974 return self.open(newurl)
1975 else:
1976 return self.open(newurl, data)
1977
1978 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001979 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001980 newurl = 'https://' + host + selector
1981 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001982 urltype, proxyhost = splittype(proxy)
1983 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001984 i = proxyhost.find('@') + 1
1985 proxyhost = proxyhost[i:]
1986 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1987 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001988 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001989 quote(passwd, safe=''), proxyhost)
1990 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1991 if data is None:
1992 return self.open(newurl)
1993 else:
1994 return self.open(newurl, data)
1995
1996 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001997 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001998 i = host.find('@') + 1
1999 host = host[i:]
2000 user, passwd = self.get_user_passwd(host, realm, i)
2001 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002002 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002003 quote(passwd, safe=''), host)
2004 newurl = 'http://' + host + selector
2005 if data is None:
2006 return self.open(newurl)
2007 else:
2008 return self.open(newurl, data)
2009
2010 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002011 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002012 i = host.find('@') + 1
2013 host = host[i:]
2014 user, passwd = self.get_user_passwd(host, realm, i)
2015 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002016 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002017 quote(passwd, safe=''), host)
2018 newurl = 'https://' + host + selector
2019 if data is None:
2020 return self.open(newurl)
2021 else:
2022 return self.open(newurl, data)
2023
Florent Xicluna757445b2010-05-17 17:24:07 +00002024 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002025 key = realm + '@' + host.lower()
2026 if key in self.auth_cache:
2027 if clear_cache:
2028 del self.auth_cache[key]
2029 else:
2030 return self.auth_cache[key]
2031 user, passwd = self.prompt_user_passwd(host, realm)
2032 if user or passwd: self.auth_cache[key] = (user, passwd)
2033 return user, passwd
2034
2035 def prompt_user_passwd(self, host, realm):
2036 """Override this in a GUI environment!"""
2037 import getpass
2038 try:
2039 user = input("Enter username for %s at %s: " % (realm, host))
2040 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2041 (user, realm, host))
2042 return user, passwd
2043 except KeyboardInterrupt:
2044 print()
2045 return None, None
2046
2047
2048# Utility functions
2049
2050_localhost = None
2051def localhost():
2052 """Return the IP address of the magic hostname 'localhost'."""
2053 global _localhost
2054 if _localhost is None:
2055 _localhost = socket.gethostbyname('localhost')
2056 return _localhost
2057
2058_thishost = None
2059def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002060 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002061 global _thishost
2062 if _thishost is None:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002063 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname()[2]))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002064 return _thishost
2065
2066_ftperrors = None
2067def ftperrors():
2068 """Return the set of errors raised by the FTP class."""
2069 global _ftperrors
2070 if _ftperrors is None:
2071 import ftplib
2072 _ftperrors = ftplib.all_errors
2073 return _ftperrors
2074
2075_noheaders = None
2076def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002077 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002078 global _noheaders
2079 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002080 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002081 return _noheaders
2082
2083
2084# Utility classes
2085
2086class ftpwrapper:
2087 """Class used by open_ftp() for cache of open FTP connections."""
2088
2089 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2090 self.user = user
2091 self.passwd = passwd
2092 self.host = host
2093 self.port = port
2094 self.dirs = dirs
2095 self.timeout = timeout
2096 self.init()
2097
2098 def init(self):
2099 import ftplib
2100 self.busy = 0
2101 self.ftp = ftplib.FTP()
2102 self.ftp.connect(self.host, self.port, self.timeout)
2103 self.ftp.login(self.user, self.passwd)
2104 for dir in self.dirs:
2105 self.ftp.cwd(dir)
2106
2107 def retrfile(self, file, type):
2108 import ftplib
2109 self.endtransfer()
2110 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2111 else: cmd = 'TYPE ' + type; isdir = 0
2112 try:
2113 self.ftp.voidcmd(cmd)
2114 except ftplib.all_errors:
2115 self.init()
2116 self.ftp.voidcmd(cmd)
2117 conn = None
2118 if file and not isdir:
2119 # Try to retrieve as a file
2120 try:
2121 cmd = 'RETR ' + file
2122 conn = self.ftp.ntransfercmd(cmd)
2123 except ftplib.error_perm as reason:
2124 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002125 raise URLError('ftp error', reason).with_traceback(
2126 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002127 if not conn:
2128 # Set transfer mode to ASCII!
2129 self.ftp.voidcmd('TYPE A')
2130 # Try a directory listing. Verify that directory exists.
2131 if file:
2132 pwd = self.ftp.pwd()
2133 try:
2134 try:
2135 self.ftp.cwd(file)
2136 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002137 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002138 finally:
2139 self.ftp.cwd(pwd)
2140 cmd = 'LIST ' + file
2141 else:
2142 cmd = 'LIST'
2143 conn = self.ftp.ntransfercmd(cmd)
2144 self.busy = 1
2145 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002146 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002147 def endtransfer(self):
2148 if not self.busy:
2149 return
2150 self.busy = 0
2151 try:
2152 self.ftp.voidresp()
2153 except ftperrors():
2154 pass
2155
2156 def close(self):
2157 self.endtransfer()
2158 try:
2159 self.ftp.close()
2160 except ftperrors():
2161 pass
2162
2163# Proxy handling
2164def getproxies_environment():
2165 """Return a dictionary of scheme -> proxy server URL mappings.
2166
2167 Scan the environment for variables named <scheme>_proxy;
2168 this seems to be the standard convention. If you need a
2169 different way, you can pass a proxies dictionary to the
2170 [Fancy]URLopener constructor.
2171
2172 """
2173 proxies = {}
2174 for name, value in os.environ.items():
2175 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002176 if value and name[-6:] == '_proxy':
2177 proxies[name[:-6]] = value
2178 return proxies
2179
2180def proxy_bypass_environment(host):
2181 """Test if proxies should not be used for a particular host.
2182
2183 Checks the environment for a variable named no_proxy, which should
2184 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2185 """
2186 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2187 # '*' is special case for always bypass
2188 if no_proxy == '*':
2189 return 1
2190 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002191 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002192 # check if the host ends with any of the DNS suffixes
2193 for name in no_proxy.split(','):
2194 if name and (hostonly.endswith(name) or host.endswith(name)):
2195 return 1
2196 # otherwise, don't bypass
2197 return 0
2198
2199
2200if sys.platform == 'darwin':
Ronald Oussoren84151202010-04-18 20:46:11 +00002201 from _scproxy import _get_proxy_settings, _get_proxies
2202
2203 def proxy_bypass_macosx_sysconf(host):
2204 """
2205 Return True iff this host shouldn't be accessed using a proxy
2206
2207 This function uses the MacOSX framework SystemConfiguration
2208 to fetch the proxy information.
2209 """
2210 import re
2211 import socket
2212 from fnmatch import fnmatch
2213
2214 hostonly, port = splitport(host)
2215
2216 def ip2num(ipAddr):
2217 parts = ipAddr.split('.')
Mark Dickinsonc3f45c22010-05-09 12:16:29 +00002218 parts = list(map(int, parts))
Ronald Oussoren84151202010-04-18 20:46:11 +00002219 if len(parts) != 4:
2220 parts = (parts + [0, 0, 0, 0])[:4]
2221 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2222
2223 proxy_settings = _get_proxy_settings()
2224
2225 # Check for simple host names:
2226 if '.' not in host:
2227 if proxy_settings['exclude_simple']:
2228 return True
2229
2230 hostIP = None
2231
2232 for value in proxy_settings.get('exceptions', ()):
2233 # Items in the list are strings like these: *.local, 169.254/16
2234 if not value: continue
2235
2236 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2237 if m is not None:
2238 if hostIP is None:
2239 try:
2240 hostIP = socket.gethostbyname(hostonly)
2241 hostIP = ip2num(hostIP)
2242 except socket.error:
2243 continue
2244
2245 base = ip2num(m.group(1))
Ronald Oussorenab90f8e2010-06-27 14:26:30 +00002246 mask = m.group(2)
2247 if mask is None:
2248 mask = 8 * (m.group(1).count('.') + 1)
2249
2250 else:
2251 mask = int(mask[1:])
2252 mask = 32 - mask
Ronald Oussoren84151202010-04-18 20:46:11 +00002253
2254 if (hostIP >> mask) == (base >> mask):
2255 return True
2256
2257 elif fnmatch(host, value):
2258 return True
2259
2260 return False
2261
2262
2263 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002264 """Return a dictionary of scheme -> proxy server URL mappings.
2265
Ronald Oussoren84151202010-04-18 20:46:11 +00002266 This function uses the MacOSX framework SystemConfiguration
2267 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002268 """
Ronald Oussoren84151202010-04-18 20:46:11 +00002269 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002270
Ronald Oussoren84151202010-04-18 20:46:11 +00002271
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002272
2273 def proxy_bypass(host):
2274 if getproxies_environment():
2275 return proxy_bypass_environment(host)
2276 else:
Ronald Oussoren84151202010-04-18 20:46:11 +00002277 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002278
2279 def getproxies():
Ronald Oussoren84151202010-04-18 20:46:11 +00002280 return getproxies_environment() or getproxies_macosx_sysconf()
2281
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002282
2283elif os.name == 'nt':
2284 def getproxies_registry():
2285 """Return a dictionary of scheme -> proxy server URL mappings.
2286
2287 Win32 uses the registry to store proxies.
2288
2289 """
2290 proxies = {}
2291 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002292 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002293 except ImportError:
2294 # Std module, so should be around - but you never know!
2295 return proxies
2296 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002297 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002298 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002299 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002300 'ProxyEnable')[0]
2301 if proxyEnable:
2302 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002303 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002304 'ProxyServer')[0])
2305 if '=' in proxyServer:
2306 # Per-protocol settings
2307 for p in proxyServer.split(';'):
2308 protocol, address = p.split('=', 1)
2309 # See if address has a type:// prefix
2310 import re
2311 if not re.match('^([^/:]+)://', address):
2312 address = '%s://%s' % (protocol, address)
2313 proxies[protocol] = address
2314 else:
2315 # Use one setting for all protocols
2316 if proxyServer[:5] == 'http:':
2317 proxies['http'] = proxyServer
2318 else:
2319 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran04f31b82010-07-14 20:10:52 +00002320 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002321 proxies['ftp'] = 'ftp://%s' % proxyServer
2322 internetSettings.Close()
2323 except (WindowsError, ValueError, TypeError):
2324 # Either registry key not found etc, or the value in an
2325 # unexpected format.
2326 # proxies already set up to be empty so nothing to do
2327 pass
2328 return proxies
2329
2330 def getproxies():
2331 """Return a dictionary of scheme -> proxy server URL mappings.
2332
2333 Returns settings gathered from the environment, if specified,
2334 or the registry.
2335
2336 """
2337 return getproxies_environment() or getproxies_registry()
2338
2339 def proxy_bypass_registry(host):
2340 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002341 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002342 import re
2343 except ImportError:
2344 # Std modules, so should be around - but you never know!
2345 return 0
2346 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002347 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002348 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002349 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002350 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002351 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002352 'ProxyOverride')[0])
2353 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2354 except WindowsError:
2355 return 0
2356 if not proxyEnable or not proxyOverride:
2357 return 0
2358 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002359 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002360 host = [rawHost]
2361 try:
2362 addr = socket.gethostbyname(rawHost)
2363 if addr != rawHost:
2364 host.append(addr)
2365 except socket.error:
2366 pass
2367 try:
2368 fqdn = socket.getfqdn(rawHost)
2369 if fqdn != rawHost:
2370 host.append(fqdn)
2371 except socket.error:
2372 pass
2373 # make a check value list from the registry entry: replace the
2374 # '<local>' string by the localhost entry and the corresponding
2375 # canonical entry.
2376 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002377 # now check if we match one of the registry values.
2378 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002379 if test == '<local>':
2380 if '.' not in rawHost:
2381 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002382 test = test.replace(".", r"\.") # mask dots
2383 test = test.replace("*", r".*") # change glob sequence
2384 test = test.replace("?", r".") # change glob char
2385 for val in host:
2386 # print "%s <--> %s" %( test, val )
2387 if re.match(test, val, re.I):
2388 return 1
2389 return 0
2390
2391 def proxy_bypass(host):
2392 """Return a dictionary of scheme -> proxy server URL mappings.
2393
2394 Returns settings gathered from the environment, if specified,
2395 or the registry.
2396
2397 """
2398 if getproxies_environment():
2399 return proxy_bypass_environment(host)
2400 else:
2401 return proxy_bypass_registry(host)
2402
2403else:
2404 # By default use environment variables
2405 getproxies = getproxies_environment
2406 proxy_bypass = proxy_bypass_environment