blob: 6b299018647f3e065b026d3b215167eab9c3c9fa [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran04454cd2009-11-15 07:27:02 +000033
Senthil Kumaran4b9fbeb2009-12-20 07:18:22 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097
Georg Brandl13e89462008-07-01 19:56:00 +000098from urllib.error import URLError, HTTPError, ContentTooShortError
99from urllib.parse import (
100 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
101 splittype, splithost, splitport, splituser, splitpasswd,
Senthil Kumaran4c88db72010-08-08 11:30:58 +0000102 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000103from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000104
105# check for SSL
106try:
107 import ssl
Senthil Kumarand17ebdb2010-11-22 04:53:57 +0000108except ImportError:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000109 _have_ssl = False
110else:
111 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000112
113# used in User-Agent header sent
114__version__ = sys.version[:3]
115
116_opener = None
117def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
118 global _opener
119 if _opener is None:
120 _opener = build_opener()
121 return _opener.open(url, data, timeout)
122
123def install_opener(opener):
124 global _opener
125 _opener = opener
126
127# TODO(jhylton): Make this work with the same global opener.
128_urlopener = None
129def urlretrieve(url, filename=None, reporthook=None, data=None):
130 global _urlopener
131 if not _urlopener:
132 _urlopener = FancyURLopener()
133 return _urlopener.retrieve(url, filename, reporthook, data)
134
135def urlcleanup():
136 if _urlopener:
137 _urlopener.cleanup()
138 global _opener
139 if _opener:
140 _opener = None
141
142# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000143_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000144def request_host(request):
145 """Return request-host, as defined by RFC 2965.
146
147 Variation from RFC: returned value is lowercased, for convenient
148 comparison.
149
150 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000151 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000152 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000153 if host == "":
154 host = request.get_header("Host", "")
155
156 # remove port, if present
157 host = _cut_port_re.sub("", host, 1)
158 return host.lower()
159
160class Request:
161
162 def __init__(self, url, data=None, headers={},
163 origin_req_host=None, unverifiable=False):
164 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000165 self.full_url = unwrap(url)
Senthil Kumaran26430412011-04-13 07:01:19 +0800166 self.full_url, self.fragment = splittag(self.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000167 self.data = data
168 self.headers = {}
Senthil Kumaran0ac1f832009-07-26 12:39:47 +0000169 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000170 for key, value in headers.items():
171 self.add_header(key, value)
172 self.unredirected_hdrs = {}
173 if origin_req_host is None:
174 origin_req_host = request_host(self)
175 self.origin_req_host = origin_req_host
176 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000177 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000178
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000179 def _parse(self):
180 self.type, rest = splittype(self.full_url)
181 if self.type is None:
182 raise ValueError("unknown url type: %s" % self.full_url)
183 self.host, self.selector = splithost(rest)
184 if self.host:
185 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000186
187 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000188 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000189 return "POST"
190 else:
191 return "GET"
192
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000193 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000194
195 def add_data(self, data):
196 self.data = data
197
198 def has_data(self):
199 return self.data is not None
200
201 def get_data(self):
202 return self.data
203
204 def get_full_url(self):
Senthil Kumaran26430412011-04-13 07:01:19 +0800205 if self.fragment:
206 return '%s#%s' % (self.full_url, self.fragment)
207 else:
208 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000209
210 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211 return self.type
212
213 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000214 return self.host
215
216 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000217 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000218
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000219 def is_unverifiable(self):
220 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000221
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000222 def get_origin_req_host(self):
223 return self.origin_req_host
224
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000225 # End deprecated methods
226
227 def set_proxy(self, host, type):
Senthil Kumaran0ac1f832009-07-26 12:39:47 +0000228 if self.type == 'https' and not self._tunnel_host:
229 self._tunnel_host = self.host
230 else:
231 self.type= type
232 self.selector = self.full_url
233 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000234
235 def has_proxy(self):
236 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000237
238 def add_header(self, key, val):
239 # useful for something like authentication
240 self.headers[key.capitalize()] = val
241
242 def add_unredirected_header(self, key, val):
243 # will not be added to a redirected request
244 self.unredirected_hdrs[key.capitalize()] = val
245
246 def has_header(self, header_name):
247 return (header_name in self.headers or
248 header_name in self.unredirected_hdrs)
249
250 def get_header(self, header_name, default=None):
251 return self.headers.get(
252 header_name,
253 self.unredirected_hdrs.get(header_name, default))
254
255 def header_items(self):
256 hdrs = self.unredirected_hdrs.copy()
257 hdrs.update(self.headers)
258 return list(hdrs.items())
259
260class OpenerDirector:
261 def __init__(self):
262 client_version = "Python-urllib/%s" % __version__
263 self.addheaders = [('User-agent', client_version)]
R. David Murray46c6fd62010-12-23 19:47:37 +0000264 # self.handlers is retained only for backward compatibility
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000265 self.handlers = []
R. David Murray46c6fd62010-12-23 19:47:37 +0000266 # manage the individual handlers
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000267 self.handle_open = {}
268 self.handle_error = {}
269 self.process_response = {}
270 self.process_request = {}
271
272 def add_handler(self, handler):
273 if not hasattr(handler, "add_parent"):
274 raise TypeError("expected BaseHandler instance, got %r" %
275 type(handler))
276
277 added = False
278 for meth in dir(handler):
279 if meth in ["redirect_request", "do_open", "proxy_open"]:
280 # oops, coincidental match
281 continue
282
283 i = meth.find("_")
284 protocol = meth[:i]
285 condition = meth[i+1:]
286
287 if condition.startswith("error"):
288 j = condition.find("_") + i + 1
289 kind = meth[j+1:]
290 try:
291 kind = int(kind)
292 except ValueError:
293 pass
294 lookup = self.handle_error.get(protocol, {})
295 self.handle_error[protocol] = lookup
296 elif condition == "open":
297 kind = protocol
298 lookup = self.handle_open
299 elif condition == "response":
300 kind = protocol
301 lookup = self.process_response
302 elif condition == "request":
303 kind = protocol
304 lookup = self.process_request
305 else:
306 continue
307
308 handlers = lookup.setdefault(kind, [])
309 if handlers:
310 bisect.insort(handlers, handler)
311 else:
312 handlers.append(handler)
313 added = True
314
315 if added:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000316 bisect.insort(self.handlers, handler)
317 handler.add_parent(self)
318
319 def close(self):
320 # Only exists for backwards compatibility.
321 pass
322
323 def _call_chain(self, chain, kind, meth_name, *args):
324 # Handlers raise an exception if no one else should try to handle
325 # the request, or return None if they can't but another handler
326 # could. Otherwise, they return the response.
327 handlers = chain.get(kind, ())
328 for handler in handlers:
329 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330 result = func(*args)
331 if result is not None:
332 return result
333
334 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
335 # accept a URL or a Request object
336 if isinstance(fullurl, str):
337 req = Request(fullurl, data)
338 else:
339 req = fullurl
340 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000341 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000342
343 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000344 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000345
346 # pre-process request
347 meth_name = protocol+"_request"
348 for processor in self.process_request.get(protocol, []):
349 meth = getattr(processor, meth_name)
350 req = meth(req)
351
352 response = self._open(req, data)
353
354 # post-process response
355 meth_name = protocol+"_response"
356 for processor in self.process_response.get(protocol, []):
357 meth = getattr(processor, meth_name)
358 response = meth(req, response)
359
360 return response
361
362 def _open(self, req, data=None):
363 result = self._call_chain(self.handle_open, 'default',
364 'default_open', req)
365 if result:
366 return result
367
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000368 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000369 result = self._call_chain(self.handle_open, protocol, protocol +
370 '_open', req)
371 if result:
372 return result
373
374 return self._call_chain(self.handle_open, 'unknown',
375 'unknown_open', req)
376
377 def error(self, proto, *args):
378 if proto in ('http', 'https'):
379 # XXX http[s] protocols are special-cased
380 dict = self.handle_error['http'] # https is not different than http
381 proto = args[2] # YUCK!
382 meth_name = 'http_error_%s' % proto
383 http_err = 1
384 orig_args = args
385 else:
386 dict = self.handle_error
387 meth_name = proto + '_error'
388 http_err = 0
389 args = (dict, proto, meth_name) + args
390 result = self._call_chain(*args)
391 if result:
392 return result
393
394 if http_err:
395 args = (dict, 'default', 'http_error_default') + orig_args
396 return self._call_chain(*args)
397
398# XXX probably also want an abstract factory that knows when it makes
399# sense to skip a superclass in favor of a subclass and when it might
400# make sense to include both
401
402def build_opener(*handlers):
403 """Create an opener object from a list of handlers.
404
405 The opener will use several default handlers, including support
Senthil Kumaran04454cd2009-11-15 07:27:02 +0000406 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000407
408 If any of the handlers passed as arguments are subclasses of the
409 default handlers, the default handlers will not be used.
410 """
411 def isclass(obj):
412 return isinstance(obj, type) or hasattr(obj, "__bases__")
413
414 opener = OpenerDirector()
415 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
416 HTTPDefaultErrorHandler, HTTPRedirectHandler,
417 FTPHandler, FileHandler, HTTPErrorProcessor]
418 if hasattr(http.client, "HTTPSConnection"):
419 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000420 skip = set()
421 for klass in default_classes:
422 for check in handlers:
423 if isclass(check):
424 if issubclass(check, klass):
425 skip.add(klass)
426 elif isinstance(check, klass):
427 skip.add(klass)
428 for klass in skip:
429 default_classes.remove(klass)
430
431 for klass in default_classes:
432 opener.add_handler(klass())
433
434 for h in handlers:
435 if isclass(h):
436 h = h()
437 opener.add_handler(h)
438 return opener
439
440class BaseHandler:
441 handler_order = 500
442
443 def add_parent(self, parent):
444 self.parent = parent
445
446 def close(self):
447 # Only exists for backwards compatibility
448 pass
449
450 def __lt__(self, other):
451 if not hasattr(other, "handler_order"):
452 # Try to preserve the old behavior of having custom classes
453 # inserted after default ones (works only for custom user
454 # classes which are not aware of handler_order).
455 return True
456 return self.handler_order < other.handler_order
457
458
459class HTTPErrorProcessor(BaseHandler):
460 """Process HTTP error responses."""
461 handler_order = 1000 # after all other processing
462
463 def http_response(self, request, response):
464 code, msg, hdrs = response.code, response.msg, response.info()
465
466 # According to RFC 2616, "2xx" code indicates that the client's
467 # request was successfully received, understood, and accepted.
468 if not (200 <= code < 300):
469 response = self.parent.error(
470 'http', request, response, code, msg, hdrs)
471
472 return response
473
474 https_response = http_response
475
476class HTTPDefaultErrorHandler(BaseHandler):
477 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000478 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000479
480class HTTPRedirectHandler(BaseHandler):
481 # maximum number of redirections to any single URL
482 # this is needed because of the state that cookies introduce
483 max_repeats = 4
484 # maximum total number of redirections (regardless of URL) before
485 # assuming we're in a loop
486 max_redirections = 10
487
488 def redirect_request(self, req, fp, code, msg, headers, newurl):
489 """Return a Request or None in response to a redirect.
490
491 This is called by the http_error_30x methods when a
492 redirection response is received. If a redirection should
493 take place, return a new Request to allow http_error_30x to
494 perform the redirect. Otherwise, raise HTTPError if no-one
495 else should try to handle this url. Return None if you can't
496 but another Handler might.
497 """
498 m = req.get_method()
499 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
500 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000501 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000502
503 # Strictly (according to RFC 2616), 301 or 302 in response to
504 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000505 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000506 # essentially all clients do redirect in this case, so we do
507 # the same.
508 # be conciliant with URIs containing a space
509 newurl = newurl.replace(' ', '%20')
510 CONTENT_HEADERS = ("content-length", "content-type")
511 newheaders = dict((k, v) for k, v in req.headers.items()
512 if k.lower() not in CONTENT_HEADERS)
513 return Request(newurl,
514 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000515 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000516 unverifiable=True)
517
518 # Implementation note: To avoid the server sending us into an
519 # infinite loop, the request object needs to track what URLs we
520 # have already seen. Do this by adding a handler-specific
521 # attribute to the Request object.
522 def http_error_302(self, req, fp, code, msg, headers):
523 # Some servers (incorrectly) return multiple Location headers
524 # (so probably same goes for URI). Use first header.
525 if "location" in headers:
526 newurl = headers["location"]
527 elif "uri" in headers:
528 newurl = headers["uri"]
529 else:
530 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000531
532 # fix a possible malformed URL
533 urlparts = urlparse(newurl)
guido@google.coma119df92011-03-29 11:41:02 -0700534
535 # For security reasons we don't allow redirection to anything other
536 # than http, https or ftp.
537
538 if not urlparts.scheme in ('http', 'https', 'ftp'):
539 raise HTTPError(newurl, code,
540 msg +
541 " - Redirection to url '%s' is not allowed" %
542 newurl,
543 headers, fp)
544
Facundo Batistaf24802c2008-08-17 03:36:03 +0000545 if not urlparts.path:
546 urlparts = list(urlparts)
547 urlparts[2] = "/"
548 newurl = urlunparse(urlparts)
549
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000550 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000551
552 # XXX Probably want to forget about the state of the current
553 # request, although that might interact poorly with other
554 # handlers that also use handler-specific request attributes
555 new = self.redirect_request(req, fp, code, msg, headers, newurl)
556 if new is None:
557 return
558
559 # loop detection
560 # .redirect_dict has a key url if url was previously visited.
561 if hasattr(req, 'redirect_dict'):
562 visited = new.redirect_dict = req.redirect_dict
563 if (visited.get(newurl, 0) >= self.max_repeats or
564 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000565 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000566 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000567 else:
568 visited = new.redirect_dict = req.redirect_dict = {}
569 visited[newurl] = visited.get(newurl, 0) + 1
570
571 # Don't close the fp until we are sure that we won't use it
572 # with HTTPError.
573 fp.read()
574 fp.close()
575
Senthil Kumarane9da06f2009-07-19 04:20:12 +0000576 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000577
578 http_error_301 = http_error_303 = http_error_307 = http_error_302
579
580 inf_msg = "The HTTP server returned a redirect error that would " \
581 "lead to an infinite loop.\n" \
582 "The last 30x error message was:\n"
583
584
585def _parse_proxy(proxy):
586 """Return (scheme, user, password, host/port) given a URL or an authority.
587
588 If a URL is supplied, it must have an authority (host:port) component.
589 According to RFC 3986, having an authority component means the URL must
590 have two slashes after the scheme:
591
592 >>> _parse_proxy('file:/ftp.example.com/')
593 Traceback (most recent call last):
594 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
595
596 The first three items of the returned tuple may be None.
597
598 Examples of authority parsing:
599
600 >>> _parse_proxy('proxy.example.com')
601 (None, None, None, 'proxy.example.com')
602 >>> _parse_proxy('proxy.example.com:3128')
603 (None, None, None, 'proxy.example.com:3128')
604
605 The authority component may optionally include userinfo (assumed to be
606 username:password):
607
608 >>> _parse_proxy('joe:password@proxy.example.com')
609 (None, 'joe', 'password', 'proxy.example.com')
610 >>> _parse_proxy('joe:password@proxy.example.com:3128')
611 (None, 'joe', 'password', 'proxy.example.com:3128')
612
613 Same examples, but with URLs instead:
614
615 >>> _parse_proxy('http://proxy.example.com/')
616 ('http', None, None, 'proxy.example.com')
617 >>> _parse_proxy('http://proxy.example.com:3128/')
618 ('http', None, None, 'proxy.example.com:3128')
619 >>> _parse_proxy('http://joe:password@proxy.example.com/')
620 ('http', 'joe', 'password', 'proxy.example.com')
621 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
622 ('http', 'joe', 'password', 'proxy.example.com:3128')
623
624 Everything after the authority is ignored:
625
626 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
627 ('ftp', 'joe', 'password', 'proxy.example.com')
628
629 Test for no trailing '/' case:
630
631 >>> _parse_proxy('http://joe:password@proxy.example.com')
632 ('http', 'joe', 'password', 'proxy.example.com')
633
634 """
Georg Brandl13e89462008-07-01 19:56:00 +0000635 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000636 if not r_scheme.startswith("/"):
637 # authority
638 scheme = None
639 authority = proxy
640 else:
641 # URL
642 if not r_scheme.startswith("//"):
643 raise ValueError("proxy URL with no authority: %r" % proxy)
644 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
645 # and 3.3.), path is empty or starts with '/'
646 end = r_scheme.find("/", 2)
647 if end == -1:
648 end = None
649 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000650 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000651 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000652 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000653 else:
654 user = password = None
655 return scheme, user, password, hostport
656
657class ProxyHandler(BaseHandler):
658 # Proxies must be in front
659 handler_order = 100
660
661 def __init__(self, proxies=None):
662 if proxies is None:
663 proxies = getproxies()
664 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
665 self.proxies = proxies
666 for type, url in proxies.items():
667 setattr(self, '%s_open' % type,
668 lambda r, proxy=url, type=type, meth=self.proxy_open: \
669 meth(r, proxy, type))
670
671 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000672 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000673 proxy_type, user, password, hostport = _parse_proxy(proxy)
674 if proxy_type is None:
675 proxy_type = orig_type
Senthil Kumaran11301632009-10-11 06:07:46 +0000676
677 if req.host and proxy_bypass(req.host):
678 return None
679
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000680 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000681 user_pass = '%s:%s' % (unquote(user),
682 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000683 creds = base64.b64encode(user_pass.encode()).decode("ascii")
684 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000685 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000686 req.set_proxy(hostport, proxy_type)
Senthil Kumaran0ac1f832009-07-26 12:39:47 +0000687 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000688 # let other handlers take care of it
689 return None
690 else:
691 # need to start over, because the other handlers don't
692 # grok the proxy's URL type
693 # e.g. if we have a constructor arg proxies like so:
694 # {'http': 'ftp://proxy.example.com'}, we may end up turning
695 # a request for http://acme.example.com/a into one for
696 # ftp://proxy.example.com/a
Senthil Kumarane9da06f2009-07-19 04:20:12 +0000697 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000698
699class HTTPPasswordMgr:
700
701 def __init__(self):
702 self.passwd = {}
703
704 def add_password(self, realm, uri, user, passwd):
705 # uri could be a single URI or a sequence
706 if isinstance(uri, str):
707 uri = [uri]
708 if not realm in self.passwd:
709 self.passwd[realm] = {}
710 for default_port in True, False:
711 reduced_uri = tuple(
712 [self.reduce_uri(u, default_port) for u in uri])
713 self.passwd[realm][reduced_uri] = (user, passwd)
714
715 def find_user_password(self, realm, authuri):
716 domains = self.passwd.get(realm, {})
717 for default_port in True, False:
718 reduced_authuri = self.reduce_uri(authuri, default_port)
719 for uris, authinfo in domains.items():
720 for uri in uris:
721 if self.is_suburi(uri, reduced_authuri):
722 return authinfo
723 return None, None
724
725 def reduce_uri(self, uri, default_port=True):
726 """Accept authority or URI and extract only the authority and path."""
727 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000728 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000729 if parts[1]:
730 # URI
731 scheme = parts[0]
732 authority = parts[1]
733 path = parts[2] or '/'
734 else:
735 # host or host:port
736 scheme = None
737 authority = uri
738 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000739 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000740 if default_port and port is None and scheme is not None:
741 dport = {"http": 80,
742 "https": 443,
743 }.get(scheme)
744 if dport is not None:
745 authority = "%s:%d" % (host, dport)
746 return authority, path
747
748 def is_suburi(self, base, test):
749 """Check if test is below base in a URI tree
750
751 Both args must be URIs in reduced form.
752 """
753 if base == test:
754 return True
755 if base[0] != test[0]:
756 return False
757 common = posixpath.commonprefix((base[1], test[1]))
758 if len(common) == len(base[1]):
759 return True
760 return False
761
762
763class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
764
765 def find_user_password(self, realm, authuri):
766 user, password = HTTPPasswordMgr.find_user_password(self, realm,
767 authuri)
768 if user is not None:
769 return user, password
770 return HTTPPasswordMgr.find_user_password(self, None, authuri)
771
772
773class AbstractBasicAuthHandler:
774
775 # XXX this allows for multiple auth-schemes, but will stupidly pick
776 # the last one with a realm specified.
777
778 # allow for double- and single-quoted realm values
779 # (single quotes are a violation of the RFC, but appear in the wild)
780 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
781 'realm=(["\'])(.*?)\\2', re.I)
782
783 # XXX could pre-emptively send auth info already accepted (RFC 2617,
784 # end of section 2, and section 1.2 immediately after "credentials"
785 # production).
786
787 def __init__(self, password_mgr=None):
788 if password_mgr is None:
789 password_mgr = HTTPPasswordMgr()
790 self.passwd = password_mgr
791 self.add_password = self.passwd.add_password
Senthil Kumaranefafdc72010-06-01 12:56:17 +0000792 self.retried = 0
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000793
Senthil Kumarancb39d6c2010-08-19 17:54:33 +0000794 def reset_retry_count(self):
795 self.retried = 0
796
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000797 def http_error_auth_reqed(self, authreq, host, req, headers):
798 # host may be an authority (without userinfo) or a URL with an
799 # authority
800 # XXX could be multiple headers
801 authreq = headers.get(authreq, None)
Senthil Kumaranefafdc72010-06-01 12:56:17 +0000802
803 if self.retried > 5:
804 # retry sending the username:password 5 times before failing.
805 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
806 headers, None)
807 else:
808 self.retried += 1
809
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000810 if authreq:
811 mo = AbstractBasicAuthHandler.rx.search(authreq)
812 if mo:
813 scheme, quote, realm = mo.groups()
814 if scheme.lower() == 'basic':
Senthil Kumaran06509382010-08-26 06:24:04 +0000815 response = self.retry_http_basic_auth(host, req, realm)
816 if response and response.code != 401:
817 self.retried = 0
818 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000819
820 def retry_http_basic_auth(self, host, req, realm):
821 user, pw = self.passwd.find_user_password(realm, host)
822 if pw is not None:
823 raw = "%s:%s" % (user, pw)
824 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
825 if req.headers.get(self.auth_header, None) == auth:
826 return None
Senthil Kumaranefcd8832010-02-24 16:56:20 +0000827 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumarane9da06f2009-07-19 04:20:12 +0000828 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000829 else:
830 return None
831
832
833class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
834
835 auth_header = 'Authorization'
836
837 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000838 url = req.full_url
Senthil Kumarancb39d6c2010-08-19 17:54:33 +0000839 response = self.http_error_auth_reqed('www-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000840 url, req, headers)
Senthil Kumarancb39d6c2010-08-19 17:54:33 +0000841 self.reset_retry_count()
842 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000843
844
845class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
846
847 auth_header = 'Proxy-authorization'
848
849 def http_error_407(self, req, fp, code, msg, headers):
850 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000851 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000852 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
853 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000854 authority = req.host
Senthil Kumarancb39d6c2010-08-19 17:54:33 +0000855 response = self.http_error_auth_reqed('proxy-authenticate',
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000856 authority, req, headers)
Senthil Kumarancb39d6c2010-08-19 17:54:33 +0000857 self.reset_retry_count()
858 return response
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000859
860
861def randombytes(n):
862 """Return n random bytes."""
863 return os.urandom(n)
864
865class AbstractDigestAuthHandler:
866 # Digest authentication is specified in RFC 2617.
867
868 # XXX The client does not inspect the Authentication-Info header
869 # in a successful response.
870
871 # XXX It should be possible to test this implementation against
872 # a mock server that just generates a static set of challenges.
873
874 # XXX qop="auth-int" supports is shaky
875
876 def __init__(self, passwd=None):
877 if passwd is None:
878 passwd = HTTPPasswordMgr()
879 self.passwd = passwd
880 self.add_password = self.passwd.add_password
881 self.retried = 0
882 self.nonce_count = 0
Senthil Kumaranb58474f2009-11-15 08:45:27 +0000883 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000884
885 def reset_retry_count(self):
886 self.retried = 0
887
888 def http_error_auth_reqed(self, auth_header, host, req, headers):
889 authreq = headers.get(auth_header, None)
890 if self.retried > 5:
891 # Don't fail endlessly - if we failed once, we'll probably
892 # fail a second time. Hm. Unless the Password Manager is
893 # prompting for the information. Crap. This isn't great
894 # but it's better than the current 'repeat until recursion
895 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000896 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000897 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000898 else:
899 self.retried += 1
900 if authreq:
901 scheme = authreq.split()[0]
902 if scheme.lower() == 'digest':
903 return self.retry_http_digest_auth(req, authreq)
904
905 def retry_http_digest_auth(self, req, auth):
906 token, challenge = auth.split(' ', 1)
907 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
908 auth = self.get_authorization(req, chal)
909 if auth:
910 auth_val = 'Digest %s' % auth
911 if req.headers.get(self.auth_header, None) == auth_val:
912 return None
913 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumarane9da06f2009-07-19 04:20:12 +0000914 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000915 return resp
916
917 def get_cnonce(self, nonce):
918 # The cnonce-value is an opaque
919 # quoted string value provided by the client and used by both client
920 # and server to avoid chosen plaintext attacks, to provide mutual
921 # authentication, and to provide some message integrity protection.
922 # This isn't a fabulous effort, but it's probably Good Enough.
923 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
924 b = s.encode("ascii") + randombytes(8)
925 dig = hashlib.sha1(b).hexdigest()
926 return dig[:16]
927
928 def get_authorization(self, req, chal):
929 try:
930 realm = chal['realm']
931 nonce = chal['nonce']
932 qop = chal.get('qop')
933 algorithm = chal.get('algorithm', 'MD5')
934 # mod_digest doesn't send an opaque, even though it isn't
935 # supposed to be optional
936 opaque = chal.get('opaque', None)
937 except KeyError:
938 return None
939
940 H, KD = self.get_algorithm_impls(algorithm)
941 if H is None:
942 return None
943
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000944 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000945 if user is None:
946 return None
947
948 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000949 if req.data is not None:
950 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000951 else:
952 entdig = None
953
954 A1 = "%s:%s:%s" % (user, realm, pw)
955 A2 = "%s:%s" % (req.get_method(),
956 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000957 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000958 if qop == 'auth':
Senthil Kumaranb58474f2009-11-15 08:45:27 +0000959 if nonce == self.last_nonce:
960 self.nonce_count += 1
961 else:
962 self.nonce_count = 1
963 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000964 ncvalue = '%08x' % self.nonce_count
965 cnonce = self.get_cnonce(nonce)
966 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
967 respdig = KD(H(A1), noncebit)
968 elif qop is None:
969 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
970 else:
971 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000972 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000973
974 # XXX should the partial digests be encoded too?
975
976 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000977 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000978 respdig)
979 if opaque:
980 base += ', opaque="%s"' % opaque
981 if entdig:
982 base += ', digest="%s"' % entdig
983 base += ', algorithm="%s"' % algorithm
984 if qop:
985 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
986 return base
987
988 def get_algorithm_impls(self, algorithm):
989 # lambdas assume digest modules are imported at the top level
990 if algorithm == 'MD5':
991 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
992 elif algorithm == 'SHA':
993 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
994 # XXX MD5-sess
995 KD = lambda s, d: H("%s:%s" % (s, d))
996 return H, KD
997
998 def get_entity_digest(self, data, chal):
999 # XXX not implemented yet
1000 return None
1001
1002
1003class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1004 """An authentication protocol defined by RFC 2069
1005
1006 Digest authentication improves on basic authentication because it
1007 does not transmit passwords in the clear.
1008 """
1009
1010 auth_header = 'Authorization'
1011 handler_order = 490 # before Basic auth
1012
1013 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001014 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001015 retry = self.http_error_auth_reqed('www-authenticate',
1016 host, req, headers)
1017 self.reset_retry_count()
1018 return retry
1019
1020
1021class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1022
1023 auth_header = 'Proxy-Authorization'
1024 handler_order = 490 # before Basic auth
1025
1026 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001027 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001028 retry = self.http_error_auth_reqed('proxy-authenticate',
1029 host, req, headers)
1030 self.reset_retry_count()
1031 return retry
1032
1033class AbstractHTTPHandler(BaseHandler):
1034
1035 def __init__(self, debuglevel=0):
1036 self._debuglevel = debuglevel
1037
1038 def set_http_debuglevel(self, level):
1039 self._debuglevel = level
1040
1041 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001042 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001043 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001044 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001045
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001046 if request.data is not None: # POST
1047 data = request.data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001048 if not request.has_header('Content-type'):
1049 request.add_unredirected_header(
1050 'Content-type',
1051 'application/x-www-form-urlencoded')
1052 if not request.has_header('Content-length'):
1053 request.add_unredirected_header(
1054 'Content-length', '%d' % len(data))
1055
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001056 sel_host = host
1057 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001058 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001059 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001060 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001061 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001062 for name, value in self.parent.addheaders:
1063 name = name.capitalize()
1064 if not request.has_header(name):
1065 request.add_unredirected_header(name, value)
1066
1067 return request
1068
1069 def do_open(self, http_class, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001070 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001071
1072 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001073 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001074 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001075 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001076 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001077
1078 h = http_class(host, timeout=req.timeout) # will parse host:port
Senthil Kumaran603ca412010-09-27 01:28:10 +00001079
1080 headers = dict(req.unredirected_hdrs)
1081 headers.update(dict((k, v) for k, v in req.headers.items()
1082 if k not in headers))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001083
1084 # TODO(jhylton): Should this be redesigned to handle
1085 # persistent connections?
1086
1087 # We want to make an HTTP/1.1 request, but the addinfourl
1088 # class isn't prepared to deal with a persistent connection.
1089 # It will try to read all remaining data from the socket,
1090 # which will block while the server waits for the next request.
1091 # So make sure the connection gets closed after the (only)
1092 # request.
1093 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001094 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran0ac1f832009-07-26 12:39:47 +00001095
1096 if req._tunnel_host:
Senthil Kumaran4b9fbeb2009-12-20 07:18:22 +00001097 tunnel_headers = {}
1098 proxy_auth_hdr = "Proxy-Authorization"
1099 if proxy_auth_hdr in headers:
1100 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1101 # Proxy-Authorization should not be sent to origin
1102 # server.
1103 del headers[proxy_auth_hdr]
1104 h._set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran0ac1f832009-07-26 12:39:47 +00001105
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001106 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001107 h.request(req.get_method(), req.selector, req.data, headers)
1108 r = h.getresponse() # an HTTPResponse instance
1109 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001110 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001111
Senthil Kumaran26430412011-04-13 07:01:19 +08001112 r.url = req.get_full_url()
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001113 # This line replaces the .msg attribute of the HTTPResponse
1114 # with .headers, because urllib clients expect the response to
1115 # have the reason in .msg. It would be good to mark this
1116 # attribute is deprecated and get then to use info() or
1117 # .headers.
1118 r.msg = r.reason
1119 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001120
1121
1122class HTTPHandler(AbstractHTTPHandler):
1123
1124 def http_open(self, req):
1125 return self.do_open(http.client.HTTPConnection, req)
1126
1127 http_request = AbstractHTTPHandler.do_request_
1128
1129if hasattr(http.client, 'HTTPSConnection'):
1130 class HTTPSHandler(AbstractHTTPHandler):
1131
1132 def https_open(self, req):
1133 return self.do_open(http.client.HTTPSConnection, req)
1134
1135 https_request = AbstractHTTPHandler.do_request_
1136
1137class HTTPCookieProcessor(BaseHandler):
1138 def __init__(self, cookiejar=None):
1139 import http.cookiejar
1140 if cookiejar is None:
1141 cookiejar = http.cookiejar.CookieJar()
1142 self.cookiejar = cookiejar
1143
1144 def http_request(self, request):
1145 self.cookiejar.add_cookie_header(request)
1146 return request
1147
1148 def http_response(self, request, response):
1149 self.cookiejar.extract_cookies(response, request)
1150 return response
1151
1152 https_request = http_request
1153 https_response = http_response
1154
1155class UnknownHandler(BaseHandler):
1156 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001157 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001158 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001159
1160def parse_keqv_list(l):
1161 """Parse list of key=value strings where keys are not duplicated."""
1162 parsed = {}
1163 for elt in l:
1164 k, v = elt.split('=', 1)
1165 if v[0] == '"' and v[-1] == '"':
1166 v = v[1:-1]
1167 parsed[k] = v
1168 return parsed
1169
1170def parse_http_list(s):
1171 """Parse lists as described by RFC 2068 Section 2.
1172
1173 In particular, parse comma-separated lists where the elements of
1174 the list may include quoted-strings. A quoted-string could
1175 contain a comma. A non-quoted string could have quotes in the
1176 middle. Neither commas nor quotes count if they are escaped.
1177 Only double-quotes count, not single-quotes.
1178 """
1179 res = []
1180 part = ''
1181
1182 escape = quote = False
1183 for cur in s:
1184 if escape:
1185 part += cur
1186 escape = False
1187 continue
1188 if quote:
1189 if cur == '\\':
1190 escape = True
1191 continue
1192 elif cur == '"':
1193 quote = False
1194 part += cur
1195 continue
1196
1197 if cur == ',':
1198 res.append(part)
1199 part = ''
1200 continue
1201
1202 if cur == '"':
1203 quote = True
1204
1205 part += cur
1206
1207 # append last part
1208 if part:
1209 res.append(part)
1210
1211 return [part.strip() for part in res]
1212
1213class FileHandler(BaseHandler):
1214 # Use local file or FTP depending on form of URL
1215 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001216 url = req.selector
Senthil Kumaran34024142010-07-11 03:15:25 +00001217 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1218 req.host != 'localhost'):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001219 req.type = 'ftp'
1220 return self.parent.open(req)
1221 else:
1222 return self.open_local_file(req)
1223
1224 # names for the localhost
1225 names = None
1226 def get_names(self):
1227 if FileHandler.names is None:
1228 try:
Senthil Kumaran88a495d2009-12-27 10:15:45 +00001229 FileHandler.names = tuple(
1230 socket.gethostbyname_ex('localhost')[2] +
1231 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001232 except socket.gaierror:
1233 FileHandler.names = (socket.gethostbyname('localhost'),)
1234 return FileHandler.names
1235
1236 # not entirely sure what the rules are here
1237 def open_local_file(self, req):
1238 import email.utils
1239 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001240 host = req.host
Senthil Kumaran1e72bd32010-05-08 05:14:29 +00001241 filename = req.selector
1242 localfile = url2pathname(filename)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001243 try:
1244 stats = os.stat(localfile)
1245 size = stats.st_size
1246 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
Senthil Kumaran1e72bd32010-05-08 05:14:29 +00001247 mtype = mimetypes.guess_type(filename)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001248 headers = email.message_from_string(
1249 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1250 (mtype or 'text/plain', size, modified))
1251 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001252 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001253 if not host or \
1254 (not port and _safe_gethostbyname(host) in self.get_names()):
Senthil Kumaran1e72bd32010-05-08 05:14:29 +00001255 if host:
1256 origurl = 'file://' + host + filename
1257 else:
1258 origurl = 'file://' + filename
1259 return addinfourl(open(localfile, 'rb'), headers, origurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001260 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001261 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001262 raise URLError(msg)
1263 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001264
1265def _safe_gethostbyname(host):
1266 try:
1267 return socket.gethostbyname(host)
1268 except socket.gaierror:
1269 return None
1270
1271class FTPHandler(BaseHandler):
1272 def ftp_open(self, req):
1273 import ftplib
1274 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001275 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001276 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001277 raise URLError('ftp error: no host given')
1278 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001279 if port is None:
1280 port = ftplib.FTP_PORT
1281 else:
1282 port = int(port)
1283
1284 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001285 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001286 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001287 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001288 else:
1289 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001290 host = unquote(host)
Senthil Kumaran723a7a62010-11-18 16:44:38 +00001291 user = user or ''
1292 passwd = passwd or ''
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001293
1294 try:
1295 host = socket.gethostbyname(host)
1296 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001297 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001298 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001299 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001300 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001301 dirs, file = dirs[:-1], dirs[-1]
1302 if dirs and not dirs[0]:
1303 dirs = dirs[1:]
1304 try:
1305 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1306 type = file and 'I' or 'D'
1307 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001308 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001309 if attr.lower() == 'type' and \
1310 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1311 type = value.upper()
1312 fp, retrlen = fw.retrfile(file, type)
1313 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001314 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001315 if mtype:
1316 headers += "Content-type: %s\n" % mtype
1317 if retrlen is not None and retrlen >= 0:
1318 headers += "Content-length: %d\n" % retrlen
1319 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001320 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001321 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001322 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001323 raise exc.with_traceback(sys.exc_info()[2])
1324
1325 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1326 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1327 return fw
1328
1329class CacheFTPHandler(FTPHandler):
1330 # XXX would be nice to have pluggable cache strategies
1331 # XXX this stuff is definitely not thread safe
1332 def __init__(self):
1333 self.cache = {}
1334 self.timeout = {}
1335 self.soonest = 0
1336 self.delay = 60
1337 self.max_conns = 16
1338
1339 def setTimeout(self, t):
1340 self.delay = t
1341
1342 def setMaxConns(self, m):
1343 self.max_conns = m
1344
1345 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1346 key = user, host, port, '/'.join(dirs), timeout
1347 if key in self.cache:
1348 self.timeout[key] = time.time() + self.delay
1349 else:
1350 self.cache[key] = ftpwrapper(user, passwd, host, port,
1351 dirs, timeout)
1352 self.timeout[key] = time.time() + self.delay
1353 self.check_cache()
1354 return self.cache[key]
1355
1356 def check_cache(self):
1357 # first check for old ones
1358 t = time.time()
1359 if self.soonest <= t:
1360 for k, v in list(self.timeout.items()):
1361 if v < t:
1362 self.cache[k].close()
1363 del self.cache[k]
1364 del self.timeout[k]
1365 self.soonest = min(list(self.timeout.values()))
1366
1367 # then check the size
1368 if len(self.cache) == self.max_conns:
1369 for k, v in list(self.timeout.items()):
1370 if v == self.soonest:
1371 del self.cache[k]
1372 del self.timeout[k]
1373 break
1374 self.soonest = min(list(self.timeout.values()))
1375
1376# Code move from the old urllib module
1377
1378MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1379
1380# Helper for non-unix systems
1381if os.name == 'mac':
1382 from macurl2path import url2pathname, pathname2url
1383elif os.name == 'nt':
1384 from nturl2path import url2pathname, pathname2url
1385else:
1386 def url2pathname(pathname):
1387 """OS-specific conversion from a relative URL of the 'file' scheme
1388 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001389 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001390
1391 def pathname2url(pathname):
1392 """OS-specific conversion from a file system path to a relative URL
1393 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001394 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001395
1396# This really consists of two pieces:
1397# (1) a class which handles opening of all sorts of URLs
1398# (plus assorted utilities etc.)
1399# (2) a set of functions for parsing URLs
1400# XXX Should these be separated out into different modules?
1401
1402
1403ftpcache = {}
1404class URLopener:
1405 """Class to open URLs.
1406 This is a class rather than just a subroutine because we may need
1407 more than one set of global protocol-specific options.
1408 Note -- this is a base class for those who don't want the
1409 automatic handling of errors type 302 (relocated) and 401
1410 (authorization needed)."""
1411
1412 __tempfiles = None
1413
1414 version = "Python-urllib/%s" % __version__
1415
1416 # Constructor
1417 def __init__(self, proxies=None, **x509):
1418 if proxies is None:
1419 proxies = getproxies()
1420 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1421 self.proxies = proxies
1422 self.key_file = x509.get('key_file')
1423 self.cert_file = x509.get('cert_file')
1424 self.addheaders = [('User-Agent', self.version)]
1425 self.__tempfiles = []
1426 self.__unlink = os.unlink # See cleanup()
1427 self.tempcache = None
1428 # Undocumented feature: if you assign {} to tempcache,
1429 # it is used to cache files retrieved with
1430 # self.retrieve(). This is not enabled by default
1431 # since it does not work for changing documents (and I
1432 # haven't got the logic to check expiration headers
1433 # yet).
1434 self.ftpcache = ftpcache
1435 # Undocumented feature: you can use a different
1436 # ftp cache by assigning to the .ftpcache member;
1437 # in case you want logically independent URL openers
1438 # XXX This is not threadsafe. Bah.
1439
1440 def __del__(self):
1441 self.close()
1442
1443 def close(self):
1444 self.cleanup()
1445
1446 def cleanup(self):
1447 # This code sometimes runs when the rest of this module
1448 # has already been deleted, so it can't use any globals
1449 # or import anything.
1450 if self.__tempfiles:
1451 for file in self.__tempfiles:
1452 try:
1453 self.__unlink(file)
1454 except OSError:
1455 pass
1456 del self.__tempfiles[:]
1457 if self.tempcache:
1458 self.tempcache.clear()
1459
1460 def addheader(self, *args):
1461 """Add a header to be used by the HTTP interface only
1462 e.g. u.addheader('Accept', 'sound/basic')"""
1463 self.addheaders.append(args)
1464
1465 # External interface
1466 def open(self, fullurl, data=None):
1467 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001468 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran0e7e9ae2010-02-20 22:30:21 +00001469 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001470 if self.tempcache and fullurl in self.tempcache:
1471 filename, headers = self.tempcache[fullurl]
1472 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001473 return addinfourl(fp, headers, fullurl)
1474 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001475 if not urltype:
1476 urltype = 'file'
1477 if urltype in self.proxies:
1478 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001479 urltype, proxyhost = splittype(proxy)
1480 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001481 url = (host, fullurl) # Signal special case to open_*()
1482 else:
1483 proxy = None
1484 name = 'open_' + urltype
1485 self.type = urltype
1486 name = name.replace('-', '_')
1487 if not hasattr(self, name):
1488 if proxy:
1489 return self.open_unknown_proxy(proxy, fullurl, data)
1490 else:
1491 return self.open_unknown(fullurl, data)
1492 try:
1493 if data is None:
1494 return getattr(self, name)(url)
1495 else:
1496 return getattr(self, name)(url, data)
1497 except socket.error as msg:
1498 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1499
1500 def open_unknown(self, fullurl, data=None):
1501 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001502 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001503 raise IOError('url error', 'unknown url type', type)
1504
1505 def open_unknown_proxy(self, proxy, fullurl, data=None):
1506 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001507 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001508 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1509
1510 # External interface
1511 def retrieve(self, url, filename=None, reporthook=None, data=None):
1512 """retrieve(url) returns (filename, headers) for a local object
1513 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001514 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001515 if self.tempcache and url in self.tempcache:
1516 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001517 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001518 if filename is None and (not type or type == 'file'):
1519 try:
1520 fp = self.open_local_file(url1)
1521 hdrs = fp.info()
1522 del fp
Georg Brandl13e89462008-07-01 19:56:00 +00001523 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001524 except IOError as msg:
1525 pass
1526 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001527 try:
1528 headers = fp.info()
1529 if filename:
1530 tfp = open(filename, 'wb')
1531 else:
1532 import tempfile
1533 garbage, path = splittype(url)
1534 garbage, path = splithost(path or "")
1535 path, garbage = splitquery(path or "")
1536 path, garbage = splitattr(path or "")
1537 suffix = os.path.splitext(path)[1]
1538 (fd, filename) = tempfile.mkstemp(suffix)
1539 self.__tempfiles.append(filename)
1540 tfp = os.fdopen(fd, 'wb')
1541 try:
1542 result = filename, headers
1543 if self.tempcache is not None:
1544 self.tempcache[url] = result
1545 bs = 1024*8
1546 size = -1
1547 read = 0
1548 blocknum = 0
1549 if reporthook:
1550 if "content-length" in headers:
1551 size = int(headers["Content-Length"])
1552 reporthook(blocknum, bs, size)
1553 while 1:
1554 block = fp.read(bs)
1555 if not block:
1556 break
1557 read += len(block)
1558 tfp.write(block)
1559 blocknum += 1
1560 if reporthook:
1561 reporthook(blocknum, bs, size)
1562 finally:
1563 tfp.close()
1564 finally:
1565 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001566 del fp
1567 del tfp
1568
1569 # raise exception if actual size does not match content-length header
1570 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001571 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001572 "retrieval incomplete: got only %i out of %i bytes"
1573 % (read, size), result)
1574
1575 return result
1576
1577 # Each method named open_<type> knows how to open that type of URL
1578
1579 def _open_generic_http(self, connection_factory, url, data):
1580 """Make an HTTP connection using connection_class.
1581
1582 This is an internal method that should be called from
1583 open_http() or open_https().
1584
1585 Arguments:
1586 - connection_factory should take a host name and return an
1587 HTTPConnection instance.
1588 - url is the url to retrieval or a host, relative-path pair.
1589 - data is payload for a POST request or None.
1590 """
1591
1592 user_passwd = None
1593 proxy_passwd= None
1594 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001595 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001596 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001597 user_passwd, host = splituser(host)
1598 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001599 realhost = host
1600 else:
1601 host, selector = url
1602 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001603 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001604 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001605 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001606 url = rest
1607 user_passwd = None
1608 if urltype.lower() != 'http':
1609 realhost = None
1610 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001611 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001612 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001613 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001614 if user_passwd:
1615 selector = "%s://%s%s" % (urltype, realhost, rest)
1616 if proxy_bypass(realhost):
1617 host = realhost
1618
1619 #print "proxy via http:", host, selector
1620 if not host: raise IOError('http error', 'no host given')
1621
1622 if proxy_passwd:
1623 import base64
Senthil Kumaranfe2f4ec2010-08-04 17:49:13 +00001624 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001625 else:
1626 proxy_auth = None
1627
1628 if user_passwd:
1629 import base64
Senthil Kumaranfe2f4ec2010-08-04 17:49:13 +00001630 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001631 else:
1632 auth = None
1633 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001634 headers = {}
1635 if proxy_auth:
1636 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1637 if auth:
1638 headers["Authorization"] = "Basic %s" % auth
1639 if realhost:
1640 headers["Host"] = realhost
1641 for header, value in self.addheaders:
1642 headers[header] = value
1643
1644 if data is not None:
1645 headers["Content-Type"] = "application/x-www-form-urlencoded"
1646 http_conn.request("POST", selector, data, headers)
1647 else:
1648 http_conn.request("GET", selector, headers=headers)
1649
1650 try:
1651 response = http_conn.getresponse()
1652 except http.client.BadStatusLine:
1653 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001654 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001655
1656 # According to RFC 2616, "2xx" code indicates that the client's
1657 # request was successfully received, understood, and accepted.
1658 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001659 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001660 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001661 else:
1662 return self.http_error(
1663 url, response.fp,
1664 response.status, response.reason, response.msg, data)
1665
1666 def open_http(self, url, data=None):
1667 """Use HTTP protocol."""
1668 return self._open_generic_http(http.client.HTTPConnection, url, data)
1669
1670 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1671 """Handle http errors.
1672
1673 Derived class can override this, or provide specific handlers
1674 named http_error_DDD where DDD is the 3-digit error code."""
1675 # First check if there's a specific handler for this error
1676 name = 'http_error_%d' % errcode
1677 if hasattr(self, name):
1678 method = getattr(self, name)
1679 if data is None:
1680 result = method(url, fp, errcode, errmsg, headers)
1681 else:
1682 result = method(url, fp, errcode, errmsg, headers, data)
1683 if result: return result
1684 return self.http_error_default(url, fp, errcode, errmsg, headers)
1685
1686 def http_error_default(self, url, fp, errcode, errmsg, headers):
1687 """Default error handler: close the connection and raise IOError."""
1688 void = fp.read()
1689 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001690 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001691
1692 if _have_ssl:
1693 def _https_connection(self, host):
1694 return http.client.HTTPSConnection(host,
1695 key_file=self.key_file,
1696 cert_file=self.cert_file)
1697
1698 def open_https(self, url, data=None):
1699 """Use HTTPS protocol."""
1700 return self._open_generic_http(self._https_connection, url, data)
1701
1702 def open_file(self, url):
1703 """Use local file or FTP depending on form of URL."""
1704 if not isinstance(url, str):
1705 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1706 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1707 return self.open_ftp(url)
1708 else:
1709 return self.open_local_file(url)
1710
1711 def open_local_file(self, url):
1712 """Use local file."""
1713 import mimetypes, email.utils
1714 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001715 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001716 localname = url2pathname(file)
1717 try:
1718 stats = os.stat(localname)
1719 except OSError as e:
1720 raise URLError(e.errno, e.strerror, e.filename)
1721 size = stats.st_size
1722 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1723 mtype = mimetypes.guess_type(url)[0]
1724 headers = email.message_from_string(
1725 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1726 (mtype or 'text/plain', size, modified))
1727 if not host:
1728 urlfile = file
1729 if file[:1] == '/':
1730 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001731 return addinfourl(open(localname, 'rb'), headers, urlfile)
1732 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001733 if (not port
Senthil Kumaran88a495d2009-12-27 10:15:45 +00001734 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001735 urlfile = file
1736 if file[:1] == '/':
1737 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001738 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001739 raise URLError('local file error', 'not on local host')
1740
1741 def open_ftp(self, url):
1742 """Use FTP protocol."""
1743 if not isinstance(url, str):
1744 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1745 import mimetypes
1746 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001747 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001748 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001749 host, port = splitport(host)
1750 user, host = splituser(host)
1751 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001752 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001753 host = unquote(host)
1754 user = unquote(user or '')
1755 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001756 host = socket.gethostbyname(host)
1757 if not port:
1758 import ftplib
1759 port = ftplib.FTP_PORT
1760 else:
1761 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001762 path, attrs = splitattr(path)
1763 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001764 dirs = path.split('/')
1765 dirs, file = dirs[:-1], dirs[-1]
1766 if dirs and not dirs[0]: dirs = dirs[1:]
1767 if dirs and not dirs[0]: dirs[0] = '/'
1768 key = user, host, port, '/'.join(dirs)
1769 # XXX thread unsafe!
1770 if len(self.ftpcache) > MAXFTPCACHE:
1771 # Prune the cache, rather arbitrarily
1772 for k in self.ftpcache.keys():
1773 if k != key:
1774 v = self.ftpcache[k]
1775 del self.ftpcache[k]
1776 v.close()
1777 try:
1778 if not key in self.ftpcache:
1779 self.ftpcache[key] = \
1780 ftpwrapper(user, passwd, host, port, dirs)
1781 if not file: type = 'D'
1782 else: type = 'I'
1783 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001784 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001785 if attr.lower() == 'type' and \
1786 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1787 type = value.upper()
1788 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1789 mtype = mimetypes.guess_type("ftp:" + url)[0]
1790 headers = ""
1791 if mtype:
1792 headers += "Content-Type: %s\n" % mtype
1793 if retrlen is not None and retrlen >= 0:
1794 headers += "Content-Length: %d\n" % retrlen
1795 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001796 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001797 except ftperrors() as msg:
1798 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1799
1800 def open_data(self, url, data=None):
1801 """Use "data" URL."""
1802 if not isinstance(url, str):
1803 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1804 # ignore POSTed data
1805 #
1806 # syntax of data URLs:
1807 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1808 # mediatype := [ type "/" subtype ] *( ";" parameter )
1809 # data := *urlchar
1810 # parameter := attribute "=" value
1811 try:
1812 [type, data] = url.split(',', 1)
1813 except ValueError:
1814 raise IOError('data error', 'bad data URL')
1815 if not type:
1816 type = 'text/plain;charset=US-ASCII'
1817 semi = type.rfind(';')
1818 if semi >= 0 and '=' not in type[semi:]:
1819 encoding = type[semi+1:]
1820 type = type[:semi]
1821 else:
1822 encoding = ''
1823 msg = []
Senthil Kumaran5a3bc652010-05-01 08:32:23 +00001824 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001825 time.gmtime(time.time())))
1826 msg.append('Content-type: %s' % type)
1827 if encoding == 'base64':
1828 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001829 # XXX is this encoding/decoding ok?
1830 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001831 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001832 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001833 msg.append('Content-Length: %d' % len(data))
1834 msg.append('')
1835 msg.append(data)
1836 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001837 headers = email.message_from_string(msg)
1838 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001839 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001840 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001841
1842
1843class FancyURLopener(URLopener):
1844 """Derived class with handlers for errors we can handle (perhaps)."""
1845
1846 def __init__(self, *args, **kwargs):
1847 URLopener.__init__(self, *args, **kwargs)
1848 self.auth_cache = {}
1849 self.tries = 0
1850 self.maxtries = 10
1851
1852 def http_error_default(self, url, fp, errcode, errmsg, headers):
1853 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001854 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001855
1856 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1857 """Error 302 -- relocated (temporarily)."""
1858 self.tries += 1
1859 if self.maxtries and self.tries >= self.maxtries:
1860 if hasattr(self, "http_error_500"):
1861 meth = self.http_error_500
1862 else:
1863 meth = self.http_error_default
1864 self.tries = 0
1865 return meth(url, fp, 500,
1866 "Internal Server Error: Redirect Recursion", headers)
1867 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1868 data)
1869 self.tries = 0
1870 return result
1871
1872 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1873 if 'location' in headers:
1874 newurl = headers['location']
1875 elif 'uri' in headers:
1876 newurl = headers['uri']
1877 else:
1878 return
1879 void = fp.read()
1880 fp.close()
guido@google.coma119df92011-03-29 11:41:02 -07001881
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001882 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001883 newurl = urljoin(self.type + ":" + url, newurl)
guido@google.coma119df92011-03-29 11:41:02 -07001884
1885 urlparts = urlparse(newurl)
1886
1887 # For security reasons, we don't allow redirection to anything other
1888 # than http, https and ftp.
1889
1890 # We are using newer HTTPError with older redirect_internal method
1891 # This older method will get deprecated in 3.3
1892
1893 if not urlparts.scheme in ('http', 'https', 'ftp'):
1894 raise HTTPError(newurl, errcode,
1895 errmsg +
1896 " Redirection to url '%s' is not allowed." % newurl,
1897 headers, fp)
1898
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001899 return self.open(newurl)
1900
1901 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1902 """Error 301 -- also relocated (permanently)."""
1903 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1904
1905 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1906 """Error 303 -- also relocated (essentially identical to 302)."""
1907 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1908
1909 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1910 """Error 307 -- relocated, but turn POST into error."""
1911 if data is None:
1912 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1913 else:
1914 return self.http_error_default(url, fp, errcode, errmsg, headers)
1915
Senthil Kumaranb4d1c2c2010-06-18 15:12:48 +00001916 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
1917 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001918 """Error 401 -- authentication required.
1919 This function supports Basic authentication only."""
1920 if not 'www-authenticate' in headers:
1921 URLopener.http_error_default(self, url, fp,
1922 errcode, errmsg, headers)
1923 stuff = headers['www-authenticate']
1924 import re
1925 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1926 if not match:
1927 URLopener.http_error_default(self, url, fp,
1928 errcode, errmsg, headers)
1929 scheme, realm = match.groups()
1930 if scheme.lower() != 'basic':
1931 URLopener.http_error_default(self, url, fp,
1932 errcode, errmsg, headers)
Senthil Kumaranb4d1c2c2010-06-18 15:12:48 +00001933 if not retry:
1934 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1935 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001936 name = 'retry_' + self.type + '_basic_auth'
1937 if data is None:
1938 return getattr(self,name)(url, realm)
1939 else:
1940 return getattr(self,name)(url, realm, data)
1941
Senthil Kumaranb4d1c2c2010-06-18 15:12:48 +00001942 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
1943 retry=False):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001944 """Error 407 -- proxy authentication required.
1945 This function supports Basic authentication only."""
1946 if not 'proxy-authenticate' in headers:
1947 URLopener.http_error_default(self, url, fp,
1948 errcode, errmsg, headers)
1949 stuff = headers['proxy-authenticate']
1950 import re
1951 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1952 if not match:
1953 URLopener.http_error_default(self, url, fp,
1954 errcode, errmsg, headers)
1955 scheme, realm = match.groups()
1956 if scheme.lower() != 'basic':
1957 URLopener.http_error_default(self, url, fp,
1958 errcode, errmsg, headers)
Senthil Kumaranb4d1c2c2010-06-18 15:12:48 +00001959 if not retry:
1960 URLopener.http_error_default(self, url, fp, errcode, errmsg,
1961 headers)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001962 name = 'retry_proxy_' + self.type + '_basic_auth'
1963 if data is None:
1964 return getattr(self,name)(url, realm)
1965 else:
1966 return getattr(self,name)(url, realm, data)
1967
1968 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001969 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001970 newurl = 'http://' + host + selector
1971 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001972 urltype, proxyhost = splittype(proxy)
1973 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001974 i = proxyhost.find('@') + 1
1975 proxyhost = proxyhost[i:]
1976 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1977 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001978 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001979 quote(passwd, safe=''), proxyhost)
1980 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1981 if data is None:
1982 return self.open(newurl)
1983 else:
1984 return self.open(newurl, data)
1985
1986 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001987 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001988 newurl = 'https://' + host + selector
1989 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001990 urltype, proxyhost = splittype(proxy)
1991 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001992 i = proxyhost.find('@') + 1
1993 proxyhost = proxyhost[i:]
1994 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1995 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001996 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001997 quote(passwd, safe=''), proxyhost)
1998 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1999 if data is None:
2000 return self.open(newurl)
2001 else:
2002 return self.open(newurl, data)
2003
2004 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002005 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002006 i = host.find('@') + 1
2007 host = host[i:]
2008 user, passwd = self.get_user_passwd(host, realm, i)
2009 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002010 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002011 quote(passwd, safe=''), host)
2012 newurl = 'http://' + host + selector
2013 if data is None:
2014 return self.open(newurl)
2015 else:
2016 return self.open(newurl, data)
2017
2018 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00002019 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002020 i = host.find('@') + 1
2021 host = host[i:]
2022 user, passwd = self.get_user_passwd(host, realm, i)
2023 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00002024 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002025 quote(passwd, safe=''), host)
2026 newurl = 'https://' + host + selector
2027 if data is None:
2028 return self.open(newurl)
2029 else:
2030 return self.open(newurl, data)
2031
Florent Xicluna37ddbb82010-08-14 21:06:29 +00002032 def get_user_passwd(self, host, realm, clear_cache=0):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002033 key = realm + '@' + host.lower()
2034 if key in self.auth_cache:
2035 if clear_cache:
2036 del self.auth_cache[key]
2037 else:
2038 return self.auth_cache[key]
2039 user, passwd = self.prompt_user_passwd(host, realm)
2040 if user or passwd: self.auth_cache[key] = (user, passwd)
2041 return user, passwd
2042
2043 def prompt_user_passwd(self, host, realm):
2044 """Override this in a GUI environment!"""
2045 import getpass
2046 try:
2047 user = input("Enter username for %s at %s: " % (realm, host))
2048 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2049 (user, realm, host))
2050 return user, passwd
2051 except KeyboardInterrupt:
2052 print()
2053 return None, None
2054
2055
2056# Utility functions
2057
2058_localhost = None
2059def localhost():
2060 """Return the IP address of the magic hostname 'localhost'."""
2061 global _localhost
2062 if _localhost is None:
2063 _localhost = socket.gethostbyname('localhost')
2064 return _localhost
2065
2066_thishost = None
2067def thishost():
Senthil Kumaran88a495d2009-12-27 10:15:45 +00002068 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002069 global _thishost
2070 if _thishost is None:
Senthil Kumaran88a495d2009-12-27 10:15:45 +00002071 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname()[2]))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002072 return _thishost
2073
2074_ftperrors = None
2075def ftperrors():
2076 """Return the set of errors raised by the FTP class."""
2077 global _ftperrors
2078 if _ftperrors is None:
2079 import ftplib
2080 _ftperrors = ftplib.all_errors
2081 return _ftperrors
2082
2083_noheaders = None
2084def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002085 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002086 global _noheaders
2087 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002088 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002089 return _noheaders
2090
2091
2092# Utility classes
2093
2094class ftpwrapper:
2095 """Class used by open_ftp() for cache of open FTP connections."""
2096
2097 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2098 self.user = user
2099 self.passwd = passwd
2100 self.host = host
2101 self.port = port
2102 self.dirs = dirs
2103 self.timeout = timeout
2104 self.init()
2105
2106 def init(self):
2107 import ftplib
2108 self.busy = 0
2109 self.ftp = ftplib.FTP()
2110 self.ftp.connect(self.host, self.port, self.timeout)
2111 self.ftp.login(self.user, self.passwd)
2112 for dir in self.dirs:
2113 self.ftp.cwd(dir)
2114
2115 def retrfile(self, file, type):
2116 import ftplib
2117 self.endtransfer()
2118 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2119 else: cmd = 'TYPE ' + type; isdir = 0
2120 try:
2121 self.ftp.voidcmd(cmd)
2122 except ftplib.all_errors:
2123 self.init()
2124 self.ftp.voidcmd(cmd)
2125 conn = None
2126 if file and not isdir:
2127 # Try to retrieve as a file
2128 try:
2129 cmd = 'RETR ' + file
2130 conn = self.ftp.ntransfercmd(cmd)
2131 except ftplib.error_perm as reason:
2132 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002133 raise URLError('ftp error', reason).with_traceback(
2134 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002135 if not conn:
2136 # Set transfer mode to ASCII!
2137 self.ftp.voidcmd('TYPE A')
2138 # Try a directory listing. Verify that directory exists.
2139 if file:
2140 pwd = self.ftp.pwd()
2141 try:
2142 try:
2143 self.ftp.cwd(file)
2144 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002145 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002146 finally:
2147 self.ftp.cwd(pwd)
2148 cmd = 'LIST ' + file
2149 else:
2150 cmd = 'LIST'
2151 conn = self.ftp.ntransfercmd(cmd)
2152 self.busy = 1
2153 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002154 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002155 def endtransfer(self):
2156 if not self.busy:
2157 return
2158 self.busy = 0
2159 try:
2160 self.ftp.voidresp()
2161 except ftperrors():
2162 pass
2163
2164 def close(self):
2165 self.endtransfer()
2166 try:
2167 self.ftp.close()
2168 except ftperrors():
2169 pass
2170
2171# Proxy handling
2172def getproxies_environment():
2173 """Return a dictionary of scheme -> proxy server URL mappings.
2174
2175 Scan the environment for variables named <scheme>_proxy;
2176 this seems to be the standard convention. If you need a
2177 different way, you can pass a proxies dictionary to the
2178 [Fancy]URLopener constructor.
2179
2180 """
2181 proxies = {}
2182 for name, value in os.environ.items():
2183 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002184 if value and name[-6:] == '_proxy':
2185 proxies[name[:-6]] = value
2186 return proxies
2187
2188def proxy_bypass_environment(host):
2189 """Test if proxies should not be used for a particular host.
2190
2191 Checks the environment for a variable named no_proxy, which should
2192 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2193 """
2194 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2195 # '*' is special case for always bypass
2196 if no_proxy == '*':
2197 return 1
2198 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002199 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002200 # check if the host ends with any of the DNS suffixes
2201 for name in no_proxy.split(','):
2202 if name and (hostonly.endswith(name) or host.endswith(name)):
2203 return 1
2204 # otherwise, don't bypass
2205 return 0
2206
2207
Ronald Oussorene72e1612011-03-14 18:15:25 -04002208# This code tests an OSX specific data structure but is testable on all
2209# platforms
2210def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2211 """
2212 Return True iff this host shouldn't be accessed using a proxy
2213
2214 This function uses the MacOSX framework SystemConfiguration
2215 to fetch the proxy information.
2216
2217 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2218 { 'exclude_simple': bool,
2219 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2220 }
2221 """
2222 import re
2223 import socket
2224 from fnmatch import fnmatch
2225
2226 hostonly, port = splitport(host)
2227
2228 def ip2num(ipAddr):
2229 parts = ipAddr.split('.')
2230 parts = list(map(int, parts))
2231 if len(parts) != 4:
2232 parts = (parts + [0, 0, 0, 0])[:4]
2233 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2234
2235 # Check for simple host names:
2236 if '.' not in host:
2237 if proxy_settings['exclude_simple']:
2238 return True
2239
2240 hostIP = None
2241
2242 for value in proxy_settings.get('exceptions', ()):
2243 # Items in the list are strings like these: *.local, 169.254/16
2244 if not value: continue
2245
2246 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2247 if m is not None:
2248 if hostIP is None:
2249 try:
2250 hostIP = socket.gethostbyname(hostonly)
2251 hostIP = ip2num(hostIP)
2252 except socket.error:
2253 continue
2254
2255 base = ip2num(m.group(1))
2256 mask = m.group(2)
2257 if mask is None:
2258 mask = 8 * (m.group(1).count('.') + 1)
2259 else:
2260 mask = int(mask[1:])
2261 mask = 32 - mask
2262
2263 if (hostIP >> mask) == (base >> mask):
2264 return True
2265
2266 elif fnmatch(host, value):
2267 return True
2268
2269 return False
2270
2271
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002272if sys.platform == 'darwin':
Ronald Oussoren218cc582010-04-18 20:49:34 +00002273 from _scproxy import _get_proxy_settings, _get_proxies
2274
2275 def proxy_bypass_macosx_sysconf(host):
Ronald Oussoren218cc582010-04-18 20:49:34 +00002276 proxy_settings = _get_proxy_settings()
Ronald Oussorene72e1612011-03-14 18:15:25 -04002277 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
Ronald Oussoren218cc582010-04-18 20:49:34 +00002278
2279 def getproxies_macosx_sysconf():
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002280 """Return a dictionary of scheme -> proxy server URL mappings.
2281
Ronald Oussoren218cc582010-04-18 20:49:34 +00002282 This function uses the MacOSX framework SystemConfiguration
2283 to fetch the proxy information.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002284 """
Ronald Oussoren218cc582010-04-18 20:49:34 +00002285 return _get_proxies()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002286
Ronald Oussoren218cc582010-04-18 20:49:34 +00002287
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002288
2289 def proxy_bypass(host):
2290 if getproxies_environment():
2291 return proxy_bypass_environment(host)
2292 else:
Ronald Oussoren218cc582010-04-18 20:49:34 +00002293 return proxy_bypass_macosx_sysconf(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002294
2295 def getproxies():
Ronald Oussoren218cc582010-04-18 20:49:34 +00002296 return getproxies_environment() or getproxies_macosx_sysconf()
2297
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002298
2299elif os.name == 'nt':
2300 def getproxies_registry():
2301 """Return a dictionary of scheme -> proxy server URL mappings.
2302
2303 Win32 uses the registry to store proxies.
2304
2305 """
2306 proxies = {}
2307 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002308 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002309 except ImportError:
2310 # Std module, so should be around - but you never know!
2311 return proxies
2312 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002313 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002314 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002315 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002316 'ProxyEnable')[0]
2317 if proxyEnable:
2318 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002319 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002320 'ProxyServer')[0])
2321 if '=' in proxyServer:
2322 # Per-protocol settings
2323 for p in proxyServer.split(';'):
2324 protocol, address = p.split('=', 1)
2325 # See if address has a type:// prefix
2326 import re
2327 if not re.match('^([^/:]+)://', address):
2328 address = '%s://%s' % (protocol, address)
2329 proxies[protocol] = address
2330 else:
2331 # Use one setting for all protocols
2332 if proxyServer[:5] == 'http:':
2333 proxies['http'] = proxyServer
2334 else:
2335 proxies['http'] = 'http://%s' % proxyServer
Senthil Kumaran1ea57a62010-07-14 20:13:28 +00002336 proxies['https'] = 'https://%s' % proxyServer
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002337 proxies['ftp'] = 'ftp://%s' % proxyServer
2338 internetSettings.Close()
2339 except (WindowsError, ValueError, TypeError):
2340 # Either registry key not found etc, or the value in an
2341 # unexpected format.
2342 # proxies already set up to be empty so nothing to do
2343 pass
2344 return proxies
2345
2346 def getproxies():
2347 """Return a dictionary of scheme -> proxy server URL mappings.
2348
2349 Returns settings gathered from the environment, if specified,
2350 or the registry.
2351
2352 """
2353 return getproxies_environment() or getproxies_registry()
2354
2355 def proxy_bypass_registry(host):
2356 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002357 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002358 import re
2359 except ImportError:
2360 # Std modules, so should be around - but you never know!
2361 return 0
2362 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002363 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002364 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002365 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002366 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002367 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002368 'ProxyOverride')[0])
2369 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2370 except WindowsError:
2371 return 0
2372 if not proxyEnable or not proxyOverride:
2373 return 0
2374 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002375 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002376 host = [rawHost]
2377 try:
2378 addr = socket.gethostbyname(rawHost)
2379 if addr != rawHost:
2380 host.append(addr)
2381 except socket.error:
2382 pass
2383 try:
2384 fqdn = socket.getfqdn(rawHost)
2385 if fqdn != rawHost:
2386 host.append(fqdn)
2387 except socket.error:
2388 pass
2389 # make a check value list from the registry entry: replace the
2390 # '<local>' string by the localhost entry and the corresponding
2391 # canonical entry.
2392 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002393 # now check if we match one of the registry values.
2394 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002395 if test == '<local>':
2396 if '.' not in rawHost:
2397 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002398 test = test.replace(".", r"\.") # mask dots
2399 test = test.replace("*", r".*") # change glob sequence
2400 test = test.replace("?", r".") # change glob char
2401 for val in host:
2402 # print "%s <--> %s" %( test, val )
2403 if re.match(test, val, re.I):
2404 return 1
2405 return 0
2406
2407 def proxy_bypass(host):
2408 """Return a dictionary of scheme -> proxy server URL mappings.
2409
2410 Returns settings gathered from the environment, if specified,
2411 or the registry.
2412
2413 """
2414 if getproxies_environment():
2415 return proxy_bypass_environment(host)
2416 else:
2417 return proxy_bypass_registry(host)
2418
2419else:
2420 # By default use environment variables
2421 getproxies = getproxies_environment
2422 proxy_bypass = proxy_bypass_environment