blob: 9f5e6073416d2245d7deec2205a2ec9a066abb21 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001# Issues in merging urllib and urllib2:
2# 1. They both define a function named urlopen()
3
4"""An extensible library for opening URLs using a variety of protocols
5
6The simplest way to use this module is to call the urlopen function,
7which accepts a string containing a URL or a Request object (described
8below). It opens the URL and returns the results as file-like
9object; the returned object has some extra methods described below.
10
11The OpenerDirector manages a collection of Handler objects that do
12all the actual work. Each Handler implements a particular protocol or
13option. The OpenerDirector is a composite object that invokes the
14Handlers needed to open the requested URL. For example, the
15HTTPHandler performs HTTP GET and POST requests and deals with
16non-error returns. The HTTPRedirectHandler automatically deals with
17HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
18deals with digest authentication.
19
20urlopen(url, data=None) -- Basic usage is the same as original
21urllib. pass the url and optionally data to post to an HTTP URL, and
22get a file-like object back. One difference is that you can also pass
23a Request instance instead of URL. Raises a URLError (subclass of
24IOError); for HTTP errors, raises an HTTPError, which can also be
25treated as a valid response.
26
27build_opener -- Function that creates a new OpenerDirector instance.
28Will install the default handlers. Accepts one or more Handlers as
29arguments, either instances or Handler classes that it will
30instantiate. If one of the argument is a subclass of the default
31handler, the argument will be installed instead of the default.
32
33install_opener -- Installs a new opener as the default opener.
34
35objects of interest:
36OpenerDirector --
37
38Request -- An object that encapsulates the state of a request. The
39state can be as simple as the URL. It can also include extra HTTP
40headers, e.g. a User-Agent.
41
42BaseHandler --
43
44internals:
45BaseHandler and parent
46_call_chain conventions
47
48Example usage:
49
Georg Brandl029986a2008-06-23 11:44:14 +000050import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051
52# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000053authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000054authinfo.add_password(realm='PDQ Application',
55 uri='https://mahler:8092/site-updates.py',
56 user='klem',
57 passwd='geheim$parole')
58
Georg Brandl029986a2008-06-23 11:44:14 +000059proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000060
61# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000062opener = urllib.request.build_opener(proxy_support, authinfo,
63 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064
65# install it
Georg Brandl029986a2008-06-23 11:44:14 +000066urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000067
Georg Brandl029986a2008-06-23 11:44:14 +000068f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000069"""
70
71# XXX issues:
72# If an authentication error handler that tries to perform
73# authentication for some reason but fails, how should the error be
74# signalled? The client needs to know the HTTP error code. But if
75# the handler knows that the problem was, e.g., that it didn't know
76# that hash algo that requested in the challenge, it would be good to
77# pass that information along to the client, too.
78# ftp errors aren't handled cleanly
79# check digest against correct (i.e. non-apache) implementation
80
81# Possible extensions:
82# complex proxies XXX not sure what exactly was meant by this
83# abstract factory for opener
84
85import base64
86import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
97import urllib.parse, urllib.error, urllib.response
98import bisect
99
100from io import StringIO
101
102# check for SSL
103try:
104 import ssl
105except:
106 _have_ssl = False
107else:
108 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000109
110# used in User-Agent header sent
111__version__ = sys.version[:3]
112
113_opener = None
114def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
115 global _opener
116 if _opener is None:
117 _opener = build_opener()
118 return _opener.open(url, data, timeout)
119
120def install_opener(opener):
121 global _opener
122 _opener = opener
123
124# TODO(jhylton): Make this work with the same global opener.
125_urlopener = None
126def urlretrieve(url, filename=None, reporthook=None, data=None):
127 global _urlopener
128 if not _urlopener:
129 _urlopener = FancyURLopener()
130 return _urlopener.retrieve(url, filename, reporthook, data)
131
132def urlcleanup():
133 if _urlopener:
134 _urlopener.cleanup()
135 global _opener
136 if _opener:
137 _opener = None
138
139# copied from cookielib.py
140_cut_port_re = re.compile(r":\d+$")
141def request_host(request):
142 """Return request-host, as defined by RFC 2965.
143
144 Variation from RFC: returned value is lowercased, for convenient
145 comparison.
146
147 """
148 url = request.get_full_url()
149 host = urllib.parse.urlparse(url)[1]
150 if host == "":
151 host = request.get_header("Host", "")
152
153 # remove port, if present
154 host = _cut_port_re.sub("", host, 1)
155 return host.lower()
156
157class Request:
158
159 def __init__(self, url, data=None, headers={},
160 origin_req_host=None, unverifiable=False):
161 # unwrap('<URL:type://host/path>') --> 'type://host/path'
162 self.__original = urllib.parse.unwrap(url)
163 self.type = None
164 # self.__r_type is what's left after doing the splittype
165 self.host = None
166 self.port = None
167 self.data = data
168 self.headers = {}
169 for key, value in headers.items():
170 self.add_header(key, value)
171 self.unredirected_hdrs = {}
172 if origin_req_host is None:
173 origin_req_host = request_host(self)
174 self.origin_req_host = origin_req_host
175 self.unverifiable = unverifiable
176
177 def __getattr__(self, attr):
178 # XXX this is a fallback mechanism to guard against these
179 # methods getting called in a non-standard order. this may be
180 # too complicated and/or unnecessary.
181 # XXX should the __r_XXX attributes be public?
182 if attr[:12] == '_Request__r_':
183 name = attr[12:]
184 if hasattr(Request, 'get_' + name):
185 getattr(self, 'get_' + name)()
186 return getattr(self, attr)
187 raise AttributeError(attr)
188
189 def get_method(self):
190 if self.has_data():
191 return "POST"
192 else:
193 return "GET"
194
195 # XXX these helper methods are lame
196
197 def add_data(self, data):
198 self.data = data
199
200 def has_data(self):
201 return self.data is not None
202
203 def get_data(self):
204 return self.data
205
206 def get_full_url(self):
207 return self.__original
208
209 def get_type(self):
210 if self.type is None:
211 self.type, self.__r_type = urllib.parse.splittype(self.__original)
212 if self.type is None:
213 raise ValueError("unknown url type: %s" % self.__original)
214 return self.type
215
216 def get_host(self):
217 if self.host is None:
218 self.host, self.__r_host = urllib.parse.splithost(self.__r_type)
219 if self.host:
220 self.host = urllib.parse.unquote(self.host)
221 return self.host
222
223 def get_selector(self):
224 return self.__r_host
225
226 def set_proxy(self, host, type):
227 self.host, self.type = host, type
228 self.__r_host = self.__original
229
230 def get_origin_req_host(self):
231 return self.origin_req_host
232
233 def is_unverifiable(self):
234 return self.unverifiable
235
236 def add_header(self, key, val):
237 # useful for something like authentication
238 self.headers[key.capitalize()] = val
239
240 def add_unredirected_header(self, key, val):
241 # will not be added to a redirected request
242 self.unredirected_hdrs[key.capitalize()] = val
243
244 def has_header(self, header_name):
245 return (header_name in self.headers or
246 header_name in self.unredirected_hdrs)
247
248 def get_header(self, header_name, default=None):
249 return self.headers.get(
250 header_name,
251 self.unredirected_hdrs.get(header_name, default))
252
253 def header_items(self):
254 hdrs = self.unredirected_hdrs.copy()
255 hdrs.update(self.headers)
256 return list(hdrs.items())
257
258class OpenerDirector:
259 def __init__(self):
260 client_version = "Python-urllib/%s" % __version__
261 self.addheaders = [('User-agent', client_version)]
262 # manage the individual handlers
263 self.handlers = []
264 self.handle_open = {}
265 self.handle_error = {}
266 self.process_response = {}
267 self.process_request = {}
268
269 def add_handler(self, handler):
270 if not hasattr(handler, "add_parent"):
271 raise TypeError("expected BaseHandler instance, got %r" %
272 type(handler))
273
274 added = False
275 for meth in dir(handler):
276 if meth in ["redirect_request", "do_open", "proxy_open"]:
277 # oops, coincidental match
278 continue
279
280 i = meth.find("_")
281 protocol = meth[:i]
282 condition = meth[i+1:]
283
284 if condition.startswith("error"):
285 j = condition.find("_") + i + 1
286 kind = meth[j+1:]
287 try:
288 kind = int(kind)
289 except ValueError:
290 pass
291 lookup = self.handle_error.get(protocol, {})
292 self.handle_error[protocol] = lookup
293 elif condition == "open":
294 kind = protocol
295 lookup = self.handle_open
296 elif condition == "response":
297 kind = protocol
298 lookup = self.process_response
299 elif condition == "request":
300 kind = protocol
301 lookup = self.process_request
302 else:
303 continue
304
305 handlers = lookup.setdefault(kind, [])
306 if handlers:
307 bisect.insort(handlers, handler)
308 else:
309 handlers.append(handler)
310 added = True
311
312 if added:
313 # the handlers must work in an specific order, the order
314 # is specified in a Handler attribute
315 bisect.insort(self.handlers, handler)
316 handler.add_parent(self)
317
318 def close(self):
319 # Only exists for backwards compatibility.
320 pass
321
322 def _call_chain(self, chain, kind, meth_name, *args):
323 # Handlers raise an exception if no one else should try to handle
324 # the request, or return None if they can't but another handler
325 # could. Otherwise, they return the response.
326 handlers = chain.get(kind, ())
327 for handler in handlers:
328 func = getattr(handler, meth_name)
329
330 result = func(*args)
331 if result is not None:
332 return result
333
334 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
335 # accept a URL or a Request object
336 if isinstance(fullurl, str):
337 req = Request(fullurl, data)
338 else:
339 req = fullurl
340 if data is not None:
341 req.add_data(data)
342
343 req.timeout = timeout
344 protocol = req.get_type()
345
346 # pre-process request
347 meth_name = protocol+"_request"
348 for processor in self.process_request.get(protocol, []):
349 meth = getattr(processor, meth_name)
350 req = meth(req)
351
352 response = self._open(req, data)
353
354 # post-process response
355 meth_name = protocol+"_response"
356 for processor in self.process_response.get(protocol, []):
357 meth = getattr(processor, meth_name)
358 response = meth(req, response)
359
360 return response
361
362 def _open(self, req, data=None):
363 result = self._call_chain(self.handle_open, 'default',
364 'default_open', req)
365 if result:
366 return result
367
368 protocol = req.get_type()
369 result = self._call_chain(self.handle_open, protocol, protocol +
370 '_open', req)
371 if result:
372 return result
373
374 return self._call_chain(self.handle_open, 'unknown',
375 'unknown_open', req)
376
377 def error(self, proto, *args):
378 if proto in ('http', 'https'):
379 # XXX http[s] protocols are special-cased
380 dict = self.handle_error['http'] # https is not different than http
381 proto = args[2] # YUCK!
382 meth_name = 'http_error_%s' % proto
383 http_err = 1
384 orig_args = args
385 else:
386 dict = self.handle_error
387 meth_name = proto + '_error'
388 http_err = 0
389 args = (dict, proto, meth_name) + args
390 result = self._call_chain(*args)
391 if result:
392 return result
393
394 if http_err:
395 args = (dict, 'default', 'http_error_default') + orig_args
396 return self._call_chain(*args)
397
398# XXX probably also want an abstract factory that knows when it makes
399# sense to skip a superclass in favor of a subclass and when it might
400# make sense to include both
401
402def build_opener(*handlers):
403 """Create an opener object from a list of handlers.
404
405 The opener will use several default handlers, including support
406 for HTTP and FTP.
407
408 If any of the handlers passed as arguments are subclasses of the
409 default handlers, the default handlers will not be used.
410 """
411 def isclass(obj):
412 return isinstance(obj, type) or hasattr(obj, "__bases__")
413
414 opener = OpenerDirector()
415 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
416 HTTPDefaultErrorHandler, HTTPRedirectHandler,
417 FTPHandler, FileHandler, HTTPErrorProcessor]
418 if hasattr(http.client, "HTTPSConnection"):
419 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000420 skip = set()
421 for klass in default_classes:
422 for check in handlers:
423 if isclass(check):
424 if issubclass(check, klass):
425 skip.add(klass)
426 elif isinstance(check, klass):
427 skip.add(klass)
428 for klass in skip:
429 default_classes.remove(klass)
430
431 for klass in default_classes:
432 opener.add_handler(klass())
433
434 for h in handlers:
435 if isclass(h):
436 h = h()
437 opener.add_handler(h)
438 return opener
439
440class BaseHandler:
441 handler_order = 500
442
443 def add_parent(self, parent):
444 self.parent = parent
445
446 def close(self):
447 # Only exists for backwards compatibility
448 pass
449
450 def __lt__(self, other):
451 if not hasattr(other, "handler_order"):
452 # Try to preserve the old behavior of having custom classes
453 # inserted after default ones (works only for custom user
454 # classes which are not aware of handler_order).
455 return True
456 return self.handler_order < other.handler_order
457
458
459class HTTPErrorProcessor(BaseHandler):
460 """Process HTTP error responses."""
461 handler_order = 1000 # after all other processing
462
463 def http_response(self, request, response):
464 code, msg, hdrs = response.code, response.msg, response.info()
465
466 # According to RFC 2616, "2xx" code indicates that the client's
467 # request was successfully received, understood, and accepted.
468 if not (200 <= code < 300):
469 response = self.parent.error(
470 'http', request, response, code, msg, hdrs)
471
472 return response
473
474 https_response = http_response
475
476class HTTPDefaultErrorHandler(BaseHandler):
477 def http_error_default(self, req, fp, code, msg, hdrs):
478 raise urllib.error.HTTPError(req.get_full_url(), code, msg, hdrs, fp)
479
480class HTTPRedirectHandler(BaseHandler):
481 # maximum number of redirections to any single URL
482 # this is needed because of the state that cookies introduce
483 max_repeats = 4
484 # maximum total number of redirections (regardless of URL) before
485 # assuming we're in a loop
486 max_redirections = 10
487
488 def redirect_request(self, req, fp, code, msg, headers, newurl):
489 """Return a Request or None in response to a redirect.
490
491 This is called by the http_error_30x methods when a
492 redirection response is received. If a redirection should
493 take place, return a new Request to allow http_error_30x to
494 perform the redirect. Otherwise, raise HTTPError if no-one
495 else should try to handle this url. Return None if you can't
496 but another Handler might.
497 """
498 m = req.get_method()
499 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
500 or code in (301, 302, 303) and m == "POST")):
501 raise urllib.error.HTTPError(req.get_full_url(),
502 code, msg, headers, fp)
503
504 # Strictly (according to RFC 2616), 301 or 302 in response to
505 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000506 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000507 # essentially all clients do redirect in this case, so we do
508 # the same.
509 # be conciliant with URIs containing a space
510 newurl = newurl.replace(' ', '%20')
511 CONTENT_HEADERS = ("content-length", "content-type")
512 newheaders = dict((k, v) for k, v in req.headers.items()
513 if k.lower() not in CONTENT_HEADERS)
514 return Request(newurl,
515 headers=newheaders,
516 origin_req_host=req.get_origin_req_host(),
517 unverifiable=True)
518
519 # Implementation note: To avoid the server sending us into an
520 # infinite loop, the request object needs to track what URLs we
521 # have already seen. Do this by adding a handler-specific
522 # attribute to the Request object.
523 def http_error_302(self, req, fp, code, msg, headers):
524 # Some servers (incorrectly) return multiple Location headers
525 # (so probably same goes for URI). Use first header.
526 if "location" in headers:
527 newurl = headers["location"]
528 elif "uri" in headers:
529 newurl = headers["uri"]
530 else:
531 return
532 newurl = urllib.parse.urljoin(req.get_full_url(), newurl)
533
534 # XXX Probably want to forget about the state of the current
535 # request, although that might interact poorly with other
536 # handlers that also use handler-specific request attributes
537 new = self.redirect_request(req, fp, code, msg, headers, newurl)
538 if new is None:
539 return
540
541 # loop detection
542 # .redirect_dict has a key url if url was previously visited.
543 if hasattr(req, 'redirect_dict'):
544 visited = new.redirect_dict = req.redirect_dict
545 if (visited.get(newurl, 0) >= self.max_repeats or
546 len(visited) >= self.max_redirections):
547 raise urllib.error.HTTPError(req.get_full_url(), code,
548 self.inf_msg + msg, headers, fp)
549 else:
550 visited = new.redirect_dict = req.redirect_dict = {}
551 visited[newurl] = visited.get(newurl, 0) + 1
552
553 # Don't close the fp until we are sure that we won't use it
554 # with HTTPError.
555 fp.read()
556 fp.close()
557
558 return self.parent.open(new)
559
560 http_error_301 = http_error_303 = http_error_307 = http_error_302
561
562 inf_msg = "The HTTP server returned a redirect error that would " \
563 "lead to an infinite loop.\n" \
564 "The last 30x error message was:\n"
565
566
567def _parse_proxy(proxy):
568 """Return (scheme, user, password, host/port) given a URL or an authority.
569
570 If a URL is supplied, it must have an authority (host:port) component.
571 According to RFC 3986, having an authority component means the URL must
572 have two slashes after the scheme:
573
574 >>> _parse_proxy('file:/ftp.example.com/')
575 Traceback (most recent call last):
576 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
577
578 The first three items of the returned tuple may be None.
579
580 Examples of authority parsing:
581
582 >>> _parse_proxy('proxy.example.com')
583 (None, None, None, 'proxy.example.com')
584 >>> _parse_proxy('proxy.example.com:3128')
585 (None, None, None, 'proxy.example.com:3128')
586
587 The authority component may optionally include userinfo (assumed to be
588 username:password):
589
590 >>> _parse_proxy('joe:password@proxy.example.com')
591 (None, 'joe', 'password', 'proxy.example.com')
592 >>> _parse_proxy('joe:password@proxy.example.com:3128')
593 (None, 'joe', 'password', 'proxy.example.com:3128')
594
595 Same examples, but with URLs instead:
596
597 >>> _parse_proxy('http://proxy.example.com/')
598 ('http', None, None, 'proxy.example.com')
599 >>> _parse_proxy('http://proxy.example.com:3128/')
600 ('http', None, None, 'proxy.example.com:3128')
601 >>> _parse_proxy('http://joe:password@proxy.example.com/')
602 ('http', 'joe', 'password', 'proxy.example.com')
603 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
604 ('http', 'joe', 'password', 'proxy.example.com:3128')
605
606 Everything after the authority is ignored:
607
608 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
609 ('ftp', 'joe', 'password', 'proxy.example.com')
610
611 Test for no trailing '/' case:
612
613 >>> _parse_proxy('http://joe:password@proxy.example.com')
614 ('http', 'joe', 'password', 'proxy.example.com')
615
616 """
617 scheme, r_scheme = urllib.parse.splittype(proxy)
618 if not r_scheme.startswith("/"):
619 # authority
620 scheme = None
621 authority = proxy
622 else:
623 # URL
624 if not r_scheme.startswith("//"):
625 raise ValueError("proxy URL with no authority: %r" % proxy)
626 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
627 # and 3.3.), path is empty or starts with '/'
628 end = r_scheme.find("/", 2)
629 if end == -1:
630 end = None
631 authority = r_scheme[2:end]
632 userinfo, hostport = urllib.parse.splituser(authority)
633 if userinfo is not None:
634 user, password = urllib.parse.splitpasswd(userinfo)
635 else:
636 user = password = None
637 return scheme, user, password, hostport
638
639class ProxyHandler(BaseHandler):
640 # Proxies must be in front
641 handler_order = 100
642
643 def __init__(self, proxies=None):
644 if proxies is None:
645 proxies = getproxies()
646 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
647 self.proxies = proxies
648 for type, url in proxies.items():
649 setattr(self, '%s_open' % type,
650 lambda r, proxy=url, type=type, meth=self.proxy_open: \
651 meth(r, proxy, type))
652
653 def proxy_open(self, req, proxy, type):
654 orig_type = req.get_type()
655 proxy_type, user, password, hostport = _parse_proxy(proxy)
656 if proxy_type is None:
657 proxy_type = orig_type
658 if user and password:
Georg Brandl029986a2008-06-23 11:44:14 +0000659 user_pass = '%s:%s' % (urllib.parse.unquote(user),
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000660 urllib.parse.unquote(password))
661 creds = base64.b64encode(user_pass.encode()).decode("ascii")
662 req.add_header('Proxy-authorization', 'Basic ' + creds)
663 hostport = urllib.parse.unquote(hostport)
664 req.set_proxy(hostport, proxy_type)
665 if orig_type == proxy_type:
666 # let other handlers take care of it
667 return None
668 else:
669 # need to start over, because the other handlers don't
670 # grok the proxy's URL type
671 # e.g. if we have a constructor arg proxies like so:
672 # {'http': 'ftp://proxy.example.com'}, we may end up turning
673 # a request for http://acme.example.com/a into one for
674 # ftp://proxy.example.com/a
675 return self.parent.open(req)
676
677class HTTPPasswordMgr:
678
679 def __init__(self):
680 self.passwd = {}
681
682 def add_password(self, realm, uri, user, passwd):
683 # uri could be a single URI or a sequence
684 if isinstance(uri, str):
685 uri = [uri]
686 if not realm in self.passwd:
687 self.passwd[realm] = {}
688 for default_port in True, False:
689 reduced_uri = tuple(
690 [self.reduce_uri(u, default_port) for u in uri])
691 self.passwd[realm][reduced_uri] = (user, passwd)
692
693 def find_user_password(self, realm, authuri):
694 domains = self.passwd.get(realm, {})
695 for default_port in True, False:
696 reduced_authuri = self.reduce_uri(authuri, default_port)
697 for uris, authinfo in domains.items():
698 for uri in uris:
699 if self.is_suburi(uri, reduced_authuri):
700 return authinfo
701 return None, None
702
703 def reduce_uri(self, uri, default_port=True):
704 """Accept authority or URI and extract only the authority and path."""
705 # note HTTP URLs do not have a userinfo component
706 parts = urllib.parse.urlsplit(uri)
707 if parts[1]:
708 # URI
709 scheme = parts[0]
710 authority = parts[1]
711 path = parts[2] or '/'
712 else:
713 # host or host:port
714 scheme = None
715 authority = uri
716 path = '/'
717 host, port = urllib.parse.splitport(authority)
718 if default_port and port is None and scheme is not None:
719 dport = {"http": 80,
720 "https": 443,
721 }.get(scheme)
722 if dport is not None:
723 authority = "%s:%d" % (host, dport)
724 return authority, path
725
726 def is_suburi(self, base, test):
727 """Check if test is below base in a URI tree
728
729 Both args must be URIs in reduced form.
730 """
731 if base == test:
732 return True
733 if base[0] != test[0]:
734 return False
735 common = posixpath.commonprefix((base[1], test[1]))
736 if len(common) == len(base[1]):
737 return True
738 return False
739
740
741class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
742
743 def find_user_password(self, realm, authuri):
744 user, password = HTTPPasswordMgr.find_user_password(self, realm,
745 authuri)
746 if user is not None:
747 return user, password
748 return HTTPPasswordMgr.find_user_password(self, None, authuri)
749
750
751class AbstractBasicAuthHandler:
752
753 # XXX this allows for multiple auth-schemes, but will stupidly pick
754 # the last one with a realm specified.
755
756 # allow for double- and single-quoted realm values
757 # (single quotes are a violation of the RFC, but appear in the wild)
758 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
759 'realm=(["\'])(.*?)\\2', re.I)
760
761 # XXX could pre-emptively send auth info already accepted (RFC 2617,
762 # end of section 2, and section 1.2 immediately after "credentials"
763 # production).
764
765 def __init__(self, password_mgr=None):
766 if password_mgr is None:
767 password_mgr = HTTPPasswordMgr()
768 self.passwd = password_mgr
769 self.add_password = self.passwd.add_password
770
771 def http_error_auth_reqed(self, authreq, host, req, headers):
772 # host may be an authority (without userinfo) or a URL with an
773 # authority
774 # XXX could be multiple headers
775 authreq = headers.get(authreq, None)
776 if authreq:
777 mo = AbstractBasicAuthHandler.rx.search(authreq)
778 if mo:
779 scheme, quote, realm = mo.groups()
780 if scheme.lower() == 'basic':
781 return self.retry_http_basic_auth(host, req, realm)
782
783 def retry_http_basic_auth(self, host, req, realm):
784 user, pw = self.passwd.find_user_password(realm, host)
785 if pw is not None:
786 raw = "%s:%s" % (user, pw)
787 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
788 if req.headers.get(self.auth_header, None) == auth:
789 return None
790 req.add_header(self.auth_header, auth)
791 return self.parent.open(req)
792 else:
793 return None
794
795
796class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
797
798 auth_header = 'Authorization'
799
800 def http_error_401(self, req, fp, code, msg, headers):
801 url = req.get_full_url()
802 return self.http_error_auth_reqed('www-authenticate',
803 url, req, headers)
804
805
806class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
807
808 auth_header = 'Proxy-authorization'
809
810 def http_error_407(self, req, fp, code, msg, headers):
811 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000812 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
814 # userinfo.
815 authority = req.get_host()
816 return self.http_error_auth_reqed('proxy-authenticate',
817 authority, req, headers)
818
819
820def randombytes(n):
821 """Return n random bytes."""
822 return os.urandom(n)
823
824class AbstractDigestAuthHandler:
825 # Digest authentication is specified in RFC 2617.
826
827 # XXX The client does not inspect the Authentication-Info header
828 # in a successful response.
829
830 # XXX It should be possible to test this implementation against
831 # a mock server that just generates a static set of challenges.
832
833 # XXX qop="auth-int" supports is shaky
834
835 def __init__(self, passwd=None):
836 if passwd is None:
837 passwd = HTTPPasswordMgr()
838 self.passwd = passwd
839 self.add_password = self.passwd.add_password
840 self.retried = 0
841 self.nonce_count = 0
842
843 def reset_retry_count(self):
844 self.retried = 0
845
846 def http_error_auth_reqed(self, auth_header, host, req, headers):
847 authreq = headers.get(auth_header, None)
848 if self.retried > 5:
849 # Don't fail endlessly - if we failed once, we'll probably
850 # fail a second time. Hm. Unless the Password Manager is
851 # prompting for the information. Crap. This isn't great
852 # but it's better than the current 'repeat until recursion
853 # depth exceeded' approach <wink>
854 raise urllib.error.HTTPError(req.get_full_url(), 401,
855 "digest auth failed",
856 headers, None)
857 else:
858 self.retried += 1
859 if authreq:
860 scheme = authreq.split()[0]
861 if scheme.lower() == 'digest':
862 return self.retry_http_digest_auth(req, authreq)
863
864 def retry_http_digest_auth(self, req, auth):
865 token, challenge = auth.split(' ', 1)
866 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
867 auth = self.get_authorization(req, chal)
868 if auth:
869 auth_val = 'Digest %s' % auth
870 if req.headers.get(self.auth_header, None) == auth_val:
871 return None
872 req.add_unredirected_header(self.auth_header, auth_val)
873 resp = self.parent.open(req)
874 return resp
875
876 def get_cnonce(self, nonce):
877 # The cnonce-value is an opaque
878 # quoted string value provided by the client and used by both client
879 # and server to avoid chosen plaintext attacks, to provide mutual
880 # authentication, and to provide some message integrity protection.
881 # This isn't a fabulous effort, but it's probably Good Enough.
882 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
883 b = s.encode("ascii") + randombytes(8)
884 dig = hashlib.sha1(b).hexdigest()
885 return dig[:16]
886
887 def get_authorization(self, req, chal):
888 try:
889 realm = chal['realm']
890 nonce = chal['nonce']
891 qop = chal.get('qop')
892 algorithm = chal.get('algorithm', 'MD5')
893 # mod_digest doesn't send an opaque, even though it isn't
894 # supposed to be optional
895 opaque = chal.get('opaque', None)
896 except KeyError:
897 return None
898
899 H, KD = self.get_algorithm_impls(algorithm)
900 if H is None:
901 return None
902
903 user, pw = self.passwd.find_user_password(realm, req.get_full_url())
904 if user is None:
905 return None
906
907 # XXX not implemented yet
908 if req.has_data():
909 entdig = self.get_entity_digest(req.get_data(), chal)
910 else:
911 entdig = None
912
913 A1 = "%s:%s:%s" % (user, realm, pw)
914 A2 = "%s:%s" % (req.get_method(),
915 # XXX selector: what about proxies and full urls
916 req.get_selector())
917 if qop == 'auth':
918 self.nonce_count += 1
919 ncvalue = '%08x' % self.nonce_count
920 cnonce = self.get_cnonce(nonce)
921 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
922 respdig = KD(H(A1), noncebit)
923 elif qop is None:
924 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
925 else:
926 # XXX handle auth-int.
927 raise urllib.error.URLError("qop '%s' is not supported." % qop)
928
929 # XXX should the partial digests be encoded too?
930
931 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
932 'response="%s"' % (user, realm, nonce, req.get_selector(),
933 respdig)
934 if opaque:
935 base += ', opaque="%s"' % opaque
936 if entdig:
937 base += ', digest="%s"' % entdig
938 base += ', algorithm="%s"' % algorithm
939 if qop:
940 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
941 return base
942
943 def get_algorithm_impls(self, algorithm):
944 # lambdas assume digest modules are imported at the top level
945 if algorithm == 'MD5':
946 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
947 elif algorithm == 'SHA':
948 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
949 # XXX MD5-sess
950 KD = lambda s, d: H("%s:%s" % (s, d))
951 return H, KD
952
953 def get_entity_digest(self, data, chal):
954 # XXX not implemented yet
955 return None
956
957
958class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
959 """An authentication protocol defined by RFC 2069
960
961 Digest authentication improves on basic authentication because it
962 does not transmit passwords in the clear.
963 """
964
965 auth_header = 'Authorization'
966 handler_order = 490 # before Basic auth
967
968 def http_error_401(self, req, fp, code, msg, headers):
969 host = urllib.parse.urlparse(req.get_full_url())[1]
970 retry = self.http_error_auth_reqed('www-authenticate',
971 host, req, headers)
972 self.reset_retry_count()
973 return retry
974
975
976class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
977
978 auth_header = 'Proxy-Authorization'
979 handler_order = 490 # before Basic auth
980
981 def http_error_407(self, req, fp, code, msg, headers):
982 host = req.get_host()
983 retry = self.http_error_auth_reqed('proxy-authenticate',
984 host, req, headers)
985 self.reset_retry_count()
986 return retry
987
988class AbstractHTTPHandler(BaseHandler):
989
990 def __init__(self, debuglevel=0):
991 self._debuglevel = debuglevel
992
993 def set_http_debuglevel(self, level):
994 self._debuglevel = level
995
996 def do_request_(self, request):
997 host = request.get_host()
998 if not host:
999 raise urllib.error.URLError('no host given')
1000
1001 if request.has_data(): # POST
1002 data = request.get_data()
1003 if not request.has_header('Content-type'):
1004 request.add_unredirected_header(
1005 'Content-type',
1006 'application/x-www-form-urlencoded')
1007 if not request.has_header('Content-length'):
1008 request.add_unredirected_header(
1009 'Content-length', '%d' % len(data))
1010
1011 scheme, sel = urllib.parse.splittype(request.get_selector())
1012 sel_host, sel_path = urllib.parse.splithost(sel)
1013 if not request.has_header('Host'):
1014 request.add_unredirected_header('Host', sel_host or host)
1015 for name, value in self.parent.addheaders:
1016 name = name.capitalize()
1017 if not request.has_header(name):
1018 request.add_unredirected_header(name, value)
1019
1020 return request
1021
1022 def do_open(self, http_class, req):
1023 """Return an addinfourl object for the request, using http_class.
1024
1025 http_class must implement the HTTPConnection API from http.client.
1026 The addinfourl return value is a file-like object. It also
1027 has methods and attributes including:
1028 - info(): return a mimetools.Message object for the headers
1029 - geturl(): return the original request URL
1030 - code: HTTP status code
1031 """
1032 host = req.get_host()
1033 if not host:
1034 raise urllib.error.URLError('no host given')
1035
1036 h = http_class(host, timeout=req.timeout) # will parse host:port
1037 headers = dict(req.headers)
1038 headers.update(req.unredirected_hdrs)
1039
1040 # TODO(jhylton): Should this be redesigned to handle
1041 # persistent connections?
1042
1043 # We want to make an HTTP/1.1 request, but the addinfourl
1044 # class isn't prepared to deal with a persistent connection.
1045 # It will try to read all remaining data from the socket,
1046 # which will block while the server waits for the next request.
1047 # So make sure the connection gets closed after the (only)
1048 # request.
1049 headers["Connection"] = "close"
1050 headers = dict(
1051 (name.title(), val) for name, val in headers.items())
1052 try:
1053 h.request(req.get_method(), req.get_selector(), req.data, headers)
1054 r = h.getresponse()
1055 except socket.error as err: # XXX what error?
1056 raise urllib.error.URLError(err)
1057
1058 resp = urllib.response.addinfourl(r.fp, r.msg, req.get_full_url())
1059 resp.code = r.status
1060 resp.msg = r.reason
1061 return resp
1062
1063
1064class HTTPHandler(AbstractHTTPHandler):
1065
1066 def http_open(self, req):
1067 return self.do_open(http.client.HTTPConnection, req)
1068
1069 http_request = AbstractHTTPHandler.do_request_
1070
1071if hasattr(http.client, 'HTTPSConnection'):
1072 class HTTPSHandler(AbstractHTTPHandler):
1073
1074 def https_open(self, req):
1075 return self.do_open(http.client.HTTPSConnection, req)
1076
1077 https_request = AbstractHTTPHandler.do_request_
1078
1079class HTTPCookieProcessor(BaseHandler):
1080 def __init__(self, cookiejar=None):
1081 import http.cookiejar
1082 if cookiejar is None:
1083 cookiejar = http.cookiejar.CookieJar()
1084 self.cookiejar = cookiejar
1085
1086 def http_request(self, request):
1087 self.cookiejar.add_cookie_header(request)
1088 return request
1089
1090 def http_response(self, request, response):
1091 self.cookiejar.extract_cookies(response, request)
1092 return response
1093
1094 https_request = http_request
1095 https_response = http_response
1096
1097class UnknownHandler(BaseHandler):
1098 def unknown_open(self, req):
1099 type = req.get_type()
1100 raise urllib.error.URLError('unknown url type: %s' % type)
1101
1102def parse_keqv_list(l):
1103 """Parse list of key=value strings where keys are not duplicated."""
1104 parsed = {}
1105 for elt in l:
1106 k, v = elt.split('=', 1)
1107 if v[0] == '"' and v[-1] == '"':
1108 v = v[1:-1]
1109 parsed[k] = v
1110 return parsed
1111
1112def parse_http_list(s):
1113 """Parse lists as described by RFC 2068 Section 2.
1114
1115 In particular, parse comma-separated lists where the elements of
1116 the list may include quoted-strings. A quoted-string could
1117 contain a comma. A non-quoted string could have quotes in the
1118 middle. Neither commas nor quotes count if they are escaped.
1119 Only double-quotes count, not single-quotes.
1120 """
1121 res = []
1122 part = ''
1123
1124 escape = quote = False
1125 for cur in s:
1126 if escape:
1127 part += cur
1128 escape = False
1129 continue
1130 if quote:
1131 if cur == '\\':
1132 escape = True
1133 continue
1134 elif cur == '"':
1135 quote = False
1136 part += cur
1137 continue
1138
1139 if cur == ',':
1140 res.append(part)
1141 part = ''
1142 continue
1143
1144 if cur == '"':
1145 quote = True
1146
1147 part += cur
1148
1149 # append last part
1150 if part:
1151 res.append(part)
1152
1153 return [part.strip() for part in res]
1154
1155class FileHandler(BaseHandler):
1156 # Use local file or FTP depending on form of URL
1157 def file_open(self, req):
1158 url = req.get_selector()
1159 if url[:2] == '//' and url[2:3] != '/':
1160 req.type = 'ftp'
1161 return self.parent.open(req)
1162 else:
1163 return self.open_local_file(req)
1164
1165 # names for the localhost
1166 names = None
1167 def get_names(self):
1168 if FileHandler.names is None:
1169 try:
1170 FileHandler.names = (socket.gethostbyname('localhost'),
1171 socket.gethostbyname(socket.gethostname()))
1172 except socket.gaierror:
1173 FileHandler.names = (socket.gethostbyname('localhost'),)
1174 return FileHandler.names
1175
1176 # not entirely sure what the rules are here
1177 def open_local_file(self, req):
1178 import email.utils
1179 import mimetypes
1180 host = req.get_host()
1181 file = req.get_selector()
1182 localfile = url2pathname(file)
1183 try:
1184 stats = os.stat(localfile)
1185 size = stats.st_size
1186 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1187 mtype = mimetypes.guess_type(file)[0]
1188 headers = email.message_from_string(
1189 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1190 (mtype or 'text/plain', size, modified))
1191 if host:
1192 host, port = urllib.parse.splitport(host)
1193 if not host or \
1194 (not port and _safe_gethostbyname(host) in self.get_names()):
1195 return urllib.response.addinfourl(open(localfile, 'rb'),
1196 headers, 'file:'+file)
1197 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001198 # users shouldn't expect OSErrors coming from urlopen()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001199 raise urllib.error.URLError(msg)
1200 raise urllib.error.URLError('file not on local host')
1201
1202def _safe_gethostbyname(host):
1203 try:
1204 return socket.gethostbyname(host)
1205 except socket.gaierror:
1206 return None
1207
1208class FTPHandler(BaseHandler):
1209 def ftp_open(self, req):
1210 import ftplib
1211 import mimetypes
1212 host = req.get_host()
1213 if not host:
1214 raise urllib.error.URLError('ftp error: no host given')
1215 host, port = urllib.parse.splitport(host)
1216 if port is None:
1217 port = ftplib.FTP_PORT
1218 else:
1219 port = int(port)
1220
1221 # username/password handling
1222 user, host = urllib.parse.splituser(host)
1223 if user:
1224 user, passwd = urllib.parse.splitpasswd(user)
1225 else:
1226 passwd = None
1227 host = urllib.parse.unquote(host)
1228 user = urllib.parse.unquote(user or '')
1229 passwd = urllib.parse.unquote(passwd or '')
1230
1231 try:
1232 host = socket.gethostbyname(host)
1233 except socket.error as msg:
1234 raise urllib.error.URLError(msg)
1235 path, attrs = urllib.parse.splitattr(req.get_selector())
1236 dirs = path.split('/')
1237 dirs = list(map(urllib.parse.unquote, dirs))
1238 dirs, file = dirs[:-1], dirs[-1]
1239 if dirs and not dirs[0]:
1240 dirs = dirs[1:]
1241 try:
1242 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1243 type = file and 'I' or 'D'
1244 for attr in attrs:
1245 attr, value = urllib.parse.splitvalue(attr)
1246 if attr.lower() == 'type' and \
1247 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1248 type = value.upper()
1249 fp, retrlen = fw.retrfile(file, type)
1250 headers = ""
1251 mtype = mimetypes.guess_type(req.get_full_url())[0]
1252 if mtype:
1253 headers += "Content-type: %s\n" % mtype
1254 if retrlen is not None and retrlen >= 0:
1255 headers += "Content-length: %d\n" % retrlen
1256 headers = email.message_from_string(headers)
1257 return urllib.response.addinfourl(fp, headers, req.get_full_url())
1258 except ftplib.all_errors as msg:
1259 exc = urllib.error.URLError('ftp error: %s' % msg)
1260 raise exc.with_traceback(sys.exc_info()[2])
1261
1262 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1263 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1264 return fw
1265
1266class CacheFTPHandler(FTPHandler):
1267 # XXX would be nice to have pluggable cache strategies
1268 # XXX this stuff is definitely not thread safe
1269 def __init__(self):
1270 self.cache = {}
1271 self.timeout = {}
1272 self.soonest = 0
1273 self.delay = 60
1274 self.max_conns = 16
1275
1276 def setTimeout(self, t):
1277 self.delay = t
1278
1279 def setMaxConns(self, m):
1280 self.max_conns = m
1281
1282 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1283 key = user, host, port, '/'.join(dirs), timeout
1284 if key in self.cache:
1285 self.timeout[key] = time.time() + self.delay
1286 else:
1287 self.cache[key] = ftpwrapper(user, passwd, host, port,
1288 dirs, timeout)
1289 self.timeout[key] = time.time() + self.delay
1290 self.check_cache()
1291 return self.cache[key]
1292
1293 def check_cache(self):
1294 # first check for old ones
1295 t = time.time()
1296 if self.soonest <= t:
1297 for k, v in list(self.timeout.items()):
1298 if v < t:
1299 self.cache[k].close()
1300 del self.cache[k]
1301 del self.timeout[k]
1302 self.soonest = min(list(self.timeout.values()))
1303
1304 # then check the size
1305 if len(self.cache) == self.max_conns:
1306 for k, v in list(self.timeout.items()):
1307 if v == self.soonest:
1308 del self.cache[k]
1309 del self.timeout[k]
1310 break
1311 self.soonest = min(list(self.timeout.values()))
1312
1313# Code move from the old urllib module
1314
1315MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1316
1317# Helper for non-unix systems
1318if os.name == 'mac':
1319 from macurl2path import url2pathname, pathname2url
1320elif os.name == 'nt':
1321 from nturl2path import url2pathname, pathname2url
1322else:
1323 def url2pathname(pathname):
1324 """OS-specific conversion from a relative URL of the 'file' scheme
1325 to a file system path; not recommended for general use."""
1326 return urllib.parse.unquote(pathname)
1327
1328 def pathname2url(pathname):
1329 """OS-specific conversion from a file system path to a relative URL
1330 of the 'file' scheme; not recommended for general use."""
1331 return urllib.parse.quote(pathname)
1332
1333# This really consists of two pieces:
1334# (1) a class which handles opening of all sorts of URLs
1335# (plus assorted utilities etc.)
1336# (2) a set of functions for parsing URLs
1337# XXX Should these be separated out into different modules?
1338
1339
1340ftpcache = {}
1341class URLopener:
1342 """Class to open URLs.
1343 This is a class rather than just a subroutine because we may need
1344 more than one set of global protocol-specific options.
1345 Note -- this is a base class for those who don't want the
1346 automatic handling of errors type 302 (relocated) and 401
1347 (authorization needed)."""
1348
1349 __tempfiles = None
1350
1351 version = "Python-urllib/%s" % __version__
1352
1353 # Constructor
1354 def __init__(self, proxies=None, **x509):
1355 if proxies is None:
1356 proxies = getproxies()
1357 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1358 self.proxies = proxies
1359 self.key_file = x509.get('key_file')
1360 self.cert_file = x509.get('cert_file')
1361 self.addheaders = [('User-Agent', self.version)]
1362 self.__tempfiles = []
1363 self.__unlink = os.unlink # See cleanup()
1364 self.tempcache = None
1365 # Undocumented feature: if you assign {} to tempcache,
1366 # it is used to cache files retrieved with
1367 # self.retrieve(). This is not enabled by default
1368 # since it does not work for changing documents (and I
1369 # haven't got the logic to check expiration headers
1370 # yet).
1371 self.ftpcache = ftpcache
1372 # Undocumented feature: you can use a different
1373 # ftp cache by assigning to the .ftpcache member;
1374 # in case you want logically independent URL openers
1375 # XXX This is not threadsafe. Bah.
1376
1377 def __del__(self):
1378 self.close()
1379
1380 def close(self):
1381 self.cleanup()
1382
1383 def cleanup(self):
1384 # This code sometimes runs when the rest of this module
1385 # has already been deleted, so it can't use any globals
1386 # or import anything.
1387 if self.__tempfiles:
1388 for file in self.__tempfiles:
1389 try:
1390 self.__unlink(file)
1391 except OSError:
1392 pass
1393 del self.__tempfiles[:]
1394 if self.tempcache:
1395 self.tempcache.clear()
1396
1397 def addheader(self, *args):
1398 """Add a header to be used by the HTTP interface only
1399 e.g. u.addheader('Accept', 'sound/basic')"""
1400 self.addheaders.append(args)
1401
1402 # External interface
1403 def open(self, fullurl, data=None):
1404 """Use URLopener().open(file) instead of open(file, 'r')."""
1405 fullurl = urllib.parse.unwrap(urllib.parse.toBytes(fullurl))
1406 if self.tempcache and fullurl in self.tempcache:
1407 filename, headers = self.tempcache[fullurl]
1408 fp = open(filename, 'rb')
1409 return urllib.response.addinfourl(fp, headers, fullurl)
1410 urltype, url = urllib.parse.splittype(fullurl)
1411 if not urltype:
1412 urltype = 'file'
1413 if urltype in self.proxies:
1414 proxy = self.proxies[urltype]
1415 urltype, proxyhost = urllib.parse.splittype(proxy)
1416 host, selector = urllib.parse.splithost(proxyhost)
1417 url = (host, fullurl) # Signal special case to open_*()
1418 else:
1419 proxy = None
1420 name = 'open_' + urltype
1421 self.type = urltype
1422 name = name.replace('-', '_')
1423 if not hasattr(self, name):
1424 if proxy:
1425 return self.open_unknown_proxy(proxy, fullurl, data)
1426 else:
1427 return self.open_unknown(fullurl, data)
1428 try:
1429 if data is None:
1430 return getattr(self, name)(url)
1431 else:
1432 return getattr(self, name)(url, data)
1433 except socket.error as msg:
1434 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1435
1436 def open_unknown(self, fullurl, data=None):
1437 """Overridable interface to open unknown URL type."""
1438 type, url = urllib.parse.splittype(fullurl)
1439 raise IOError('url error', 'unknown url type', type)
1440
1441 def open_unknown_proxy(self, proxy, fullurl, data=None):
1442 """Overridable interface to open unknown URL type."""
1443 type, url = urllib.parse.splittype(fullurl)
1444 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1445
1446 # External interface
1447 def retrieve(self, url, filename=None, reporthook=None, data=None):
1448 """retrieve(url) returns (filename, headers) for a local object
1449 or (tempfilename, headers) for a remote object."""
1450 url = urllib.parse.unwrap(urllib.parse.toBytes(url))
1451 if self.tempcache and url in self.tempcache:
1452 return self.tempcache[url]
1453 type, url1 = urllib.parse.splittype(url)
1454 if filename is None and (not type or type == 'file'):
1455 try:
1456 fp = self.open_local_file(url1)
1457 hdrs = fp.info()
1458 del fp
1459 return url2pathname(urllib.parse.splithost(url1)[1]), hdrs
1460 except IOError as msg:
1461 pass
1462 fp = self.open(url, data)
1463 headers = fp.info()
1464 if filename:
1465 tfp = open(filename, 'wb')
1466 else:
1467 import tempfile
1468 garbage, path = urllib.parse.splittype(url)
1469 garbage, path = urllib.parse.splithost(path or "")
1470 path, garbage = urllib.parse.splitquery(path or "")
1471 path, garbage = urllib.parse.splitattr(path or "")
1472 suffix = os.path.splitext(path)[1]
1473 (fd, filename) = tempfile.mkstemp(suffix)
1474 self.__tempfiles.append(filename)
1475 tfp = os.fdopen(fd, 'wb')
1476 result = filename, headers
1477 if self.tempcache is not None:
1478 self.tempcache[url] = result
1479 bs = 1024*8
1480 size = -1
1481 read = 0
1482 blocknum = 0
1483 if reporthook:
1484 if "content-length" in headers:
1485 size = int(headers["Content-Length"])
1486 reporthook(blocknum, bs, size)
1487 while 1:
1488 block = fp.read(bs)
1489 if not block:
1490 break
1491 read += len(block)
1492 tfp.write(block)
1493 blocknum += 1
1494 if reporthook:
1495 reporthook(blocknum, bs, size)
1496 fp.close()
1497 tfp.close()
1498 del fp
1499 del tfp
1500
1501 # raise exception if actual size does not match content-length header
1502 if size >= 0 and read < size:
1503 raise urllib.error.ContentTooShortError(
1504 "retrieval incomplete: got only %i out of %i bytes"
1505 % (read, size), result)
1506
1507 return result
1508
1509 # Each method named open_<type> knows how to open that type of URL
1510
1511 def _open_generic_http(self, connection_factory, url, data):
1512 """Make an HTTP connection using connection_class.
1513
1514 This is an internal method that should be called from
1515 open_http() or open_https().
1516
1517 Arguments:
1518 - connection_factory should take a host name and return an
1519 HTTPConnection instance.
1520 - url is the url to retrieval or a host, relative-path pair.
1521 - data is payload for a POST request or None.
1522 """
1523
1524 user_passwd = None
1525 proxy_passwd= None
1526 if isinstance(url, str):
1527 host, selector = urllib.parse.splithost(url)
1528 if host:
1529 user_passwd, host = urllib.parse.splituser(host)
1530 host = urllib.parse.unquote(host)
1531 realhost = host
1532 else:
1533 host, selector = url
1534 # check whether the proxy contains authorization information
1535 proxy_passwd, host = urllib.parse.splituser(host)
1536 # now we proceed with the url we want to obtain
1537 urltype, rest = urllib.parse.splittype(selector)
1538 url = rest
1539 user_passwd = None
1540 if urltype.lower() != 'http':
1541 realhost = None
1542 else:
1543 realhost, rest = urllib.parse.splithost(rest)
1544 if realhost:
1545 user_passwd, realhost = urllib.parse.splituser(realhost)
1546 if user_passwd:
1547 selector = "%s://%s%s" % (urltype, realhost, rest)
1548 if proxy_bypass(realhost):
1549 host = realhost
1550
1551 #print "proxy via http:", host, selector
1552 if not host: raise IOError('http error', 'no host given')
1553
1554 if proxy_passwd:
1555 import base64
1556 proxy_auth = base64.b64encode(proxy_passwd).strip()
1557 else:
1558 proxy_auth = None
1559
1560 if user_passwd:
1561 import base64
1562 auth = base64.b64encode(user_passwd).strip()
1563 else:
1564 auth = None
1565 http_conn = connection_factory(host)
1566 # XXX We should fix urllib so that it works with HTTP/1.1.
1567 http_conn._http_vsn = 10
1568 http_conn._http_vsn_str = "HTTP/1.0"
1569
1570 headers = {}
1571 if proxy_auth:
1572 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1573 if auth:
1574 headers["Authorization"] = "Basic %s" % auth
1575 if realhost:
1576 headers["Host"] = realhost
1577 for header, value in self.addheaders:
1578 headers[header] = value
1579
1580 if data is not None:
1581 headers["Content-Type"] = "application/x-www-form-urlencoded"
1582 http_conn.request("POST", selector, data, headers)
1583 else:
1584 http_conn.request("GET", selector, headers=headers)
1585
1586 try:
1587 response = http_conn.getresponse()
1588 except http.client.BadStatusLine:
1589 # something went wrong with the HTTP status line
1590 raise urllib.error.URLError("http protocol error: bad status line")
1591
1592 # According to RFC 2616, "2xx" code indicates that the client's
1593 # request was successfully received, understood, and accepted.
1594 if 200 <= response.status < 300:
1595 return urllib.response.addinfourl(response.fp, response.msg,
1596 "http:" + url,
1597 response.status)
1598 else:
1599 return self.http_error(
1600 url, response.fp,
1601 response.status, response.reason, response.msg, data)
1602
1603 def open_http(self, url, data=None):
1604 """Use HTTP protocol."""
1605 return self._open_generic_http(http.client.HTTPConnection, url, data)
1606
1607 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1608 """Handle http errors.
1609
1610 Derived class can override this, or provide specific handlers
1611 named http_error_DDD where DDD is the 3-digit error code."""
1612 # First check if there's a specific handler for this error
1613 name = 'http_error_%d' % errcode
1614 if hasattr(self, name):
1615 method = getattr(self, name)
1616 if data is None:
1617 result = method(url, fp, errcode, errmsg, headers)
1618 else:
1619 result = method(url, fp, errcode, errmsg, headers, data)
1620 if result: return result
1621 return self.http_error_default(url, fp, errcode, errmsg, headers)
1622
1623 def http_error_default(self, url, fp, errcode, errmsg, headers):
1624 """Default error handler: close the connection and raise IOError."""
1625 void = fp.read()
1626 fp.close()
1627 raise urllib.error.HTTPError(url, errcode, errmsg, headers, None)
1628
1629 if _have_ssl:
1630 def _https_connection(self, host):
1631 return http.client.HTTPSConnection(host,
1632 key_file=self.key_file,
1633 cert_file=self.cert_file)
1634
1635 def open_https(self, url, data=None):
1636 """Use HTTPS protocol."""
1637 return self._open_generic_http(self._https_connection, url, data)
1638
1639 def open_file(self, url):
1640 """Use local file or FTP depending on form of URL."""
1641 if not isinstance(url, str):
1642 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1643 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1644 return self.open_ftp(url)
1645 else:
1646 return self.open_local_file(url)
1647
1648 def open_local_file(self, url):
1649 """Use local file."""
1650 import mimetypes, email.utils
1651 from io import StringIO
1652 host, file = urllib.parse.splithost(url)
1653 localname = url2pathname(file)
1654 try:
1655 stats = os.stat(localname)
1656 except OSError as e:
1657 raise URLError(e.errno, e.strerror, e.filename)
1658 size = stats.st_size
1659 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1660 mtype = mimetypes.guess_type(url)[0]
1661 headers = email.message_from_string(
1662 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1663 (mtype or 'text/plain', size, modified))
1664 if not host:
1665 urlfile = file
1666 if file[:1] == '/':
1667 urlfile = 'file://' + file
1668 return urllib.response.addinfourl(open(localname, 'rb'),
1669 headers, urlfile)
1670 host, port = urllib.parse.splitport(host)
1671 if (not port
1672 and socket.gethostbyname(host) in (localhost(), thishost())):
1673 urlfile = file
1674 if file[:1] == '/':
1675 urlfile = 'file://' + file
1676 return urllib.response.addinfourl(open(localname, 'rb'),
1677 headers, urlfile)
1678 raise URLError('local file error', 'not on local host')
1679
1680 def open_ftp(self, url):
1681 """Use FTP protocol."""
1682 if not isinstance(url, str):
1683 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1684 import mimetypes
1685 from io import StringIO
1686 host, path = urllib.parse.splithost(url)
1687 if not host: raise URLError('ftp error', 'no host given')
1688 host, port = urllib.parse.splitport(host)
1689 user, host = urllib.parse.splituser(host)
1690 if user: user, passwd = urllib.parse.splitpasswd(user)
1691 else: passwd = None
1692 host = urllib.parse.unquote(host)
1693 user = urllib.parse.unquote(user or '')
1694 passwd = urllib.parse.unquote(passwd or '')
1695 host = socket.gethostbyname(host)
1696 if not port:
1697 import ftplib
1698 port = ftplib.FTP_PORT
1699 else:
1700 port = int(port)
1701 path, attrs = urllib.parse.splitattr(path)
1702 path = urllib.parse.unquote(path)
1703 dirs = path.split('/')
1704 dirs, file = dirs[:-1], dirs[-1]
1705 if dirs and not dirs[0]: dirs = dirs[1:]
1706 if dirs and not dirs[0]: dirs[0] = '/'
1707 key = user, host, port, '/'.join(dirs)
1708 # XXX thread unsafe!
1709 if len(self.ftpcache) > MAXFTPCACHE:
1710 # Prune the cache, rather arbitrarily
1711 for k in self.ftpcache.keys():
1712 if k != key:
1713 v = self.ftpcache[k]
1714 del self.ftpcache[k]
1715 v.close()
1716 try:
1717 if not key in self.ftpcache:
1718 self.ftpcache[key] = \
1719 ftpwrapper(user, passwd, host, port, dirs)
1720 if not file: type = 'D'
1721 else: type = 'I'
1722 for attr in attrs:
1723 attr, value = urllib.parse.splitvalue(attr)
1724 if attr.lower() == 'type' and \
1725 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1726 type = value.upper()
1727 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1728 mtype = mimetypes.guess_type("ftp:" + url)[0]
1729 headers = ""
1730 if mtype:
1731 headers += "Content-Type: %s\n" % mtype
1732 if retrlen is not None and retrlen >= 0:
1733 headers += "Content-Length: %d\n" % retrlen
1734 headers = email.message_from_string(headers)
1735 return urllib.response.addinfourl(fp, headers, "ftp:" + url)
1736 except ftperrors() as msg:
1737 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1738
1739 def open_data(self, url, data=None):
1740 """Use "data" URL."""
1741 if not isinstance(url, str):
1742 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1743 # ignore POSTed data
1744 #
1745 # syntax of data URLs:
1746 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1747 # mediatype := [ type "/" subtype ] *( ";" parameter )
1748 # data := *urlchar
1749 # parameter := attribute "=" value
1750 try:
1751 [type, data] = url.split(',', 1)
1752 except ValueError:
1753 raise IOError('data error', 'bad data URL')
1754 if not type:
1755 type = 'text/plain;charset=US-ASCII'
1756 semi = type.rfind(';')
1757 if semi >= 0 and '=' not in type[semi:]:
1758 encoding = type[semi+1:]
1759 type = type[:semi]
1760 else:
1761 encoding = ''
1762 msg = []
1763 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
1764 time.gmtime(time.time())))
1765 msg.append('Content-type: %s' % type)
1766 if encoding == 'base64':
1767 import base64
1768 data = base64.decodestring(data)
1769 else:
1770 data = urllib.parse.unquote(data)
1771 msg.append('Content-Length: %d' % len(data))
1772 msg.append('')
1773 msg.append(data)
1774 msg = '\n'.join(msg)
1775 headers = mimetools.message_from_string(msg)
1776 #f.fileno = None # needed for addinfourl
1777 return urllib.response.addinfourl(f, headers, url)
1778
1779
1780class FancyURLopener(URLopener):
1781 """Derived class with handlers for errors we can handle (perhaps)."""
1782
1783 def __init__(self, *args, **kwargs):
1784 URLopener.__init__(self, *args, **kwargs)
1785 self.auth_cache = {}
1786 self.tries = 0
1787 self.maxtries = 10
1788
1789 def http_error_default(self, url, fp, errcode, errmsg, headers):
1790 """Default error handling -- don't raise an exception."""
1791 return urllib.response.addinfourl(fp, headers, "http:" + url, errcode)
1792
1793 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1794 """Error 302 -- relocated (temporarily)."""
1795 self.tries += 1
1796 if self.maxtries and self.tries >= self.maxtries:
1797 if hasattr(self, "http_error_500"):
1798 meth = self.http_error_500
1799 else:
1800 meth = self.http_error_default
1801 self.tries = 0
1802 return meth(url, fp, 500,
1803 "Internal Server Error: Redirect Recursion", headers)
1804 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1805 data)
1806 self.tries = 0
1807 return result
1808
1809 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1810 if 'location' in headers:
1811 newurl = headers['location']
1812 elif 'uri' in headers:
1813 newurl = headers['uri']
1814 else:
1815 return
1816 void = fp.read()
1817 fp.close()
1818 # In case the server sent a relative URL, join with original:
1819 newurl = basejoin(self.type + ":" + url, newurl)
1820 return self.open(newurl)
1821
1822 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1823 """Error 301 -- also relocated (permanently)."""
1824 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1825
1826 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1827 """Error 303 -- also relocated (essentially identical to 302)."""
1828 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1829
1830 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1831 """Error 307 -- relocated, but turn POST into error."""
1832 if data is None:
1833 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1834 else:
1835 return self.http_error_default(url, fp, errcode, errmsg, headers)
1836
1837 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
1838 """Error 401 -- authentication required.
1839 This function supports Basic authentication only."""
1840 if not 'www-authenticate' in headers:
1841 URLopener.http_error_default(self, url, fp,
1842 errcode, errmsg, headers)
1843 stuff = headers['www-authenticate']
1844 import re
1845 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1846 if not match:
1847 URLopener.http_error_default(self, url, fp,
1848 errcode, errmsg, headers)
1849 scheme, realm = match.groups()
1850 if scheme.lower() != 'basic':
1851 URLopener.http_error_default(self, url, fp,
1852 errcode, errmsg, headers)
1853 name = 'retry_' + self.type + '_basic_auth'
1854 if data is None:
1855 return getattr(self,name)(url, realm)
1856 else:
1857 return getattr(self,name)(url, realm, data)
1858
1859 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
1860 """Error 407 -- proxy authentication required.
1861 This function supports Basic authentication only."""
1862 if not 'proxy-authenticate' in headers:
1863 URLopener.http_error_default(self, url, fp,
1864 errcode, errmsg, headers)
1865 stuff = headers['proxy-authenticate']
1866 import re
1867 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1868 if not match:
1869 URLopener.http_error_default(self, url, fp,
1870 errcode, errmsg, headers)
1871 scheme, realm = match.groups()
1872 if scheme.lower() != 'basic':
1873 URLopener.http_error_default(self, url, fp,
1874 errcode, errmsg, headers)
1875 name = 'retry_proxy_' + self.type + '_basic_auth'
1876 if data is None:
1877 return getattr(self,name)(url, realm)
1878 else:
1879 return getattr(self,name)(url, realm, data)
1880
1881 def retry_proxy_http_basic_auth(self, url, realm, data=None):
1882 host, selector = urllib.parse.splithost(url)
1883 newurl = 'http://' + host + selector
1884 proxy = self.proxies['http']
1885 urltype, proxyhost = urllib.parse.splittype(proxy)
1886 proxyhost, proxyselector = urllib.parse.splithost(proxyhost)
1887 i = proxyhost.find('@') + 1
1888 proxyhost = proxyhost[i:]
1889 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1890 if not (user or passwd): return None
1891 proxyhost = "%s:%s@%s" % (urllib.parse.quote(user, safe=''),
1892 quote(passwd, safe=''), proxyhost)
1893 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1894 if data is None:
1895 return self.open(newurl)
1896 else:
1897 return self.open(newurl, data)
1898
1899 def retry_proxy_https_basic_auth(self, url, realm, data=None):
1900 host, selector = urllib.parse.splithost(url)
1901 newurl = 'https://' + host + selector
1902 proxy = self.proxies['https']
1903 urltype, proxyhost = urllib.parse.splittype(proxy)
1904 proxyhost, proxyselector = urllib.parse.splithost(proxyhost)
1905 i = proxyhost.find('@') + 1
1906 proxyhost = proxyhost[i:]
1907 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1908 if not (user or passwd): return None
1909 proxyhost = "%s:%s@%s" % (urllib.parse.quote(user, safe=''),
1910 quote(passwd, safe=''), proxyhost)
1911 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1912 if data is None:
1913 return self.open(newurl)
1914 else:
1915 return self.open(newurl, data)
1916
1917 def retry_http_basic_auth(self, url, realm, data=None):
1918 host, selector = urllib.parse.splithost(url)
1919 i = host.find('@') + 1
1920 host = host[i:]
1921 user, passwd = self.get_user_passwd(host, realm, i)
1922 if not (user or passwd): return None
1923 host = "%s:%s@%s" % (urllib.parse.quote(user, safe=''),
1924 quote(passwd, safe=''), host)
1925 newurl = 'http://' + host + selector
1926 if data is None:
1927 return self.open(newurl)
1928 else:
1929 return self.open(newurl, data)
1930
1931 def retry_https_basic_auth(self, url, realm, data=None):
1932 host, selector = urllib.parse.splithost(url)
1933 i = host.find('@') + 1
1934 host = host[i:]
1935 user, passwd = self.get_user_passwd(host, realm, i)
1936 if not (user or passwd): return None
1937 host = "%s:%s@%s" % (urllib.parse.quote(user, safe=''),
1938 quote(passwd, safe=''), host)
1939 newurl = 'https://' + host + selector
1940 if data is None:
1941 return self.open(newurl)
1942 else:
1943 return self.open(newurl, data)
1944
1945 def get_user_passwd(self, host, realm, clear_cache = 0):
1946 key = realm + '@' + host.lower()
1947 if key in self.auth_cache:
1948 if clear_cache:
1949 del self.auth_cache[key]
1950 else:
1951 return self.auth_cache[key]
1952 user, passwd = self.prompt_user_passwd(host, realm)
1953 if user or passwd: self.auth_cache[key] = (user, passwd)
1954 return user, passwd
1955
1956 def prompt_user_passwd(self, host, realm):
1957 """Override this in a GUI environment!"""
1958 import getpass
1959 try:
1960 user = input("Enter username for %s at %s: " % (realm, host))
1961 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
1962 (user, realm, host))
1963 return user, passwd
1964 except KeyboardInterrupt:
1965 print()
1966 return None, None
1967
1968
1969# Utility functions
1970
1971_localhost = None
1972def localhost():
1973 """Return the IP address of the magic hostname 'localhost'."""
1974 global _localhost
1975 if _localhost is None:
1976 _localhost = socket.gethostbyname('localhost')
1977 return _localhost
1978
1979_thishost = None
1980def thishost():
1981 """Return the IP address of the current host."""
1982 global _thishost
1983 if _thishost is None:
1984 _thishost = socket.gethostbyname(socket.gethostname())
1985 return _thishost
1986
1987_ftperrors = None
1988def ftperrors():
1989 """Return the set of errors raised by the FTP class."""
1990 global _ftperrors
1991 if _ftperrors is None:
1992 import ftplib
1993 _ftperrors = ftplib.all_errors
1994 return _ftperrors
1995
1996_noheaders = None
1997def noheaders():
1998 """Return an empty mimetools.Message object."""
1999 global _noheaders
2000 if _noheaders is None:
2001 _noheaders = mimetools.message_from_string("")
2002 return _noheaders
2003
2004
2005# Utility classes
2006
2007class ftpwrapper:
2008 """Class used by open_ftp() for cache of open FTP connections."""
2009
2010 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2011 self.user = user
2012 self.passwd = passwd
2013 self.host = host
2014 self.port = port
2015 self.dirs = dirs
2016 self.timeout = timeout
2017 self.init()
2018
2019 def init(self):
2020 import ftplib
2021 self.busy = 0
2022 self.ftp = ftplib.FTP()
2023 self.ftp.connect(self.host, self.port, self.timeout)
2024 self.ftp.login(self.user, self.passwd)
2025 for dir in self.dirs:
2026 self.ftp.cwd(dir)
2027
2028 def retrfile(self, file, type):
2029 import ftplib
2030 self.endtransfer()
2031 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2032 else: cmd = 'TYPE ' + type; isdir = 0
2033 try:
2034 self.ftp.voidcmd(cmd)
2035 except ftplib.all_errors:
2036 self.init()
2037 self.ftp.voidcmd(cmd)
2038 conn = None
2039 if file and not isdir:
2040 # Try to retrieve as a file
2041 try:
2042 cmd = 'RETR ' + file
2043 conn = self.ftp.ntransfercmd(cmd)
2044 except ftplib.error_perm as reason:
2045 if str(reason)[:3] != '550':
2046 raise urllib.error.URLError('ftp error', reason).with_traceback(sys.exc_info()[2])
2047 if not conn:
2048 # Set transfer mode to ASCII!
2049 self.ftp.voidcmd('TYPE A')
2050 # Try a directory listing. Verify that directory exists.
2051 if file:
2052 pwd = self.ftp.pwd()
2053 try:
2054 try:
2055 self.ftp.cwd(file)
2056 except ftplib.error_perm as reason:
2057 raise urllib.error.URLError('ftp error', reason) from reason
2058 finally:
2059 self.ftp.cwd(pwd)
2060 cmd = 'LIST ' + file
2061 else:
2062 cmd = 'LIST'
2063 conn = self.ftp.ntransfercmd(cmd)
2064 self.busy = 1
2065 # Pass back both a suitably decorated object and a retrieval length
2066 return (urllib.response.addclosehook(conn[0].makefile('rb'),
2067 self.endtransfer), conn[1])
2068 def endtransfer(self):
2069 if not self.busy:
2070 return
2071 self.busy = 0
2072 try:
2073 self.ftp.voidresp()
2074 except ftperrors():
2075 pass
2076
2077 def close(self):
2078 self.endtransfer()
2079 try:
2080 self.ftp.close()
2081 except ftperrors():
2082 pass
2083
2084# Proxy handling
2085def getproxies_environment():
2086 """Return a dictionary of scheme -> proxy server URL mappings.
2087
2088 Scan the environment for variables named <scheme>_proxy;
2089 this seems to be the standard convention. If you need a
2090 different way, you can pass a proxies dictionary to the
2091 [Fancy]URLopener constructor.
2092
2093 """
2094 proxies = {}
2095 for name, value in os.environ.items():
2096 name = name.lower()
2097 if name == 'no_proxy':
2098 # handled in proxy_bypass_environment
2099 continue
2100 if value and name[-6:] == '_proxy':
2101 proxies[name[:-6]] = value
2102 return proxies
2103
2104def proxy_bypass_environment(host):
2105 """Test if proxies should not be used for a particular host.
2106
2107 Checks the environment for a variable named no_proxy, which should
2108 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2109 """
2110 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2111 # '*' is special case for always bypass
2112 if no_proxy == '*':
2113 return 1
2114 # strip port off host
2115 hostonly, port = urllib.parse.splitport(host)
2116 # check if the host ends with any of the DNS suffixes
2117 for name in no_proxy.split(','):
2118 if name and (hostonly.endswith(name) or host.endswith(name)):
2119 return 1
2120 # otherwise, don't bypass
2121 return 0
2122
2123
2124if sys.platform == 'darwin':
2125 def getproxies_internetconfig():
2126 """Return a dictionary of scheme -> proxy server URL mappings.
2127
2128 By convention the mac uses Internet Config to store
2129 proxies. An HTTP proxy, for instance, is stored under
2130 the HttpProxy key.
2131
2132 """
2133 try:
2134 import ic
2135 except ImportError:
2136 return {}
2137
2138 try:
2139 config = ic.IC()
2140 except ic.error:
2141 return {}
2142 proxies = {}
2143 # HTTP:
2144 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
2145 try:
2146 value = config['HTTPProxyHost']
2147 except ic.error:
2148 pass
2149 else:
2150 proxies['http'] = 'http://%s' % value
2151 # FTP: XXX To be done.
2152 # Gopher: XXX To be done.
2153 return proxies
2154
2155 def proxy_bypass(host):
2156 if getproxies_environment():
2157 return proxy_bypass_environment(host)
2158 else:
2159 return 0
2160
2161 def getproxies():
2162 return getproxies_environment() or getproxies_internetconfig()
2163
2164elif os.name == 'nt':
2165 def getproxies_registry():
2166 """Return a dictionary of scheme -> proxy server URL mappings.
2167
2168 Win32 uses the registry to store proxies.
2169
2170 """
2171 proxies = {}
2172 try:
2173 import _winreg
2174 except ImportError:
2175 # Std module, so should be around - but you never know!
2176 return proxies
2177 try:
2178 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
2179 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2180 proxyEnable = _winreg.QueryValueEx(internetSettings,
2181 'ProxyEnable')[0]
2182 if proxyEnable:
2183 # Returned as Unicode but problems if not converted to ASCII
2184 proxyServer = str(_winreg.QueryValueEx(internetSettings,
2185 'ProxyServer')[0])
2186 if '=' in proxyServer:
2187 # Per-protocol settings
2188 for p in proxyServer.split(';'):
2189 protocol, address = p.split('=', 1)
2190 # See if address has a type:// prefix
2191 import re
2192 if not re.match('^([^/:]+)://', address):
2193 address = '%s://%s' % (protocol, address)
2194 proxies[protocol] = address
2195 else:
2196 # Use one setting for all protocols
2197 if proxyServer[:5] == 'http:':
2198 proxies['http'] = proxyServer
2199 else:
2200 proxies['http'] = 'http://%s' % proxyServer
2201 proxies['ftp'] = 'ftp://%s' % proxyServer
2202 internetSettings.Close()
2203 except (WindowsError, ValueError, TypeError):
2204 # Either registry key not found etc, or the value in an
2205 # unexpected format.
2206 # proxies already set up to be empty so nothing to do
2207 pass
2208 return proxies
2209
2210 def getproxies():
2211 """Return a dictionary of scheme -> proxy server URL mappings.
2212
2213 Returns settings gathered from the environment, if specified,
2214 or the registry.
2215
2216 """
2217 return getproxies_environment() or getproxies_registry()
2218
2219 def proxy_bypass_registry(host):
2220 try:
2221 import _winreg
2222 import re
2223 except ImportError:
2224 # Std modules, so should be around - but you never know!
2225 return 0
2226 try:
2227 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
2228 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2229 proxyEnable = _winreg.QueryValueEx(internetSettings,
2230 'ProxyEnable')[0]
2231 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
2232 'ProxyOverride')[0])
2233 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2234 except WindowsError:
2235 return 0
2236 if not proxyEnable or not proxyOverride:
2237 return 0
2238 # try to make a host list from name and IP address.
2239 rawHost, port = urllib.parse.splitport(host)
2240 host = [rawHost]
2241 try:
2242 addr = socket.gethostbyname(rawHost)
2243 if addr != rawHost:
2244 host.append(addr)
2245 except socket.error:
2246 pass
2247 try:
2248 fqdn = socket.getfqdn(rawHost)
2249 if fqdn != rawHost:
2250 host.append(fqdn)
2251 except socket.error:
2252 pass
2253 # make a check value list from the registry entry: replace the
2254 # '<local>' string by the localhost entry and the corresponding
2255 # canonical entry.
2256 proxyOverride = proxyOverride.split(';')
2257 i = 0
2258 while i < len(proxyOverride):
2259 if proxyOverride[i] == '<local>':
2260 proxyOverride[i:i+1] = ['localhost',
2261 '127.0.0.1',
2262 socket.gethostname(),
2263 socket.gethostbyname(
2264 socket.gethostname())]
2265 i += 1
2266 # print proxyOverride
2267 # now check if we match one of the registry values.
2268 for test in proxyOverride:
2269 test = test.replace(".", r"\.") # mask dots
2270 test = test.replace("*", r".*") # change glob sequence
2271 test = test.replace("?", r".") # change glob char
2272 for val in host:
2273 # print "%s <--> %s" %( test, val )
2274 if re.match(test, val, re.I):
2275 return 1
2276 return 0
2277
2278 def proxy_bypass(host):
2279 """Return a dictionary of scheme -> proxy server URL mappings.
2280
2281 Returns settings gathered from the environment, if specified,
2282 or the registry.
2283
2284 """
2285 if getproxies_environment():
2286 return proxy_bypass_environment(host)
2287 else:
2288 return proxy_bypass_registry(host)
2289
2290else:
2291 # By default use environment variables
2292 getproxies = getproxies_environment
2293 proxy_bypass = proxy_bypass_environment