blob: 89ac22a8f12f95da055879f1ebb274c5257d5d33 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33OpenerDirector --
34
35Request -- An object that encapsulates the state of a request. The
36state can be as simple as the URL. It can also include extra HTTP
37headers, e.g. a User-Agent.
38
39BaseHandler --
40
41internals:
42BaseHandler and parent
43_call_chain conventions
44
45Example usage:
46
Georg Brandl029986a2008-06-23 11:44:14 +000047import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000048
49# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000050authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051authinfo.add_password(realm='PDQ Application',
52 uri='https://mahler:8092/site-updates.py',
53 user='klem',
54 passwd='geheim$parole')
55
Georg Brandl029986a2008-06-23 11:44:14 +000056proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000057
58# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000059opener = urllib.request.build_opener(proxy_support, authinfo,
60 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000061
62# install it
Georg Brandl029986a2008-06-23 11:44:14 +000063urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064
Georg Brandl029986a2008-06-23 11:44:14 +000065f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066"""
67
68# XXX issues:
69# If an authentication error handler that tries to perform
70# authentication for some reason but fails, how should the error be
71# signalled? The client needs to know the HTTP error code. But if
72# the handler knows that the problem was, e.g., that it didn't know
73# that hash algo that requested in the challenge, it would be good to
74# pass that information along to the client, too.
75# ftp errors aren't handled cleanly
76# check digest against correct (i.e. non-apache) implementation
77
78# Possible extensions:
79# complex proxies XXX not sure what exactly was meant by this
80# abstract factory for opener
81
82import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000083import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000084import email
85import hashlib
86import http.client
87import io
88import os
89import posixpath
90import random
91import re
92import socket
93import sys
94import time
Jeremy Hylton1afc1692008-06-18 20:49:58 +000095
Georg Brandl13e89462008-07-01 19:56:00 +000096from urllib.error import URLError, HTTPError, ContentTooShortError
97from urllib.parse import (
98 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
99 splittype, splithost, splitport, splituser, splitpasswd,
Facundo Batistaf24802c2008-08-17 03:36:03 +0000100 splitattr, splitquery, splitvalue, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000101from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000102
103# check for SSL
104try:
105 import ssl
106except:
107 _have_ssl = False
108else:
109 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110
111# used in User-Agent header sent
112__version__ = sys.version[:3]
113
114_opener = None
115def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
116 global _opener
117 if _opener is None:
118 _opener = build_opener()
119 return _opener.open(url, data, timeout)
120
121def install_opener(opener):
122 global _opener
123 _opener = opener
124
125# TODO(jhylton): Make this work with the same global opener.
126_urlopener = None
127def urlretrieve(url, filename=None, reporthook=None, data=None):
128 global _urlopener
129 if not _urlopener:
130 _urlopener = FancyURLopener()
131 return _urlopener.retrieve(url, filename, reporthook, data)
132
133def urlcleanup():
134 if _urlopener:
135 _urlopener.cleanup()
136 global _opener
137 if _opener:
138 _opener = None
139
140# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000141_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000142def request_host(request):
143 """Return request-host, as defined by RFC 2965.
144
145 Variation from RFC: returned value is lowercased, for convenient
146 comparison.
147
148 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000149 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000150 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000151 if host == "":
152 host = request.get_header("Host", "")
153
154 # remove port, if present
155 host = _cut_port_re.sub("", host, 1)
156 return host.lower()
157
158class Request:
159
160 def __init__(self, url, data=None, headers={},
161 origin_req_host=None, unverifiable=False):
162 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000163 self.full_url = unwrap(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000164 self.data = data
165 self.headers = {}
166 for key, value in headers.items():
167 self.add_header(key, value)
168 self.unredirected_hdrs = {}
169 if origin_req_host is None:
170 origin_req_host = request_host(self)
171 self.origin_req_host = origin_req_host
172 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000173 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000174
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000175 def _parse(self):
176 self.type, rest = splittype(self.full_url)
177 if self.type is None:
178 raise ValueError("unknown url type: %s" % self.full_url)
179 self.host, self.selector = splithost(rest)
180 if self.host:
181 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000182
183 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000184 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000185 return "POST"
186 else:
187 return "GET"
188
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000189 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000190
191 def add_data(self, data):
192 self.data = data
193
194 def has_data(self):
195 return self.data is not None
196
197 def get_data(self):
198 return self.data
199
200 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000201 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000202
203 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000204 return self.type
205
206 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000207 return self.host
208
209 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000210 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000212 def is_unverifiable(self):
213 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000214
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000215 def get_origin_req_host(self):
216 return self.origin_req_host
217
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000218 # End deprecated methods
219
220 def set_proxy(self, host, type):
221 self.host, self.type = host, type
222 self.selector = self.full_url
223
224 def has_proxy(self):
225 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000226
227 def add_header(self, key, val):
228 # useful for something like authentication
229 self.headers[key.capitalize()] = val
230
231 def add_unredirected_header(self, key, val):
232 # will not be added to a redirected request
233 self.unredirected_hdrs[key.capitalize()] = val
234
235 def has_header(self, header_name):
236 return (header_name in self.headers or
237 header_name in self.unredirected_hdrs)
238
239 def get_header(self, header_name, default=None):
240 return self.headers.get(
241 header_name,
242 self.unredirected_hdrs.get(header_name, default))
243
244 def header_items(self):
245 hdrs = self.unredirected_hdrs.copy()
246 hdrs.update(self.headers)
247 return list(hdrs.items())
248
249class OpenerDirector:
250 def __init__(self):
251 client_version = "Python-urllib/%s" % __version__
252 self.addheaders = [('User-agent', client_version)]
253 # manage the individual handlers
254 self.handlers = []
255 self.handle_open = {}
256 self.handle_error = {}
257 self.process_response = {}
258 self.process_request = {}
259
260 def add_handler(self, handler):
261 if not hasattr(handler, "add_parent"):
262 raise TypeError("expected BaseHandler instance, got %r" %
263 type(handler))
264
265 added = False
266 for meth in dir(handler):
267 if meth in ["redirect_request", "do_open", "proxy_open"]:
268 # oops, coincidental match
269 continue
270
271 i = meth.find("_")
272 protocol = meth[:i]
273 condition = meth[i+1:]
274
275 if condition.startswith("error"):
276 j = condition.find("_") + i + 1
277 kind = meth[j+1:]
278 try:
279 kind = int(kind)
280 except ValueError:
281 pass
282 lookup = self.handle_error.get(protocol, {})
283 self.handle_error[protocol] = lookup
284 elif condition == "open":
285 kind = protocol
286 lookup = self.handle_open
287 elif condition == "response":
288 kind = protocol
289 lookup = self.process_response
290 elif condition == "request":
291 kind = protocol
292 lookup = self.process_request
293 else:
294 continue
295
296 handlers = lookup.setdefault(kind, [])
297 if handlers:
298 bisect.insort(handlers, handler)
299 else:
300 handlers.append(handler)
301 added = True
302
303 if added:
304 # the handlers must work in an specific order, the order
305 # is specified in a Handler attribute
306 bisect.insort(self.handlers, handler)
307 handler.add_parent(self)
308
309 def close(self):
310 # Only exists for backwards compatibility.
311 pass
312
313 def _call_chain(self, chain, kind, meth_name, *args):
314 # Handlers raise an exception if no one else should try to handle
315 # the request, or return None if they can't but another handler
316 # could. Otherwise, they return the response.
317 handlers = chain.get(kind, ())
318 for handler in handlers:
319 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000320 result = func(*args)
321 if result is not None:
322 return result
323
324 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
325 # accept a URL or a Request object
326 if isinstance(fullurl, str):
327 req = Request(fullurl, data)
328 else:
329 req = fullurl
330 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000331 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000332
333 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000334 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000335
336 # pre-process request
337 meth_name = protocol+"_request"
338 for processor in self.process_request.get(protocol, []):
339 meth = getattr(processor, meth_name)
340 req = meth(req)
341
342 response = self._open(req, data)
343
344 # post-process response
345 meth_name = protocol+"_response"
346 for processor in self.process_response.get(protocol, []):
347 meth = getattr(processor, meth_name)
348 response = meth(req, response)
349
350 return response
351
352 def _open(self, req, data=None):
353 result = self._call_chain(self.handle_open, 'default',
354 'default_open', req)
355 if result:
356 return result
357
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000358 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000359 result = self._call_chain(self.handle_open, protocol, protocol +
360 '_open', req)
361 if result:
362 return result
363
364 return self._call_chain(self.handle_open, 'unknown',
365 'unknown_open', req)
366
367 def error(self, proto, *args):
368 if proto in ('http', 'https'):
369 # XXX http[s] protocols are special-cased
370 dict = self.handle_error['http'] # https is not different than http
371 proto = args[2] # YUCK!
372 meth_name = 'http_error_%s' % proto
373 http_err = 1
374 orig_args = args
375 else:
376 dict = self.handle_error
377 meth_name = proto + '_error'
378 http_err = 0
379 args = (dict, proto, meth_name) + args
380 result = self._call_chain(*args)
381 if result:
382 return result
383
384 if http_err:
385 args = (dict, 'default', 'http_error_default') + orig_args
386 return self._call_chain(*args)
387
388# XXX probably also want an abstract factory that knows when it makes
389# sense to skip a superclass in favor of a subclass and when it might
390# make sense to include both
391
392def build_opener(*handlers):
393 """Create an opener object from a list of handlers.
394
395 The opener will use several default handlers, including support
396 for HTTP and FTP.
397
398 If any of the handlers passed as arguments are subclasses of the
399 default handlers, the default handlers will not be used.
400 """
401 def isclass(obj):
402 return isinstance(obj, type) or hasattr(obj, "__bases__")
403
404 opener = OpenerDirector()
405 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
406 HTTPDefaultErrorHandler, HTTPRedirectHandler,
407 FTPHandler, FileHandler, HTTPErrorProcessor]
408 if hasattr(http.client, "HTTPSConnection"):
409 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000410 skip = set()
411 for klass in default_classes:
412 for check in handlers:
413 if isclass(check):
414 if issubclass(check, klass):
415 skip.add(klass)
416 elif isinstance(check, klass):
417 skip.add(klass)
418 for klass in skip:
419 default_classes.remove(klass)
420
421 for klass in default_classes:
422 opener.add_handler(klass())
423
424 for h in handlers:
425 if isclass(h):
426 h = h()
427 opener.add_handler(h)
428 return opener
429
430class BaseHandler:
431 handler_order = 500
432
433 def add_parent(self, parent):
434 self.parent = parent
435
436 def close(self):
437 # Only exists for backwards compatibility
438 pass
439
440 def __lt__(self, other):
441 if not hasattr(other, "handler_order"):
442 # Try to preserve the old behavior of having custom classes
443 # inserted after default ones (works only for custom user
444 # classes which are not aware of handler_order).
445 return True
446 return self.handler_order < other.handler_order
447
448
449class HTTPErrorProcessor(BaseHandler):
450 """Process HTTP error responses."""
451 handler_order = 1000 # after all other processing
452
453 def http_response(self, request, response):
454 code, msg, hdrs = response.code, response.msg, response.info()
455
456 # According to RFC 2616, "2xx" code indicates that the client's
457 # request was successfully received, understood, and accepted.
458 if not (200 <= code < 300):
459 response = self.parent.error(
460 'http', request, response, code, msg, hdrs)
461
462 return response
463
464 https_response = http_response
465
466class HTTPDefaultErrorHandler(BaseHandler):
467 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000468 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000469
470class HTTPRedirectHandler(BaseHandler):
471 # maximum number of redirections to any single URL
472 # this is needed because of the state that cookies introduce
473 max_repeats = 4
474 # maximum total number of redirections (regardless of URL) before
475 # assuming we're in a loop
476 max_redirections = 10
477
478 def redirect_request(self, req, fp, code, msg, headers, newurl):
479 """Return a Request or None in response to a redirect.
480
481 This is called by the http_error_30x methods when a
482 redirection response is received. If a redirection should
483 take place, return a new Request to allow http_error_30x to
484 perform the redirect. Otherwise, raise HTTPError if no-one
485 else should try to handle this url. Return None if you can't
486 but another Handler might.
487 """
488 m = req.get_method()
489 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
490 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000491 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000492
493 # Strictly (according to RFC 2616), 301 or 302 in response to
494 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000495 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000496 # essentially all clients do redirect in this case, so we do
497 # the same.
498 # be conciliant with URIs containing a space
499 newurl = newurl.replace(' ', '%20')
500 CONTENT_HEADERS = ("content-length", "content-type")
501 newheaders = dict((k, v) for k, v in req.headers.items()
502 if k.lower() not in CONTENT_HEADERS)
503 return Request(newurl,
504 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000505 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000506 unverifiable=True)
507
508 # Implementation note: To avoid the server sending us into an
509 # infinite loop, the request object needs to track what URLs we
510 # have already seen. Do this by adding a handler-specific
511 # attribute to the Request object.
512 def http_error_302(self, req, fp, code, msg, headers):
513 # Some servers (incorrectly) return multiple Location headers
514 # (so probably same goes for URI). Use first header.
515 if "location" in headers:
516 newurl = headers["location"]
517 elif "uri" in headers:
518 newurl = headers["uri"]
519 else:
520 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000521
522 # fix a possible malformed URL
523 urlparts = urlparse(newurl)
524 if not urlparts.path:
525 urlparts = list(urlparts)
526 urlparts[2] = "/"
527 newurl = urlunparse(urlparts)
528
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000529 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000530
531 # XXX Probably want to forget about the state of the current
532 # request, although that might interact poorly with other
533 # handlers that also use handler-specific request attributes
534 new = self.redirect_request(req, fp, code, msg, headers, newurl)
535 if new is None:
536 return
537
538 # loop detection
539 # .redirect_dict has a key url if url was previously visited.
540 if hasattr(req, 'redirect_dict'):
541 visited = new.redirect_dict = req.redirect_dict
542 if (visited.get(newurl, 0) >= self.max_repeats or
543 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000544 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000545 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000546 else:
547 visited = new.redirect_dict = req.redirect_dict = {}
548 visited[newurl] = visited.get(newurl, 0) + 1
549
550 # Don't close the fp until we are sure that we won't use it
551 # with HTTPError.
552 fp.read()
553 fp.close()
554
555 return self.parent.open(new)
556
557 http_error_301 = http_error_303 = http_error_307 = http_error_302
558
559 inf_msg = "The HTTP server returned a redirect error that would " \
560 "lead to an infinite loop.\n" \
561 "The last 30x error message was:\n"
562
563
564def _parse_proxy(proxy):
565 """Return (scheme, user, password, host/port) given a URL or an authority.
566
567 If a URL is supplied, it must have an authority (host:port) component.
568 According to RFC 3986, having an authority component means the URL must
569 have two slashes after the scheme:
570
571 >>> _parse_proxy('file:/ftp.example.com/')
572 Traceback (most recent call last):
573 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
574
575 The first three items of the returned tuple may be None.
576
577 Examples of authority parsing:
578
579 >>> _parse_proxy('proxy.example.com')
580 (None, None, None, 'proxy.example.com')
581 >>> _parse_proxy('proxy.example.com:3128')
582 (None, None, None, 'proxy.example.com:3128')
583
584 The authority component may optionally include userinfo (assumed to be
585 username:password):
586
587 >>> _parse_proxy('joe:password@proxy.example.com')
588 (None, 'joe', 'password', 'proxy.example.com')
589 >>> _parse_proxy('joe:password@proxy.example.com:3128')
590 (None, 'joe', 'password', 'proxy.example.com:3128')
591
592 Same examples, but with URLs instead:
593
594 >>> _parse_proxy('http://proxy.example.com/')
595 ('http', None, None, 'proxy.example.com')
596 >>> _parse_proxy('http://proxy.example.com:3128/')
597 ('http', None, None, 'proxy.example.com:3128')
598 >>> _parse_proxy('http://joe:password@proxy.example.com/')
599 ('http', 'joe', 'password', 'proxy.example.com')
600 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
601 ('http', 'joe', 'password', 'proxy.example.com:3128')
602
603 Everything after the authority is ignored:
604
605 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
606 ('ftp', 'joe', 'password', 'proxy.example.com')
607
608 Test for no trailing '/' case:
609
610 >>> _parse_proxy('http://joe:password@proxy.example.com')
611 ('http', 'joe', 'password', 'proxy.example.com')
612
613 """
Georg Brandl13e89462008-07-01 19:56:00 +0000614 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000615 if not r_scheme.startswith("/"):
616 # authority
617 scheme = None
618 authority = proxy
619 else:
620 # URL
621 if not r_scheme.startswith("//"):
622 raise ValueError("proxy URL with no authority: %r" % proxy)
623 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
624 # and 3.3.), path is empty or starts with '/'
625 end = r_scheme.find("/", 2)
626 if end == -1:
627 end = None
628 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000629 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000630 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000631 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000632 else:
633 user = password = None
634 return scheme, user, password, hostport
635
636class ProxyHandler(BaseHandler):
637 # Proxies must be in front
638 handler_order = 100
639
640 def __init__(self, proxies=None):
641 if proxies is None:
642 proxies = getproxies()
643 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
644 self.proxies = proxies
645 for type, url in proxies.items():
646 setattr(self, '%s_open' % type,
647 lambda r, proxy=url, type=type, meth=self.proxy_open: \
648 meth(r, proxy, type))
649
650 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000651 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000652 proxy_type, user, password, hostport = _parse_proxy(proxy)
653 if proxy_type is None:
654 proxy_type = orig_type
655 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000656 user_pass = '%s:%s' % (unquote(user),
657 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000658 creds = base64.b64encode(user_pass.encode()).decode("ascii")
659 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000660 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000661 req.set_proxy(hostport, proxy_type)
662 if orig_type == proxy_type:
663 # let other handlers take care of it
664 return None
665 else:
666 # need to start over, because the other handlers don't
667 # grok the proxy's URL type
668 # e.g. if we have a constructor arg proxies like so:
669 # {'http': 'ftp://proxy.example.com'}, we may end up turning
670 # a request for http://acme.example.com/a into one for
671 # ftp://proxy.example.com/a
672 return self.parent.open(req)
673
674class HTTPPasswordMgr:
675
676 def __init__(self):
677 self.passwd = {}
678
679 def add_password(self, realm, uri, user, passwd):
680 # uri could be a single URI or a sequence
681 if isinstance(uri, str):
682 uri = [uri]
683 if not realm in self.passwd:
684 self.passwd[realm] = {}
685 for default_port in True, False:
686 reduced_uri = tuple(
687 [self.reduce_uri(u, default_port) for u in uri])
688 self.passwd[realm][reduced_uri] = (user, passwd)
689
690 def find_user_password(self, realm, authuri):
691 domains = self.passwd.get(realm, {})
692 for default_port in True, False:
693 reduced_authuri = self.reduce_uri(authuri, default_port)
694 for uris, authinfo in domains.items():
695 for uri in uris:
696 if self.is_suburi(uri, reduced_authuri):
697 return authinfo
698 return None, None
699
700 def reduce_uri(self, uri, default_port=True):
701 """Accept authority or URI and extract only the authority and path."""
702 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000703 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000704 if parts[1]:
705 # URI
706 scheme = parts[0]
707 authority = parts[1]
708 path = parts[2] or '/'
709 else:
710 # host or host:port
711 scheme = None
712 authority = uri
713 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000714 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000715 if default_port and port is None and scheme is not None:
716 dport = {"http": 80,
717 "https": 443,
718 }.get(scheme)
719 if dport is not None:
720 authority = "%s:%d" % (host, dport)
721 return authority, path
722
723 def is_suburi(self, base, test):
724 """Check if test is below base in a URI tree
725
726 Both args must be URIs in reduced form.
727 """
728 if base == test:
729 return True
730 if base[0] != test[0]:
731 return False
732 common = posixpath.commonprefix((base[1], test[1]))
733 if len(common) == len(base[1]):
734 return True
735 return False
736
737
738class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
739
740 def find_user_password(self, realm, authuri):
741 user, password = HTTPPasswordMgr.find_user_password(self, realm,
742 authuri)
743 if user is not None:
744 return user, password
745 return HTTPPasswordMgr.find_user_password(self, None, authuri)
746
747
748class AbstractBasicAuthHandler:
749
750 # XXX this allows for multiple auth-schemes, but will stupidly pick
751 # the last one with a realm specified.
752
753 # allow for double- and single-quoted realm values
754 # (single quotes are a violation of the RFC, but appear in the wild)
755 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
756 'realm=(["\'])(.*?)\\2', re.I)
757
758 # XXX could pre-emptively send auth info already accepted (RFC 2617,
759 # end of section 2, and section 1.2 immediately after "credentials"
760 # production).
761
762 def __init__(self, password_mgr=None):
763 if password_mgr is None:
764 password_mgr = HTTPPasswordMgr()
765 self.passwd = password_mgr
766 self.add_password = self.passwd.add_password
767
768 def http_error_auth_reqed(self, authreq, host, req, headers):
769 # host may be an authority (without userinfo) or a URL with an
770 # authority
771 # XXX could be multiple headers
772 authreq = headers.get(authreq, None)
773 if authreq:
774 mo = AbstractBasicAuthHandler.rx.search(authreq)
775 if mo:
776 scheme, quote, realm = mo.groups()
777 if scheme.lower() == 'basic':
778 return self.retry_http_basic_auth(host, req, realm)
779
780 def retry_http_basic_auth(self, host, req, realm):
781 user, pw = self.passwd.find_user_password(realm, host)
782 if pw is not None:
783 raw = "%s:%s" % (user, pw)
784 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
785 if req.headers.get(self.auth_header, None) == auth:
786 return None
787 req.add_header(self.auth_header, auth)
788 return self.parent.open(req)
789 else:
790 return None
791
792
793class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
794
795 auth_header = 'Authorization'
796
797 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000798 url = req.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000799 return self.http_error_auth_reqed('www-authenticate',
800 url, req, headers)
801
802
803class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
804
805 auth_header = 'Proxy-authorization'
806
807 def http_error_407(self, req, fp, code, msg, headers):
808 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000809 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000810 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
811 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000812 authority = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 return self.http_error_auth_reqed('proxy-authenticate',
814 authority, req, headers)
815
816
817def randombytes(n):
818 """Return n random bytes."""
819 return os.urandom(n)
820
821class AbstractDigestAuthHandler:
822 # Digest authentication is specified in RFC 2617.
823
824 # XXX The client does not inspect the Authentication-Info header
825 # in a successful response.
826
827 # XXX It should be possible to test this implementation against
828 # a mock server that just generates a static set of challenges.
829
830 # XXX qop="auth-int" supports is shaky
831
832 def __init__(self, passwd=None):
833 if passwd is None:
834 passwd = HTTPPasswordMgr()
835 self.passwd = passwd
836 self.add_password = self.passwd.add_password
837 self.retried = 0
838 self.nonce_count = 0
839
840 def reset_retry_count(self):
841 self.retried = 0
842
843 def http_error_auth_reqed(self, auth_header, host, req, headers):
844 authreq = headers.get(auth_header, None)
845 if self.retried > 5:
846 # Don't fail endlessly - if we failed once, we'll probably
847 # fail a second time. Hm. Unless the Password Manager is
848 # prompting for the information. Crap. This isn't great
849 # but it's better than the current 'repeat until recursion
850 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000851 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000852 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000853 else:
854 self.retried += 1
855 if authreq:
856 scheme = authreq.split()[0]
857 if scheme.lower() == 'digest':
858 return self.retry_http_digest_auth(req, authreq)
859
860 def retry_http_digest_auth(self, req, auth):
861 token, challenge = auth.split(' ', 1)
862 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
863 auth = self.get_authorization(req, chal)
864 if auth:
865 auth_val = 'Digest %s' % auth
866 if req.headers.get(self.auth_header, None) == auth_val:
867 return None
868 req.add_unredirected_header(self.auth_header, auth_val)
869 resp = self.parent.open(req)
870 return resp
871
872 def get_cnonce(self, nonce):
873 # The cnonce-value is an opaque
874 # quoted string value provided by the client and used by both client
875 # and server to avoid chosen plaintext attacks, to provide mutual
876 # authentication, and to provide some message integrity protection.
877 # This isn't a fabulous effort, but it's probably Good Enough.
878 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
879 b = s.encode("ascii") + randombytes(8)
880 dig = hashlib.sha1(b).hexdigest()
881 return dig[:16]
882
883 def get_authorization(self, req, chal):
884 try:
885 realm = chal['realm']
886 nonce = chal['nonce']
887 qop = chal.get('qop')
888 algorithm = chal.get('algorithm', 'MD5')
889 # mod_digest doesn't send an opaque, even though it isn't
890 # supposed to be optional
891 opaque = chal.get('opaque', None)
892 except KeyError:
893 return None
894
895 H, KD = self.get_algorithm_impls(algorithm)
896 if H is None:
897 return None
898
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000899 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000900 if user is None:
901 return None
902
903 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000904 if req.data is not None:
905 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000906 else:
907 entdig = None
908
909 A1 = "%s:%s:%s" % (user, realm, pw)
910 A2 = "%s:%s" % (req.get_method(),
911 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000912 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000913 if qop == 'auth':
914 self.nonce_count += 1
915 ncvalue = '%08x' % self.nonce_count
916 cnonce = self.get_cnonce(nonce)
917 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
918 respdig = KD(H(A1), noncebit)
919 elif qop is None:
920 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
921 else:
922 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000923 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000924
925 # XXX should the partial digests be encoded too?
926
927 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000928 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000929 respdig)
930 if opaque:
931 base += ', opaque="%s"' % opaque
932 if entdig:
933 base += ', digest="%s"' % entdig
934 base += ', algorithm="%s"' % algorithm
935 if qop:
936 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
937 return base
938
939 def get_algorithm_impls(self, algorithm):
940 # lambdas assume digest modules are imported at the top level
941 if algorithm == 'MD5':
942 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
943 elif algorithm == 'SHA':
944 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
945 # XXX MD5-sess
946 KD = lambda s, d: H("%s:%s" % (s, d))
947 return H, KD
948
949 def get_entity_digest(self, data, chal):
950 # XXX not implemented yet
951 return None
952
953
954class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
955 """An authentication protocol defined by RFC 2069
956
957 Digest authentication improves on basic authentication because it
958 does not transmit passwords in the clear.
959 """
960
961 auth_header = 'Authorization'
962 handler_order = 490 # before Basic auth
963
964 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000965 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000966 retry = self.http_error_auth_reqed('www-authenticate',
967 host, req, headers)
968 self.reset_retry_count()
969 return retry
970
971
972class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
973
974 auth_header = 'Proxy-Authorization'
975 handler_order = 490 # before Basic auth
976
977 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000978 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000979 retry = self.http_error_auth_reqed('proxy-authenticate',
980 host, req, headers)
981 self.reset_retry_count()
982 return retry
983
984class AbstractHTTPHandler(BaseHandler):
985
986 def __init__(self, debuglevel=0):
987 self._debuglevel = debuglevel
988
989 def set_http_debuglevel(self, level):
990 self._debuglevel = level
991
992 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000993 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000994 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +0000995 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000996
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000997 if request.data is not None: # POST
998 data = request.data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000999 if not request.has_header('Content-type'):
1000 request.add_unredirected_header(
1001 'Content-type',
1002 'application/x-www-form-urlencoded')
1003 if not request.has_header('Content-length'):
1004 request.add_unredirected_header(
1005 'Content-length', '%d' % len(data))
1006
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001007 sel_host = host
1008 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001009 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001010 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001011 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001012 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001013 for name, value in self.parent.addheaders:
1014 name = name.capitalize()
1015 if not request.has_header(name):
1016 request.add_unredirected_header(name, value)
1017
1018 return request
1019
1020 def do_open(self, http_class, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001021 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001022
1023 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001024 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001025 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001026 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001027 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001028
1029 h = http_class(host, timeout=req.timeout) # will parse host:port
1030 headers = dict(req.headers)
1031 headers.update(req.unredirected_hdrs)
1032
1033 # TODO(jhylton): Should this be redesigned to handle
1034 # persistent connections?
1035
1036 # We want to make an HTTP/1.1 request, but the addinfourl
1037 # class isn't prepared to deal with a persistent connection.
1038 # It will try to read all remaining data from the socket,
1039 # which will block while the server waits for the next request.
1040 # So make sure the connection gets closed after the (only)
1041 # request.
1042 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001043 headers = dict((name.title(), val) for name, val in headers.items())
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001044 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001045 h.request(req.get_method(), req.selector, req.data, headers)
1046 r = h.getresponse() # an HTTPResponse instance
1047 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001048 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001049
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001050 r.url = req.full_url
1051 # This line replaces the .msg attribute of the HTTPResponse
1052 # with .headers, because urllib clients expect the response to
1053 # have the reason in .msg. It would be good to mark this
1054 # attribute is deprecated and get then to use info() or
1055 # .headers.
1056 r.msg = r.reason
1057 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001058
1059
1060class HTTPHandler(AbstractHTTPHandler):
1061
1062 def http_open(self, req):
1063 return self.do_open(http.client.HTTPConnection, req)
1064
1065 http_request = AbstractHTTPHandler.do_request_
1066
1067if hasattr(http.client, 'HTTPSConnection'):
1068 class HTTPSHandler(AbstractHTTPHandler):
1069
1070 def https_open(self, req):
1071 return self.do_open(http.client.HTTPSConnection, req)
1072
1073 https_request = AbstractHTTPHandler.do_request_
1074
1075class HTTPCookieProcessor(BaseHandler):
1076 def __init__(self, cookiejar=None):
1077 import http.cookiejar
1078 if cookiejar is None:
1079 cookiejar = http.cookiejar.CookieJar()
1080 self.cookiejar = cookiejar
1081
1082 def http_request(self, request):
1083 self.cookiejar.add_cookie_header(request)
1084 return request
1085
1086 def http_response(self, request, response):
1087 self.cookiejar.extract_cookies(response, request)
1088 return response
1089
1090 https_request = http_request
1091 https_response = http_response
1092
1093class UnknownHandler(BaseHandler):
1094 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001095 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001096 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001097
1098def parse_keqv_list(l):
1099 """Parse list of key=value strings where keys are not duplicated."""
1100 parsed = {}
1101 for elt in l:
1102 k, v = elt.split('=', 1)
1103 if v[0] == '"' and v[-1] == '"':
1104 v = v[1:-1]
1105 parsed[k] = v
1106 return parsed
1107
1108def parse_http_list(s):
1109 """Parse lists as described by RFC 2068 Section 2.
1110
1111 In particular, parse comma-separated lists where the elements of
1112 the list may include quoted-strings. A quoted-string could
1113 contain a comma. A non-quoted string could have quotes in the
1114 middle. Neither commas nor quotes count if they are escaped.
1115 Only double-quotes count, not single-quotes.
1116 """
1117 res = []
1118 part = ''
1119
1120 escape = quote = False
1121 for cur in s:
1122 if escape:
1123 part += cur
1124 escape = False
1125 continue
1126 if quote:
1127 if cur == '\\':
1128 escape = True
1129 continue
1130 elif cur == '"':
1131 quote = False
1132 part += cur
1133 continue
1134
1135 if cur == ',':
1136 res.append(part)
1137 part = ''
1138 continue
1139
1140 if cur == '"':
1141 quote = True
1142
1143 part += cur
1144
1145 # append last part
1146 if part:
1147 res.append(part)
1148
1149 return [part.strip() for part in res]
1150
1151class FileHandler(BaseHandler):
1152 # Use local file or FTP depending on form of URL
1153 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001154 url = req.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001155 if url[:2] == '//' and url[2:3] != '/':
1156 req.type = 'ftp'
1157 return self.parent.open(req)
1158 else:
1159 return self.open_local_file(req)
1160
1161 # names for the localhost
1162 names = None
1163 def get_names(self):
1164 if FileHandler.names is None:
1165 try:
1166 FileHandler.names = (socket.gethostbyname('localhost'),
1167 socket.gethostbyname(socket.gethostname()))
1168 except socket.gaierror:
1169 FileHandler.names = (socket.gethostbyname('localhost'),)
1170 return FileHandler.names
1171
1172 # not entirely sure what the rules are here
1173 def open_local_file(self, req):
1174 import email.utils
1175 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001176 host = req.host
1177 file = req.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001178 localfile = url2pathname(file)
1179 try:
1180 stats = os.stat(localfile)
1181 size = stats.st_size
1182 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1183 mtype = mimetypes.guess_type(file)[0]
1184 headers = email.message_from_string(
1185 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1186 (mtype or 'text/plain', size, modified))
1187 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001188 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001189 if not host or \
1190 (not port and _safe_gethostbyname(host) in self.get_names()):
Georg Brandl13e89462008-07-01 19:56:00 +00001191 return addinfourl(open(localfile, 'rb'), headers, 'file:'+file)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001192 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001193 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001194 raise URLError(msg)
1195 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001196
1197def _safe_gethostbyname(host):
1198 try:
1199 return socket.gethostbyname(host)
1200 except socket.gaierror:
1201 return None
1202
1203class FTPHandler(BaseHandler):
1204 def ftp_open(self, req):
1205 import ftplib
1206 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001207 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001208 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001209 raise URLError('ftp error: no host given')
1210 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001211 if port is None:
1212 port = ftplib.FTP_PORT
1213 else:
1214 port = int(port)
1215
1216 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001217 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001218 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001219 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001220 else:
1221 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001222 host = unquote(host)
1223 user = unquote(user or '')
1224 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001225
1226 try:
1227 host = socket.gethostbyname(host)
1228 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001229 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001230 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001231 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001232 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001233 dirs, file = dirs[:-1], dirs[-1]
1234 if dirs and not dirs[0]:
1235 dirs = dirs[1:]
1236 try:
1237 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1238 type = file and 'I' or 'D'
1239 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001240 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001241 if attr.lower() == 'type' and \
1242 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1243 type = value.upper()
1244 fp, retrlen = fw.retrfile(file, type)
1245 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001246 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001247 if mtype:
1248 headers += "Content-type: %s\n" % mtype
1249 if retrlen is not None and retrlen >= 0:
1250 headers += "Content-length: %d\n" % retrlen
1251 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001252 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001253 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001254 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001255 raise exc.with_traceback(sys.exc_info()[2])
1256
1257 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1258 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1259 return fw
1260
1261class CacheFTPHandler(FTPHandler):
1262 # XXX would be nice to have pluggable cache strategies
1263 # XXX this stuff is definitely not thread safe
1264 def __init__(self):
1265 self.cache = {}
1266 self.timeout = {}
1267 self.soonest = 0
1268 self.delay = 60
1269 self.max_conns = 16
1270
1271 def setTimeout(self, t):
1272 self.delay = t
1273
1274 def setMaxConns(self, m):
1275 self.max_conns = m
1276
1277 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1278 key = user, host, port, '/'.join(dirs), timeout
1279 if key in self.cache:
1280 self.timeout[key] = time.time() + self.delay
1281 else:
1282 self.cache[key] = ftpwrapper(user, passwd, host, port,
1283 dirs, timeout)
1284 self.timeout[key] = time.time() + self.delay
1285 self.check_cache()
1286 return self.cache[key]
1287
1288 def check_cache(self):
1289 # first check for old ones
1290 t = time.time()
1291 if self.soonest <= t:
1292 for k, v in list(self.timeout.items()):
1293 if v < t:
1294 self.cache[k].close()
1295 del self.cache[k]
1296 del self.timeout[k]
1297 self.soonest = min(list(self.timeout.values()))
1298
1299 # then check the size
1300 if len(self.cache) == self.max_conns:
1301 for k, v in list(self.timeout.items()):
1302 if v == self.soonest:
1303 del self.cache[k]
1304 del self.timeout[k]
1305 break
1306 self.soonest = min(list(self.timeout.values()))
1307
1308# Code move from the old urllib module
1309
1310MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1311
1312# Helper for non-unix systems
1313if os.name == 'mac':
1314 from macurl2path import url2pathname, pathname2url
1315elif os.name == 'nt':
1316 from nturl2path import url2pathname, pathname2url
1317else:
1318 def url2pathname(pathname):
1319 """OS-specific conversion from a relative URL of the 'file' scheme
1320 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001321 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001322
1323 def pathname2url(pathname):
1324 """OS-specific conversion from a file system path to a relative URL
1325 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001326 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001327
1328# This really consists of two pieces:
1329# (1) a class which handles opening of all sorts of URLs
1330# (plus assorted utilities etc.)
1331# (2) a set of functions for parsing URLs
1332# XXX Should these be separated out into different modules?
1333
1334
1335ftpcache = {}
1336class URLopener:
1337 """Class to open URLs.
1338 This is a class rather than just a subroutine because we may need
1339 more than one set of global protocol-specific options.
1340 Note -- this is a base class for those who don't want the
1341 automatic handling of errors type 302 (relocated) and 401
1342 (authorization needed)."""
1343
1344 __tempfiles = None
1345
1346 version = "Python-urllib/%s" % __version__
1347
1348 # Constructor
1349 def __init__(self, proxies=None, **x509):
1350 if proxies is None:
1351 proxies = getproxies()
1352 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1353 self.proxies = proxies
1354 self.key_file = x509.get('key_file')
1355 self.cert_file = x509.get('cert_file')
1356 self.addheaders = [('User-Agent', self.version)]
1357 self.__tempfiles = []
1358 self.__unlink = os.unlink # See cleanup()
1359 self.tempcache = None
1360 # Undocumented feature: if you assign {} to tempcache,
1361 # it is used to cache files retrieved with
1362 # self.retrieve(). This is not enabled by default
1363 # since it does not work for changing documents (and I
1364 # haven't got the logic to check expiration headers
1365 # yet).
1366 self.ftpcache = ftpcache
1367 # Undocumented feature: you can use a different
1368 # ftp cache by assigning to the .ftpcache member;
1369 # in case you want logically independent URL openers
1370 # XXX This is not threadsafe. Bah.
1371
1372 def __del__(self):
1373 self.close()
1374
1375 def close(self):
1376 self.cleanup()
1377
1378 def cleanup(self):
1379 # This code sometimes runs when the rest of this module
1380 # has already been deleted, so it can't use any globals
1381 # or import anything.
1382 if self.__tempfiles:
1383 for file in self.__tempfiles:
1384 try:
1385 self.__unlink(file)
1386 except OSError:
1387 pass
1388 del self.__tempfiles[:]
1389 if self.tempcache:
1390 self.tempcache.clear()
1391
1392 def addheader(self, *args):
1393 """Add a header to be used by the HTTP interface only
1394 e.g. u.addheader('Accept', 'sound/basic')"""
1395 self.addheaders.append(args)
1396
1397 # External interface
1398 def open(self, fullurl, data=None):
1399 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001400 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran690ce9b2009-05-05 18:41:13 +00001401 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001402 if self.tempcache and fullurl in self.tempcache:
1403 filename, headers = self.tempcache[fullurl]
1404 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001405 return addinfourl(fp, headers, fullurl)
1406 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001407 if not urltype:
1408 urltype = 'file'
1409 if urltype in self.proxies:
1410 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001411 urltype, proxyhost = splittype(proxy)
1412 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001413 url = (host, fullurl) # Signal special case to open_*()
1414 else:
1415 proxy = None
1416 name = 'open_' + urltype
1417 self.type = urltype
1418 name = name.replace('-', '_')
1419 if not hasattr(self, name):
1420 if proxy:
1421 return self.open_unknown_proxy(proxy, fullurl, data)
1422 else:
1423 return self.open_unknown(fullurl, data)
1424 try:
1425 if data is None:
1426 return getattr(self, name)(url)
1427 else:
1428 return getattr(self, name)(url, data)
1429 except socket.error as msg:
1430 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1431
1432 def open_unknown(self, fullurl, data=None):
1433 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001434 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001435 raise IOError('url error', 'unknown url type', type)
1436
1437 def open_unknown_proxy(self, proxy, fullurl, data=None):
1438 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001439 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001440 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1441
1442 # External interface
1443 def retrieve(self, url, filename=None, reporthook=None, data=None):
1444 """retrieve(url) returns (filename, headers) for a local object
1445 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001446 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001447 if self.tempcache and url in self.tempcache:
1448 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001449 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001450 if filename is None and (not type or type == 'file'):
1451 try:
1452 fp = self.open_local_file(url1)
1453 hdrs = fp.info()
1454 del fp
Georg Brandl13e89462008-07-01 19:56:00 +00001455 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001456 except IOError as msg:
1457 pass
1458 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001459 try:
1460 headers = fp.info()
1461 if filename:
1462 tfp = open(filename, 'wb')
1463 else:
1464 import tempfile
1465 garbage, path = splittype(url)
1466 garbage, path = splithost(path or "")
1467 path, garbage = splitquery(path or "")
1468 path, garbage = splitattr(path or "")
1469 suffix = os.path.splitext(path)[1]
1470 (fd, filename) = tempfile.mkstemp(suffix)
1471 self.__tempfiles.append(filename)
1472 tfp = os.fdopen(fd, 'wb')
1473 try:
1474 result = filename, headers
1475 if self.tempcache is not None:
1476 self.tempcache[url] = result
1477 bs = 1024*8
1478 size = -1
1479 read = 0
1480 blocknum = 0
1481 if reporthook:
1482 if "content-length" in headers:
1483 size = int(headers["Content-Length"])
1484 reporthook(blocknum, bs, size)
1485 while 1:
1486 block = fp.read(bs)
1487 if not block:
1488 break
1489 read += len(block)
1490 tfp.write(block)
1491 blocknum += 1
1492 if reporthook:
1493 reporthook(blocknum, bs, size)
1494 finally:
1495 tfp.close()
1496 finally:
1497 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001498 del fp
1499 del tfp
1500
1501 # raise exception if actual size does not match content-length header
1502 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001503 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001504 "retrieval incomplete: got only %i out of %i bytes"
1505 % (read, size), result)
1506
1507 return result
1508
1509 # Each method named open_<type> knows how to open that type of URL
1510
1511 def _open_generic_http(self, connection_factory, url, data):
1512 """Make an HTTP connection using connection_class.
1513
1514 This is an internal method that should be called from
1515 open_http() or open_https().
1516
1517 Arguments:
1518 - connection_factory should take a host name and return an
1519 HTTPConnection instance.
1520 - url is the url to retrieval or a host, relative-path pair.
1521 - data is payload for a POST request or None.
1522 """
1523
1524 user_passwd = None
1525 proxy_passwd= None
1526 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001527 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001528 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001529 user_passwd, host = splituser(host)
1530 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001531 realhost = host
1532 else:
1533 host, selector = url
1534 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001535 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001536 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001537 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001538 url = rest
1539 user_passwd = None
1540 if urltype.lower() != 'http':
1541 realhost = None
1542 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001543 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001544 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001545 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001546 if user_passwd:
1547 selector = "%s://%s%s" % (urltype, realhost, rest)
1548 if proxy_bypass(realhost):
1549 host = realhost
1550
1551 #print "proxy via http:", host, selector
1552 if not host: raise IOError('http error', 'no host given')
1553
1554 if proxy_passwd:
1555 import base64
1556 proxy_auth = base64.b64encode(proxy_passwd).strip()
1557 else:
1558 proxy_auth = None
1559
1560 if user_passwd:
1561 import base64
1562 auth = base64.b64encode(user_passwd).strip()
1563 else:
1564 auth = None
1565 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001566 headers = {}
1567 if proxy_auth:
1568 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1569 if auth:
1570 headers["Authorization"] = "Basic %s" % auth
1571 if realhost:
1572 headers["Host"] = realhost
1573 for header, value in self.addheaders:
1574 headers[header] = value
1575
1576 if data is not None:
1577 headers["Content-Type"] = "application/x-www-form-urlencoded"
1578 http_conn.request("POST", selector, data, headers)
1579 else:
1580 http_conn.request("GET", selector, headers=headers)
1581
1582 try:
1583 response = http_conn.getresponse()
1584 except http.client.BadStatusLine:
1585 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001586 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001587
1588 # According to RFC 2616, "2xx" code indicates that the client's
1589 # request was successfully received, understood, and accepted.
1590 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001591 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001592 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001593 else:
1594 return self.http_error(
1595 url, response.fp,
1596 response.status, response.reason, response.msg, data)
1597
1598 def open_http(self, url, data=None):
1599 """Use HTTP protocol."""
1600 return self._open_generic_http(http.client.HTTPConnection, url, data)
1601
1602 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1603 """Handle http errors.
1604
1605 Derived class can override this, or provide specific handlers
1606 named http_error_DDD where DDD is the 3-digit error code."""
1607 # First check if there's a specific handler for this error
1608 name = 'http_error_%d' % errcode
1609 if hasattr(self, name):
1610 method = getattr(self, name)
1611 if data is None:
1612 result = method(url, fp, errcode, errmsg, headers)
1613 else:
1614 result = method(url, fp, errcode, errmsg, headers, data)
1615 if result: return result
1616 return self.http_error_default(url, fp, errcode, errmsg, headers)
1617
1618 def http_error_default(self, url, fp, errcode, errmsg, headers):
1619 """Default error handler: close the connection and raise IOError."""
1620 void = fp.read()
1621 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001622 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001623
1624 if _have_ssl:
1625 def _https_connection(self, host):
1626 return http.client.HTTPSConnection(host,
1627 key_file=self.key_file,
1628 cert_file=self.cert_file)
1629
1630 def open_https(self, url, data=None):
1631 """Use HTTPS protocol."""
1632 return self._open_generic_http(self._https_connection, url, data)
1633
1634 def open_file(self, url):
1635 """Use local file or FTP depending on form of URL."""
1636 if not isinstance(url, str):
1637 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1638 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1639 return self.open_ftp(url)
1640 else:
1641 return self.open_local_file(url)
1642
1643 def open_local_file(self, url):
1644 """Use local file."""
1645 import mimetypes, email.utils
1646 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001647 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001648 localname = url2pathname(file)
1649 try:
1650 stats = os.stat(localname)
1651 except OSError as e:
1652 raise URLError(e.errno, e.strerror, e.filename)
1653 size = stats.st_size
1654 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1655 mtype = mimetypes.guess_type(url)[0]
1656 headers = email.message_from_string(
1657 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1658 (mtype or 'text/plain', size, modified))
1659 if not host:
1660 urlfile = file
1661 if file[:1] == '/':
1662 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001663 return addinfourl(open(localname, 'rb'), headers, urlfile)
1664 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001665 if (not port
1666 and socket.gethostbyname(host) in (localhost(), thishost())):
1667 urlfile = file
1668 if file[:1] == '/':
1669 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001670 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001671 raise URLError('local file error', 'not on local host')
1672
1673 def open_ftp(self, url):
1674 """Use FTP protocol."""
1675 if not isinstance(url, str):
1676 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1677 import mimetypes
1678 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001679 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001680 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001681 host, port = splitport(host)
1682 user, host = splituser(host)
1683 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001684 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001685 host = unquote(host)
1686 user = unquote(user or '')
1687 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001688 host = socket.gethostbyname(host)
1689 if not port:
1690 import ftplib
1691 port = ftplib.FTP_PORT
1692 else:
1693 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001694 path, attrs = splitattr(path)
1695 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001696 dirs = path.split('/')
1697 dirs, file = dirs[:-1], dirs[-1]
1698 if dirs and not dirs[0]: dirs = dirs[1:]
1699 if dirs and not dirs[0]: dirs[0] = '/'
1700 key = user, host, port, '/'.join(dirs)
1701 # XXX thread unsafe!
1702 if len(self.ftpcache) > MAXFTPCACHE:
1703 # Prune the cache, rather arbitrarily
1704 for k in self.ftpcache.keys():
1705 if k != key:
1706 v = self.ftpcache[k]
1707 del self.ftpcache[k]
1708 v.close()
1709 try:
1710 if not key in self.ftpcache:
1711 self.ftpcache[key] = \
1712 ftpwrapper(user, passwd, host, port, dirs)
1713 if not file: type = 'D'
1714 else: type = 'I'
1715 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001716 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001717 if attr.lower() == 'type' and \
1718 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1719 type = value.upper()
1720 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1721 mtype = mimetypes.guess_type("ftp:" + url)[0]
1722 headers = ""
1723 if mtype:
1724 headers += "Content-Type: %s\n" % mtype
1725 if retrlen is not None and retrlen >= 0:
1726 headers += "Content-Length: %d\n" % retrlen
1727 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001728 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001729 except ftperrors() as msg:
1730 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1731
1732 def open_data(self, url, data=None):
1733 """Use "data" URL."""
1734 if not isinstance(url, str):
1735 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1736 # ignore POSTed data
1737 #
1738 # syntax of data URLs:
1739 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1740 # mediatype := [ type "/" subtype ] *( ";" parameter )
1741 # data := *urlchar
1742 # parameter := attribute "=" value
1743 try:
1744 [type, data] = url.split(',', 1)
1745 except ValueError:
1746 raise IOError('data error', 'bad data URL')
1747 if not type:
1748 type = 'text/plain;charset=US-ASCII'
1749 semi = type.rfind(';')
1750 if semi >= 0 and '=' not in type[semi:]:
1751 encoding = type[semi+1:]
1752 type = type[:semi]
1753 else:
1754 encoding = ''
1755 msg = []
1756 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
1757 time.gmtime(time.time())))
1758 msg.append('Content-type: %s' % type)
1759 if encoding == 'base64':
1760 import base64
1761 data = base64.decodestring(data)
1762 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001763 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001764 msg.append('Content-Length: %d' % len(data))
1765 msg.append('')
1766 msg.append(data)
1767 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001768 headers = email.message_from_string(msg)
1769 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001770 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001771 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001772
1773
1774class FancyURLopener(URLopener):
1775 """Derived class with handlers for errors we can handle (perhaps)."""
1776
1777 def __init__(self, *args, **kwargs):
1778 URLopener.__init__(self, *args, **kwargs)
1779 self.auth_cache = {}
1780 self.tries = 0
1781 self.maxtries = 10
1782
1783 def http_error_default(self, url, fp, errcode, errmsg, headers):
1784 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001785 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001786
1787 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1788 """Error 302 -- relocated (temporarily)."""
1789 self.tries += 1
1790 if self.maxtries and self.tries >= self.maxtries:
1791 if hasattr(self, "http_error_500"):
1792 meth = self.http_error_500
1793 else:
1794 meth = self.http_error_default
1795 self.tries = 0
1796 return meth(url, fp, 500,
1797 "Internal Server Error: Redirect Recursion", headers)
1798 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1799 data)
1800 self.tries = 0
1801 return result
1802
1803 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1804 if 'location' in headers:
1805 newurl = headers['location']
1806 elif 'uri' in headers:
1807 newurl = headers['uri']
1808 else:
1809 return
1810 void = fp.read()
1811 fp.close()
1812 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001813 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001814 return self.open(newurl)
1815
1816 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1817 """Error 301 -- also relocated (permanently)."""
1818 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1819
1820 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1821 """Error 303 -- also relocated (essentially identical to 302)."""
1822 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1823
1824 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1825 """Error 307 -- relocated, but turn POST into error."""
1826 if data is None:
1827 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1828 else:
1829 return self.http_error_default(url, fp, errcode, errmsg, headers)
1830
1831 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
1832 """Error 401 -- authentication required.
1833 This function supports Basic authentication only."""
1834 if not 'www-authenticate' in headers:
1835 URLopener.http_error_default(self, url, fp,
1836 errcode, errmsg, headers)
1837 stuff = headers['www-authenticate']
1838 import re
1839 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1840 if not match:
1841 URLopener.http_error_default(self, url, fp,
1842 errcode, errmsg, headers)
1843 scheme, realm = match.groups()
1844 if scheme.lower() != 'basic':
1845 URLopener.http_error_default(self, url, fp,
1846 errcode, errmsg, headers)
1847 name = 'retry_' + self.type + '_basic_auth'
1848 if data is None:
1849 return getattr(self,name)(url, realm)
1850 else:
1851 return getattr(self,name)(url, realm, data)
1852
1853 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
1854 """Error 407 -- proxy authentication required.
1855 This function supports Basic authentication only."""
1856 if not 'proxy-authenticate' in headers:
1857 URLopener.http_error_default(self, url, fp,
1858 errcode, errmsg, headers)
1859 stuff = headers['proxy-authenticate']
1860 import re
1861 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1862 if not match:
1863 URLopener.http_error_default(self, url, fp,
1864 errcode, errmsg, headers)
1865 scheme, realm = match.groups()
1866 if scheme.lower() != 'basic':
1867 URLopener.http_error_default(self, url, fp,
1868 errcode, errmsg, headers)
1869 name = 'retry_proxy_' + self.type + '_basic_auth'
1870 if data is None:
1871 return getattr(self,name)(url, realm)
1872 else:
1873 return getattr(self,name)(url, realm, data)
1874
1875 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001876 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001877 newurl = 'http://' + host + selector
1878 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001879 urltype, proxyhost = splittype(proxy)
1880 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001881 i = proxyhost.find('@') + 1
1882 proxyhost = proxyhost[i:]
1883 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1884 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001885 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001886 quote(passwd, safe=''), proxyhost)
1887 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1888 if data is None:
1889 return self.open(newurl)
1890 else:
1891 return self.open(newurl, data)
1892
1893 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001894 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001895 newurl = 'https://' + host + selector
1896 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001897 urltype, proxyhost = splittype(proxy)
1898 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001899 i = proxyhost.find('@') + 1
1900 proxyhost = proxyhost[i:]
1901 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1902 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001903 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001904 quote(passwd, safe=''), proxyhost)
1905 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1906 if data is None:
1907 return self.open(newurl)
1908 else:
1909 return self.open(newurl, data)
1910
1911 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001912 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001913 i = host.find('@') + 1
1914 host = host[i:]
1915 user, passwd = self.get_user_passwd(host, realm, i)
1916 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001917 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001918 quote(passwd, safe=''), host)
1919 newurl = 'http://' + host + selector
1920 if data is None:
1921 return self.open(newurl)
1922 else:
1923 return self.open(newurl, data)
1924
1925 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001926 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001927 i = host.find('@') + 1
1928 host = host[i:]
1929 user, passwd = self.get_user_passwd(host, realm, i)
1930 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001931 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001932 quote(passwd, safe=''), host)
1933 newurl = 'https://' + host + selector
1934 if data is None:
1935 return self.open(newurl)
1936 else:
1937 return self.open(newurl, data)
1938
1939 def get_user_passwd(self, host, realm, clear_cache = 0):
1940 key = realm + '@' + host.lower()
1941 if key in self.auth_cache:
1942 if clear_cache:
1943 del self.auth_cache[key]
1944 else:
1945 return self.auth_cache[key]
1946 user, passwd = self.prompt_user_passwd(host, realm)
1947 if user or passwd: self.auth_cache[key] = (user, passwd)
1948 return user, passwd
1949
1950 def prompt_user_passwd(self, host, realm):
1951 """Override this in a GUI environment!"""
1952 import getpass
1953 try:
1954 user = input("Enter username for %s at %s: " % (realm, host))
1955 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
1956 (user, realm, host))
1957 return user, passwd
1958 except KeyboardInterrupt:
1959 print()
1960 return None, None
1961
1962
1963# Utility functions
1964
1965_localhost = None
1966def localhost():
1967 """Return the IP address of the magic hostname 'localhost'."""
1968 global _localhost
1969 if _localhost is None:
1970 _localhost = socket.gethostbyname('localhost')
1971 return _localhost
1972
1973_thishost = None
1974def thishost():
1975 """Return the IP address of the current host."""
1976 global _thishost
1977 if _thishost is None:
1978 _thishost = socket.gethostbyname(socket.gethostname())
1979 return _thishost
1980
1981_ftperrors = None
1982def ftperrors():
1983 """Return the set of errors raised by the FTP class."""
1984 global _ftperrors
1985 if _ftperrors is None:
1986 import ftplib
1987 _ftperrors = ftplib.all_errors
1988 return _ftperrors
1989
1990_noheaders = None
1991def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00001992 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001993 global _noheaders
1994 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00001995 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001996 return _noheaders
1997
1998
1999# Utility classes
2000
2001class ftpwrapper:
2002 """Class used by open_ftp() for cache of open FTP connections."""
2003
2004 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2005 self.user = user
2006 self.passwd = passwd
2007 self.host = host
2008 self.port = port
2009 self.dirs = dirs
2010 self.timeout = timeout
2011 self.init()
2012
2013 def init(self):
2014 import ftplib
2015 self.busy = 0
2016 self.ftp = ftplib.FTP()
2017 self.ftp.connect(self.host, self.port, self.timeout)
2018 self.ftp.login(self.user, self.passwd)
2019 for dir in self.dirs:
2020 self.ftp.cwd(dir)
2021
2022 def retrfile(self, file, type):
2023 import ftplib
2024 self.endtransfer()
2025 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2026 else: cmd = 'TYPE ' + type; isdir = 0
2027 try:
2028 self.ftp.voidcmd(cmd)
2029 except ftplib.all_errors:
2030 self.init()
2031 self.ftp.voidcmd(cmd)
2032 conn = None
2033 if file and not isdir:
2034 # Try to retrieve as a file
2035 try:
2036 cmd = 'RETR ' + file
2037 conn = self.ftp.ntransfercmd(cmd)
2038 except ftplib.error_perm as reason:
2039 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002040 raise URLError('ftp error', reason).with_traceback(
2041 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002042 if not conn:
2043 # Set transfer mode to ASCII!
2044 self.ftp.voidcmd('TYPE A')
2045 # Try a directory listing. Verify that directory exists.
2046 if file:
2047 pwd = self.ftp.pwd()
2048 try:
2049 try:
2050 self.ftp.cwd(file)
2051 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002052 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002053 finally:
2054 self.ftp.cwd(pwd)
2055 cmd = 'LIST ' + file
2056 else:
2057 cmd = 'LIST'
2058 conn = self.ftp.ntransfercmd(cmd)
2059 self.busy = 1
2060 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002061 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002062 def endtransfer(self):
2063 if not self.busy:
2064 return
2065 self.busy = 0
2066 try:
2067 self.ftp.voidresp()
2068 except ftperrors():
2069 pass
2070
2071 def close(self):
2072 self.endtransfer()
2073 try:
2074 self.ftp.close()
2075 except ftperrors():
2076 pass
2077
2078# Proxy handling
2079def getproxies_environment():
2080 """Return a dictionary of scheme -> proxy server URL mappings.
2081
2082 Scan the environment for variables named <scheme>_proxy;
2083 this seems to be the standard convention. If you need a
2084 different way, you can pass a proxies dictionary to the
2085 [Fancy]URLopener constructor.
2086
2087 """
2088 proxies = {}
2089 for name, value in os.environ.items():
2090 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002091 if value and name[-6:] == '_proxy':
2092 proxies[name[:-6]] = value
2093 return proxies
2094
2095def proxy_bypass_environment(host):
2096 """Test if proxies should not be used for a particular host.
2097
2098 Checks the environment for a variable named no_proxy, which should
2099 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2100 """
2101 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2102 # '*' is special case for always bypass
2103 if no_proxy == '*':
2104 return 1
2105 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002106 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002107 # check if the host ends with any of the DNS suffixes
2108 for name in no_proxy.split(','):
2109 if name and (hostonly.endswith(name) or host.endswith(name)):
2110 return 1
2111 # otherwise, don't bypass
2112 return 0
2113
2114
2115if sys.platform == 'darwin':
2116 def getproxies_internetconfig():
2117 """Return a dictionary of scheme -> proxy server URL mappings.
2118
2119 By convention the mac uses Internet Config to store
2120 proxies. An HTTP proxy, for instance, is stored under
2121 the HttpProxy key.
2122
2123 """
2124 try:
2125 import ic
2126 except ImportError:
2127 return {}
2128
2129 try:
2130 config = ic.IC()
2131 except ic.error:
2132 return {}
2133 proxies = {}
2134 # HTTP:
2135 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
2136 try:
2137 value = config['HTTPProxyHost']
2138 except ic.error:
2139 pass
2140 else:
2141 proxies['http'] = 'http://%s' % value
2142 # FTP: XXX To be done.
2143 # Gopher: XXX To be done.
2144 return proxies
2145
2146 def proxy_bypass(host):
2147 if getproxies_environment():
2148 return proxy_bypass_environment(host)
2149 else:
2150 return 0
2151
2152 def getproxies():
2153 return getproxies_environment() or getproxies_internetconfig()
2154
2155elif os.name == 'nt':
2156 def getproxies_registry():
2157 """Return a dictionary of scheme -> proxy server URL mappings.
2158
2159 Win32 uses the registry to store proxies.
2160
2161 """
2162 proxies = {}
2163 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002164 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002165 except ImportError:
2166 # Std module, so should be around - but you never know!
2167 return proxies
2168 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002169 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002170 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002171 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002172 'ProxyEnable')[0]
2173 if proxyEnable:
2174 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002175 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002176 'ProxyServer')[0])
2177 if '=' in proxyServer:
2178 # Per-protocol settings
2179 for p in proxyServer.split(';'):
2180 protocol, address = p.split('=', 1)
2181 # See if address has a type:// prefix
2182 import re
2183 if not re.match('^([^/:]+)://', address):
2184 address = '%s://%s' % (protocol, address)
2185 proxies[protocol] = address
2186 else:
2187 # Use one setting for all protocols
2188 if proxyServer[:5] == 'http:':
2189 proxies['http'] = proxyServer
2190 else:
2191 proxies['http'] = 'http://%s' % proxyServer
2192 proxies['ftp'] = 'ftp://%s' % proxyServer
2193 internetSettings.Close()
2194 except (WindowsError, ValueError, TypeError):
2195 # Either registry key not found etc, or the value in an
2196 # unexpected format.
2197 # proxies already set up to be empty so nothing to do
2198 pass
2199 return proxies
2200
2201 def getproxies():
2202 """Return a dictionary of scheme -> proxy server URL mappings.
2203
2204 Returns settings gathered from the environment, if specified,
2205 or the registry.
2206
2207 """
2208 return getproxies_environment() or getproxies_registry()
2209
2210 def proxy_bypass_registry(host):
2211 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002212 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002213 import re
2214 except ImportError:
2215 # Std modules, so should be around - but you never know!
2216 return 0
2217 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002218 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002219 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002220 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002221 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002222 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002223 'ProxyOverride')[0])
2224 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2225 except WindowsError:
2226 return 0
2227 if not proxyEnable or not proxyOverride:
2228 return 0
2229 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002230 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002231 host = [rawHost]
2232 try:
2233 addr = socket.gethostbyname(rawHost)
2234 if addr != rawHost:
2235 host.append(addr)
2236 except socket.error:
2237 pass
2238 try:
2239 fqdn = socket.getfqdn(rawHost)
2240 if fqdn != rawHost:
2241 host.append(fqdn)
2242 except socket.error:
2243 pass
2244 # make a check value list from the registry entry: replace the
2245 # '<local>' string by the localhost entry and the corresponding
2246 # canonical entry.
2247 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002248 # now check if we match one of the registry values.
2249 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002250 if test == '<local>':
2251 if '.' not in rawHost:
2252 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002253 test = test.replace(".", r"\.") # mask dots
2254 test = test.replace("*", r".*") # change glob sequence
2255 test = test.replace("?", r".") # change glob char
2256 for val in host:
2257 # print "%s <--> %s" %( test, val )
2258 if re.match(test, val, re.I):
2259 return 1
2260 return 0
2261
2262 def proxy_bypass(host):
2263 """Return a dictionary of scheme -> proxy server URL mappings.
2264
2265 Returns settings gathered from the environment, if specified,
2266 or the registry.
2267
2268 """
2269 if getproxies_environment():
2270 return proxy_bypass_environment(host)
2271 else:
2272 return proxy_bypass_registry(host)
2273
2274else:
2275 # By default use environment variables
2276 getproxies = getproxies_environment
2277 proxy_bypass = proxy_bypass_environment