blob: bb67267639bf14059cde2dbcbc7945f5ab67cc8a [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33OpenerDirector --
34
35Request -- An object that encapsulates the state of a request. The
36state can be as simple as the URL. It can also include extra HTTP
37headers, e.g. a User-Agent.
38
39BaseHandler --
40
41internals:
42BaseHandler and parent
43_call_chain conventions
44
45Example usage:
46
Georg Brandl029986a2008-06-23 11:44:14 +000047import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000048
49# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000050authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051authinfo.add_password(realm='PDQ Application',
52 uri='https://mahler:8092/site-updates.py',
53 user='klem',
54 passwd='geheim$parole')
55
Georg Brandl029986a2008-06-23 11:44:14 +000056proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000057
58# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000059opener = urllib.request.build_opener(proxy_support, authinfo,
60 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000061
62# install it
Georg Brandl029986a2008-06-23 11:44:14 +000063urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064
Georg Brandl029986a2008-06-23 11:44:14 +000065f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066"""
67
68# XXX issues:
69# If an authentication error handler that tries to perform
70# authentication for some reason but fails, how should the error be
71# signalled? The client needs to know the HTTP error code. But if
72# the handler knows that the problem was, e.g., that it didn't know
73# that hash algo that requested in the challenge, it would be good to
74# pass that information along to the client, too.
75# ftp errors aren't handled cleanly
76# check digest against correct (i.e. non-apache) implementation
77
78# Possible extensions:
79# complex proxies XXX not sure what exactly was meant by this
80# abstract factory for opener
81
82import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000083import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000084import email
85import hashlib
86import http.client
87import io
88import os
89import posixpath
90import random
91import re
92import socket
93import sys
94import time
Jeremy Hylton1afc1692008-06-18 20:49:58 +000095
Georg Brandl13e89462008-07-01 19:56:00 +000096from urllib.error import URLError, HTTPError, ContentTooShortError
97from urllib.parse import (
98 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
99 splittype, splithost, splitport, splituser, splitpasswd,
Facundo Batistaf24802c2008-08-17 03:36:03 +0000100 splitattr, splitquery, splitvalue, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000101from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000102
103# check for SSL
104try:
105 import ssl
106except:
107 _have_ssl = False
108else:
109 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110
111# used in User-Agent header sent
112__version__ = sys.version[:3]
113
114_opener = None
115def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
116 global _opener
117 if _opener is None:
118 _opener = build_opener()
119 return _opener.open(url, data, timeout)
120
121def install_opener(opener):
122 global _opener
123 _opener = opener
124
125# TODO(jhylton): Make this work with the same global opener.
126_urlopener = None
127def urlretrieve(url, filename=None, reporthook=None, data=None):
128 global _urlopener
129 if not _urlopener:
130 _urlopener = FancyURLopener()
131 return _urlopener.retrieve(url, filename, reporthook, data)
132
133def urlcleanup():
134 if _urlopener:
135 _urlopener.cleanup()
136 global _opener
137 if _opener:
138 _opener = None
139
140# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000141_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000142def request_host(request):
143 """Return request-host, as defined by RFC 2965.
144
145 Variation from RFC: returned value is lowercased, for convenient
146 comparison.
147
148 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000149 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000150 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000151 if host == "":
152 host = request.get_header("Host", "")
153
154 # remove port, if present
155 host = _cut_port_re.sub("", host, 1)
156 return host.lower()
157
158class Request:
159
160 def __init__(self, url, data=None, headers={},
161 origin_req_host=None, unverifiable=False):
162 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000163 self.full_url = unwrap(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000164 self.data = data
165 self.headers = {}
166 for key, value in headers.items():
167 self.add_header(key, value)
168 self.unredirected_hdrs = {}
169 if origin_req_host is None:
170 origin_req_host = request_host(self)
171 self.origin_req_host = origin_req_host
172 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000173 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000174
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000175 def _parse(self):
176 self.type, rest = splittype(self.full_url)
177 if self.type is None:
178 raise ValueError("unknown url type: %s" % self.full_url)
179 self.host, self.selector = splithost(rest)
180 if self.host:
181 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000182
183 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000184 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000185 return "POST"
186 else:
187 return "GET"
188
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000189 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000190
191 def add_data(self, data):
192 self.data = data
193
194 def has_data(self):
195 return self.data is not None
196
197 def get_data(self):
198 return self.data
199
200 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000201 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000202
203 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000204 return self.type
205
206 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000207 return self.host
208
209 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000210 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000212 def is_unverifiable(self):
213 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000214
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000215 def get_origin_req_host(self):
216 return self.origin_req_host
217
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000218 # End deprecated methods
219
220 def set_proxy(self, host, type):
221 self.host, self.type = host, type
222 self.selector = self.full_url
223
224 def has_proxy(self):
225 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000226
227 def add_header(self, key, val):
228 # useful for something like authentication
229 self.headers[key.capitalize()] = val
230
231 def add_unredirected_header(self, key, val):
232 # will not be added to a redirected request
233 self.unredirected_hdrs[key.capitalize()] = val
234
235 def has_header(self, header_name):
236 return (header_name in self.headers or
237 header_name in self.unredirected_hdrs)
238
239 def get_header(self, header_name, default=None):
240 return self.headers.get(
241 header_name,
242 self.unredirected_hdrs.get(header_name, default))
243
244 def header_items(self):
245 hdrs = self.unredirected_hdrs.copy()
246 hdrs.update(self.headers)
247 return list(hdrs.items())
248
249class OpenerDirector:
250 def __init__(self):
251 client_version = "Python-urllib/%s" % __version__
252 self.addheaders = [('User-agent', client_version)]
253 # manage the individual handlers
254 self.handlers = []
255 self.handle_open = {}
256 self.handle_error = {}
257 self.process_response = {}
258 self.process_request = {}
259
260 def add_handler(self, handler):
261 if not hasattr(handler, "add_parent"):
262 raise TypeError("expected BaseHandler instance, got %r" %
263 type(handler))
264
265 added = False
266 for meth in dir(handler):
267 if meth in ["redirect_request", "do_open", "proxy_open"]:
268 # oops, coincidental match
269 continue
270
271 i = meth.find("_")
272 protocol = meth[:i]
273 condition = meth[i+1:]
274
275 if condition.startswith("error"):
276 j = condition.find("_") + i + 1
277 kind = meth[j+1:]
278 try:
279 kind = int(kind)
280 except ValueError:
281 pass
282 lookup = self.handle_error.get(protocol, {})
283 self.handle_error[protocol] = lookup
284 elif condition == "open":
285 kind = protocol
286 lookup = self.handle_open
287 elif condition == "response":
288 kind = protocol
289 lookup = self.process_response
290 elif condition == "request":
291 kind = protocol
292 lookup = self.process_request
293 else:
294 continue
295
296 handlers = lookup.setdefault(kind, [])
297 if handlers:
298 bisect.insort(handlers, handler)
299 else:
300 handlers.append(handler)
301 added = True
302
303 if added:
304 # the handlers must work in an specific order, the order
305 # is specified in a Handler attribute
306 bisect.insort(self.handlers, handler)
307 handler.add_parent(self)
308
309 def close(self):
310 # Only exists for backwards compatibility.
311 pass
312
313 def _call_chain(self, chain, kind, meth_name, *args):
314 # Handlers raise an exception if no one else should try to handle
315 # the request, or return None if they can't but another handler
316 # could. Otherwise, they return the response.
317 handlers = chain.get(kind, ())
318 for handler in handlers:
319 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000320 result = func(*args)
321 if result is not None:
322 return result
323
324 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
325 # accept a URL or a Request object
326 if isinstance(fullurl, str):
327 req = Request(fullurl, data)
328 else:
329 req = fullurl
330 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000331 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000332
333 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000334 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000335
336 # pre-process request
337 meth_name = protocol+"_request"
338 for processor in self.process_request.get(protocol, []):
339 meth = getattr(processor, meth_name)
340 req = meth(req)
341
342 response = self._open(req, data)
343
344 # post-process response
345 meth_name = protocol+"_response"
346 for processor in self.process_response.get(protocol, []):
347 meth = getattr(processor, meth_name)
348 response = meth(req, response)
349
350 return response
351
352 def _open(self, req, data=None):
353 result = self._call_chain(self.handle_open, 'default',
354 'default_open', req)
355 if result:
356 return result
357
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000358 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000359 result = self._call_chain(self.handle_open, protocol, protocol +
360 '_open', req)
361 if result:
362 return result
363
364 return self._call_chain(self.handle_open, 'unknown',
365 'unknown_open', req)
366
367 def error(self, proto, *args):
368 if proto in ('http', 'https'):
369 # XXX http[s] protocols are special-cased
370 dict = self.handle_error['http'] # https is not different than http
371 proto = args[2] # YUCK!
372 meth_name = 'http_error_%s' % proto
373 http_err = 1
374 orig_args = args
375 else:
376 dict = self.handle_error
377 meth_name = proto + '_error'
378 http_err = 0
379 args = (dict, proto, meth_name) + args
380 result = self._call_chain(*args)
381 if result:
382 return result
383
384 if http_err:
385 args = (dict, 'default', 'http_error_default') + orig_args
386 return self._call_chain(*args)
387
388# XXX probably also want an abstract factory that knows when it makes
389# sense to skip a superclass in favor of a subclass and when it might
390# make sense to include both
391
392def build_opener(*handlers):
393 """Create an opener object from a list of handlers.
394
395 The opener will use several default handlers, including support
396 for HTTP and FTP.
397
398 If any of the handlers passed as arguments are subclasses of the
399 default handlers, the default handlers will not be used.
400 """
401 def isclass(obj):
402 return isinstance(obj, type) or hasattr(obj, "__bases__")
403
404 opener = OpenerDirector()
405 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
406 HTTPDefaultErrorHandler, HTTPRedirectHandler,
407 FTPHandler, FileHandler, HTTPErrorProcessor]
408 if hasattr(http.client, "HTTPSConnection"):
409 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000410 skip = set()
411 for klass in default_classes:
412 for check in handlers:
413 if isclass(check):
414 if issubclass(check, klass):
415 skip.add(klass)
416 elif isinstance(check, klass):
417 skip.add(klass)
418 for klass in skip:
419 default_classes.remove(klass)
420
421 for klass in default_classes:
422 opener.add_handler(klass())
423
424 for h in handlers:
425 if isclass(h):
426 h = h()
427 opener.add_handler(h)
428 return opener
429
430class BaseHandler:
431 handler_order = 500
432
433 def add_parent(self, parent):
434 self.parent = parent
435
436 def close(self):
437 # Only exists for backwards compatibility
438 pass
439
440 def __lt__(self, other):
441 if not hasattr(other, "handler_order"):
442 # Try to preserve the old behavior of having custom classes
443 # inserted after default ones (works only for custom user
444 # classes which are not aware of handler_order).
445 return True
446 return self.handler_order < other.handler_order
447
448
449class HTTPErrorProcessor(BaseHandler):
450 """Process HTTP error responses."""
451 handler_order = 1000 # after all other processing
452
453 def http_response(self, request, response):
454 code, msg, hdrs = response.code, response.msg, response.info()
455
456 # According to RFC 2616, "2xx" code indicates that the client's
457 # request was successfully received, understood, and accepted.
458 if not (200 <= code < 300):
459 response = self.parent.error(
460 'http', request, response, code, msg, hdrs)
461
462 return response
463
464 https_response = http_response
465
466class HTTPDefaultErrorHandler(BaseHandler):
467 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000468 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000469
470class HTTPRedirectHandler(BaseHandler):
471 # maximum number of redirections to any single URL
472 # this is needed because of the state that cookies introduce
473 max_repeats = 4
474 # maximum total number of redirections (regardless of URL) before
475 # assuming we're in a loop
476 max_redirections = 10
477
478 def redirect_request(self, req, fp, code, msg, headers, newurl):
479 """Return a Request or None in response to a redirect.
480
481 This is called by the http_error_30x methods when a
482 redirection response is received. If a redirection should
483 take place, return a new Request to allow http_error_30x to
484 perform the redirect. Otherwise, raise HTTPError if no-one
485 else should try to handle this url. Return None if you can't
486 but another Handler might.
487 """
488 m = req.get_method()
489 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
490 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000491 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000492
493 # Strictly (according to RFC 2616), 301 or 302 in response to
494 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000495 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000496 # essentially all clients do redirect in this case, so we do
497 # the same.
498 # be conciliant with URIs containing a space
499 newurl = newurl.replace(' ', '%20')
500 CONTENT_HEADERS = ("content-length", "content-type")
501 newheaders = dict((k, v) for k, v in req.headers.items()
502 if k.lower() not in CONTENT_HEADERS)
503 return Request(newurl,
504 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000505 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000506 unverifiable=True)
507
508 # Implementation note: To avoid the server sending us into an
509 # infinite loop, the request object needs to track what URLs we
510 # have already seen. Do this by adding a handler-specific
511 # attribute to the Request object.
512 def http_error_302(self, req, fp, code, msg, headers):
513 # Some servers (incorrectly) return multiple Location headers
514 # (so probably same goes for URI). Use first header.
515 if "location" in headers:
516 newurl = headers["location"]
517 elif "uri" in headers:
518 newurl = headers["uri"]
519 else:
520 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000521
522 # fix a possible malformed URL
523 urlparts = urlparse(newurl)
524 if not urlparts.path:
525 urlparts = list(urlparts)
526 urlparts[2] = "/"
527 newurl = urlunparse(urlparts)
528
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000529 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000530
531 # XXX Probably want to forget about the state of the current
532 # request, although that might interact poorly with other
533 # handlers that also use handler-specific request attributes
534 new = self.redirect_request(req, fp, code, msg, headers, newurl)
535 if new is None:
536 return
537
538 # loop detection
539 # .redirect_dict has a key url if url was previously visited.
540 if hasattr(req, 'redirect_dict'):
541 visited = new.redirect_dict = req.redirect_dict
542 if (visited.get(newurl, 0) >= self.max_repeats or
543 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000544 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000545 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000546 else:
547 visited = new.redirect_dict = req.redirect_dict = {}
548 visited[newurl] = visited.get(newurl, 0) + 1
549
550 # Don't close the fp until we are sure that we won't use it
551 # with HTTPError.
552 fp.read()
553 fp.close()
554
555 return self.parent.open(new)
556
557 http_error_301 = http_error_303 = http_error_307 = http_error_302
558
559 inf_msg = "The HTTP server returned a redirect error that would " \
560 "lead to an infinite loop.\n" \
561 "The last 30x error message was:\n"
562
563
564def _parse_proxy(proxy):
565 """Return (scheme, user, password, host/port) given a URL or an authority.
566
567 If a URL is supplied, it must have an authority (host:port) component.
568 According to RFC 3986, having an authority component means the URL must
569 have two slashes after the scheme:
570
571 >>> _parse_proxy('file:/ftp.example.com/')
572 Traceback (most recent call last):
573 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
574
575 The first three items of the returned tuple may be None.
576
577 Examples of authority parsing:
578
579 >>> _parse_proxy('proxy.example.com')
580 (None, None, None, 'proxy.example.com')
581 >>> _parse_proxy('proxy.example.com:3128')
582 (None, None, None, 'proxy.example.com:3128')
583
584 The authority component may optionally include userinfo (assumed to be
585 username:password):
586
587 >>> _parse_proxy('joe:password@proxy.example.com')
588 (None, 'joe', 'password', 'proxy.example.com')
589 >>> _parse_proxy('joe:password@proxy.example.com:3128')
590 (None, 'joe', 'password', 'proxy.example.com:3128')
591
592 Same examples, but with URLs instead:
593
594 >>> _parse_proxy('http://proxy.example.com/')
595 ('http', None, None, 'proxy.example.com')
596 >>> _parse_proxy('http://proxy.example.com:3128/')
597 ('http', None, None, 'proxy.example.com:3128')
598 >>> _parse_proxy('http://joe:password@proxy.example.com/')
599 ('http', 'joe', 'password', 'proxy.example.com')
600 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
601 ('http', 'joe', 'password', 'proxy.example.com:3128')
602
603 Everything after the authority is ignored:
604
605 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
606 ('ftp', 'joe', 'password', 'proxy.example.com')
607
608 Test for no trailing '/' case:
609
610 >>> _parse_proxy('http://joe:password@proxy.example.com')
611 ('http', 'joe', 'password', 'proxy.example.com')
612
613 """
Georg Brandl13e89462008-07-01 19:56:00 +0000614 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000615 if not r_scheme.startswith("/"):
616 # authority
617 scheme = None
618 authority = proxy
619 else:
620 # URL
621 if not r_scheme.startswith("//"):
622 raise ValueError("proxy URL with no authority: %r" % proxy)
623 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
624 # and 3.3.), path is empty or starts with '/'
625 end = r_scheme.find("/", 2)
626 if end == -1:
627 end = None
628 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000629 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000630 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000631 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000632 else:
633 user = password = None
634 return scheme, user, password, hostport
635
636class ProxyHandler(BaseHandler):
637 # Proxies must be in front
638 handler_order = 100
639
640 def __init__(self, proxies=None):
641 if proxies is None:
642 proxies = getproxies()
643 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
644 self.proxies = proxies
645 for type, url in proxies.items():
646 setattr(self, '%s_open' % type,
647 lambda r, proxy=url, type=type, meth=self.proxy_open: \
648 meth(r, proxy, type))
649
650 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000651 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000652 proxy_type, user, password, hostport = _parse_proxy(proxy)
653 if proxy_type is None:
654 proxy_type = orig_type
655 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000656 user_pass = '%s:%s' % (unquote(user),
657 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000658 creds = base64.b64encode(user_pass.encode()).decode("ascii")
659 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000660 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000661 req.set_proxy(hostport, proxy_type)
662 if orig_type == proxy_type:
663 # let other handlers take care of it
664 return None
665 else:
666 # need to start over, because the other handlers don't
667 # grok the proxy's URL type
668 # e.g. if we have a constructor arg proxies like so:
669 # {'http': 'ftp://proxy.example.com'}, we may end up turning
670 # a request for http://acme.example.com/a into one for
671 # ftp://proxy.example.com/a
672 return self.parent.open(req)
673
674class HTTPPasswordMgr:
675
676 def __init__(self):
677 self.passwd = {}
678
679 def add_password(self, realm, uri, user, passwd):
680 # uri could be a single URI or a sequence
681 if isinstance(uri, str):
682 uri = [uri]
683 if not realm in self.passwd:
684 self.passwd[realm] = {}
685 for default_port in True, False:
686 reduced_uri = tuple(
687 [self.reduce_uri(u, default_port) for u in uri])
688 self.passwd[realm][reduced_uri] = (user, passwd)
689
690 def find_user_password(self, realm, authuri):
691 domains = self.passwd.get(realm, {})
692 for default_port in True, False:
693 reduced_authuri = self.reduce_uri(authuri, default_port)
694 for uris, authinfo in domains.items():
695 for uri in uris:
696 if self.is_suburi(uri, reduced_authuri):
697 return authinfo
698 return None, None
699
700 def reduce_uri(self, uri, default_port=True):
701 """Accept authority or URI and extract only the authority and path."""
702 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000703 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000704 if parts[1]:
705 # URI
706 scheme = parts[0]
707 authority = parts[1]
708 path = parts[2] or '/'
709 else:
710 # host or host:port
711 scheme = None
712 authority = uri
713 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000714 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000715 if default_port and port is None and scheme is not None:
716 dport = {"http": 80,
717 "https": 443,
718 }.get(scheme)
719 if dport is not None:
720 authority = "%s:%d" % (host, dport)
721 return authority, path
722
723 def is_suburi(self, base, test):
724 """Check if test is below base in a URI tree
725
726 Both args must be URIs in reduced form.
727 """
728 if base == test:
729 return True
730 if base[0] != test[0]:
731 return False
732 common = posixpath.commonprefix((base[1], test[1]))
733 if len(common) == len(base[1]):
734 return True
735 return False
736
737
738class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
739
740 def find_user_password(self, realm, authuri):
741 user, password = HTTPPasswordMgr.find_user_password(self, realm,
742 authuri)
743 if user is not None:
744 return user, password
745 return HTTPPasswordMgr.find_user_password(self, None, authuri)
746
747
748class AbstractBasicAuthHandler:
749
750 # XXX this allows for multiple auth-schemes, but will stupidly pick
751 # the last one with a realm specified.
752
753 # allow for double- and single-quoted realm values
754 # (single quotes are a violation of the RFC, but appear in the wild)
755 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
756 'realm=(["\'])(.*?)\\2', re.I)
757
758 # XXX could pre-emptively send auth info already accepted (RFC 2617,
759 # end of section 2, and section 1.2 immediately after "credentials"
760 # production).
761
762 def __init__(self, password_mgr=None):
763 if password_mgr is None:
764 password_mgr = HTTPPasswordMgr()
765 self.passwd = password_mgr
766 self.add_password = self.passwd.add_password
767
768 def http_error_auth_reqed(self, authreq, host, req, headers):
769 # host may be an authority (without userinfo) or a URL with an
770 # authority
771 # XXX could be multiple headers
772 authreq = headers.get(authreq, None)
773 if authreq:
774 mo = AbstractBasicAuthHandler.rx.search(authreq)
775 if mo:
776 scheme, quote, realm = mo.groups()
777 if scheme.lower() == 'basic':
778 return self.retry_http_basic_auth(host, req, realm)
779
780 def retry_http_basic_auth(self, host, req, realm):
781 user, pw = self.passwd.find_user_password(realm, host)
782 if pw is not None:
783 raw = "%s:%s" % (user, pw)
784 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
785 if req.headers.get(self.auth_header, None) == auth:
786 return None
787 req.add_header(self.auth_header, auth)
788 return self.parent.open(req)
789 else:
790 return None
791
792
793class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
794
795 auth_header = 'Authorization'
796
797 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000798 url = req.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000799 return self.http_error_auth_reqed('www-authenticate',
800 url, req, headers)
801
802
803class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
804
805 auth_header = 'Proxy-authorization'
806
807 def http_error_407(self, req, fp, code, msg, headers):
808 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000809 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000810 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
811 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000812 authority = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 return self.http_error_auth_reqed('proxy-authenticate',
814 authority, req, headers)
815
816
817def randombytes(n):
818 """Return n random bytes."""
819 return os.urandom(n)
820
821class AbstractDigestAuthHandler:
822 # Digest authentication is specified in RFC 2617.
823
824 # XXX The client does not inspect the Authentication-Info header
825 # in a successful response.
826
827 # XXX It should be possible to test this implementation against
828 # a mock server that just generates a static set of challenges.
829
830 # XXX qop="auth-int" supports is shaky
831
832 def __init__(self, passwd=None):
833 if passwd is None:
834 passwd = HTTPPasswordMgr()
835 self.passwd = passwd
836 self.add_password = self.passwd.add_password
837 self.retried = 0
838 self.nonce_count = 0
839
840 def reset_retry_count(self):
841 self.retried = 0
842
843 def http_error_auth_reqed(self, auth_header, host, req, headers):
844 authreq = headers.get(auth_header, None)
845 if self.retried > 5:
846 # Don't fail endlessly - if we failed once, we'll probably
847 # fail a second time. Hm. Unless the Password Manager is
848 # prompting for the information. Crap. This isn't great
849 # but it's better than the current 'repeat until recursion
850 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000851 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000852 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000853 else:
854 self.retried += 1
855 if authreq:
856 scheme = authreq.split()[0]
857 if scheme.lower() == 'digest':
858 return self.retry_http_digest_auth(req, authreq)
859
860 def retry_http_digest_auth(self, req, auth):
861 token, challenge = auth.split(' ', 1)
862 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
863 auth = self.get_authorization(req, chal)
864 if auth:
865 auth_val = 'Digest %s' % auth
866 if req.headers.get(self.auth_header, None) == auth_val:
867 return None
868 req.add_unredirected_header(self.auth_header, auth_val)
869 resp = self.parent.open(req)
870 return resp
871
872 def get_cnonce(self, nonce):
873 # The cnonce-value is an opaque
874 # quoted string value provided by the client and used by both client
875 # and server to avoid chosen plaintext attacks, to provide mutual
876 # authentication, and to provide some message integrity protection.
877 # This isn't a fabulous effort, but it's probably Good Enough.
878 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
879 b = s.encode("ascii") + randombytes(8)
880 dig = hashlib.sha1(b).hexdigest()
881 return dig[:16]
882
883 def get_authorization(self, req, chal):
884 try:
885 realm = chal['realm']
886 nonce = chal['nonce']
887 qop = chal.get('qop')
888 algorithm = chal.get('algorithm', 'MD5')
889 # mod_digest doesn't send an opaque, even though it isn't
890 # supposed to be optional
891 opaque = chal.get('opaque', None)
892 except KeyError:
893 return None
894
895 H, KD = self.get_algorithm_impls(algorithm)
896 if H is None:
897 return None
898
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000899 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000900 if user is None:
901 return None
902
903 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000904 if req.data is not None:
905 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000906 else:
907 entdig = None
908
909 A1 = "%s:%s:%s" % (user, realm, pw)
910 A2 = "%s:%s" % (req.get_method(),
911 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000912 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000913 if qop == 'auth':
914 self.nonce_count += 1
915 ncvalue = '%08x' % self.nonce_count
916 cnonce = self.get_cnonce(nonce)
917 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
918 respdig = KD(H(A1), noncebit)
919 elif qop is None:
920 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
921 else:
922 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000923 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000924
925 # XXX should the partial digests be encoded too?
926
927 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000928 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000929 respdig)
930 if opaque:
931 base += ', opaque="%s"' % opaque
932 if entdig:
933 base += ', digest="%s"' % entdig
934 base += ', algorithm="%s"' % algorithm
935 if qop:
936 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
937 return base
938
939 def get_algorithm_impls(self, algorithm):
940 # lambdas assume digest modules are imported at the top level
941 if algorithm == 'MD5':
942 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
943 elif algorithm == 'SHA':
944 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
945 # XXX MD5-sess
946 KD = lambda s, d: H("%s:%s" % (s, d))
947 return H, KD
948
949 def get_entity_digest(self, data, chal):
950 # XXX not implemented yet
951 return None
952
953
954class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
955 """An authentication protocol defined by RFC 2069
956
957 Digest authentication improves on basic authentication because it
958 does not transmit passwords in the clear.
959 """
960
961 auth_header = 'Authorization'
962 handler_order = 490 # before Basic auth
963
964 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000965 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000966 retry = self.http_error_auth_reqed('www-authenticate',
967 host, req, headers)
968 self.reset_retry_count()
969 return retry
970
971
972class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
973
974 auth_header = 'Proxy-Authorization'
975 handler_order = 490 # before Basic auth
976
977 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000978 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000979 retry = self.http_error_auth_reqed('proxy-authenticate',
980 host, req, headers)
981 self.reset_retry_count()
982 return retry
983
984class AbstractHTTPHandler(BaseHandler):
985
986 def __init__(self, debuglevel=0):
987 self._debuglevel = debuglevel
988
989 def set_http_debuglevel(self, level):
990 self._debuglevel = level
991
992 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000993 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000994 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +0000995 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000996
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000997 if request.data is not None: # POST
998 data = request.data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000999 if not request.has_header('Content-type'):
1000 request.add_unredirected_header(
1001 'Content-type',
1002 'application/x-www-form-urlencoded')
1003 if not request.has_header('Content-length'):
1004 request.add_unredirected_header(
1005 'Content-length', '%d' % len(data))
1006
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001007 sel_host = host
1008 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001009 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001010 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001011 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001012 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001013 for name, value in self.parent.addheaders:
1014 name = name.capitalize()
1015 if not request.has_header(name):
1016 request.add_unredirected_header(name, value)
1017
1018 return request
1019
1020 def do_open(self, http_class, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001021 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001022
1023 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001024 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001025 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001026 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001027 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001028
1029 h = http_class(host, timeout=req.timeout) # will parse host:port
1030 headers = dict(req.headers)
1031 headers.update(req.unredirected_hdrs)
1032
1033 # TODO(jhylton): Should this be redesigned to handle
1034 # persistent connections?
1035
1036 # We want to make an HTTP/1.1 request, but the addinfourl
1037 # class isn't prepared to deal with a persistent connection.
1038 # It will try to read all remaining data from the socket,
1039 # which will block while the server waits for the next request.
1040 # So make sure the connection gets closed after the (only)
1041 # request.
1042 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001043 headers = dict((name.title(), val) for name, val in headers.items())
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001044 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001045 h.request(req.get_method(), req.selector, req.data, headers)
1046 r = h.getresponse() # an HTTPResponse instance
1047 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001048 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001049
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001050 r.url = req.full_url
1051 # This line replaces the .msg attribute of the HTTPResponse
1052 # with .headers, because urllib clients expect the response to
1053 # have the reason in .msg. It would be good to mark this
1054 # attribute is deprecated and get then to use info() or
1055 # .headers.
1056 r.msg = r.reason
1057 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001058
1059
1060class HTTPHandler(AbstractHTTPHandler):
1061
1062 def http_open(self, req):
1063 return self.do_open(http.client.HTTPConnection, req)
1064
1065 http_request = AbstractHTTPHandler.do_request_
1066
1067if hasattr(http.client, 'HTTPSConnection'):
1068 class HTTPSHandler(AbstractHTTPHandler):
1069
1070 def https_open(self, req):
1071 return self.do_open(http.client.HTTPSConnection, req)
1072
1073 https_request = AbstractHTTPHandler.do_request_
1074
1075class HTTPCookieProcessor(BaseHandler):
1076 def __init__(self, cookiejar=None):
1077 import http.cookiejar
1078 if cookiejar is None:
1079 cookiejar = http.cookiejar.CookieJar()
1080 self.cookiejar = cookiejar
1081
1082 def http_request(self, request):
1083 self.cookiejar.add_cookie_header(request)
1084 return request
1085
1086 def http_response(self, request, response):
1087 self.cookiejar.extract_cookies(response, request)
1088 return response
1089
1090 https_request = http_request
1091 https_response = http_response
1092
1093class UnknownHandler(BaseHandler):
1094 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001095 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001096 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001097
1098def parse_keqv_list(l):
1099 """Parse list of key=value strings where keys are not duplicated."""
1100 parsed = {}
1101 for elt in l:
1102 k, v = elt.split('=', 1)
1103 if v[0] == '"' and v[-1] == '"':
1104 v = v[1:-1]
1105 parsed[k] = v
1106 return parsed
1107
1108def parse_http_list(s):
1109 """Parse lists as described by RFC 2068 Section 2.
1110
1111 In particular, parse comma-separated lists where the elements of
1112 the list may include quoted-strings. A quoted-string could
1113 contain a comma. A non-quoted string could have quotes in the
1114 middle. Neither commas nor quotes count if they are escaped.
1115 Only double-quotes count, not single-quotes.
1116 """
1117 res = []
1118 part = ''
1119
1120 escape = quote = False
1121 for cur in s:
1122 if escape:
1123 part += cur
1124 escape = False
1125 continue
1126 if quote:
1127 if cur == '\\':
1128 escape = True
1129 continue
1130 elif cur == '"':
1131 quote = False
1132 part += cur
1133 continue
1134
1135 if cur == ',':
1136 res.append(part)
1137 part = ''
1138 continue
1139
1140 if cur == '"':
1141 quote = True
1142
1143 part += cur
1144
1145 # append last part
1146 if part:
1147 res.append(part)
1148
1149 return [part.strip() for part in res]
1150
1151class FileHandler(BaseHandler):
1152 # Use local file or FTP depending on form of URL
1153 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001154 url = req.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001155 if url[:2] == '//' and url[2:3] != '/':
1156 req.type = 'ftp'
1157 return self.parent.open(req)
1158 else:
1159 return self.open_local_file(req)
1160
1161 # names for the localhost
1162 names = None
1163 def get_names(self):
1164 if FileHandler.names is None:
1165 try:
1166 FileHandler.names = (socket.gethostbyname('localhost'),
1167 socket.gethostbyname(socket.gethostname()))
1168 except socket.gaierror:
1169 FileHandler.names = (socket.gethostbyname('localhost'),)
1170 return FileHandler.names
1171
1172 # not entirely sure what the rules are here
1173 def open_local_file(self, req):
1174 import email.utils
1175 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001176 host = req.host
1177 file = req.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001178 localfile = url2pathname(file)
1179 try:
1180 stats = os.stat(localfile)
1181 size = stats.st_size
1182 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1183 mtype = mimetypes.guess_type(file)[0]
1184 headers = email.message_from_string(
1185 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1186 (mtype or 'text/plain', size, modified))
1187 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001188 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001189 if not host or \
1190 (not port and _safe_gethostbyname(host) in self.get_names()):
Georg Brandl13e89462008-07-01 19:56:00 +00001191 return addinfourl(open(localfile, 'rb'), headers, 'file:'+file)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001192 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001193 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001194 raise URLError(msg)
1195 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001196
1197def _safe_gethostbyname(host):
1198 try:
1199 return socket.gethostbyname(host)
1200 except socket.gaierror:
1201 return None
1202
1203class FTPHandler(BaseHandler):
1204 def ftp_open(self, req):
1205 import ftplib
1206 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001207 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001208 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001209 raise URLError('ftp error: no host given')
1210 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001211 if port is None:
1212 port = ftplib.FTP_PORT
1213 else:
1214 port = int(port)
1215
1216 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001217 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001218 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001219 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001220 else:
1221 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001222 host = unquote(host)
1223 user = unquote(user or '')
1224 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001225
1226 try:
1227 host = socket.gethostbyname(host)
1228 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001229 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001230 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001231 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001232 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001233 dirs, file = dirs[:-1], dirs[-1]
1234 if dirs and not dirs[0]:
1235 dirs = dirs[1:]
1236 try:
1237 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1238 type = file and 'I' or 'D'
1239 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001240 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001241 if attr.lower() == 'type' and \
1242 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1243 type = value.upper()
1244 fp, retrlen = fw.retrfile(file, type)
1245 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001246 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001247 if mtype:
1248 headers += "Content-type: %s\n" % mtype
1249 if retrlen is not None and retrlen >= 0:
1250 headers += "Content-length: %d\n" % retrlen
1251 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001252 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001253 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001254 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001255 raise exc.with_traceback(sys.exc_info()[2])
1256
1257 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1258 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1259 return fw
1260
1261class CacheFTPHandler(FTPHandler):
1262 # XXX would be nice to have pluggable cache strategies
1263 # XXX this stuff is definitely not thread safe
1264 def __init__(self):
1265 self.cache = {}
1266 self.timeout = {}
1267 self.soonest = 0
1268 self.delay = 60
1269 self.max_conns = 16
1270
1271 def setTimeout(self, t):
1272 self.delay = t
1273
1274 def setMaxConns(self, m):
1275 self.max_conns = m
1276
1277 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1278 key = user, host, port, '/'.join(dirs), timeout
1279 if key in self.cache:
1280 self.timeout[key] = time.time() + self.delay
1281 else:
1282 self.cache[key] = ftpwrapper(user, passwd, host, port,
1283 dirs, timeout)
1284 self.timeout[key] = time.time() + self.delay
1285 self.check_cache()
1286 return self.cache[key]
1287
1288 def check_cache(self):
1289 # first check for old ones
1290 t = time.time()
1291 if self.soonest <= t:
1292 for k, v in list(self.timeout.items()):
1293 if v < t:
1294 self.cache[k].close()
1295 del self.cache[k]
1296 del self.timeout[k]
1297 self.soonest = min(list(self.timeout.values()))
1298
1299 # then check the size
1300 if len(self.cache) == self.max_conns:
1301 for k, v in list(self.timeout.items()):
1302 if v == self.soonest:
1303 del self.cache[k]
1304 del self.timeout[k]
1305 break
1306 self.soonest = min(list(self.timeout.values()))
1307
1308# Code move from the old urllib module
1309
1310MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1311
1312# Helper for non-unix systems
1313if os.name == 'mac':
1314 from macurl2path import url2pathname, pathname2url
1315elif os.name == 'nt':
1316 from nturl2path import url2pathname, pathname2url
1317else:
1318 def url2pathname(pathname):
1319 """OS-specific conversion from a relative URL of the 'file' scheme
1320 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001321 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001322
1323 def pathname2url(pathname):
1324 """OS-specific conversion from a file system path to a relative URL
1325 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001326 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001327
1328# This really consists of two pieces:
1329# (1) a class which handles opening of all sorts of URLs
1330# (plus assorted utilities etc.)
1331# (2) a set of functions for parsing URLs
1332# XXX Should these be separated out into different modules?
1333
1334
1335ftpcache = {}
1336class URLopener:
1337 """Class to open URLs.
1338 This is a class rather than just a subroutine because we may need
1339 more than one set of global protocol-specific options.
1340 Note -- this is a base class for those who don't want the
1341 automatic handling of errors type 302 (relocated) and 401
1342 (authorization needed)."""
1343
1344 __tempfiles = None
1345
1346 version = "Python-urllib/%s" % __version__
1347
1348 # Constructor
1349 def __init__(self, proxies=None, **x509):
1350 if proxies is None:
1351 proxies = getproxies()
1352 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1353 self.proxies = proxies
1354 self.key_file = x509.get('key_file')
1355 self.cert_file = x509.get('cert_file')
1356 self.addheaders = [('User-Agent', self.version)]
1357 self.__tempfiles = []
1358 self.__unlink = os.unlink # See cleanup()
1359 self.tempcache = None
1360 # Undocumented feature: if you assign {} to tempcache,
1361 # it is used to cache files retrieved with
1362 # self.retrieve(). This is not enabled by default
1363 # since it does not work for changing documents (and I
1364 # haven't got the logic to check expiration headers
1365 # yet).
1366 self.ftpcache = ftpcache
1367 # Undocumented feature: you can use a different
1368 # ftp cache by assigning to the .ftpcache member;
1369 # in case you want logically independent URL openers
1370 # XXX This is not threadsafe. Bah.
1371
1372 def __del__(self):
1373 self.close()
1374
1375 def close(self):
1376 self.cleanup()
1377
1378 def cleanup(self):
1379 # This code sometimes runs when the rest of this module
1380 # has already been deleted, so it can't use any globals
1381 # or import anything.
1382 if self.__tempfiles:
1383 for file in self.__tempfiles:
1384 try:
1385 self.__unlink(file)
1386 except OSError:
1387 pass
1388 del self.__tempfiles[:]
1389 if self.tempcache:
1390 self.tempcache.clear()
1391
1392 def addheader(self, *args):
1393 """Add a header to be used by the HTTP interface only
1394 e.g. u.addheader('Accept', 'sound/basic')"""
1395 self.addheaders.append(args)
1396
1397 # External interface
1398 def open(self, fullurl, data=None):
1399 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001400 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran690ce9b2009-05-05 18:41:13 +00001401 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001402 if self.tempcache and fullurl in self.tempcache:
1403 filename, headers = self.tempcache[fullurl]
1404 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001405 return addinfourl(fp, headers, fullurl)
1406 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001407 if not urltype:
1408 urltype = 'file'
1409 if urltype in self.proxies:
1410 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001411 urltype, proxyhost = splittype(proxy)
1412 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001413 url = (host, fullurl) # Signal special case to open_*()
1414 else:
1415 proxy = None
1416 name = 'open_' + urltype
1417 self.type = urltype
1418 name = name.replace('-', '_')
1419 if not hasattr(self, name):
1420 if proxy:
1421 return self.open_unknown_proxy(proxy, fullurl, data)
1422 else:
1423 return self.open_unknown(fullurl, data)
1424 try:
1425 if data is None:
1426 return getattr(self, name)(url)
1427 else:
1428 return getattr(self, name)(url, data)
1429 except socket.error as msg:
1430 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1431
1432 def open_unknown(self, fullurl, data=None):
1433 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001434 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001435 raise IOError('url error', 'unknown url type', type)
1436
1437 def open_unknown_proxy(self, proxy, fullurl, data=None):
1438 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001439 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001440 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1441
1442 # External interface
1443 def retrieve(self, url, filename=None, reporthook=None, data=None):
1444 """retrieve(url) returns (filename, headers) for a local object
1445 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001446 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001447 if self.tempcache and url in self.tempcache:
1448 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001449 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001450 if filename is None and (not type or type == 'file'):
1451 try:
1452 fp = self.open_local_file(url1)
1453 hdrs = fp.info()
1454 del fp
Georg Brandl13e89462008-07-01 19:56:00 +00001455 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001456 except IOError as msg:
1457 pass
1458 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001459 try:
1460 headers = fp.info()
1461 if filename:
1462 tfp = open(filename, 'wb')
1463 else:
1464 import tempfile
1465 garbage, path = splittype(url)
1466 garbage, path = splithost(path or "")
1467 path, garbage = splitquery(path or "")
1468 path, garbage = splitattr(path or "")
1469 suffix = os.path.splitext(path)[1]
1470 (fd, filename) = tempfile.mkstemp(suffix)
1471 self.__tempfiles.append(filename)
1472 tfp = os.fdopen(fd, 'wb')
1473 try:
1474 result = filename, headers
1475 if self.tempcache is not None:
1476 self.tempcache[url] = result
1477 bs = 1024*8
1478 size = -1
1479 read = 0
1480 blocknum = 0
1481 if reporthook:
1482 if "content-length" in headers:
1483 size = int(headers["Content-Length"])
1484 reporthook(blocknum, bs, size)
1485 while 1:
1486 block = fp.read(bs)
1487 if not block:
1488 break
1489 read += len(block)
1490 tfp.write(block)
1491 blocknum += 1
1492 if reporthook:
1493 reporthook(blocknum, bs, size)
1494 finally:
1495 tfp.close()
1496 finally:
1497 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001498 del fp
1499 del tfp
1500
1501 # raise exception if actual size does not match content-length header
1502 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001503 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001504 "retrieval incomplete: got only %i out of %i bytes"
1505 % (read, size), result)
1506
1507 return result
1508
1509 # Each method named open_<type> knows how to open that type of URL
1510
1511 def _open_generic_http(self, connection_factory, url, data):
1512 """Make an HTTP connection using connection_class.
1513
1514 This is an internal method that should be called from
1515 open_http() or open_https().
1516
1517 Arguments:
1518 - connection_factory should take a host name and return an
1519 HTTPConnection instance.
1520 - url is the url to retrieval or a host, relative-path pair.
1521 - data is payload for a POST request or None.
1522 """
1523
1524 user_passwd = None
1525 proxy_passwd= None
1526 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001527 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001528 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001529 user_passwd, host = splituser(host)
1530 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001531 realhost = host
1532 else:
1533 host, selector = url
1534 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001535 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001536 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001537 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001538 url = rest
1539 user_passwd = None
1540 if urltype.lower() != 'http':
1541 realhost = None
1542 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001543 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001544 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001545 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001546 if user_passwd:
1547 selector = "%s://%s%s" % (urltype, realhost, rest)
1548 if proxy_bypass(realhost):
1549 host = realhost
1550
1551 #print "proxy via http:", host, selector
1552 if not host: raise IOError('http error', 'no host given')
1553
1554 if proxy_passwd:
1555 import base64
1556 proxy_auth = base64.b64encode(proxy_passwd).strip()
1557 else:
1558 proxy_auth = None
1559
1560 if user_passwd:
1561 import base64
1562 auth = base64.b64encode(user_passwd).strip()
1563 else:
1564 auth = None
1565 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001566 headers = {}
1567 if proxy_auth:
1568 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1569 if auth:
1570 headers["Authorization"] = "Basic %s" % auth
1571 if realhost:
1572 headers["Host"] = realhost
1573 for header, value in self.addheaders:
1574 headers[header] = value
1575
1576 if data is not None:
1577 headers["Content-Type"] = "application/x-www-form-urlencoded"
1578 http_conn.request("POST", selector, data, headers)
1579 else:
1580 http_conn.request("GET", selector, headers=headers)
1581
1582 try:
1583 response = http_conn.getresponse()
1584 except http.client.BadStatusLine:
1585 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001586 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001587
1588 # According to RFC 2616, "2xx" code indicates that the client's
1589 # request was successfully received, understood, and accepted.
1590 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001591 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001592 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001593 else:
1594 return self.http_error(
1595 url, response.fp,
1596 response.status, response.reason, response.msg, data)
1597
1598 def open_http(self, url, data=None):
1599 """Use HTTP protocol."""
1600 return self._open_generic_http(http.client.HTTPConnection, url, data)
1601
1602 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1603 """Handle http errors.
1604
1605 Derived class can override this, or provide specific handlers
1606 named http_error_DDD where DDD is the 3-digit error code."""
1607 # First check if there's a specific handler for this error
1608 name = 'http_error_%d' % errcode
1609 if hasattr(self, name):
1610 method = getattr(self, name)
1611 if data is None:
1612 result = method(url, fp, errcode, errmsg, headers)
1613 else:
1614 result = method(url, fp, errcode, errmsg, headers, data)
1615 if result: return result
1616 return self.http_error_default(url, fp, errcode, errmsg, headers)
1617
1618 def http_error_default(self, url, fp, errcode, errmsg, headers):
1619 """Default error handler: close the connection and raise IOError."""
1620 void = fp.read()
1621 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001622 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001623
1624 if _have_ssl:
1625 def _https_connection(self, host):
1626 return http.client.HTTPSConnection(host,
1627 key_file=self.key_file,
1628 cert_file=self.cert_file)
1629
1630 def open_https(self, url, data=None):
1631 """Use HTTPS protocol."""
1632 return self._open_generic_http(self._https_connection, url, data)
1633
1634 def open_file(self, url):
1635 """Use local file or FTP depending on form of URL."""
1636 if not isinstance(url, str):
1637 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1638 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1639 return self.open_ftp(url)
1640 else:
1641 return self.open_local_file(url)
1642
1643 def open_local_file(self, url):
1644 """Use local file."""
1645 import mimetypes, email.utils
1646 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001647 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001648 localname = url2pathname(file)
1649 try:
1650 stats = os.stat(localname)
1651 except OSError as e:
1652 raise URLError(e.errno, e.strerror, e.filename)
1653 size = stats.st_size
1654 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1655 mtype = mimetypes.guess_type(url)[0]
1656 headers = email.message_from_string(
1657 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1658 (mtype or 'text/plain', size, modified))
1659 if not host:
1660 urlfile = file
1661 if file[:1] == '/':
1662 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001663 return addinfourl(open(localname, 'rb'), headers, urlfile)
1664 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001665 if (not port
1666 and socket.gethostbyname(host) in (localhost(), thishost())):
1667 urlfile = file
1668 if file[:1] == '/':
1669 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001670 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001671 raise URLError('local file error', 'not on local host')
1672
1673 def open_ftp(self, url):
1674 """Use FTP protocol."""
1675 if not isinstance(url, str):
1676 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1677 import mimetypes
1678 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001679 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001680 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001681 host, port = splitport(host)
1682 user, host = splituser(host)
1683 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001684 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001685 host = unquote(host)
1686 user = unquote(user or '')
1687 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001688 host = socket.gethostbyname(host)
1689 if not port:
1690 import ftplib
1691 port = ftplib.FTP_PORT
1692 else:
1693 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001694 path, attrs = splitattr(path)
1695 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001696 dirs = path.split('/')
1697 dirs, file = dirs[:-1], dirs[-1]
1698 if dirs and not dirs[0]: dirs = dirs[1:]
1699 if dirs and not dirs[0]: dirs[0] = '/'
1700 key = user, host, port, '/'.join(dirs)
1701 # XXX thread unsafe!
1702 if len(self.ftpcache) > MAXFTPCACHE:
1703 # Prune the cache, rather arbitrarily
1704 for k in self.ftpcache.keys():
1705 if k != key:
1706 v = self.ftpcache[k]
1707 del self.ftpcache[k]
1708 v.close()
1709 try:
1710 if not key in self.ftpcache:
1711 self.ftpcache[key] = \
1712 ftpwrapper(user, passwd, host, port, dirs)
1713 if not file: type = 'D'
1714 else: type = 'I'
1715 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001716 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001717 if attr.lower() == 'type' and \
1718 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1719 type = value.upper()
1720 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1721 mtype = mimetypes.guess_type("ftp:" + url)[0]
1722 headers = ""
1723 if mtype:
1724 headers += "Content-Type: %s\n" % mtype
1725 if retrlen is not None and retrlen >= 0:
1726 headers += "Content-Length: %d\n" % retrlen
1727 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001728 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001729 except ftperrors() as msg:
1730 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1731
1732 def open_data(self, url, data=None):
1733 """Use "data" URL."""
1734 if not isinstance(url, str):
1735 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1736 # ignore POSTed data
1737 #
1738 # syntax of data URLs:
1739 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1740 # mediatype := [ type "/" subtype ] *( ";" parameter )
1741 # data := *urlchar
1742 # parameter := attribute "=" value
1743 try:
1744 [type, data] = url.split(',', 1)
1745 except ValueError:
1746 raise IOError('data error', 'bad data URL')
1747 if not type:
1748 type = 'text/plain;charset=US-ASCII'
1749 semi = type.rfind(';')
1750 if semi >= 0 and '=' not in type[semi:]:
1751 encoding = type[semi+1:]
1752 type = type[:semi]
1753 else:
1754 encoding = ''
1755 msg = []
1756 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
1757 time.gmtime(time.time())))
1758 msg.append('Content-type: %s' % type)
1759 if encoding == 'base64':
1760 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001761 # XXX is this encoding/decoding ok?
1762 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001763 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001764 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001765 msg.append('Content-Length: %d' % len(data))
1766 msg.append('')
1767 msg.append(data)
1768 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001769 headers = email.message_from_string(msg)
1770 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001771 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001772 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001773
1774
1775class FancyURLopener(URLopener):
1776 """Derived class with handlers for errors we can handle (perhaps)."""
1777
1778 def __init__(self, *args, **kwargs):
1779 URLopener.__init__(self, *args, **kwargs)
1780 self.auth_cache = {}
1781 self.tries = 0
1782 self.maxtries = 10
1783
1784 def http_error_default(self, url, fp, errcode, errmsg, headers):
1785 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001786 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001787
1788 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1789 """Error 302 -- relocated (temporarily)."""
1790 self.tries += 1
1791 if self.maxtries and self.tries >= self.maxtries:
1792 if hasattr(self, "http_error_500"):
1793 meth = self.http_error_500
1794 else:
1795 meth = self.http_error_default
1796 self.tries = 0
1797 return meth(url, fp, 500,
1798 "Internal Server Error: Redirect Recursion", headers)
1799 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1800 data)
1801 self.tries = 0
1802 return result
1803
1804 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1805 if 'location' in headers:
1806 newurl = headers['location']
1807 elif 'uri' in headers:
1808 newurl = headers['uri']
1809 else:
1810 return
1811 void = fp.read()
1812 fp.close()
1813 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001814 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001815 return self.open(newurl)
1816
1817 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1818 """Error 301 -- also relocated (permanently)."""
1819 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1820
1821 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1822 """Error 303 -- also relocated (essentially identical to 302)."""
1823 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1824
1825 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1826 """Error 307 -- relocated, but turn POST into error."""
1827 if data is None:
1828 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1829 else:
1830 return self.http_error_default(url, fp, errcode, errmsg, headers)
1831
1832 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
1833 """Error 401 -- authentication required.
1834 This function supports Basic authentication only."""
1835 if not 'www-authenticate' in headers:
1836 URLopener.http_error_default(self, url, fp,
1837 errcode, errmsg, headers)
1838 stuff = headers['www-authenticate']
1839 import re
1840 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1841 if not match:
1842 URLopener.http_error_default(self, url, fp,
1843 errcode, errmsg, headers)
1844 scheme, realm = match.groups()
1845 if scheme.lower() != 'basic':
1846 URLopener.http_error_default(self, url, fp,
1847 errcode, errmsg, headers)
1848 name = 'retry_' + self.type + '_basic_auth'
1849 if data is None:
1850 return getattr(self,name)(url, realm)
1851 else:
1852 return getattr(self,name)(url, realm, data)
1853
1854 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
1855 """Error 407 -- proxy authentication required.
1856 This function supports Basic authentication only."""
1857 if not 'proxy-authenticate' in headers:
1858 URLopener.http_error_default(self, url, fp,
1859 errcode, errmsg, headers)
1860 stuff = headers['proxy-authenticate']
1861 import re
1862 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1863 if not match:
1864 URLopener.http_error_default(self, url, fp,
1865 errcode, errmsg, headers)
1866 scheme, realm = match.groups()
1867 if scheme.lower() != 'basic':
1868 URLopener.http_error_default(self, url, fp,
1869 errcode, errmsg, headers)
1870 name = 'retry_proxy_' + self.type + '_basic_auth'
1871 if data is None:
1872 return getattr(self,name)(url, realm)
1873 else:
1874 return getattr(self,name)(url, realm, data)
1875
1876 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001877 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001878 newurl = 'http://' + host + selector
1879 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001880 urltype, proxyhost = splittype(proxy)
1881 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001882 i = proxyhost.find('@') + 1
1883 proxyhost = proxyhost[i:]
1884 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1885 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001886 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001887 quote(passwd, safe=''), proxyhost)
1888 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1889 if data is None:
1890 return self.open(newurl)
1891 else:
1892 return self.open(newurl, data)
1893
1894 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001895 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001896 newurl = 'https://' + host + selector
1897 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001898 urltype, proxyhost = splittype(proxy)
1899 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001900 i = proxyhost.find('@') + 1
1901 proxyhost = proxyhost[i:]
1902 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1903 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001904 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001905 quote(passwd, safe=''), proxyhost)
1906 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1907 if data is None:
1908 return self.open(newurl)
1909 else:
1910 return self.open(newurl, data)
1911
1912 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001913 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001914 i = host.find('@') + 1
1915 host = host[i:]
1916 user, passwd = self.get_user_passwd(host, realm, i)
1917 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001918 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001919 quote(passwd, safe=''), host)
1920 newurl = 'http://' + host + selector
1921 if data is None:
1922 return self.open(newurl)
1923 else:
1924 return self.open(newurl, data)
1925
1926 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001927 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001928 i = host.find('@') + 1
1929 host = host[i:]
1930 user, passwd = self.get_user_passwd(host, realm, i)
1931 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001932 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001933 quote(passwd, safe=''), host)
1934 newurl = 'https://' + host + selector
1935 if data is None:
1936 return self.open(newurl)
1937 else:
1938 return self.open(newurl, data)
1939
1940 def get_user_passwd(self, host, realm, clear_cache = 0):
1941 key = realm + '@' + host.lower()
1942 if key in self.auth_cache:
1943 if clear_cache:
1944 del self.auth_cache[key]
1945 else:
1946 return self.auth_cache[key]
1947 user, passwd = self.prompt_user_passwd(host, realm)
1948 if user or passwd: self.auth_cache[key] = (user, passwd)
1949 return user, passwd
1950
1951 def prompt_user_passwd(self, host, realm):
1952 """Override this in a GUI environment!"""
1953 import getpass
1954 try:
1955 user = input("Enter username for %s at %s: " % (realm, host))
1956 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
1957 (user, realm, host))
1958 return user, passwd
1959 except KeyboardInterrupt:
1960 print()
1961 return None, None
1962
1963
1964# Utility functions
1965
1966_localhost = None
1967def localhost():
1968 """Return the IP address of the magic hostname 'localhost'."""
1969 global _localhost
1970 if _localhost is None:
1971 _localhost = socket.gethostbyname('localhost')
1972 return _localhost
1973
1974_thishost = None
1975def thishost():
1976 """Return the IP address of the current host."""
1977 global _thishost
1978 if _thishost is None:
1979 _thishost = socket.gethostbyname(socket.gethostname())
1980 return _thishost
1981
1982_ftperrors = None
1983def ftperrors():
1984 """Return the set of errors raised by the FTP class."""
1985 global _ftperrors
1986 if _ftperrors is None:
1987 import ftplib
1988 _ftperrors = ftplib.all_errors
1989 return _ftperrors
1990
1991_noheaders = None
1992def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00001993 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001994 global _noheaders
1995 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00001996 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001997 return _noheaders
1998
1999
2000# Utility classes
2001
2002class ftpwrapper:
2003 """Class used by open_ftp() for cache of open FTP connections."""
2004
2005 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2006 self.user = user
2007 self.passwd = passwd
2008 self.host = host
2009 self.port = port
2010 self.dirs = dirs
2011 self.timeout = timeout
2012 self.init()
2013
2014 def init(self):
2015 import ftplib
2016 self.busy = 0
2017 self.ftp = ftplib.FTP()
2018 self.ftp.connect(self.host, self.port, self.timeout)
2019 self.ftp.login(self.user, self.passwd)
2020 for dir in self.dirs:
2021 self.ftp.cwd(dir)
2022
2023 def retrfile(self, file, type):
2024 import ftplib
2025 self.endtransfer()
2026 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2027 else: cmd = 'TYPE ' + type; isdir = 0
2028 try:
2029 self.ftp.voidcmd(cmd)
2030 except ftplib.all_errors:
2031 self.init()
2032 self.ftp.voidcmd(cmd)
2033 conn = None
2034 if file and not isdir:
2035 # Try to retrieve as a file
2036 try:
2037 cmd = 'RETR ' + file
2038 conn = self.ftp.ntransfercmd(cmd)
2039 except ftplib.error_perm as reason:
2040 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002041 raise URLError('ftp error', reason).with_traceback(
2042 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002043 if not conn:
2044 # Set transfer mode to ASCII!
2045 self.ftp.voidcmd('TYPE A')
2046 # Try a directory listing. Verify that directory exists.
2047 if file:
2048 pwd = self.ftp.pwd()
2049 try:
2050 try:
2051 self.ftp.cwd(file)
2052 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002053 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002054 finally:
2055 self.ftp.cwd(pwd)
2056 cmd = 'LIST ' + file
2057 else:
2058 cmd = 'LIST'
2059 conn = self.ftp.ntransfercmd(cmd)
2060 self.busy = 1
2061 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002062 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002063 def endtransfer(self):
2064 if not self.busy:
2065 return
2066 self.busy = 0
2067 try:
2068 self.ftp.voidresp()
2069 except ftperrors():
2070 pass
2071
2072 def close(self):
2073 self.endtransfer()
2074 try:
2075 self.ftp.close()
2076 except ftperrors():
2077 pass
2078
2079# Proxy handling
2080def getproxies_environment():
2081 """Return a dictionary of scheme -> proxy server URL mappings.
2082
2083 Scan the environment for variables named <scheme>_proxy;
2084 this seems to be the standard convention. If you need a
2085 different way, you can pass a proxies dictionary to the
2086 [Fancy]URLopener constructor.
2087
2088 """
2089 proxies = {}
2090 for name, value in os.environ.items():
2091 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002092 if value and name[-6:] == '_proxy':
2093 proxies[name[:-6]] = value
2094 return proxies
2095
2096def proxy_bypass_environment(host):
2097 """Test if proxies should not be used for a particular host.
2098
2099 Checks the environment for a variable named no_proxy, which should
2100 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2101 """
2102 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2103 # '*' is special case for always bypass
2104 if no_proxy == '*':
2105 return 1
2106 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002107 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002108 # check if the host ends with any of the DNS suffixes
2109 for name in no_proxy.split(','):
2110 if name and (hostonly.endswith(name) or host.endswith(name)):
2111 return 1
2112 # otherwise, don't bypass
2113 return 0
2114
2115
2116if sys.platform == 'darwin':
2117 def getproxies_internetconfig():
2118 """Return a dictionary of scheme -> proxy server URL mappings.
2119
2120 By convention the mac uses Internet Config to store
2121 proxies. An HTTP proxy, for instance, is stored under
2122 the HttpProxy key.
2123
2124 """
2125 try:
2126 import ic
2127 except ImportError:
2128 return {}
2129
2130 try:
2131 config = ic.IC()
2132 except ic.error:
2133 return {}
2134 proxies = {}
2135 # HTTP:
2136 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
2137 try:
2138 value = config['HTTPProxyHost']
2139 except ic.error:
2140 pass
2141 else:
2142 proxies['http'] = 'http://%s' % value
2143 # FTP: XXX To be done.
2144 # Gopher: XXX To be done.
2145 return proxies
2146
2147 def proxy_bypass(host):
2148 if getproxies_environment():
2149 return proxy_bypass_environment(host)
2150 else:
2151 return 0
2152
2153 def getproxies():
2154 return getproxies_environment() or getproxies_internetconfig()
2155
2156elif os.name == 'nt':
2157 def getproxies_registry():
2158 """Return a dictionary of scheme -> proxy server URL mappings.
2159
2160 Win32 uses the registry to store proxies.
2161
2162 """
2163 proxies = {}
2164 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002165 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002166 except ImportError:
2167 # Std module, so should be around - but you never know!
2168 return proxies
2169 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002170 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002171 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002172 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002173 'ProxyEnable')[0]
2174 if proxyEnable:
2175 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002176 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002177 'ProxyServer')[0])
2178 if '=' in proxyServer:
2179 # Per-protocol settings
2180 for p in proxyServer.split(';'):
2181 protocol, address = p.split('=', 1)
2182 # See if address has a type:// prefix
2183 import re
2184 if not re.match('^([^/:]+)://', address):
2185 address = '%s://%s' % (protocol, address)
2186 proxies[protocol] = address
2187 else:
2188 # Use one setting for all protocols
2189 if proxyServer[:5] == 'http:':
2190 proxies['http'] = proxyServer
2191 else:
2192 proxies['http'] = 'http://%s' % proxyServer
2193 proxies['ftp'] = 'ftp://%s' % proxyServer
2194 internetSettings.Close()
2195 except (WindowsError, ValueError, TypeError):
2196 # Either registry key not found etc, or the value in an
2197 # unexpected format.
2198 # proxies already set up to be empty so nothing to do
2199 pass
2200 return proxies
2201
2202 def getproxies():
2203 """Return a dictionary of scheme -> proxy server URL mappings.
2204
2205 Returns settings gathered from the environment, if specified,
2206 or the registry.
2207
2208 """
2209 return getproxies_environment() or getproxies_registry()
2210
2211 def proxy_bypass_registry(host):
2212 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002213 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002214 import re
2215 except ImportError:
2216 # Std modules, so should be around - but you never know!
2217 return 0
2218 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002219 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002220 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002221 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002222 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002223 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002224 'ProxyOverride')[0])
2225 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2226 except WindowsError:
2227 return 0
2228 if not proxyEnable or not proxyOverride:
2229 return 0
2230 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002231 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002232 host = [rawHost]
2233 try:
2234 addr = socket.gethostbyname(rawHost)
2235 if addr != rawHost:
2236 host.append(addr)
2237 except socket.error:
2238 pass
2239 try:
2240 fqdn = socket.getfqdn(rawHost)
2241 if fqdn != rawHost:
2242 host.append(fqdn)
2243 except socket.error:
2244 pass
2245 # make a check value list from the registry entry: replace the
2246 # '<local>' string by the localhost entry and the corresponding
2247 # canonical entry.
2248 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002249 # now check if we match one of the registry values.
2250 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002251 if test == '<local>':
2252 if '.' not in rawHost:
2253 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002254 test = test.replace(".", r"\.") # mask dots
2255 test = test.replace("*", r".*") # change glob sequence
2256 test = test.replace("?", r".") # change glob char
2257 for val in host:
2258 # print "%s <--> %s" %( test, val )
2259 if re.match(test, val, re.I):
2260 return 1
2261 return 0
2262
2263 def proxy_bypass(host):
2264 """Return a dictionary of scheme -> proxy server URL mappings.
2265
2266 Returns settings gathered from the environment, if specified,
2267 or the registry.
2268
2269 """
2270 if getproxies_environment():
2271 return proxy_bypass_environment(host)
2272 else:
2273 return proxy_bypass_registry(host)
2274
2275else:
2276 # By default use environment variables
2277 getproxies = getproxies_environment
2278 proxy_bypass = proxy_bypass_environment