blob: 42e6d17206dd398073438047ffa416b371c597c6 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33OpenerDirector --
34
35Request -- An object that encapsulates the state of a request. The
36state can be as simple as the URL. It can also include extra HTTP
37headers, e.g. a User-Agent.
38
39BaseHandler --
40
41internals:
42BaseHandler and parent
43_call_chain conventions
44
45Example usage:
46
Georg Brandl029986a2008-06-23 11:44:14 +000047import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000048
49# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000050authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051authinfo.add_password(realm='PDQ Application',
52 uri='https://mahler:8092/site-updates.py',
53 user='klem',
54 passwd='geheim$parole')
55
Georg Brandl029986a2008-06-23 11:44:14 +000056proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000057
58# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000059opener = urllib.request.build_opener(proxy_support, authinfo,
60 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000061
62# install it
Georg Brandl029986a2008-06-23 11:44:14 +000063urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064
Georg Brandl029986a2008-06-23 11:44:14 +000065f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066"""
67
68# XXX issues:
69# If an authentication error handler that tries to perform
70# authentication for some reason but fails, how should the error be
71# signalled? The client needs to know the HTTP error code. But if
72# the handler knows that the problem was, e.g., that it didn't know
73# that hash algo that requested in the challenge, it would be good to
74# pass that information along to the client, too.
75# ftp errors aren't handled cleanly
76# check digest against correct (i.e. non-apache) implementation
77
78# Possible extensions:
79# complex proxies XXX not sure what exactly was meant by this
80# abstract factory for opener
81
82import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000083import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000084import email
85import hashlib
86import http.client
87import io
88import os
89import posixpath
90import random
91import re
92import socket
93import sys
94import time
Jeremy Hylton1afc1692008-06-18 20:49:58 +000095
Georg Brandl13e89462008-07-01 19:56:00 +000096from urllib.error import URLError, HTTPError, ContentTooShortError
97from urllib.parse import (
98 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
99 splittype, splithost, splitport, splituser, splitpasswd,
Facundo Batistaf24802c2008-08-17 03:36:03 +0000100 splitattr, splitquery, splitvalue, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000101from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000102
103# check for SSL
104try:
105 import ssl
106except:
107 _have_ssl = False
108else:
109 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110
111# used in User-Agent header sent
112__version__ = sys.version[:3]
113
114_opener = None
115def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
116 global _opener
117 if _opener is None:
118 _opener = build_opener()
119 return _opener.open(url, data, timeout)
120
121def install_opener(opener):
122 global _opener
123 _opener = opener
124
125# TODO(jhylton): Make this work with the same global opener.
126_urlopener = None
127def urlretrieve(url, filename=None, reporthook=None, data=None):
128 global _urlopener
129 if not _urlopener:
130 _urlopener = FancyURLopener()
131 return _urlopener.retrieve(url, filename, reporthook, data)
132
133def urlcleanup():
134 if _urlopener:
135 _urlopener.cleanup()
136 global _opener
137 if _opener:
138 _opener = None
139
140# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000141_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000142def request_host(request):
143 """Return request-host, as defined by RFC 2965.
144
145 Variation from RFC: returned value is lowercased, for convenient
146 comparison.
147
148 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000149 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000150 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000151 if host == "":
152 host = request.get_header("Host", "")
153
154 # remove port, if present
155 host = _cut_port_re.sub("", host, 1)
156 return host.lower()
157
158class Request:
159
160 def __init__(self, url, data=None, headers={},
161 origin_req_host=None, unverifiable=False):
162 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000163 self.full_url = unwrap(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000164 self.data = data
165 self.headers = {}
166 for key, value in headers.items():
167 self.add_header(key, value)
168 self.unredirected_hdrs = {}
169 if origin_req_host is None:
170 origin_req_host = request_host(self)
171 self.origin_req_host = origin_req_host
172 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000173 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000174
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000175 def _parse(self):
176 self.type, rest = splittype(self.full_url)
177 if self.type is None:
178 raise ValueError("unknown url type: %s" % self.full_url)
179 self.host, self.selector = splithost(rest)
180 if self.host:
181 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000182
183 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000184 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000185 return "POST"
186 else:
187 return "GET"
188
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000189 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000190
191 def add_data(self, data):
192 self.data = data
193
194 def has_data(self):
195 return self.data is not None
196
197 def get_data(self):
198 return self.data
199
200 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000201 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000202
203 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000204 return self.type
205
206 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000207 return self.host
208
209 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000210 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000212 def is_unverifiable(self):
213 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000214
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000215 def get_origin_req_host(self):
216 return self.origin_req_host
217
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000218 # End deprecated methods
219
220 def set_proxy(self, host, type):
221 self.host, self.type = host, type
222 self.selector = self.full_url
223
224 def has_proxy(self):
225 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000226
227 def add_header(self, key, val):
228 # useful for something like authentication
229 self.headers[key.capitalize()] = val
230
231 def add_unredirected_header(self, key, val):
232 # will not be added to a redirected request
233 self.unredirected_hdrs[key.capitalize()] = val
234
235 def has_header(self, header_name):
236 return (header_name in self.headers or
237 header_name in self.unredirected_hdrs)
238
239 def get_header(self, header_name, default=None):
240 return self.headers.get(
241 header_name,
242 self.unredirected_hdrs.get(header_name, default))
243
244 def header_items(self):
245 hdrs = self.unredirected_hdrs.copy()
246 hdrs.update(self.headers)
247 return list(hdrs.items())
248
249class OpenerDirector:
250 def __init__(self):
251 client_version = "Python-urllib/%s" % __version__
252 self.addheaders = [('User-agent', client_version)]
253 # manage the individual handlers
254 self.handlers = []
255 self.handle_open = {}
256 self.handle_error = {}
257 self.process_response = {}
258 self.process_request = {}
259
260 def add_handler(self, handler):
261 if not hasattr(handler, "add_parent"):
262 raise TypeError("expected BaseHandler instance, got %r" %
263 type(handler))
264
265 added = False
266 for meth in dir(handler):
267 if meth in ["redirect_request", "do_open", "proxy_open"]:
268 # oops, coincidental match
269 continue
270
271 i = meth.find("_")
272 protocol = meth[:i]
273 condition = meth[i+1:]
274
275 if condition.startswith("error"):
276 j = condition.find("_") + i + 1
277 kind = meth[j+1:]
278 try:
279 kind = int(kind)
280 except ValueError:
281 pass
282 lookup = self.handle_error.get(protocol, {})
283 self.handle_error[protocol] = lookup
284 elif condition == "open":
285 kind = protocol
286 lookup = self.handle_open
287 elif condition == "response":
288 kind = protocol
289 lookup = self.process_response
290 elif condition == "request":
291 kind = protocol
292 lookup = self.process_request
293 else:
294 continue
295
296 handlers = lookup.setdefault(kind, [])
297 if handlers:
298 bisect.insort(handlers, handler)
299 else:
300 handlers.append(handler)
301 added = True
302
303 if added:
304 # the handlers must work in an specific order, the order
305 # is specified in a Handler attribute
306 bisect.insort(self.handlers, handler)
307 handler.add_parent(self)
308
309 def close(self):
310 # Only exists for backwards compatibility.
311 pass
312
313 def _call_chain(self, chain, kind, meth_name, *args):
314 # Handlers raise an exception if no one else should try to handle
315 # the request, or return None if they can't but another handler
316 # could. Otherwise, they return the response.
317 handlers = chain.get(kind, ())
318 for handler in handlers:
319 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000320 result = func(*args)
321 if result is not None:
322 return result
323
324 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
325 # accept a URL or a Request object
326 if isinstance(fullurl, str):
327 req = Request(fullurl, data)
328 else:
329 req = fullurl
330 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000331 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000332
333 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000334 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000335
336 # pre-process request
337 meth_name = protocol+"_request"
338 for processor in self.process_request.get(protocol, []):
339 meth = getattr(processor, meth_name)
340 req = meth(req)
341
342 response = self._open(req, data)
343
344 # post-process response
345 meth_name = protocol+"_response"
346 for processor in self.process_response.get(protocol, []):
347 meth = getattr(processor, meth_name)
348 response = meth(req, response)
349
350 return response
351
352 def _open(self, req, data=None):
353 result = self._call_chain(self.handle_open, 'default',
354 'default_open', req)
355 if result:
356 return result
357
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000358 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000359 result = self._call_chain(self.handle_open, protocol, protocol +
360 '_open', req)
361 if result:
362 return result
363
364 return self._call_chain(self.handle_open, 'unknown',
365 'unknown_open', req)
366
367 def error(self, proto, *args):
368 if proto in ('http', 'https'):
369 # XXX http[s] protocols are special-cased
370 dict = self.handle_error['http'] # https is not different than http
371 proto = args[2] # YUCK!
372 meth_name = 'http_error_%s' % proto
373 http_err = 1
374 orig_args = args
375 else:
376 dict = self.handle_error
377 meth_name = proto + '_error'
378 http_err = 0
379 args = (dict, proto, meth_name) + args
380 result = self._call_chain(*args)
381 if result:
382 return result
383
384 if http_err:
385 args = (dict, 'default', 'http_error_default') + orig_args
386 return self._call_chain(*args)
387
388# XXX probably also want an abstract factory that knows when it makes
389# sense to skip a superclass in favor of a subclass and when it might
390# make sense to include both
391
392def build_opener(*handlers):
393 """Create an opener object from a list of handlers.
394
395 The opener will use several default handlers, including support
396 for HTTP and FTP.
397
398 If any of the handlers passed as arguments are subclasses of the
399 default handlers, the default handlers will not be used.
400 """
401 def isclass(obj):
402 return isinstance(obj, type) or hasattr(obj, "__bases__")
403
404 opener = OpenerDirector()
405 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
406 HTTPDefaultErrorHandler, HTTPRedirectHandler,
407 FTPHandler, FileHandler, HTTPErrorProcessor]
408 if hasattr(http.client, "HTTPSConnection"):
409 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000410 skip = set()
411 for klass in default_classes:
412 for check in handlers:
413 if isclass(check):
414 if issubclass(check, klass):
415 skip.add(klass)
416 elif isinstance(check, klass):
417 skip.add(klass)
418 for klass in skip:
419 default_classes.remove(klass)
420
421 for klass in default_classes:
422 opener.add_handler(klass())
423
424 for h in handlers:
425 if isclass(h):
426 h = h()
427 opener.add_handler(h)
428 return opener
429
430class BaseHandler:
431 handler_order = 500
432
433 def add_parent(self, parent):
434 self.parent = parent
435
436 def close(self):
437 # Only exists for backwards compatibility
438 pass
439
440 def __lt__(self, other):
441 if not hasattr(other, "handler_order"):
442 # Try to preserve the old behavior of having custom classes
443 # inserted after default ones (works only for custom user
444 # classes which are not aware of handler_order).
445 return True
446 return self.handler_order < other.handler_order
447
448
449class HTTPErrorProcessor(BaseHandler):
450 """Process HTTP error responses."""
451 handler_order = 1000 # after all other processing
452
453 def http_response(self, request, response):
454 code, msg, hdrs = response.code, response.msg, response.info()
455
456 # According to RFC 2616, "2xx" code indicates that the client's
457 # request was successfully received, understood, and accepted.
458 if not (200 <= code < 300):
459 response = self.parent.error(
460 'http', request, response, code, msg, hdrs)
461
462 return response
463
464 https_response = http_response
465
466class HTTPDefaultErrorHandler(BaseHandler):
467 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000468 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000469
470class HTTPRedirectHandler(BaseHandler):
471 # maximum number of redirections to any single URL
472 # this is needed because of the state that cookies introduce
473 max_repeats = 4
474 # maximum total number of redirections (regardless of URL) before
475 # assuming we're in a loop
476 max_redirections = 10
477
478 def redirect_request(self, req, fp, code, msg, headers, newurl):
479 """Return a Request or None in response to a redirect.
480
481 This is called by the http_error_30x methods when a
482 redirection response is received. If a redirection should
483 take place, return a new Request to allow http_error_30x to
484 perform the redirect. Otherwise, raise HTTPError if no-one
485 else should try to handle this url. Return None if you can't
486 but another Handler might.
487 """
488 m = req.get_method()
489 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
490 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000491 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000492
493 # Strictly (according to RFC 2616), 301 or 302 in response to
494 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000495 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000496 # essentially all clients do redirect in this case, so we do
497 # the same.
498 # be conciliant with URIs containing a space
499 newurl = newurl.replace(' ', '%20')
500 CONTENT_HEADERS = ("content-length", "content-type")
501 newheaders = dict((k, v) for k, v in req.headers.items()
502 if k.lower() not in CONTENT_HEADERS)
503 return Request(newurl,
504 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000505 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000506 unverifiable=True)
507
508 # Implementation note: To avoid the server sending us into an
509 # infinite loop, the request object needs to track what URLs we
510 # have already seen. Do this by adding a handler-specific
511 # attribute to the Request object.
512 def http_error_302(self, req, fp, code, msg, headers):
513 # Some servers (incorrectly) return multiple Location headers
514 # (so probably same goes for URI). Use first header.
515 if "location" in headers:
516 newurl = headers["location"]
517 elif "uri" in headers:
518 newurl = headers["uri"]
519 else:
520 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000521
522 # fix a possible malformed URL
523 urlparts = urlparse(newurl)
524 if not urlparts.path:
525 urlparts = list(urlparts)
526 urlparts[2] = "/"
527 newurl = urlunparse(urlparts)
528
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000529 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000530
531 # XXX Probably want to forget about the state of the current
532 # request, although that might interact poorly with other
533 # handlers that also use handler-specific request attributes
534 new = self.redirect_request(req, fp, code, msg, headers, newurl)
535 if new is None:
536 return
537
538 # loop detection
539 # .redirect_dict has a key url if url was previously visited.
540 if hasattr(req, 'redirect_dict'):
541 visited = new.redirect_dict = req.redirect_dict
542 if (visited.get(newurl, 0) >= self.max_repeats or
543 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000544 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000545 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000546 else:
547 visited = new.redirect_dict = req.redirect_dict = {}
548 visited[newurl] = visited.get(newurl, 0) + 1
549
550 # Don't close the fp until we are sure that we won't use it
551 # with HTTPError.
552 fp.read()
553 fp.close()
554
555 return self.parent.open(new)
556
557 http_error_301 = http_error_303 = http_error_307 = http_error_302
558
559 inf_msg = "The HTTP server returned a redirect error that would " \
560 "lead to an infinite loop.\n" \
561 "The last 30x error message was:\n"
562
563
564def _parse_proxy(proxy):
565 """Return (scheme, user, password, host/port) given a URL or an authority.
566
567 If a URL is supplied, it must have an authority (host:port) component.
568 According to RFC 3986, having an authority component means the URL must
569 have two slashes after the scheme:
570
571 >>> _parse_proxy('file:/ftp.example.com/')
572 Traceback (most recent call last):
573 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
574
575 The first three items of the returned tuple may be None.
576
577 Examples of authority parsing:
578
579 >>> _parse_proxy('proxy.example.com')
580 (None, None, None, 'proxy.example.com')
581 >>> _parse_proxy('proxy.example.com:3128')
582 (None, None, None, 'proxy.example.com:3128')
583
584 The authority component may optionally include userinfo (assumed to be
585 username:password):
586
587 >>> _parse_proxy('joe:password@proxy.example.com')
588 (None, 'joe', 'password', 'proxy.example.com')
589 >>> _parse_proxy('joe:password@proxy.example.com:3128')
590 (None, 'joe', 'password', 'proxy.example.com:3128')
591
592 Same examples, but with URLs instead:
593
594 >>> _parse_proxy('http://proxy.example.com/')
595 ('http', None, None, 'proxy.example.com')
596 >>> _parse_proxy('http://proxy.example.com:3128/')
597 ('http', None, None, 'proxy.example.com:3128')
598 >>> _parse_proxy('http://joe:password@proxy.example.com/')
599 ('http', 'joe', 'password', 'proxy.example.com')
600 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
601 ('http', 'joe', 'password', 'proxy.example.com:3128')
602
603 Everything after the authority is ignored:
604
605 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
606 ('ftp', 'joe', 'password', 'proxy.example.com')
607
608 Test for no trailing '/' case:
609
610 >>> _parse_proxy('http://joe:password@proxy.example.com')
611 ('http', 'joe', 'password', 'proxy.example.com')
612
613 """
Georg Brandl13e89462008-07-01 19:56:00 +0000614 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000615 if not r_scheme.startswith("/"):
616 # authority
617 scheme = None
618 authority = proxy
619 else:
620 # URL
621 if not r_scheme.startswith("//"):
622 raise ValueError("proxy URL with no authority: %r" % proxy)
623 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
624 # and 3.3.), path is empty or starts with '/'
625 end = r_scheme.find("/", 2)
626 if end == -1:
627 end = None
628 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000629 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000630 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000631 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000632 else:
633 user = password = None
634 return scheme, user, password, hostport
635
636class ProxyHandler(BaseHandler):
637 # Proxies must be in front
638 handler_order = 100
639
640 def __init__(self, proxies=None):
641 if proxies is None:
642 proxies = getproxies()
643 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
644 self.proxies = proxies
645 for type, url in proxies.items():
646 setattr(self, '%s_open' % type,
647 lambda r, proxy=url, type=type, meth=self.proxy_open: \
648 meth(r, proxy, type))
649
650 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000651 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000652 proxy_type, user, password, hostport = _parse_proxy(proxy)
653 if proxy_type is None:
654 proxy_type = orig_type
655 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000656 user_pass = '%s:%s' % (unquote(user),
657 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000658 creds = base64.b64encode(user_pass.encode()).decode("ascii")
659 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000660 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000661 req.set_proxy(hostport, proxy_type)
662 if orig_type == proxy_type:
663 # let other handlers take care of it
664 return None
665 else:
666 # need to start over, because the other handlers don't
667 # grok the proxy's URL type
668 # e.g. if we have a constructor arg proxies like so:
669 # {'http': 'ftp://proxy.example.com'}, we may end up turning
670 # a request for http://acme.example.com/a into one for
671 # ftp://proxy.example.com/a
672 return self.parent.open(req)
673
674class HTTPPasswordMgr:
675
676 def __init__(self):
677 self.passwd = {}
678
679 def add_password(self, realm, uri, user, passwd):
680 # uri could be a single URI or a sequence
681 if isinstance(uri, str):
682 uri = [uri]
683 if not realm in self.passwd:
684 self.passwd[realm] = {}
685 for default_port in True, False:
686 reduced_uri = tuple(
687 [self.reduce_uri(u, default_port) for u in uri])
688 self.passwd[realm][reduced_uri] = (user, passwd)
689
690 def find_user_password(self, realm, authuri):
691 domains = self.passwd.get(realm, {})
692 for default_port in True, False:
693 reduced_authuri = self.reduce_uri(authuri, default_port)
694 for uris, authinfo in domains.items():
695 for uri in uris:
696 if self.is_suburi(uri, reduced_authuri):
697 return authinfo
698 return None, None
699
700 def reduce_uri(self, uri, default_port=True):
701 """Accept authority or URI and extract only the authority and path."""
702 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000703 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000704 if parts[1]:
705 # URI
706 scheme = parts[0]
707 authority = parts[1]
708 path = parts[2] or '/'
709 else:
710 # host or host:port
711 scheme = None
712 authority = uri
713 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000714 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000715 if default_port and port is None and scheme is not None:
716 dport = {"http": 80,
717 "https": 443,
718 }.get(scheme)
719 if dport is not None:
720 authority = "%s:%d" % (host, dport)
721 return authority, path
722
723 def is_suburi(self, base, test):
724 """Check if test is below base in a URI tree
725
726 Both args must be URIs in reduced form.
727 """
728 if base == test:
729 return True
730 if base[0] != test[0]:
731 return False
732 common = posixpath.commonprefix((base[1], test[1]))
733 if len(common) == len(base[1]):
734 return True
735 return False
736
737
738class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
739
740 def find_user_password(self, realm, authuri):
741 user, password = HTTPPasswordMgr.find_user_password(self, realm,
742 authuri)
743 if user is not None:
744 return user, password
745 return HTTPPasswordMgr.find_user_password(self, None, authuri)
746
747
748class AbstractBasicAuthHandler:
749
750 # XXX this allows for multiple auth-schemes, but will stupidly pick
751 # the last one with a realm specified.
752
753 # allow for double- and single-quoted realm values
754 # (single quotes are a violation of the RFC, but appear in the wild)
755 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
756 'realm=(["\'])(.*?)\\2', re.I)
757
758 # XXX could pre-emptively send auth info already accepted (RFC 2617,
759 # end of section 2, and section 1.2 immediately after "credentials"
760 # production).
761
762 def __init__(self, password_mgr=None):
763 if password_mgr is None:
764 password_mgr = HTTPPasswordMgr()
765 self.passwd = password_mgr
766 self.add_password = self.passwd.add_password
767
768 def http_error_auth_reqed(self, authreq, host, req, headers):
769 # host may be an authority (without userinfo) or a URL with an
770 # authority
771 # XXX could be multiple headers
772 authreq = headers.get(authreq, None)
773 if authreq:
774 mo = AbstractBasicAuthHandler.rx.search(authreq)
775 if mo:
776 scheme, quote, realm = mo.groups()
777 if scheme.lower() == 'basic':
778 return self.retry_http_basic_auth(host, req, realm)
779
780 def retry_http_basic_auth(self, host, req, realm):
781 user, pw = self.passwd.find_user_password(realm, host)
782 if pw is not None:
783 raw = "%s:%s" % (user, pw)
784 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
785 if req.headers.get(self.auth_header, None) == auth:
786 return None
787 req.add_header(self.auth_header, auth)
788 return self.parent.open(req)
789 else:
790 return None
791
792
793class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
794
795 auth_header = 'Authorization'
796
797 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000798 url = req.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000799 return self.http_error_auth_reqed('www-authenticate',
800 url, req, headers)
801
802
803class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
804
805 auth_header = 'Proxy-authorization'
806
807 def http_error_407(self, req, fp, code, msg, headers):
808 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000809 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000810 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
811 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000812 authority = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 return self.http_error_auth_reqed('proxy-authenticate',
814 authority, req, headers)
815
816
817def randombytes(n):
818 """Return n random bytes."""
819 return os.urandom(n)
820
821class AbstractDigestAuthHandler:
822 # Digest authentication is specified in RFC 2617.
823
824 # XXX The client does not inspect the Authentication-Info header
825 # in a successful response.
826
827 # XXX It should be possible to test this implementation against
828 # a mock server that just generates a static set of challenges.
829
830 # XXX qop="auth-int" supports is shaky
831
832 def __init__(self, passwd=None):
833 if passwd is None:
834 passwd = HTTPPasswordMgr()
835 self.passwd = passwd
836 self.add_password = self.passwd.add_password
837 self.retried = 0
838 self.nonce_count = 0
839
840 def reset_retry_count(self):
841 self.retried = 0
842
843 def http_error_auth_reqed(self, auth_header, host, req, headers):
844 authreq = headers.get(auth_header, None)
845 if self.retried > 5:
846 # Don't fail endlessly - if we failed once, we'll probably
847 # fail a second time. Hm. Unless the Password Manager is
848 # prompting for the information. Crap. This isn't great
849 # but it's better than the current 'repeat until recursion
850 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000851 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000852 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000853 else:
854 self.retried += 1
855 if authreq:
856 scheme = authreq.split()[0]
857 if scheme.lower() == 'digest':
858 return self.retry_http_digest_auth(req, authreq)
859
860 def retry_http_digest_auth(self, req, auth):
861 token, challenge = auth.split(' ', 1)
862 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
863 auth = self.get_authorization(req, chal)
864 if auth:
865 auth_val = 'Digest %s' % auth
866 if req.headers.get(self.auth_header, None) == auth_val:
867 return None
868 req.add_unredirected_header(self.auth_header, auth_val)
869 resp = self.parent.open(req)
870 return resp
871
872 def get_cnonce(self, nonce):
873 # The cnonce-value is an opaque
874 # quoted string value provided by the client and used by both client
875 # and server to avoid chosen plaintext attacks, to provide mutual
876 # authentication, and to provide some message integrity protection.
877 # This isn't a fabulous effort, but it's probably Good Enough.
878 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
879 b = s.encode("ascii") + randombytes(8)
880 dig = hashlib.sha1(b).hexdigest()
881 return dig[:16]
882
883 def get_authorization(self, req, chal):
884 try:
885 realm = chal['realm']
886 nonce = chal['nonce']
887 qop = chal.get('qop')
888 algorithm = chal.get('algorithm', 'MD5')
889 # mod_digest doesn't send an opaque, even though it isn't
890 # supposed to be optional
891 opaque = chal.get('opaque', None)
892 except KeyError:
893 return None
894
895 H, KD = self.get_algorithm_impls(algorithm)
896 if H is None:
897 return None
898
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000899 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000900 if user is None:
901 return None
902
903 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000904 if req.data is not None:
905 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000906 else:
907 entdig = None
908
909 A1 = "%s:%s:%s" % (user, realm, pw)
910 A2 = "%s:%s" % (req.get_method(),
911 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000912 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000913 if qop == 'auth':
914 self.nonce_count += 1
915 ncvalue = '%08x' % self.nonce_count
916 cnonce = self.get_cnonce(nonce)
917 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
918 respdig = KD(H(A1), noncebit)
919 elif qop is None:
920 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
921 else:
922 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000923 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000924
925 # XXX should the partial digests be encoded too?
926
927 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000928 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000929 respdig)
930 if opaque:
931 base += ', opaque="%s"' % opaque
932 if entdig:
933 base += ', digest="%s"' % entdig
934 base += ', algorithm="%s"' % algorithm
935 if qop:
936 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
937 return base
938
939 def get_algorithm_impls(self, algorithm):
940 # lambdas assume digest modules are imported at the top level
941 if algorithm == 'MD5':
942 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
943 elif algorithm == 'SHA':
944 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
945 # XXX MD5-sess
946 KD = lambda s, d: H("%s:%s" % (s, d))
947 return H, KD
948
949 def get_entity_digest(self, data, chal):
950 # XXX not implemented yet
951 return None
952
953
954class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
955 """An authentication protocol defined by RFC 2069
956
957 Digest authentication improves on basic authentication because it
958 does not transmit passwords in the clear.
959 """
960
961 auth_header = 'Authorization'
962 handler_order = 490 # before Basic auth
963
964 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000965 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000966 retry = self.http_error_auth_reqed('www-authenticate',
967 host, req, headers)
968 self.reset_retry_count()
969 return retry
970
971
972class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
973
974 auth_header = 'Proxy-Authorization'
975 handler_order = 490 # before Basic auth
976
977 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000978 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000979 retry = self.http_error_auth_reqed('proxy-authenticate',
980 host, req, headers)
981 self.reset_retry_count()
982 return retry
983
984class AbstractHTTPHandler(BaseHandler):
985
986 def __init__(self, debuglevel=0):
987 self._debuglevel = debuglevel
988
989 def set_http_debuglevel(self, level):
990 self._debuglevel = level
991
992 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000993 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000994 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +0000995 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000996
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000997 if request.data is not None: # POST
998 data = request.data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000999 if not request.has_header('Content-type'):
1000 request.add_unredirected_header(
1001 'Content-type',
1002 'application/x-www-form-urlencoded')
1003 if not request.has_header('Content-length'):
1004 request.add_unredirected_header(
1005 'Content-length', '%d' % len(data))
1006
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001007 sel_host = host
1008 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001009 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001010 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001011 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001012 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001013 for name, value in self.parent.addheaders:
1014 name = name.capitalize()
1015 if not request.has_header(name):
1016 request.add_unredirected_header(name, value)
1017
1018 return request
1019
1020 def do_open(self, http_class, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001021 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001022
1023 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001024 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001025 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001026 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001027 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001028
1029 h = http_class(host, timeout=req.timeout) # will parse host:port
1030 headers = dict(req.headers)
1031 headers.update(req.unredirected_hdrs)
1032
1033 # TODO(jhylton): Should this be redesigned to handle
1034 # persistent connections?
1035
1036 # We want to make an HTTP/1.1 request, but the addinfourl
1037 # class isn't prepared to deal with a persistent connection.
1038 # It will try to read all remaining data from the socket,
1039 # which will block while the server waits for the next request.
1040 # So make sure the connection gets closed after the (only)
1041 # request.
1042 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001043 headers = dict((name.title(), val) for name, val in headers.items())
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001044 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001045 h.request(req.get_method(), req.selector, req.data, headers)
1046 r = h.getresponse() # an HTTPResponse instance
1047 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001048 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001049
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001050 r.url = req.full_url
1051 # This line replaces the .msg attribute of the HTTPResponse
1052 # with .headers, because urllib clients expect the response to
1053 # have the reason in .msg. It would be good to mark this
1054 # attribute is deprecated and get then to use info() or
1055 # .headers.
1056 r.msg = r.reason
1057 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001058
1059
1060class HTTPHandler(AbstractHTTPHandler):
1061
1062 def http_open(self, req):
1063 return self.do_open(http.client.HTTPConnection, req)
1064
1065 http_request = AbstractHTTPHandler.do_request_
1066
1067if hasattr(http.client, 'HTTPSConnection'):
1068 class HTTPSHandler(AbstractHTTPHandler):
1069
1070 def https_open(self, req):
1071 return self.do_open(http.client.HTTPSConnection, req)
1072
1073 https_request = AbstractHTTPHandler.do_request_
1074
1075class HTTPCookieProcessor(BaseHandler):
1076 def __init__(self, cookiejar=None):
1077 import http.cookiejar
1078 if cookiejar is None:
1079 cookiejar = http.cookiejar.CookieJar()
1080 self.cookiejar = cookiejar
1081
1082 def http_request(self, request):
1083 self.cookiejar.add_cookie_header(request)
1084 return request
1085
1086 def http_response(self, request, response):
1087 self.cookiejar.extract_cookies(response, request)
1088 return response
1089
1090 https_request = http_request
1091 https_response = http_response
1092
1093class UnknownHandler(BaseHandler):
1094 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001095 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001096 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001097
1098def parse_keqv_list(l):
1099 """Parse list of key=value strings where keys are not duplicated."""
1100 parsed = {}
1101 for elt in l:
1102 k, v = elt.split('=', 1)
1103 if v[0] == '"' and v[-1] == '"':
1104 v = v[1:-1]
1105 parsed[k] = v
1106 return parsed
1107
1108def parse_http_list(s):
1109 """Parse lists as described by RFC 2068 Section 2.
1110
1111 In particular, parse comma-separated lists where the elements of
1112 the list may include quoted-strings. A quoted-string could
1113 contain a comma. A non-quoted string could have quotes in the
1114 middle. Neither commas nor quotes count if they are escaped.
1115 Only double-quotes count, not single-quotes.
1116 """
1117 res = []
1118 part = ''
1119
1120 escape = quote = False
1121 for cur in s:
1122 if escape:
1123 part += cur
1124 escape = False
1125 continue
1126 if quote:
1127 if cur == '\\':
1128 escape = True
1129 continue
1130 elif cur == '"':
1131 quote = False
1132 part += cur
1133 continue
1134
1135 if cur == ',':
1136 res.append(part)
1137 part = ''
1138 continue
1139
1140 if cur == '"':
1141 quote = True
1142
1143 part += cur
1144
1145 # append last part
1146 if part:
1147 res.append(part)
1148
1149 return [part.strip() for part in res]
1150
1151class FileHandler(BaseHandler):
1152 # Use local file or FTP depending on form of URL
1153 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001154 url = req.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001155 if url[:2] == '//' and url[2:3] != '/':
1156 req.type = 'ftp'
1157 return self.parent.open(req)
1158 else:
1159 return self.open_local_file(req)
1160
1161 # names for the localhost
1162 names = None
1163 def get_names(self):
1164 if FileHandler.names is None:
1165 try:
1166 FileHandler.names = (socket.gethostbyname('localhost'),
1167 socket.gethostbyname(socket.gethostname()))
1168 except socket.gaierror:
1169 FileHandler.names = (socket.gethostbyname('localhost'),)
1170 return FileHandler.names
1171
1172 # not entirely sure what the rules are here
1173 def open_local_file(self, req):
1174 import email.utils
1175 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001176 host = req.host
1177 file = req.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001178 localfile = url2pathname(file)
1179 try:
1180 stats = os.stat(localfile)
1181 size = stats.st_size
1182 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1183 mtype = mimetypes.guess_type(file)[0]
1184 headers = email.message_from_string(
1185 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1186 (mtype or 'text/plain', size, modified))
1187 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001188 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001189 if not host or \
1190 (not port and _safe_gethostbyname(host) in self.get_names()):
Georg Brandl13e89462008-07-01 19:56:00 +00001191 return addinfourl(open(localfile, 'rb'), headers, 'file:'+file)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001192 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001193 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001194 raise URLError(msg)
1195 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001196
1197def _safe_gethostbyname(host):
1198 try:
1199 return socket.gethostbyname(host)
1200 except socket.gaierror:
1201 return None
1202
1203class FTPHandler(BaseHandler):
1204 def ftp_open(self, req):
1205 import ftplib
1206 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001207 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001208 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001209 raise URLError('ftp error: no host given')
1210 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001211 if port is None:
1212 port = ftplib.FTP_PORT
1213 else:
1214 port = int(port)
1215
1216 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001217 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001218 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001219 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001220 else:
1221 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001222 host = unquote(host)
1223 user = unquote(user or '')
1224 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001225
1226 try:
1227 host = socket.gethostbyname(host)
1228 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001229 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001230 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001231 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001232 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001233 dirs, file = dirs[:-1], dirs[-1]
1234 if dirs and not dirs[0]:
1235 dirs = dirs[1:]
1236 try:
1237 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1238 type = file and 'I' or 'D'
1239 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001240 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001241 if attr.lower() == 'type' and \
1242 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1243 type = value.upper()
1244 fp, retrlen = fw.retrfile(file, type)
1245 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001246 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001247 if mtype:
1248 headers += "Content-type: %s\n" % mtype
1249 if retrlen is not None and retrlen >= 0:
1250 headers += "Content-length: %d\n" % retrlen
1251 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001252 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001253 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001254 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001255 raise exc.with_traceback(sys.exc_info()[2])
1256
1257 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1258 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1259 return fw
1260
1261class CacheFTPHandler(FTPHandler):
1262 # XXX would be nice to have pluggable cache strategies
1263 # XXX this stuff is definitely not thread safe
1264 def __init__(self):
1265 self.cache = {}
1266 self.timeout = {}
1267 self.soonest = 0
1268 self.delay = 60
1269 self.max_conns = 16
1270
1271 def setTimeout(self, t):
1272 self.delay = t
1273
1274 def setMaxConns(self, m):
1275 self.max_conns = m
1276
1277 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1278 key = user, host, port, '/'.join(dirs), timeout
1279 if key in self.cache:
1280 self.timeout[key] = time.time() + self.delay
1281 else:
1282 self.cache[key] = ftpwrapper(user, passwd, host, port,
1283 dirs, timeout)
1284 self.timeout[key] = time.time() + self.delay
1285 self.check_cache()
1286 return self.cache[key]
1287
1288 def check_cache(self):
1289 # first check for old ones
1290 t = time.time()
1291 if self.soonest <= t:
1292 for k, v in list(self.timeout.items()):
1293 if v < t:
1294 self.cache[k].close()
1295 del self.cache[k]
1296 del self.timeout[k]
1297 self.soonest = min(list(self.timeout.values()))
1298
1299 # then check the size
1300 if len(self.cache) == self.max_conns:
1301 for k, v in list(self.timeout.items()):
1302 if v == self.soonest:
1303 del self.cache[k]
1304 del self.timeout[k]
1305 break
1306 self.soonest = min(list(self.timeout.values()))
1307
1308# Code move from the old urllib module
1309
1310MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1311
1312# Helper for non-unix systems
1313if os.name == 'mac':
1314 from macurl2path import url2pathname, pathname2url
1315elif os.name == 'nt':
1316 from nturl2path import url2pathname, pathname2url
1317else:
1318 def url2pathname(pathname):
1319 """OS-specific conversion from a relative URL of the 'file' scheme
1320 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001321 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001322
1323 def pathname2url(pathname):
1324 """OS-specific conversion from a file system path to a relative URL
1325 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001326 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001327
1328# This really consists of two pieces:
1329# (1) a class which handles opening of all sorts of URLs
1330# (plus assorted utilities etc.)
1331# (2) a set of functions for parsing URLs
1332# XXX Should these be separated out into different modules?
1333
1334
1335ftpcache = {}
1336class URLopener:
1337 """Class to open URLs.
1338 This is a class rather than just a subroutine because we may need
1339 more than one set of global protocol-specific options.
1340 Note -- this is a base class for those who don't want the
1341 automatic handling of errors type 302 (relocated) and 401
1342 (authorization needed)."""
1343
1344 __tempfiles = None
1345
1346 version = "Python-urllib/%s" % __version__
1347
1348 # Constructor
1349 def __init__(self, proxies=None, **x509):
1350 if proxies is None:
1351 proxies = getproxies()
1352 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1353 self.proxies = proxies
1354 self.key_file = x509.get('key_file')
1355 self.cert_file = x509.get('cert_file')
1356 self.addheaders = [('User-Agent', self.version)]
1357 self.__tempfiles = []
1358 self.__unlink = os.unlink # See cleanup()
1359 self.tempcache = None
1360 # Undocumented feature: if you assign {} to tempcache,
1361 # it is used to cache files retrieved with
1362 # self.retrieve(). This is not enabled by default
1363 # since it does not work for changing documents (and I
1364 # haven't got the logic to check expiration headers
1365 # yet).
1366 self.ftpcache = ftpcache
1367 # Undocumented feature: you can use a different
1368 # ftp cache by assigning to the .ftpcache member;
1369 # in case you want logically independent URL openers
1370 # XXX This is not threadsafe. Bah.
1371
1372 def __del__(self):
1373 self.close()
1374
1375 def close(self):
1376 self.cleanup()
1377
1378 def cleanup(self):
1379 # This code sometimes runs when the rest of this module
1380 # has already been deleted, so it can't use any globals
1381 # or import anything.
1382 if self.__tempfiles:
1383 for file in self.__tempfiles:
1384 try:
1385 self.__unlink(file)
1386 except OSError:
1387 pass
1388 del self.__tempfiles[:]
1389 if self.tempcache:
1390 self.tempcache.clear()
1391
1392 def addheader(self, *args):
1393 """Add a header to be used by the HTTP interface only
1394 e.g. u.addheader('Accept', 'sound/basic')"""
1395 self.addheaders.append(args)
1396
1397 # External interface
1398 def open(self, fullurl, data=None):
1399 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001400 fullurl = unwrap(to_bytes(fullurl))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001401 if self.tempcache and fullurl in self.tempcache:
1402 filename, headers = self.tempcache[fullurl]
1403 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001404 return addinfourl(fp, headers, fullurl)
1405 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001406 if not urltype:
1407 urltype = 'file'
1408 if urltype in self.proxies:
1409 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001410 urltype, proxyhost = splittype(proxy)
1411 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001412 url = (host, fullurl) # Signal special case to open_*()
1413 else:
1414 proxy = None
1415 name = 'open_' + urltype
1416 self.type = urltype
1417 name = name.replace('-', '_')
1418 if not hasattr(self, name):
1419 if proxy:
1420 return self.open_unknown_proxy(proxy, fullurl, data)
1421 else:
1422 return self.open_unknown(fullurl, data)
1423 try:
1424 if data is None:
1425 return getattr(self, name)(url)
1426 else:
1427 return getattr(self, name)(url, data)
1428 except socket.error as msg:
1429 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1430
1431 def open_unknown(self, fullurl, data=None):
1432 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001433 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001434 raise IOError('url error', 'unknown url type', type)
1435
1436 def open_unknown_proxy(self, proxy, fullurl, data=None):
1437 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001438 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001439 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1440
1441 # External interface
1442 def retrieve(self, url, filename=None, reporthook=None, data=None):
1443 """retrieve(url) returns (filename, headers) for a local object
1444 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001445 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001446 if self.tempcache and url in self.tempcache:
1447 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001448 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001449 if filename is None and (not type or type == 'file'):
1450 try:
1451 fp = self.open_local_file(url1)
1452 hdrs = fp.info()
1453 del fp
Georg Brandl13e89462008-07-01 19:56:00 +00001454 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001455 except IOError as msg:
1456 pass
1457 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001458 try:
1459 headers = fp.info()
1460 if filename:
1461 tfp = open(filename, 'wb')
1462 else:
1463 import tempfile
1464 garbage, path = splittype(url)
1465 garbage, path = splithost(path or "")
1466 path, garbage = splitquery(path or "")
1467 path, garbage = splitattr(path or "")
1468 suffix = os.path.splitext(path)[1]
1469 (fd, filename) = tempfile.mkstemp(suffix)
1470 self.__tempfiles.append(filename)
1471 tfp = os.fdopen(fd, 'wb')
1472 try:
1473 result = filename, headers
1474 if self.tempcache is not None:
1475 self.tempcache[url] = result
1476 bs = 1024*8
1477 size = -1
1478 read = 0
1479 blocknum = 0
1480 if reporthook:
1481 if "content-length" in headers:
1482 size = int(headers["Content-Length"])
1483 reporthook(blocknum, bs, size)
1484 while 1:
1485 block = fp.read(bs)
1486 if not block:
1487 break
1488 read += len(block)
1489 tfp.write(block)
1490 blocknum += 1
1491 if reporthook:
1492 reporthook(blocknum, bs, size)
1493 finally:
1494 tfp.close()
1495 finally:
1496 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001497 del fp
1498 del tfp
1499
1500 # raise exception if actual size does not match content-length header
1501 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001502 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001503 "retrieval incomplete: got only %i out of %i bytes"
1504 % (read, size), result)
1505
1506 return result
1507
1508 # Each method named open_<type> knows how to open that type of URL
1509
1510 def _open_generic_http(self, connection_factory, url, data):
1511 """Make an HTTP connection using connection_class.
1512
1513 This is an internal method that should be called from
1514 open_http() or open_https().
1515
1516 Arguments:
1517 - connection_factory should take a host name and return an
1518 HTTPConnection instance.
1519 - url is the url to retrieval or a host, relative-path pair.
1520 - data is payload for a POST request or None.
1521 """
1522
1523 user_passwd = None
1524 proxy_passwd= None
1525 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001526 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001527 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001528 user_passwd, host = splituser(host)
1529 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001530 realhost = host
1531 else:
1532 host, selector = url
1533 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001534 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001535 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001536 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001537 url = rest
1538 user_passwd = None
1539 if urltype.lower() != 'http':
1540 realhost = None
1541 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001542 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001543 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001544 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001545 if user_passwd:
1546 selector = "%s://%s%s" % (urltype, realhost, rest)
1547 if proxy_bypass(realhost):
1548 host = realhost
1549
1550 #print "proxy via http:", host, selector
1551 if not host: raise IOError('http error', 'no host given')
1552
1553 if proxy_passwd:
1554 import base64
1555 proxy_auth = base64.b64encode(proxy_passwd).strip()
1556 else:
1557 proxy_auth = None
1558
1559 if user_passwd:
1560 import base64
1561 auth = base64.b64encode(user_passwd).strip()
1562 else:
1563 auth = None
1564 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001565 headers = {}
1566 if proxy_auth:
1567 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1568 if auth:
1569 headers["Authorization"] = "Basic %s" % auth
1570 if realhost:
1571 headers["Host"] = realhost
1572 for header, value in self.addheaders:
1573 headers[header] = value
1574
1575 if data is not None:
1576 headers["Content-Type"] = "application/x-www-form-urlencoded"
1577 http_conn.request("POST", selector, data, headers)
1578 else:
1579 http_conn.request("GET", selector, headers=headers)
1580
1581 try:
1582 response = http_conn.getresponse()
1583 except http.client.BadStatusLine:
1584 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001585 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001586
1587 # According to RFC 2616, "2xx" code indicates that the client's
1588 # request was successfully received, understood, and accepted.
1589 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001590 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001591 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001592 else:
1593 return self.http_error(
1594 url, response.fp,
1595 response.status, response.reason, response.msg, data)
1596
1597 def open_http(self, url, data=None):
1598 """Use HTTP protocol."""
1599 return self._open_generic_http(http.client.HTTPConnection, url, data)
1600
1601 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1602 """Handle http errors.
1603
1604 Derived class can override this, or provide specific handlers
1605 named http_error_DDD where DDD is the 3-digit error code."""
1606 # First check if there's a specific handler for this error
1607 name = 'http_error_%d' % errcode
1608 if hasattr(self, name):
1609 method = getattr(self, name)
1610 if data is None:
1611 result = method(url, fp, errcode, errmsg, headers)
1612 else:
1613 result = method(url, fp, errcode, errmsg, headers, data)
1614 if result: return result
1615 return self.http_error_default(url, fp, errcode, errmsg, headers)
1616
1617 def http_error_default(self, url, fp, errcode, errmsg, headers):
1618 """Default error handler: close the connection and raise IOError."""
1619 void = fp.read()
1620 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001621 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001622
1623 if _have_ssl:
1624 def _https_connection(self, host):
1625 return http.client.HTTPSConnection(host,
1626 key_file=self.key_file,
1627 cert_file=self.cert_file)
1628
1629 def open_https(self, url, data=None):
1630 """Use HTTPS protocol."""
1631 return self._open_generic_http(self._https_connection, url, data)
1632
1633 def open_file(self, url):
1634 """Use local file or FTP depending on form of URL."""
1635 if not isinstance(url, str):
1636 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1637 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1638 return self.open_ftp(url)
1639 else:
1640 return self.open_local_file(url)
1641
1642 def open_local_file(self, url):
1643 """Use local file."""
1644 import mimetypes, email.utils
1645 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001646 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001647 localname = url2pathname(file)
1648 try:
1649 stats = os.stat(localname)
1650 except OSError as e:
1651 raise URLError(e.errno, e.strerror, e.filename)
1652 size = stats.st_size
1653 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1654 mtype = mimetypes.guess_type(url)[0]
1655 headers = email.message_from_string(
1656 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1657 (mtype or 'text/plain', size, modified))
1658 if not host:
1659 urlfile = file
1660 if file[:1] == '/':
1661 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001662 return addinfourl(open(localname, 'rb'), headers, urlfile)
1663 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001664 if (not port
1665 and socket.gethostbyname(host) in (localhost(), thishost())):
1666 urlfile = file
1667 if file[:1] == '/':
1668 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001669 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001670 raise URLError('local file error', 'not on local host')
1671
1672 def open_ftp(self, url):
1673 """Use FTP protocol."""
1674 if not isinstance(url, str):
1675 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1676 import mimetypes
1677 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001678 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001679 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001680 host, port = splitport(host)
1681 user, host = splituser(host)
1682 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001683 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001684 host = unquote(host)
1685 user = unquote(user or '')
1686 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001687 host = socket.gethostbyname(host)
1688 if not port:
1689 import ftplib
1690 port = ftplib.FTP_PORT
1691 else:
1692 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001693 path, attrs = splitattr(path)
1694 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001695 dirs = path.split('/')
1696 dirs, file = dirs[:-1], dirs[-1]
1697 if dirs and not dirs[0]: dirs = dirs[1:]
1698 if dirs and not dirs[0]: dirs[0] = '/'
1699 key = user, host, port, '/'.join(dirs)
1700 # XXX thread unsafe!
1701 if len(self.ftpcache) > MAXFTPCACHE:
1702 # Prune the cache, rather arbitrarily
1703 for k in self.ftpcache.keys():
1704 if k != key:
1705 v = self.ftpcache[k]
1706 del self.ftpcache[k]
1707 v.close()
1708 try:
1709 if not key in self.ftpcache:
1710 self.ftpcache[key] = \
1711 ftpwrapper(user, passwd, host, port, dirs)
1712 if not file: type = 'D'
1713 else: type = 'I'
1714 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001715 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001716 if attr.lower() == 'type' and \
1717 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1718 type = value.upper()
1719 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1720 mtype = mimetypes.guess_type("ftp:" + url)[0]
1721 headers = ""
1722 if mtype:
1723 headers += "Content-Type: %s\n" % mtype
1724 if retrlen is not None and retrlen >= 0:
1725 headers += "Content-Length: %d\n" % retrlen
1726 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001727 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001728 except ftperrors() as msg:
1729 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1730
1731 def open_data(self, url, data=None):
1732 """Use "data" URL."""
1733 if not isinstance(url, str):
1734 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1735 # ignore POSTed data
1736 #
1737 # syntax of data URLs:
1738 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1739 # mediatype := [ type "/" subtype ] *( ";" parameter )
1740 # data := *urlchar
1741 # parameter := attribute "=" value
1742 try:
1743 [type, data] = url.split(',', 1)
1744 except ValueError:
1745 raise IOError('data error', 'bad data URL')
1746 if not type:
1747 type = 'text/plain;charset=US-ASCII'
1748 semi = type.rfind(';')
1749 if semi >= 0 and '=' not in type[semi:]:
1750 encoding = type[semi+1:]
1751 type = type[:semi]
1752 else:
1753 encoding = ''
1754 msg = []
1755 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
1756 time.gmtime(time.time())))
1757 msg.append('Content-type: %s' % type)
1758 if encoding == 'base64':
1759 import base64
1760 data = base64.decodestring(data)
1761 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001762 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001763 msg.append('Content-Length: %d' % len(data))
1764 msg.append('')
1765 msg.append(data)
1766 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001767 headers = email.message_from_string(msg)
1768 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001769 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001770 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001771
1772
1773class FancyURLopener(URLopener):
1774 """Derived class with handlers for errors we can handle (perhaps)."""
1775
1776 def __init__(self, *args, **kwargs):
1777 URLopener.__init__(self, *args, **kwargs)
1778 self.auth_cache = {}
1779 self.tries = 0
1780 self.maxtries = 10
1781
1782 def http_error_default(self, url, fp, errcode, errmsg, headers):
1783 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001784 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001785
1786 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1787 """Error 302 -- relocated (temporarily)."""
1788 self.tries += 1
1789 if self.maxtries and self.tries >= self.maxtries:
1790 if hasattr(self, "http_error_500"):
1791 meth = self.http_error_500
1792 else:
1793 meth = self.http_error_default
1794 self.tries = 0
1795 return meth(url, fp, 500,
1796 "Internal Server Error: Redirect Recursion", headers)
1797 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1798 data)
1799 self.tries = 0
1800 return result
1801
1802 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1803 if 'location' in headers:
1804 newurl = headers['location']
1805 elif 'uri' in headers:
1806 newurl = headers['uri']
1807 else:
1808 return
1809 void = fp.read()
1810 fp.close()
1811 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001812 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001813 return self.open(newurl)
1814
1815 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1816 """Error 301 -- also relocated (permanently)."""
1817 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1818
1819 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1820 """Error 303 -- also relocated (essentially identical to 302)."""
1821 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1822
1823 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1824 """Error 307 -- relocated, but turn POST into error."""
1825 if data is None:
1826 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1827 else:
1828 return self.http_error_default(url, fp, errcode, errmsg, headers)
1829
1830 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
1831 """Error 401 -- authentication required.
1832 This function supports Basic authentication only."""
1833 if not 'www-authenticate' in headers:
1834 URLopener.http_error_default(self, url, fp,
1835 errcode, errmsg, headers)
1836 stuff = headers['www-authenticate']
1837 import re
1838 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1839 if not match:
1840 URLopener.http_error_default(self, url, fp,
1841 errcode, errmsg, headers)
1842 scheme, realm = match.groups()
1843 if scheme.lower() != 'basic':
1844 URLopener.http_error_default(self, url, fp,
1845 errcode, errmsg, headers)
1846 name = 'retry_' + self.type + '_basic_auth'
1847 if data is None:
1848 return getattr(self,name)(url, realm)
1849 else:
1850 return getattr(self,name)(url, realm, data)
1851
1852 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
1853 """Error 407 -- proxy authentication required.
1854 This function supports Basic authentication only."""
1855 if not 'proxy-authenticate' in headers:
1856 URLopener.http_error_default(self, url, fp,
1857 errcode, errmsg, headers)
1858 stuff = headers['proxy-authenticate']
1859 import re
1860 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1861 if not match:
1862 URLopener.http_error_default(self, url, fp,
1863 errcode, errmsg, headers)
1864 scheme, realm = match.groups()
1865 if scheme.lower() != 'basic':
1866 URLopener.http_error_default(self, url, fp,
1867 errcode, errmsg, headers)
1868 name = 'retry_proxy_' + self.type + '_basic_auth'
1869 if data is None:
1870 return getattr(self,name)(url, realm)
1871 else:
1872 return getattr(self,name)(url, realm, data)
1873
1874 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001875 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001876 newurl = 'http://' + host + selector
1877 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001878 urltype, proxyhost = splittype(proxy)
1879 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001880 i = proxyhost.find('@') + 1
1881 proxyhost = proxyhost[i:]
1882 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1883 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001884 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001885 quote(passwd, safe=''), proxyhost)
1886 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1887 if data is None:
1888 return self.open(newurl)
1889 else:
1890 return self.open(newurl, data)
1891
1892 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001893 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001894 newurl = 'https://' + host + selector
1895 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001896 urltype, proxyhost = splittype(proxy)
1897 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001898 i = proxyhost.find('@') + 1
1899 proxyhost = proxyhost[i:]
1900 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1901 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001902 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001903 quote(passwd, safe=''), proxyhost)
1904 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1905 if data is None:
1906 return self.open(newurl)
1907 else:
1908 return self.open(newurl, data)
1909
1910 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001911 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001912 i = host.find('@') + 1
1913 host = host[i:]
1914 user, passwd = self.get_user_passwd(host, realm, i)
1915 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001916 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001917 quote(passwd, safe=''), host)
1918 newurl = 'http://' + host + selector
1919 if data is None:
1920 return self.open(newurl)
1921 else:
1922 return self.open(newurl, data)
1923
1924 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001925 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001926 i = host.find('@') + 1
1927 host = host[i:]
1928 user, passwd = self.get_user_passwd(host, realm, i)
1929 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001930 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001931 quote(passwd, safe=''), host)
1932 newurl = 'https://' + host + selector
1933 if data is None:
1934 return self.open(newurl)
1935 else:
1936 return self.open(newurl, data)
1937
1938 def get_user_passwd(self, host, realm, clear_cache = 0):
1939 key = realm + '@' + host.lower()
1940 if key in self.auth_cache:
1941 if clear_cache:
1942 del self.auth_cache[key]
1943 else:
1944 return self.auth_cache[key]
1945 user, passwd = self.prompt_user_passwd(host, realm)
1946 if user or passwd: self.auth_cache[key] = (user, passwd)
1947 return user, passwd
1948
1949 def prompt_user_passwd(self, host, realm):
1950 """Override this in a GUI environment!"""
1951 import getpass
1952 try:
1953 user = input("Enter username for %s at %s: " % (realm, host))
1954 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
1955 (user, realm, host))
1956 return user, passwd
1957 except KeyboardInterrupt:
1958 print()
1959 return None, None
1960
1961
1962# Utility functions
1963
1964_localhost = None
1965def localhost():
1966 """Return the IP address of the magic hostname 'localhost'."""
1967 global _localhost
1968 if _localhost is None:
1969 _localhost = socket.gethostbyname('localhost')
1970 return _localhost
1971
1972_thishost = None
1973def thishost():
1974 """Return the IP address of the current host."""
1975 global _thishost
1976 if _thishost is None:
1977 _thishost = socket.gethostbyname(socket.gethostname())
1978 return _thishost
1979
1980_ftperrors = None
1981def ftperrors():
1982 """Return the set of errors raised by the FTP class."""
1983 global _ftperrors
1984 if _ftperrors is None:
1985 import ftplib
1986 _ftperrors = ftplib.all_errors
1987 return _ftperrors
1988
1989_noheaders = None
1990def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00001991 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001992 global _noheaders
1993 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00001994 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001995 return _noheaders
1996
1997
1998# Utility classes
1999
2000class ftpwrapper:
2001 """Class used by open_ftp() for cache of open FTP connections."""
2002
2003 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2004 self.user = user
2005 self.passwd = passwd
2006 self.host = host
2007 self.port = port
2008 self.dirs = dirs
2009 self.timeout = timeout
2010 self.init()
2011
2012 def init(self):
2013 import ftplib
2014 self.busy = 0
2015 self.ftp = ftplib.FTP()
2016 self.ftp.connect(self.host, self.port, self.timeout)
2017 self.ftp.login(self.user, self.passwd)
2018 for dir in self.dirs:
2019 self.ftp.cwd(dir)
2020
2021 def retrfile(self, file, type):
2022 import ftplib
2023 self.endtransfer()
2024 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2025 else: cmd = 'TYPE ' + type; isdir = 0
2026 try:
2027 self.ftp.voidcmd(cmd)
2028 except ftplib.all_errors:
2029 self.init()
2030 self.ftp.voidcmd(cmd)
2031 conn = None
2032 if file and not isdir:
2033 # Try to retrieve as a file
2034 try:
2035 cmd = 'RETR ' + file
2036 conn = self.ftp.ntransfercmd(cmd)
2037 except ftplib.error_perm as reason:
2038 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002039 raise URLError('ftp error', reason).with_traceback(
2040 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002041 if not conn:
2042 # Set transfer mode to ASCII!
2043 self.ftp.voidcmd('TYPE A')
2044 # Try a directory listing. Verify that directory exists.
2045 if file:
2046 pwd = self.ftp.pwd()
2047 try:
2048 try:
2049 self.ftp.cwd(file)
2050 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002051 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002052 finally:
2053 self.ftp.cwd(pwd)
2054 cmd = 'LIST ' + file
2055 else:
2056 cmd = 'LIST'
2057 conn = self.ftp.ntransfercmd(cmd)
2058 self.busy = 1
2059 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002060 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002061 def endtransfer(self):
2062 if not self.busy:
2063 return
2064 self.busy = 0
2065 try:
2066 self.ftp.voidresp()
2067 except ftperrors():
2068 pass
2069
2070 def close(self):
2071 self.endtransfer()
2072 try:
2073 self.ftp.close()
2074 except ftperrors():
2075 pass
2076
2077# Proxy handling
2078def getproxies_environment():
2079 """Return a dictionary of scheme -> proxy server URL mappings.
2080
2081 Scan the environment for variables named <scheme>_proxy;
2082 this seems to be the standard convention. If you need a
2083 different way, you can pass a proxies dictionary to the
2084 [Fancy]URLopener constructor.
2085
2086 """
2087 proxies = {}
2088 for name, value in os.environ.items():
2089 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002090 if value and name[-6:] == '_proxy':
2091 proxies[name[:-6]] = value
2092 return proxies
2093
2094def proxy_bypass_environment(host):
2095 """Test if proxies should not be used for a particular host.
2096
2097 Checks the environment for a variable named no_proxy, which should
2098 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2099 """
2100 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2101 # '*' is special case for always bypass
2102 if no_proxy == '*':
2103 return 1
2104 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002105 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002106 # check if the host ends with any of the DNS suffixes
2107 for name in no_proxy.split(','):
2108 if name and (hostonly.endswith(name) or host.endswith(name)):
2109 return 1
2110 # otherwise, don't bypass
2111 return 0
2112
2113
2114if sys.platform == 'darwin':
2115 def getproxies_internetconfig():
2116 """Return a dictionary of scheme -> proxy server URL mappings.
2117
2118 By convention the mac uses Internet Config to store
2119 proxies. An HTTP proxy, for instance, is stored under
2120 the HttpProxy key.
2121
2122 """
2123 try:
2124 import ic
2125 except ImportError:
2126 return {}
2127
2128 try:
2129 config = ic.IC()
2130 except ic.error:
2131 return {}
2132 proxies = {}
2133 # HTTP:
2134 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
2135 try:
2136 value = config['HTTPProxyHost']
2137 except ic.error:
2138 pass
2139 else:
2140 proxies['http'] = 'http://%s' % value
2141 # FTP: XXX To be done.
2142 # Gopher: XXX To be done.
2143 return proxies
2144
2145 def proxy_bypass(host):
2146 if getproxies_environment():
2147 return proxy_bypass_environment(host)
2148 else:
2149 return 0
2150
2151 def getproxies():
2152 return getproxies_environment() or getproxies_internetconfig()
2153
2154elif os.name == 'nt':
2155 def getproxies_registry():
2156 """Return a dictionary of scheme -> proxy server URL mappings.
2157
2158 Win32 uses the registry to store proxies.
2159
2160 """
2161 proxies = {}
2162 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002163 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002164 except ImportError:
2165 # Std module, so should be around - but you never know!
2166 return proxies
2167 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002168 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002169 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002170 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002171 'ProxyEnable')[0]
2172 if proxyEnable:
2173 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002174 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002175 'ProxyServer')[0])
2176 if '=' in proxyServer:
2177 # Per-protocol settings
2178 for p in proxyServer.split(';'):
2179 protocol, address = p.split('=', 1)
2180 # See if address has a type:// prefix
2181 import re
2182 if not re.match('^([^/:]+)://', address):
2183 address = '%s://%s' % (protocol, address)
2184 proxies[protocol] = address
2185 else:
2186 # Use one setting for all protocols
2187 if proxyServer[:5] == 'http:':
2188 proxies['http'] = proxyServer
2189 else:
2190 proxies['http'] = 'http://%s' % proxyServer
2191 proxies['ftp'] = 'ftp://%s' % proxyServer
2192 internetSettings.Close()
2193 except (WindowsError, ValueError, TypeError):
2194 # Either registry key not found etc, or the value in an
2195 # unexpected format.
2196 # proxies already set up to be empty so nothing to do
2197 pass
2198 return proxies
2199
2200 def getproxies():
2201 """Return a dictionary of scheme -> proxy server URL mappings.
2202
2203 Returns settings gathered from the environment, if specified,
2204 or the registry.
2205
2206 """
2207 return getproxies_environment() or getproxies_registry()
2208
2209 def proxy_bypass_registry(host):
2210 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002211 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002212 import re
2213 except ImportError:
2214 # Std modules, so should be around - but you never know!
2215 return 0
2216 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002217 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002218 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002219 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002220 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002221 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002222 'ProxyOverride')[0])
2223 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2224 except WindowsError:
2225 return 0
2226 if not proxyEnable or not proxyOverride:
2227 return 0
2228 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002229 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002230 host = [rawHost]
2231 try:
2232 addr = socket.gethostbyname(rawHost)
2233 if addr != rawHost:
2234 host.append(addr)
2235 except socket.error:
2236 pass
2237 try:
2238 fqdn = socket.getfqdn(rawHost)
2239 if fqdn != rawHost:
2240 host.append(fqdn)
2241 except socket.error:
2242 pass
2243 # make a check value list from the registry entry: replace the
2244 # '<local>' string by the localhost entry and the corresponding
2245 # canonical entry.
2246 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002247 # now check if we match one of the registry values.
2248 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002249 if test == '<local>':
2250 if '.' not in rawHost:
2251 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002252 test = test.replace(".", r"\.") # mask dots
2253 test = test.replace("*", r".*") # change glob sequence
2254 test = test.replace("?", r".") # change glob char
2255 for val in host:
2256 # print "%s <--> %s" %( test, val )
2257 if re.match(test, val, re.I):
2258 return 1
2259 return 0
2260
2261 def proxy_bypass(host):
2262 """Return a dictionary of scheme -> proxy server URL mappings.
2263
2264 Returns settings gathered from the environment, if specified,
2265 or the registry.
2266
2267 """
2268 if getproxies_environment():
2269 return proxy_bypass_environment(host)
2270 else:
2271 return proxy_bypass_registry(host)
2272
2273else:
2274 # By default use environment variables
2275 getproxies = getproxies_environment
2276 proxy_bypass = proxy_bypass_environment