blob: 3776536d387e1a5350e9090ac8f8f177878d8d4d [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33OpenerDirector --
34
35Request -- An object that encapsulates the state of a request. The
36state can be as simple as the URL. It can also include extra HTTP
37headers, e.g. a User-Agent.
38
39BaseHandler --
40
41internals:
42BaseHandler and parent
43_call_chain conventions
44
45Example usage:
46
Georg Brandl029986a2008-06-23 11:44:14 +000047import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000048
49# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000050authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051authinfo.add_password(realm='PDQ Application',
52 uri='https://mahler:8092/site-updates.py',
53 user='klem',
54 passwd='geheim$parole')
55
Georg Brandl029986a2008-06-23 11:44:14 +000056proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000057
58# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000059opener = urllib.request.build_opener(proxy_support, authinfo,
60 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000061
62# install it
Georg Brandl029986a2008-06-23 11:44:14 +000063urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000064
Georg Brandl029986a2008-06-23 11:44:14 +000065f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066"""
67
68# XXX issues:
69# If an authentication error handler that tries to perform
70# authentication for some reason but fails, how should the error be
71# signalled? The client needs to know the HTTP error code. But if
72# the handler knows that the problem was, e.g., that it didn't know
73# that hash algo that requested in the challenge, it would be good to
74# pass that information along to the client, too.
75# ftp errors aren't handled cleanly
76# check digest against correct (i.e. non-apache) implementation
77
78# Possible extensions:
79# complex proxies XXX not sure what exactly was meant by this
80# abstract factory for opener
81
82import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000083import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000084import email
85import hashlib
86import http.client
87import io
88import os
89import posixpath
90import random
91import re
92import socket
93import sys
94import time
Jeremy Hylton1afc1692008-06-18 20:49:58 +000095
Georg Brandl13e89462008-07-01 19:56:00 +000096from urllib.error import URLError, HTTPError, ContentTooShortError
97from urllib.parse import (
98 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
99 splittype, splithost, splitport, splituser, splitpasswd,
Facundo Batistaf24802c2008-08-17 03:36:03 +0000100 splitattr, splitquery, splitvalue, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000101from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000102
103# check for SSL
104try:
105 import ssl
106except:
107 _have_ssl = False
108else:
109 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000110
111# used in User-Agent header sent
112__version__ = sys.version[:3]
113
114_opener = None
115def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
116 global _opener
117 if _opener is None:
118 _opener = build_opener()
119 return _opener.open(url, data, timeout)
120
121def install_opener(opener):
122 global _opener
123 _opener = opener
124
125# TODO(jhylton): Make this work with the same global opener.
126_urlopener = None
127def urlretrieve(url, filename=None, reporthook=None, data=None):
128 global _urlopener
129 if not _urlopener:
130 _urlopener = FancyURLopener()
131 return _urlopener.retrieve(url, filename, reporthook, data)
132
133def urlcleanup():
134 if _urlopener:
135 _urlopener.cleanup()
136 global _opener
137 if _opener:
138 _opener = None
139
140# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000141_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000142def request_host(request):
143 """Return request-host, as defined by RFC 2965.
144
145 Variation from RFC: returned value is lowercased, for convenient
146 comparison.
147
148 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000149 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000150 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000151 if host == "":
152 host = request.get_header("Host", "")
153
154 # remove port, if present
155 host = _cut_port_re.sub("", host, 1)
156 return host.lower()
157
158class Request:
159
160 def __init__(self, url, data=None, headers={},
161 origin_req_host=None, unverifiable=False):
162 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000163 self.full_url = unwrap(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000164 self.data = data
165 self.headers = {}
166 for key, value in headers.items():
167 self.add_header(key, value)
168 self.unredirected_hdrs = {}
169 if origin_req_host is None:
170 origin_req_host = request_host(self)
171 self.origin_req_host = origin_req_host
172 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000173 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000174
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000175 def _parse(self):
176 self.type, rest = splittype(self.full_url)
177 if self.type is None:
178 raise ValueError("unknown url type: %s" % self.full_url)
179 self.host, self.selector = splithost(rest)
180 if self.host:
181 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000182
183 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000184 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000185 return "POST"
186 else:
187 return "GET"
188
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000189 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000190
191 def add_data(self, data):
192 self.data = data
193
194 def has_data(self):
195 return self.data is not None
196
197 def get_data(self):
198 return self.data
199
200 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000201 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000202
203 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000204 return self.type
205
206 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000207 return self.host
208
209 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000210 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000211
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000212 def is_unverifiable(self):
213 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000214
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000215 def get_origin_req_host(self):
216 return self.origin_req_host
217
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000218 # End deprecated methods
219
220 def set_proxy(self, host, type):
221 self.host, self.type = host, type
222 self.selector = self.full_url
223
224 def has_proxy(self):
225 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000226
227 def add_header(self, key, val):
228 # useful for something like authentication
229 self.headers[key.capitalize()] = val
230
231 def add_unredirected_header(self, key, val):
232 # will not be added to a redirected request
233 self.unredirected_hdrs[key.capitalize()] = val
234
235 def has_header(self, header_name):
236 return (header_name in self.headers or
237 header_name in self.unredirected_hdrs)
238
239 def get_header(self, header_name, default=None):
240 return self.headers.get(
241 header_name,
242 self.unredirected_hdrs.get(header_name, default))
243
244 def header_items(self):
245 hdrs = self.unredirected_hdrs.copy()
246 hdrs.update(self.headers)
247 return list(hdrs.items())
248
249class OpenerDirector:
250 def __init__(self):
251 client_version = "Python-urllib/%s" % __version__
252 self.addheaders = [('User-agent', client_version)]
253 # manage the individual handlers
254 self.handlers = []
255 self.handle_open = {}
256 self.handle_error = {}
257 self.process_response = {}
258 self.process_request = {}
259
260 def add_handler(self, handler):
261 if not hasattr(handler, "add_parent"):
262 raise TypeError("expected BaseHandler instance, got %r" %
263 type(handler))
264
265 added = False
266 for meth in dir(handler):
267 if meth in ["redirect_request", "do_open", "proxy_open"]:
268 # oops, coincidental match
269 continue
270
271 i = meth.find("_")
272 protocol = meth[:i]
273 condition = meth[i+1:]
274
275 if condition.startswith("error"):
276 j = condition.find("_") + i + 1
277 kind = meth[j+1:]
278 try:
279 kind = int(kind)
280 except ValueError:
281 pass
282 lookup = self.handle_error.get(protocol, {})
283 self.handle_error[protocol] = lookup
284 elif condition == "open":
285 kind = protocol
286 lookup = self.handle_open
287 elif condition == "response":
288 kind = protocol
289 lookup = self.process_response
290 elif condition == "request":
291 kind = protocol
292 lookup = self.process_request
293 else:
294 continue
295
296 handlers = lookup.setdefault(kind, [])
297 if handlers:
298 bisect.insort(handlers, handler)
299 else:
300 handlers.append(handler)
301 added = True
302
303 if added:
304 # the handlers must work in an specific order, the order
305 # is specified in a Handler attribute
306 bisect.insort(self.handlers, handler)
307 handler.add_parent(self)
308
309 def close(self):
310 # Only exists for backwards compatibility.
311 pass
312
313 def _call_chain(self, chain, kind, meth_name, *args):
314 # Handlers raise an exception if no one else should try to handle
315 # the request, or return None if they can't but another handler
316 # could. Otherwise, they return the response.
317 handlers = chain.get(kind, ())
318 for handler in handlers:
319 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000320 result = func(*args)
321 if result is not None:
322 return result
323
324 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
325 # accept a URL or a Request object
326 if isinstance(fullurl, str):
327 req = Request(fullurl, data)
328 else:
329 req = fullurl
330 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000331 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000332
333 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000334 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000335
336 # pre-process request
337 meth_name = protocol+"_request"
338 for processor in self.process_request.get(protocol, []):
339 meth = getattr(processor, meth_name)
340 req = meth(req)
341
342 response = self._open(req, data)
343
344 # post-process response
345 meth_name = protocol+"_response"
346 for processor in self.process_response.get(protocol, []):
347 meth = getattr(processor, meth_name)
348 response = meth(req, response)
349
350 return response
351
352 def _open(self, req, data=None):
353 result = self._call_chain(self.handle_open, 'default',
354 'default_open', req)
355 if result:
356 return result
357
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000358 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000359 result = self._call_chain(self.handle_open, protocol, protocol +
360 '_open', req)
361 if result:
362 return result
363
364 return self._call_chain(self.handle_open, 'unknown',
365 'unknown_open', req)
366
367 def error(self, proto, *args):
368 if proto in ('http', 'https'):
369 # XXX http[s] protocols are special-cased
370 dict = self.handle_error['http'] # https is not different than http
371 proto = args[2] # YUCK!
372 meth_name = 'http_error_%s' % proto
373 http_err = 1
374 orig_args = args
375 else:
376 dict = self.handle_error
377 meth_name = proto + '_error'
378 http_err = 0
379 args = (dict, proto, meth_name) + args
380 result = self._call_chain(*args)
381 if result:
382 return result
383
384 if http_err:
385 args = (dict, 'default', 'http_error_default') + orig_args
386 return self._call_chain(*args)
387
388# XXX probably also want an abstract factory that knows when it makes
389# sense to skip a superclass in favor of a subclass and when it might
390# make sense to include both
391
392def build_opener(*handlers):
393 """Create an opener object from a list of handlers.
394
395 The opener will use several default handlers, including support
396 for HTTP and FTP.
397
398 If any of the handlers passed as arguments are subclasses of the
399 default handlers, the default handlers will not be used.
400 """
401 def isclass(obj):
402 return isinstance(obj, type) or hasattr(obj, "__bases__")
403
404 opener = OpenerDirector()
405 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
406 HTTPDefaultErrorHandler, HTTPRedirectHandler,
407 FTPHandler, FileHandler, HTTPErrorProcessor]
408 if hasattr(http.client, "HTTPSConnection"):
409 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000410 skip = set()
411 for klass in default_classes:
412 for check in handlers:
413 if isclass(check):
414 if issubclass(check, klass):
415 skip.add(klass)
416 elif isinstance(check, klass):
417 skip.add(klass)
418 for klass in skip:
419 default_classes.remove(klass)
420
421 for klass in default_classes:
422 opener.add_handler(klass())
423
424 for h in handlers:
425 if isclass(h):
426 h = h()
427 opener.add_handler(h)
428 return opener
429
430class BaseHandler:
431 handler_order = 500
432
433 def add_parent(self, parent):
434 self.parent = parent
435
436 def close(self):
437 # Only exists for backwards compatibility
438 pass
439
440 def __lt__(self, other):
441 if not hasattr(other, "handler_order"):
442 # Try to preserve the old behavior of having custom classes
443 # inserted after default ones (works only for custom user
444 # classes which are not aware of handler_order).
445 return True
446 return self.handler_order < other.handler_order
447
448
449class HTTPErrorProcessor(BaseHandler):
450 """Process HTTP error responses."""
451 handler_order = 1000 # after all other processing
452
453 def http_response(self, request, response):
454 code, msg, hdrs = response.code, response.msg, response.info()
455
456 # According to RFC 2616, "2xx" code indicates that the client's
457 # request was successfully received, understood, and accepted.
458 if not (200 <= code < 300):
459 response = self.parent.error(
460 'http', request, response, code, msg, hdrs)
461
462 return response
463
464 https_response = http_response
465
466class HTTPDefaultErrorHandler(BaseHandler):
467 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000468 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000469
470class HTTPRedirectHandler(BaseHandler):
471 # maximum number of redirections to any single URL
472 # this is needed because of the state that cookies introduce
473 max_repeats = 4
474 # maximum total number of redirections (regardless of URL) before
475 # assuming we're in a loop
476 max_redirections = 10
477
478 def redirect_request(self, req, fp, code, msg, headers, newurl):
479 """Return a Request or None in response to a redirect.
480
481 This is called by the http_error_30x methods when a
482 redirection response is received. If a redirection should
483 take place, return a new Request to allow http_error_30x to
484 perform the redirect. Otherwise, raise HTTPError if no-one
485 else should try to handle this url. Return None if you can't
486 but another Handler might.
487 """
488 m = req.get_method()
489 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
490 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000491 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000492
493 # Strictly (according to RFC 2616), 301 or 302 in response to
494 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000495 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000496 # essentially all clients do redirect in this case, so we do
497 # the same.
498 # be conciliant with URIs containing a space
499 newurl = newurl.replace(' ', '%20')
500 CONTENT_HEADERS = ("content-length", "content-type")
501 newheaders = dict((k, v) for k, v in req.headers.items()
502 if k.lower() not in CONTENT_HEADERS)
503 return Request(newurl,
504 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000505 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000506 unverifiable=True)
507
508 # Implementation note: To avoid the server sending us into an
509 # infinite loop, the request object needs to track what URLs we
510 # have already seen. Do this by adding a handler-specific
511 # attribute to the Request object.
512 def http_error_302(self, req, fp, code, msg, headers):
513 # Some servers (incorrectly) return multiple Location headers
514 # (so probably same goes for URI). Use first header.
515 if "location" in headers:
516 newurl = headers["location"]
517 elif "uri" in headers:
518 newurl = headers["uri"]
519 else:
520 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000521
522 # fix a possible malformed URL
523 urlparts = urlparse(newurl)
524 if not urlparts.path:
525 urlparts = list(urlparts)
526 urlparts[2] = "/"
527 newurl = urlunparse(urlparts)
528
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000529 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000530
531 # XXX Probably want to forget about the state of the current
532 # request, although that might interact poorly with other
533 # handlers that also use handler-specific request attributes
534 new = self.redirect_request(req, fp, code, msg, headers, newurl)
535 if new is None:
536 return
537
538 # loop detection
539 # .redirect_dict has a key url if url was previously visited.
540 if hasattr(req, 'redirect_dict'):
541 visited = new.redirect_dict = req.redirect_dict
542 if (visited.get(newurl, 0) >= self.max_repeats or
543 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000544 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000545 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000546 else:
547 visited = new.redirect_dict = req.redirect_dict = {}
548 visited[newurl] = visited.get(newurl, 0) + 1
549
550 # Don't close the fp until we are sure that we won't use it
551 # with HTTPError.
552 fp.read()
553 fp.close()
554
555 return self.parent.open(new)
556
557 http_error_301 = http_error_303 = http_error_307 = http_error_302
558
559 inf_msg = "The HTTP server returned a redirect error that would " \
560 "lead to an infinite loop.\n" \
561 "The last 30x error message was:\n"
562
563
564def _parse_proxy(proxy):
565 """Return (scheme, user, password, host/port) given a URL or an authority.
566
567 If a URL is supplied, it must have an authority (host:port) component.
568 According to RFC 3986, having an authority component means the URL must
569 have two slashes after the scheme:
570
571 >>> _parse_proxy('file:/ftp.example.com/')
572 Traceback (most recent call last):
573 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
574
575 The first three items of the returned tuple may be None.
576
577 Examples of authority parsing:
578
579 >>> _parse_proxy('proxy.example.com')
580 (None, None, None, 'proxy.example.com')
581 >>> _parse_proxy('proxy.example.com:3128')
582 (None, None, None, 'proxy.example.com:3128')
583
584 The authority component may optionally include userinfo (assumed to be
585 username:password):
586
587 >>> _parse_proxy('joe:password@proxy.example.com')
588 (None, 'joe', 'password', 'proxy.example.com')
589 >>> _parse_proxy('joe:password@proxy.example.com:3128')
590 (None, 'joe', 'password', 'proxy.example.com:3128')
591
592 Same examples, but with URLs instead:
593
594 >>> _parse_proxy('http://proxy.example.com/')
595 ('http', None, None, 'proxy.example.com')
596 >>> _parse_proxy('http://proxy.example.com:3128/')
597 ('http', None, None, 'proxy.example.com:3128')
598 >>> _parse_proxy('http://joe:password@proxy.example.com/')
599 ('http', 'joe', 'password', 'proxy.example.com')
600 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
601 ('http', 'joe', 'password', 'proxy.example.com:3128')
602
603 Everything after the authority is ignored:
604
605 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
606 ('ftp', 'joe', 'password', 'proxy.example.com')
607
608 Test for no trailing '/' case:
609
610 >>> _parse_proxy('http://joe:password@proxy.example.com')
611 ('http', 'joe', 'password', 'proxy.example.com')
612
613 """
Georg Brandl13e89462008-07-01 19:56:00 +0000614 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000615 if not r_scheme.startswith("/"):
616 # authority
617 scheme = None
618 authority = proxy
619 else:
620 # URL
621 if not r_scheme.startswith("//"):
622 raise ValueError("proxy URL with no authority: %r" % proxy)
623 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
624 # and 3.3.), path is empty or starts with '/'
625 end = r_scheme.find("/", 2)
626 if end == -1:
627 end = None
628 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000629 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000630 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000631 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000632 else:
633 user = password = None
634 return scheme, user, password, hostport
635
636class ProxyHandler(BaseHandler):
637 # Proxies must be in front
638 handler_order = 100
639
640 def __init__(self, proxies=None):
641 if proxies is None:
642 proxies = getproxies()
643 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
644 self.proxies = proxies
645 for type, url in proxies.items():
646 setattr(self, '%s_open' % type,
647 lambda r, proxy=url, type=type, meth=self.proxy_open: \
648 meth(r, proxy, type))
649
650 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000651 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000652 proxy_type, user, password, hostport = _parse_proxy(proxy)
653 if proxy_type is None:
654 proxy_type = orig_type
655 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000656 user_pass = '%s:%s' % (unquote(user),
657 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000658 creds = base64.b64encode(user_pass.encode()).decode("ascii")
659 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000660 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000661 req.set_proxy(hostport, proxy_type)
662 if orig_type == proxy_type:
663 # let other handlers take care of it
664 return None
665 else:
666 # need to start over, because the other handlers don't
667 # grok the proxy's URL type
668 # e.g. if we have a constructor arg proxies like so:
669 # {'http': 'ftp://proxy.example.com'}, we may end up turning
670 # a request for http://acme.example.com/a into one for
671 # ftp://proxy.example.com/a
672 return self.parent.open(req)
673
674class HTTPPasswordMgr:
675
676 def __init__(self):
677 self.passwd = {}
678
679 def add_password(self, realm, uri, user, passwd):
680 # uri could be a single URI or a sequence
681 if isinstance(uri, str):
682 uri = [uri]
683 if not realm in self.passwd:
684 self.passwd[realm] = {}
685 for default_port in True, False:
686 reduced_uri = tuple(
687 [self.reduce_uri(u, default_port) for u in uri])
688 self.passwd[realm][reduced_uri] = (user, passwd)
689
690 def find_user_password(self, realm, authuri):
691 domains = self.passwd.get(realm, {})
692 for default_port in True, False:
693 reduced_authuri = self.reduce_uri(authuri, default_port)
694 for uris, authinfo in domains.items():
695 for uri in uris:
696 if self.is_suburi(uri, reduced_authuri):
697 return authinfo
698 return None, None
699
700 def reduce_uri(self, uri, default_port=True):
701 """Accept authority or URI and extract only the authority and path."""
702 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000703 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000704 if parts[1]:
705 # URI
706 scheme = parts[0]
707 authority = parts[1]
708 path = parts[2] or '/'
709 else:
710 # host or host:port
711 scheme = None
712 authority = uri
713 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000714 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000715 if default_port and port is None and scheme is not None:
716 dport = {"http": 80,
717 "https": 443,
718 }.get(scheme)
719 if dport is not None:
720 authority = "%s:%d" % (host, dport)
721 return authority, path
722
723 def is_suburi(self, base, test):
724 """Check if test is below base in a URI tree
725
726 Both args must be URIs in reduced form.
727 """
728 if base == test:
729 return True
730 if base[0] != test[0]:
731 return False
732 common = posixpath.commonprefix((base[1], test[1]))
733 if len(common) == len(base[1]):
734 return True
735 return False
736
737
738class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
739
740 def find_user_password(self, realm, authuri):
741 user, password = HTTPPasswordMgr.find_user_password(self, realm,
742 authuri)
743 if user is not None:
744 return user, password
745 return HTTPPasswordMgr.find_user_password(self, None, authuri)
746
747
748class AbstractBasicAuthHandler:
749
750 # XXX this allows for multiple auth-schemes, but will stupidly pick
751 # the last one with a realm specified.
752
753 # allow for double- and single-quoted realm values
754 # (single quotes are a violation of the RFC, but appear in the wild)
755 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
756 'realm=(["\'])(.*?)\\2', re.I)
757
758 # XXX could pre-emptively send auth info already accepted (RFC 2617,
759 # end of section 2, and section 1.2 immediately after "credentials"
760 # production).
761
762 def __init__(self, password_mgr=None):
763 if password_mgr is None:
764 password_mgr = HTTPPasswordMgr()
765 self.passwd = password_mgr
766 self.add_password = self.passwd.add_password
767
768 def http_error_auth_reqed(self, authreq, host, req, headers):
769 # host may be an authority (without userinfo) or a URL with an
770 # authority
771 # XXX could be multiple headers
772 authreq = headers.get(authreq, None)
773 if authreq:
774 mo = AbstractBasicAuthHandler.rx.search(authreq)
775 if mo:
776 scheme, quote, realm = mo.groups()
777 if scheme.lower() == 'basic':
778 return self.retry_http_basic_auth(host, req, realm)
779
780 def retry_http_basic_auth(self, host, req, realm):
781 user, pw = self.passwd.find_user_password(realm, host)
782 if pw is not None:
783 raw = "%s:%s" % (user, pw)
784 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
785 if req.headers.get(self.auth_header, None) == auth:
786 return None
787 req.add_header(self.auth_header, auth)
788 return self.parent.open(req)
789 else:
790 return None
791
792
793class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
794
795 auth_header = 'Authorization'
796
797 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000798 url = req.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000799 return self.http_error_auth_reqed('www-authenticate',
800 url, req, headers)
801
802
803class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
804
805 auth_header = 'Proxy-authorization'
806
807 def http_error_407(self, req, fp, code, msg, headers):
808 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000809 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000810 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
811 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000812 authority = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 return self.http_error_auth_reqed('proxy-authenticate',
814 authority, req, headers)
815
816
817def randombytes(n):
818 """Return n random bytes."""
819 return os.urandom(n)
820
821class AbstractDigestAuthHandler:
822 # Digest authentication is specified in RFC 2617.
823
824 # XXX The client does not inspect the Authentication-Info header
825 # in a successful response.
826
827 # XXX It should be possible to test this implementation against
828 # a mock server that just generates a static set of challenges.
829
830 # XXX qop="auth-int" supports is shaky
831
832 def __init__(self, passwd=None):
833 if passwd is None:
834 passwd = HTTPPasswordMgr()
835 self.passwd = passwd
836 self.add_password = self.passwd.add_password
837 self.retried = 0
838 self.nonce_count = 0
839
840 def reset_retry_count(self):
841 self.retried = 0
842
843 def http_error_auth_reqed(self, auth_header, host, req, headers):
844 authreq = headers.get(auth_header, None)
845 if self.retried > 5:
846 # Don't fail endlessly - if we failed once, we'll probably
847 # fail a second time. Hm. Unless the Password Manager is
848 # prompting for the information. Crap. This isn't great
849 # but it's better than the current 'repeat until recursion
850 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000851 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000852 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000853 else:
854 self.retried += 1
855 if authreq:
856 scheme = authreq.split()[0]
857 if scheme.lower() == 'digest':
858 return self.retry_http_digest_auth(req, authreq)
859
860 def retry_http_digest_auth(self, req, auth):
861 token, challenge = auth.split(' ', 1)
862 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
863 auth = self.get_authorization(req, chal)
864 if auth:
865 auth_val = 'Digest %s' % auth
866 if req.headers.get(self.auth_header, None) == auth_val:
867 return None
868 req.add_unredirected_header(self.auth_header, auth_val)
869 resp = self.parent.open(req)
870 return resp
871
872 def get_cnonce(self, nonce):
873 # The cnonce-value is an opaque
874 # quoted string value provided by the client and used by both client
875 # and server to avoid chosen plaintext attacks, to provide mutual
876 # authentication, and to provide some message integrity protection.
877 # This isn't a fabulous effort, but it's probably Good Enough.
878 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
879 b = s.encode("ascii") + randombytes(8)
880 dig = hashlib.sha1(b).hexdigest()
881 return dig[:16]
882
883 def get_authorization(self, req, chal):
884 try:
885 realm = chal['realm']
886 nonce = chal['nonce']
887 qop = chal.get('qop')
888 algorithm = chal.get('algorithm', 'MD5')
889 # mod_digest doesn't send an opaque, even though it isn't
890 # supposed to be optional
891 opaque = chal.get('opaque', None)
892 except KeyError:
893 return None
894
895 H, KD = self.get_algorithm_impls(algorithm)
896 if H is None:
897 return None
898
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000899 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000900 if user is None:
901 return None
902
903 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000904 if req.data is not None:
905 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000906 else:
907 entdig = None
908
909 A1 = "%s:%s:%s" % (user, realm, pw)
910 A2 = "%s:%s" % (req.get_method(),
911 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000912 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000913 if qop == 'auth':
914 self.nonce_count += 1
915 ncvalue = '%08x' % self.nonce_count
916 cnonce = self.get_cnonce(nonce)
917 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
918 respdig = KD(H(A1), noncebit)
919 elif qop is None:
920 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
921 else:
922 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000923 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000924
925 # XXX should the partial digests be encoded too?
926
927 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000928 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000929 respdig)
930 if opaque:
931 base += ', opaque="%s"' % opaque
932 if entdig:
933 base += ', digest="%s"' % entdig
934 base += ', algorithm="%s"' % algorithm
935 if qop:
936 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
937 return base
938
939 def get_algorithm_impls(self, algorithm):
940 # lambdas assume digest modules are imported at the top level
941 if algorithm == 'MD5':
942 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
943 elif algorithm == 'SHA':
944 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
945 # XXX MD5-sess
946 KD = lambda s, d: H("%s:%s" % (s, d))
947 return H, KD
948
949 def get_entity_digest(self, data, chal):
950 # XXX not implemented yet
951 return None
952
953
954class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
955 """An authentication protocol defined by RFC 2069
956
957 Digest authentication improves on basic authentication because it
958 does not transmit passwords in the clear.
959 """
960
961 auth_header = 'Authorization'
962 handler_order = 490 # before Basic auth
963
964 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000965 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000966 retry = self.http_error_auth_reqed('www-authenticate',
967 host, req, headers)
968 self.reset_retry_count()
969 return retry
970
971
972class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
973
974 auth_header = 'Proxy-Authorization'
975 handler_order = 490 # before Basic auth
976
977 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000978 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000979 retry = self.http_error_auth_reqed('proxy-authenticate',
980 host, req, headers)
981 self.reset_retry_count()
982 return retry
983
984class AbstractHTTPHandler(BaseHandler):
985
986 def __init__(self, debuglevel=0):
987 self._debuglevel = debuglevel
988
989 def set_http_debuglevel(self, level):
990 self._debuglevel = level
991
992 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000993 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000994 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +0000995 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000996
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000997 if request.data is not None: # POST
998 data = request.data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000999 if not request.has_header('Content-type'):
1000 request.add_unredirected_header(
1001 'Content-type',
1002 'application/x-www-form-urlencoded')
1003 if not request.has_header('Content-length'):
1004 request.add_unredirected_header(
1005 'Content-length', '%d' % len(data))
1006
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001007 sel_host = host
1008 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001009 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001010 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001011 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001012 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001013 for name, value in self.parent.addheaders:
1014 name = name.capitalize()
1015 if not request.has_header(name):
1016 request.add_unredirected_header(name, value)
1017
1018 return request
1019
1020 def do_open(self, http_class, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001021 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001022
1023 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001024 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001025 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001026 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001027 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001028
1029 h = http_class(host, timeout=req.timeout) # will parse host:port
1030 headers = dict(req.headers)
1031 headers.update(req.unredirected_hdrs)
1032
1033 # TODO(jhylton): Should this be redesigned to handle
1034 # persistent connections?
1035
1036 # We want to make an HTTP/1.1 request, but the addinfourl
1037 # class isn't prepared to deal with a persistent connection.
1038 # It will try to read all remaining data from the socket,
1039 # which will block while the server waits for the next request.
1040 # So make sure the connection gets closed after the (only)
1041 # request.
1042 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001043 headers = dict((name.title(), val) for name, val in headers.items())
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001044 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001045 h.request(req.get_method(), req.selector, req.data, headers)
1046 r = h.getresponse() # an HTTPResponse instance
1047 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001048 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001049
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001050 r.url = req.full_url
1051 # This line replaces the .msg attribute of the HTTPResponse
1052 # with .headers, because urllib clients expect the response to
1053 # have the reason in .msg. It would be good to mark this
1054 # attribute is deprecated and get then to use info() or
1055 # .headers.
1056 r.msg = r.reason
1057 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001058
1059
1060class HTTPHandler(AbstractHTTPHandler):
1061
1062 def http_open(self, req):
1063 return self.do_open(http.client.HTTPConnection, req)
1064
1065 http_request = AbstractHTTPHandler.do_request_
1066
1067if hasattr(http.client, 'HTTPSConnection'):
1068 class HTTPSHandler(AbstractHTTPHandler):
1069
1070 def https_open(self, req):
1071 return self.do_open(http.client.HTTPSConnection, req)
1072
1073 https_request = AbstractHTTPHandler.do_request_
1074
1075class HTTPCookieProcessor(BaseHandler):
1076 def __init__(self, cookiejar=None):
1077 import http.cookiejar
1078 if cookiejar is None:
1079 cookiejar = http.cookiejar.CookieJar()
1080 self.cookiejar = cookiejar
1081
1082 def http_request(self, request):
1083 self.cookiejar.add_cookie_header(request)
1084 return request
1085
1086 def http_response(self, request, response):
1087 self.cookiejar.extract_cookies(response, request)
1088 return response
1089
1090 https_request = http_request
1091 https_response = http_response
1092
1093class UnknownHandler(BaseHandler):
1094 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001095 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001096 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001097
1098def parse_keqv_list(l):
1099 """Parse list of key=value strings where keys are not duplicated."""
1100 parsed = {}
1101 for elt in l:
1102 k, v = elt.split('=', 1)
1103 if v[0] == '"' and v[-1] == '"':
1104 v = v[1:-1]
1105 parsed[k] = v
1106 return parsed
1107
1108def parse_http_list(s):
1109 """Parse lists as described by RFC 2068 Section 2.
1110
1111 In particular, parse comma-separated lists where the elements of
1112 the list may include quoted-strings. A quoted-string could
1113 contain a comma. A non-quoted string could have quotes in the
1114 middle. Neither commas nor quotes count if they are escaped.
1115 Only double-quotes count, not single-quotes.
1116 """
1117 res = []
1118 part = ''
1119
1120 escape = quote = False
1121 for cur in s:
1122 if escape:
1123 part += cur
1124 escape = False
1125 continue
1126 if quote:
1127 if cur == '\\':
1128 escape = True
1129 continue
1130 elif cur == '"':
1131 quote = False
1132 part += cur
1133 continue
1134
1135 if cur == ',':
1136 res.append(part)
1137 part = ''
1138 continue
1139
1140 if cur == '"':
1141 quote = True
1142
1143 part += cur
1144
1145 # append last part
1146 if part:
1147 res.append(part)
1148
1149 return [part.strip() for part in res]
1150
1151class FileHandler(BaseHandler):
1152 # Use local file or FTP depending on form of URL
1153 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001154 url = req.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001155 if url[:2] == '//' and url[2:3] != '/':
1156 req.type = 'ftp'
1157 return self.parent.open(req)
1158 else:
1159 return self.open_local_file(req)
1160
1161 # names for the localhost
1162 names = None
1163 def get_names(self):
1164 if FileHandler.names is None:
1165 try:
1166 FileHandler.names = (socket.gethostbyname('localhost'),
1167 socket.gethostbyname(socket.gethostname()))
1168 except socket.gaierror:
1169 FileHandler.names = (socket.gethostbyname('localhost'),)
1170 return FileHandler.names
1171
1172 # not entirely sure what the rules are here
1173 def open_local_file(self, req):
1174 import email.utils
1175 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001176 host = req.host
1177 file = req.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001178 localfile = url2pathname(file)
1179 try:
1180 stats = os.stat(localfile)
1181 size = stats.st_size
1182 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1183 mtype = mimetypes.guess_type(file)[0]
1184 headers = email.message_from_string(
1185 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1186 (mtype or 'text/plain', size, modified))
1187 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001188 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001189 if not host or \
1190 (not port and _safe_gethostbyname(host) in self.get_names()):
Georg Brandl13e89462008-07-01 19:56:00 +00001191 return addinfourl(open(localfile, 'rb'), headers, 'file:'+file)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001192 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001193 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001194 raise URLError(msg)
1195 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001196
1197def _safe_gethostbyname(host):
1198 try:
1199 return socket.gethostbyname(host)
1200 except socket.gaierror:
1201 return None
1202
1203class FTPHandler(BaseHandler):
1204 def ftp_open(self, req):
1205 import ftplib
1206 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001207 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001208 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001209 raise URLError('ftp error: no host given')
1210 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001211 if port is None:
1212 port = ftplib.FTP_PORT
1213 else:
1214 port = int(port)
1215
1216 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001217 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001218 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001219 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001220 else:
1221 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001222 host = unquote(host)
1223 user = unquote(user or '')
1224 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001225
1226 try:
1227 host = socket.gethostbyname(host)
1228 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001229 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001230 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001231 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001232 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001233 dirs, file = dirs[:-1], dirs[-1]
1234 if dirs and not dirs[0]:
1235 dirs = dirs[1:]
1236 try:
1237 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1238 type = file and 'I' or 'D'
1239 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001240 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001241 if attr.lower() == 'type' and \
1242 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1243 type = value.upper()
1244 fp, retrlen = fw.retrfile(file, type)
1245 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001246 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001247 if mtype:
1248 headers += "Content-type: %s\n" % mtype
1249 if retrlen is not None and retrlen >= 0:
1250 headers += "Content-length: %d\n" % retrlen
1251 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001252 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001253 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001254 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001255 raise exc.with_traceback(sys.exc_info()[2])
1256
1257 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1258 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1259 return fw
1260
1261class CacheFTPHandler(FTPHandler):
1262 # XXX would be nice to have pluggable cache strategies
1263 # XXX this stuff is definitely not thread safe
1264 def __init__(self):
1265 self.cache = {}
1266 self.timeout = {}
1267 self.soonest = 0
1268 self.delay = 60
1269 self.max_conns = 16
1270
1271 def setTimeout(self, t):
1272 self.delay = t
1273
1274 def setMaxConns(self, m):
1275 self.max_conns = m
1276
1277 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1278 key = user, host, port, '/'.join(dirs), timeout
1279 if key in self.cache:
1280 self.timeout[key] = time.time() + self.delay
1281 else:
1282 self.cache[key] = ftpwrapper(user, passwd, host, port,
1283 dirs, timeout)
1284 self.timeout[key] = time.time() + self.delay
1285 self.check_cache()
1286 return self.cache[key]
1287
1288 def check_cache(self):
1289 # first check for old ones
1290 t = time.time()
1291 if self.soonest <= t:
1292 for k, v in list(self.timeout.items()):
1293 if v < t:
1294 self.cache[k].close()
1295 del self.cache[k]
1296 del self.timeout[k]
1297 self.soonest = min(list(self.timeout.values()))
1298
1299 # then check the size
1300 if len(self.cache) == self.max_conns:
1301 for k, v in list(self.timeout.items()):
1302 if v == self.soonest:
1303 del self.cache[k]
1304 del self.timeout[k]
1305 break
1306 self.soonest = min(list(self.timeout.values()))
1307
1308# Code move from the old urllib module
1309
1310MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1311
1312# Helper for non-unix systems
1313if os.name == 'mac':
1314 from macurl2path import url2pathname, pathname2url
1315elif os.name == 'nt':
1316 from nturl2path import url2pathname, pathname2url
1317else:
1318 def url2pathname(pathname):
1319 """OS-specific conversion from a relative URL of the 'file' scheme
1320 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001321 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001322
1323 def pathname2url(pathname):
1324 """OS-specific conversion from a file system path to a relative URL
1325 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001326 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001327
1328# This really consists of two pieces:
1329# (1) a class which handles opening of all sorts of URLs
1330# (plus assorted utilities etc.)
1331# (2) a set of functions for parsing URLs
1332# XXX Should these be separated out into different modules?
1333
1334
1335ftpcache = {}
1336class URLopener:
1337 """Class to open URLs.
1338 This is a class rather than just a subroutine because we may need
1339 more than one set of global protocol-specific options.
1340 Note -- this is a base class for those who don't want the
1341 automatic handling of errors type 302 (relocated) and 401
1342 (authorization needed)."""
1343
1344 __tempfiles = None
1345
1346 version = "Python-urllib/%s" % __version__
1347
1348 # Constructor
1349 def __init__(self, proxies=None, **x509):
1350 if proxies is None:
1351 proxies = getproxies()
1352 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1353 self.proxies = proxies
1354 self.key_file = x509.get('key_file')
1355 self.cert_file = x509.get('cert_file')
1356 self.addheaders = [('User-Agent', self.version)]
1357 self.__tempfiles = []
1358 self.__unlink = os.unlink # See cleanup()
1359 self.tempcache = None
1360 # Undocumented feature: if you assign {} to tempcache,
1361 # it is used to cache files retrieved with
1362 # self.retrieve(). This is not enabled by default
1363 # since it does not work for changing documents (and I
1364 # haven't got the logic to check expiration headers
1365 # yet).
1366 self.ftpcache = ftpcache
1367 # Undocumented feature: you can use a different
1368 # ftp cache by assigning to the .ftpcache member;
1369 # in case you want logically independent URL openers
1370 # XXX This is not threadsafe. Bah.
1371
1372 def __del__(self):
1373 self.close()
1374
1375 def close(self):
1376 self.cleanup()
1377
1378 def cleanup(self):
1379 # This code sometimes runs when the rest of this module
1380 # has already been deleted, so it can't use any globals
1381 # or import anything.
1382 if self.__tempfiles:
1383 for file in self.__tempfiles:
1384 try:
1385 self.__unlink(file)
1386 except OSError:
1387 pass
1388 del self.__tempfiles[:]
1389 if self.tempcache:
1390 self.tempcache.clear()
1391
1392 def addheader(self, *args):
1393 """Add a header to be used by the HTTP interface only
1394 e.g. u.addheader('Accept', 'sound/basic')"""
1395 self.addheaders.append(args)
1396
1397 # External interface
1398 def open(self, fullurl, data=None):
1399 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001400 fullurl = unwrap(to_bytes(fullurl))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001401 if self.tempcache and fullurl in self.tempcache:
1402 filename, headers = self.tempcache[fullurl]
1403 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001404 return addinfourl(fp, headers, fullurl)
1405 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001406 if not urltype:
1407 urltype = 'file'
1408 if urltype in self.proxies:
1409 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001410 urltype, proxyhost = splittype(proxy)
1411 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001412 url = (host, fullurl) # Signal special case to open_*()
1413 else:
1414 proxy = None
1415 name = 'open_' + urltype
1416 self.type = urltype
1417 name = name.replace('-', '_')
1418 if not hasattr(self, name):
1419 if proxy:
1420 return self.open_unknown_proxy(proxy, fullurl, data)
1421 else:
1422 return self.open_unknown(fullurl, data)
1423 try:
1424 if data is None:
1425 return getattr(self, name)(url)
1426 else:
1427 return getattr(self, name)(url, data)
1428 except socket.error as msg:
1429 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1430
1431 def open_unknown(self, fullurl, data=None):
1432 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001433 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001434 raise IOError('url error', 'unknown url type', type)
1435
1436 def open_unknown_proxy(self, proxy, fullurl, data=None):
1437 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001438 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001439 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1440
1441 # External interface
1442 def retrieve(self, url, filename=None, reporthook=None, data=None):
1443 """retrieve(url) returns (filename, headers) for a local object
1444 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001445 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001446 if self.tempcache and url in self.tempcache:
1447 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001448 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001449 if filename is None and (not type or type == 'file'):
1450 try:
1451 fp = self.open_local_file(url1)
1452 hdrs = fp.info()
1453 del fp
Georg Brandl13e89462008-07-01 19:56:00 +00001454 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001455 except IOError as msg:
1456 pass
1457 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001458 try:
1459 headers = fp.info()
1460 if filename:
1461 tfp = open(filename, 'wb')
1462 else:
1463 import tempfile
1464 garbage, path = splittype(url)
1465 garbage, path = splithost(path or "")
1466 path, garbage = splitquery(path or "")
1467 path, garbage = splitattr(path or "")
1468 suffix = os.path.splitext(path)[1]
1469 (fd, filename) = tempfile.mkstemp(suffix)
1470 self.__tempfiles.append(filename)
1471 tfp = os.fdopen(fd, 'wb')
1472 try:
1473 result = filename, headers
1474 if self.tempcache is not None:
1475 self.tempcache[url] = result
1476 bs = 1024*8
1477 size = -1
1478 read = 0
1479 blocknum = 0
1480 if reporthook:
1481 if "content-length" in headers:
1482 size = int(headers["Content-Length"])
1483 reporthook(blocknum, bs, size)
1484 while 1:
1485 block = fp.read(bs)
1486 if not block:
1487 break
1488 read += len(block)
1489 tfp.write(block)
1490 blocknum += 1
1491 if reporthook:
1492 reporthook(blocknum, bs, size)
1493 finally:
1494 tfp.close()
1495 finally:
1496 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001497 del fp
1498 del tfp
1499
1500 # raise exception if actual size does not match content-length header
1501 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001502 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001503 "retrieval incomplete: got only %i out of %i bytes"
1504 % (read, size), result)
1505
1506 return result
1507
1508 # Each method named open_<type> knows how to open that type of URL
1509
1510 def _open_generic_http(self, connection_factory, url, data):
1511 """Make an HTTP connection using connection_class.
1512
1513 This is an internal method that should be called from
1514 open_http() or open_https().
1515
1516 Arguments:
1517 - connection_factory should take a host name and return an
1518 HTTPConnection instance.
1519 - url is the url to retrieval or a host, relative-path pair.
1520 - data is payload for a POST request or None.
1521 """
1522
1523 user_passwd = None
1524 proxy_passwd= None
1525 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001526 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001527 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001528 user_passwd, host = splituser(host)
1529 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001530 realhost = host
1531 else:
1532 host, selector = url
1533 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001534 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001535 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001536 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001537 url = rest
1538 user_passwd = None
1539 if urltype.lower() != 'http':
1540 realhost = None
1541 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001542 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001543 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001544 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001545 if user_passwd:
1546 selector = "%s://%s%s" % (urltype, realhost, rest)
1547 if proxy_bypass(realhost):
1548 host = realhost
1549
1550 #print "proxy via http:", host, selector
1551 if not host: raise IOError('http error', 'no host given')
1552
1553 if proxy_passwd:
1554 import base64
1555 proxy_auth = base64.b64encode(proxy_passwd).strip()
1556 else:
1557 proxy_auth = None
1558
1559 if user_passwd:
1560 import base64
1561 auth = base64.b64encode(user_passwd).strip()
1562 else:
1563 auth = None
1564 http_conn = connection_factory(host)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001565## # XXX We should fix urllib so that it works with HTTP/1.1.
1566## http_conn._http_vsn = 10
1567## http_conn._http_vsn_str = "HTTP/1.0"
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001568
1569 headers = {}
1570 if proxy_auth:
1571 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1572 if auth:
1573 headers["Authorization"] = "Basic %s" % auth
1574 if realhost:
1575 headers["Host"] = realhost
1576 for header, value in self.addheaders:
1577 headers[header] = value
1578
1579 if data is not None:
1580 headers["Content-Type"] = "application/x-www-form-urlencoded"
1581 http_conn.request("POST", selector, data, headers)
1582 else:
1583 http_conn.request("GET", selector, headers=headers)
1584
1585 try:
1586 response = http_conn.getresponse()
1587 except http.client.BadStatusLine:
1588 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001589 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001590
1591 # According to RFC 2616, "2xx" code indicates that the client's
1592 # request was successfully received, understood, and accepted.
1593 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001594 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001595 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001596 else:
1597 return self.http_error(
1598 url, response.fp,
1599 response.status, response.reason, response.msg, data)
1600
1601 def open_http(self, url, data=None):
1602 """Use HTTP protocol."""
1603 return self._open_generic_http(http.client.HTTPConnection, url, data)
1604
1605 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1606 """Handle http errors.
1607
1608 Derived class can override this, or provide specific handlers
1609 named http_error_DDD where DDD is the 3-digit error code."""
1610 # First check if there's a specific handler for this error
1611 name = 'http_error_%d' % errcode
1612 if hasattr(self, name):
1613 method = getattr(self, name)
1614 if data is None:
1615 result = method(url, fp, errcode, errmsg, headers)
1616 else:
1617 result = method(url, fp, errcode, errmsg, headers, data)
1618 if result: return result
1619 return self.http_error_default(url, fp, errcode, errmsg, headers)
1620
1621 def http_error_default(self, url, fp, errcode, errmsg, headers):
1622 """Default error handler: close the connection and raise IOError."""
1623 void = fp.read()
1624 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001625 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001626
1627 if _have_ssl:
1628 def _https_connection(self, host):
1629 return http.client.HTTPSConnection(host,
1630 key_file=self.key_file,
1631 cert_file=self.cert_file)
1632
1633 def open_https(self, url, data=None):
1634 """Use HTTPS protocol."""
1635 return self._open_generic_http(self._https_connection, url, data)
1636
1637 def open_file(self, url):
1638 """Use local file or FTP depending on form of URL."""
1639 if not isinstance(url, str):
1640 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1641 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1642 return self.open_ftp(url)
1643 else:
1644 return self.open_local_file(url)
1645
1646 def open_local_file(self, url):
1647 """Use local file."""
1648 import mimetypes, email.utils
1649 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001650 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001651 localname = url2pathname(file)
1652 try:
1653 stats = os.stat(localname)
1654 except OSError as e:
1655 raise URLError(e.errno, e.strerror, e.filename)
1656 size = stats.st_size
1657 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1658 mtype = mimetypes.guess_type(url)[0]
1659 headers = email.message_from_string(
1660 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1661 (mtype or 'text/plain', size, modified))
1662 if not host:
1663 urlfile = file
1664 if file[:1] == '/':
1665 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001666 return addinfourl(open(localname, 'rb'), headers, urlfile)
1667 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001668 if (not port
1669 and socket.gethostbyname(host) in (localhost(), thishost())):
1670 urlfile = file
1671 if file[:1] == '/':
1672 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001673 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001674 raise URLError('local file error', 'not on local host')
1675
1676 def open_ftp(self, url):
1677 """Use FTP protocol."""
1678 if not isinstance(url, str):
1679 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1680 import mimetypes
1681 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001682 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001683 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001684 host, port = splitport(host)
1685 user, host = splituser(host)
1686 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001687 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001688 host = unquote(host)
1689 user = unquote(user or '')
1690 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001691 host = socket.gethostbyname(host)
1692 if not port:
1693 import ftplib
1694 port = ftplib.FTP_PORT
1695 else:
1696 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001697 path, attrs = splitattr(path)
1698 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001699 dirs = path.split('/')
1700 dirs, file = dirs[:-1], dirs[-1]
1701 if dirs and not dirs[0]: dirs = dirs[1:]
1702 if dirs and not dirs[0]: dirs[0] = '/'
1703 key = user, host, port, '/'.join(dirs)
1704 # XXX thread unsafe!
1705 if len(self.ftpcache) > MAXFTPCACHE:
1706 # Prune the cache, rather arbitrarily
1707 for k in self.ftpcache.keys():
1708 if k != key:
1709 v = self.ftpcache[k]
1710 del self.ftpcache[k]
1711 v.close()
1712 try:
1713 if not key in self.ftpcache:
1714 self.ftpcache[key] = \
1715 ftpwrapper(user, passwd, host, port, dirs)
1716 if not file: type = 'D'
1717 else: type = 'I'
1718 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001719 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001720 if attr.lower() == 'type' and \
1721 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1722 type = value.upper()
1723 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1724 mtype = mimetypes.guess_type("ftp:" + url)[0]
1725 headers = ""
1726 if mtype:
1727 headers += "Content-Type: %s\n" % mtype
1728 if retrlen is not None and retrlen >= 0:
1729 headers += "Content-Length: %d\n" % retrlen
1730 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001731 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001732 except ftperrors() as msg:
1733 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1734
1735 def open_data(self, url, data=None):
1736 """Use "data" URL."""
1737 if not isinstance(url, str):
1738 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1739 # ignore POSTed data
1740 #
1741 # syntax of data URLs:
1742 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1743 # mediatype := [ type "/" subtype ] *( ";" parameter )
1744 # data := *urlchar
1745 # parameter := attribute "=" value
1746 try:
1747 [type, data] = url.split(',', 1)
1748 except ValueError:
1749 raise IOError('data error', 'bad data URL')
1750 if not type:
1751 type = 'text/plain;charset=US-ASCII'
1752 semi = type.rfind(';')
1753 if semi >= 0 and '=' not in type[semi:]:
1754 encoding = type[semi+1:]
1755 type = type[:semi]
1756 else:
1757 encoding = ''
1758 msg = []
1759 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
1760 time.gmtime(time.time())))
1761 msg.append('Content-type: %s' % type)
1762 if encoding == 'base64':
1763 import base64
1764 data = base64.decodestring(data)
1765 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001766 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001767 msg.append('Content-Length: %d' % len(data))
1768 msg.append('')
1769 msg.append(data)
1770 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001771 headers = email.message_from_string(msg)
1772 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001773 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001774 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001775
1776
1777class FancyURLopener(URLopener):
1778 """Derived class with handlers for errors we can handle (perhaps)."""
1779
1780 def __init__(self, *args, **kwargs):
1781 URLopener.__init__(self, *args, **kwargs)
1782 self.auth_cache = {}
1783 self.tries = 0
1784 self.maxtries = 10
1785
1786 def http_error_default(self, url, fp, errcode, errmsg, headers):
1787 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001788 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001789
1790 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1791 """Error 302 -- relocated (temporarily)."""
1792 self.tries += 1
1793 if self.maxtries and self.tries >= self.maxtries:
1794 if hasattr(self, "http_error_500"):
1795 meth = self.http_error_500
1796 else:
1797 meth = self.http_error_default
1798 self.tries = 0
1799 return meth(url, fp, 500,
1800 "Internal Server Error: Redirect Recursion", headers)
1801 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1802 data)
1803 self.tries = 0
1804 return result
1805
1806 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1807 if 'location' in headers:
1808 newurl = headers['location']
1809 elif 'uri' in headers:
1810 newurl = headers['uri']
1811 else:
1812 return
1813 void = fp.read()
1814 fp.close()
1815 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001816 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001817 return self.open(newurl)
1818
1819 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1820 """Error 301 -- also relocated (permanently)."""
1821 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1822
1823 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1824 """Error 303 -- also relocated (essentially identical to 302)."""
1825 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1826
1827 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1828 """Error 307 -- relocated, but turn POST into error."""
1829 if data is None:
1830 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1831 else:
1832 return self.http_error_default(url, fp, errcode, errmsg, headers)
1833
1834 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
1835 """Error 401 -- authentication required.
1836 This function supports Basic authentication only."""
1837 if not 'www-authenticate' in headers:
1838 URLopener.http_error_default(self, url, fp,
1839 errcode, errmsg, headers)
1840 stuff = headers['www-authenticate']
1841 import re
1842 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1843 if not match:
1844 URLopener.http_error_default(self, url, fp,
1845 errcode, errmsg, headers)
1846 scheme, realm = match.groups()
1847 if scheme.lower() != 'basic':
1848 URLopener.http_error_default(self, url, fp,
1849 errcode, errmsg, headers)
1850 name = 'retry_' + self.type + '_basic_auth'
1851 if data is None:
1852 return getattr(self,name)(url, realm)
1853 else:
1854 return getattr(self,name)(url, realm, data)
1855
1856 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
1857 """Error 407 -- proxy authentication required.
1858 This function supports Basic authentication only."""
1859 if not 'proxy-authenticate' in headers:
1860 URLopener.http_error_default(self, url, fp,
1861 errcode, errmsg, headers)
1862 stuff = headers['proxy-authenticate']
1863 import re
1864 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1865 if not match:
1866 URLopener.http_error_default(self, url, fp,
1867 errcode, errmsg, headers)
1868 scheme, realm = match.groups()
1869 if scheme.lower() != 'basic':
1870 URLopener.http_error_default(self, url, fp,
1871 errcode, errmsg, headers)
1872 name = 'retry_proxy_' + self.type + '_basic_auth'
1873 if data is None:
1874 return getattr(self,name)(url, realm)
1875 else:
1876 return getattr(self,name)(url, realm, data)
1877
1878 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001879 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001880 newurl = 'http://' + host + selector
1881 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001882 urltype, proxyhost = splittype(proxy)
1883 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001884 i = proxyhost.find('@') + 1
1885 proxyhost = proxyhost[i:]
1886 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1887 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001888 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001889 quote(passwd, safe=''), proxyhost)
1890 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1891 if data is None:
1892 return self.open(newurl)
1893 else:
1894 return self.open(newurl, data)
1895
1896 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001897 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001898 newurl = 'https://' + host + selector
1899 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001900 urltype, proxyhost = splittype(proxy)
1901 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001902 i = proxyhost.find('@') + 1
1903 proxyhost = proxyhost[i:]
1904 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1905 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001906 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001907 quote(passwd, safe=''), proxyhost)
1908 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1909 if data is None:
1910 return self.open(newurl)
1911 else:
1912 return self.open(newurl, data)
1913
1914 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001915 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001916 i = host.find('@') + 1
1917 host = host[i:]
1918 user, passwd = self.get_user_passwd(host, realm, i)
1919 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001920 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001921 quote(passwd, safe=''), host)
1922 newurl = 'http://' + host + selector
1923 if data is None:
1924 return self.open(newurl)
1925 else:
1926 return self.open(newurl, data)
1927
1928 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001929 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001930 i = host.find('@') + 1
1931 host = host[i:]
1932 user, passwd = self.get_user_passwd(host, realm, i)
1933 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001934 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001935 quote(passwd, safe=''), host)
1936 newurl = 'https://' + host + selector
1937 if data is None:
1938 return self.open(newurl)
1939 else:
1940 return self.open(newurl, data)
1941
1942 def get_user_passwd(self, host, realm, clear_cache = 0):
1943 key = realm + '@' + host.lower()
1944 if key in self.auth_cache:
1945 if clear_cache:
1946 del self.auth_cache[key]
1947 else:
1948 return self.auth_cache[key]
1949 user, passwd = self.prompt_user_passwd(host, realm)
1950 if user or passwd: self.auth_cache[key] = (user, passwd)
1951 return user, passwd
1952
1953 def prompt_user_passwd(self, host, realm):
1954 """Override this in a GUI environment!"""
1955 import getpass
1956 try:
1957 user = input("Enter username for %s at %s: " % (realm, host))
1958 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
1959 (user, realm, host))
1960 return user, passwd
1961 except KeyboardInterrupt:
1962 print()
1963 return None, None
1964
1965
1966# Utility functions
1967
1968_localhost = None
1969def localhost():
1970 """Return the IP address of the magic hostname 'localhost'."""
1971 global _localhost
1972 if _localhost is None:
1973 _localhost = socket.gethostbyname('localhost')
1974 return _localhost
1975
1976_thishost = None
1977def thishost():
1978 """Return the IP address of the current host."""
1979 global _thishost
1980 if _thishost is None:
1981 _thishost = socket.gethostbyname(socket.gethostname())
1982 return _thishost
1983
1984_ftperrors = None
1985def ftperrors():
1986 """Return the set of errors raised by the FTP class."""
1987 global _ftperrors
1988 if _ftperrors is None:
1989 import ftplib
1990 _ftperrors = ftplib.all_errors
1991 return _ftperrors
1992
1993_noheaders = None
1994def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00001995 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001996 global _noheaders
1997 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00001998 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001999 return _noheaders
2000
2001
2002# Utility classes
2003
2004class ftpwrapper:
2005 """Class used by open_ftp() for cache of open FTP connections."""
2006
2007 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2008 self.user = user
2009 self.passwd = passwd
2010 self.host = host
2011 self.port = port
2012 self.dirs = dirs
2013 self.timeout = timeout
2014 self.init()
2015
2016 def init(self):
2017 import ftplib
2018 self.busy = 0
2019 self.ftp = ftplib.FTP()
2020 self.ftp.connect(self.host, self.port, self.timeout)
2021 self.ftp.login(self.user, self.passwd)
2022 for dir in self.dirs:
2023 self.ftp.cwd(dir)
2024
2025 def retrfile(self, file, type):
2026 import ftplib
2027 self.endtransfer()
2028 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2029 else: cmd = 'TYPE ' + type; isdir = 0
2030 try:
2031 self.ftp.voidcmd(cmd)
2032 except ftplib.all_errors:
2033 self.init()
2034 self.ftp.voidcmd(cmd)
2035 conn = None
2036 if file and not isdir:
2037 # Try to retrieve as a file
2038 try:
2039 cmd = 'RETR ' + file
2040 conn = self.ftp.ntransfercmd(cmd)
2041 except ftplib.error_perm as reason:
2042 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002043 raise URLError('ftp error', reason).with_traceback(
2044 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002045 if not conn:
2046 # Set transfer mode to ASCII!
2047 self.ftp.voidcmd('TYPE A')
2048 # Try a directory listing. Verify that directory exists.
2049 if file:
2050 pwd = self.ftp.pwd()
2051 try:
2052 try:
2053 self.ftp.cwd(file)
2054 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002055 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002056 finally:
2057 self.ftp.cwd(pwd)
2058 cmd = 'LIST ' + file
2059 else:
2060 cmd = 'LIST'
2061 conn = self.ftp.ntransfercmd(cmd)
2062 self.busy = 1
2063 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002064 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002065 def endtransfer(self):
2066 if not self.busy:
2067 return
2068 self.busy = 0
2069 try:
2070 self.ftp.voidresp()
2071 except ftperrors():
2072 pass
2073
2074 def close(self):
2075 self.endtransfer()
2076 try:
2077 self.ftp.close()
2078 except ftperrors():
2079 pass
2080
2081# Proxy handling
2082def getproxies_environment():
2083 """Return a dictionary of scheme -> proxy server URL mappings.
2084
2085 Scan the environment for variables named <scheme>_proxy;
2086 this seems to be the standard convention. If you need a
2087 different way, you can pass a proxies dictionary to the
2088 [Fancy]URLopener constructor.
2089
2090 """
2091 proxies = {}
2092 for name, value in os.environ.items():
2093 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002094 if value and name[-6:] == '_proxy':
2095 proxies[name[:-6]] = value
2096 return proxies
2097
2098def proxy_bypass_environment(host):
2099 """Test if proxies should not be used for a particular host.
2100
2101 Checks the environment for a variable named no_proxy, which should
2102 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2103 """
2104 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2105 # '*' is special case for always bypass
2106 if no_proxy == '*':
2107 return 1
2108 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002109 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002110 # check if the host ends with any of the DNS suffixes
2111 for name in no_proxy.split(','):
2112 if name and (hostonly.endswith(name) or host.endswith(name)):
2113 return 1
2114 # otherwise, don't bypass
2115 return 0
2116
2117
2118if sys.platform == 'darwin':
2119 def getproxies_internetconfig():
2120 """Return a dictionary of scheme -> proxy server URL mappings.
2121
2122 By convention the mac uses Internet Config to store
2123 proxies. An HTTP proxy, for instance, is stored under
2124 the HttpProxy key.
2125
2126 """
2127 try:
2128 import ic
2129 except ImportError:
2130 return {}
2131
2132 try:
2133 config = ic.IC()
2134 except ic.error:
2135 return {}
2136 proxies = {}
2137 # HTTP:
2138 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
2139 try:
2140 value = config['HTTPProxyHost']
2141 except ic.error:
2142 pass
2143 else:
2144 proxies['http'] = 'http://%s' % value
2145 # FTP: XXX To be done.
2146 # Gopher: XXX To be done.
2147 return proxies
2148
2149 def proxy_bypass(host):
2150 if getproxies_environment():
2151 return proxy_bypass_environment(host)
2152 else:
2153 return 0
2154
2155 def getproxies():
2156 return getproxies_environment() or getproxies_internetconfig()
2157
2158elif os.name == 'nt':
2159 def getproxies_registry():
2160 """Return a dictionary of scheme -> proxy server URL mappings.
2161
2162 Win32 uses the registry to store proxies.
2163
2164 """
2165 proxies = {}
2166 try:
2167 import _winreg
2168 except ImportError:
2169 # Std module, so should be around - but you never know!
2170 return proxies
2171 try:
2172 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
2173 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2174 proxyEnable = _winreg.QueryValueEx(internetSettings,
2175 'ProxyEnable')[0]
2176 if proxyEnable:
2177 # Returned as Unicode but problems if not converted to ASCII
2178 proxyServer = str(_winreg.QueryValueEx(internetSettings,
2179 'ProxyServer')[0])
2180 if '=' in proxyServer:
2181 # Per-protocol settings
2182 for p in proxyServer.split(';'):
2183 protocol, address = p.split('=', 1)
2184 # See if address has a type:// prefix
2185 import re
2186 if not re.match('^([^/:]+)://', address):
2187 address = '%s://%s' % (protocol, address)
2188 proxies[protocol] = address
2189 else:
2190 # Use one setting for all protocols
2191 if proxyServer[:5] == 'http:':
2192 proxies['http'] = proxyServer
2193 else:
2194 proxies['http'] = 'http://%s' % proxyServer
2195 proxies['ftp'] = 'ftp://%s' % proxyServer
2196 internetSettings.Close()
2197 except (WindowsError, ValueError, TypeError):
2198 # Either registry key not found etc, or the value in an
2199 # unexpected format.
2200 # proxies already set up to be empty so nothing to do
2201 pass
2202 return proxies
2203
2204 def getproxies():
2205 """Return a dictionary of scheme -> proxy server URL mappings.
2206
2207 Returns settings gathered from the environment, if specified,
2208 or the registry.
2209
2210 """
2211 return getproxies_environment() or getproxies_registry()
2212
2213 def proxy_bypass_registry(host):
2214 try:
2215 import _winreg
2216 import re
2217 except ImportError:
2218 # Std modules, so should be around - but you never know!
2219 return 0
2220 try:
2221 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
2222 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2223 proxyEnable = _winreg.QueryValueEx(internetSettings,
2224 'ProxyEnable')[0]
2225 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
2226 'ProxyOverride')[0])
2227 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2228 except WindowsError:
2229 return 0
2230 if not proxyEnable or not proxyOverride:
2231 return 0
2232 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002233 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002234 host = [rawHost]
2235 try:
2236 addr = socket.gethostbyname(rawHost)
2237 if addr != rawHost:
2238 host.append(addr)
2239 except socket.error:
2240 pass
2241 try:
2242 fqdn = socket.getfqdn(rawHost)
2243 if fqdn != rawHost:
2244 host.append(fqdn)
2245 except socket.error:
2246 pass
2247 # make a check value list from the registry entry: replace the
2248 # '<local>' string by the localhost entry and the corresponding
2249 # canonical entry.
2250 proxyOverride = proxyOverride.split(';')
2251 i = 0
2252 while i < len(proxyOverride):
2253 if proxyOverride[i] == '<local>':
2254 proxyOverride[i:i+1] = ['localhost',
2255 '127.0.0.1',
2256 socket.gethostname(),
2257 socket.gethostbyname(
2258 socket.gethostname())]
2259 i += 1
2260 # print proxyOverride
2261 # now check if we match one of the registry values.
2262 for test in proxyOverride:
2263 test = test.replace(".", r"\.") # mask dots
2264 test = test.replace("*", r".*") # change glob sequence
2265 test = test.replace("?", r".") # change glob char
2266 for val in host:
2267 # print "%s <--> %s" %( test, val )
2268 if re.match(test, val, re.I):
2269 return 1
2270 return 0
2271
2272 def proxy_bypass(host):
2273 """Return a dictionary of scheme -> proxy server URL mappings.
2274
2275 Returns settings gathered from the environment, if specified,
2276 or the registry.
2277
2278 """
2279 if getproxies_environment():
2280 return proxy_bypass_environment(host)
2281 else:
2282 return proxy_bypass_registry(host)
2283
2284else:
2285 # By default use environment variables
2286 getproxies = getproxies_environment
2287 proxy_bypass = proxy_bypass_environment