blob: 5bdc103563edfc4f9a5d712553860c2cef431ca1 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran1107c5d2009-11-15 06:20:55 +000033
Senthil Kumaran47fff872009-12-20 07:10:31 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097
Georg Brandl13e89462008-07-01 19:56:00 +000098from urllib.error import URLError, HTTPError, ContentTooShortError
99from urllib.parse import (
100 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
101 splittype, splithost, splitport, splituser, splitpasswd,
Facundo Batistaf24802c2008-08-17 03:36:03 +0000102 splitattr, splitquery, splitvalue, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000103from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000104
105# check for SSL
106try:
107 import ssl
108except:
109 _have_ssl = False
110else:
111 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000112
113# used in User-Agent header sent
114__version__ = sys.version[:3]
115
116_opener = None
117def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
118 global _opener
119 if _opener is None:
120 _opener = build_opener()
121 return _opener.open(url, data, timeout)
122
123def install_opener(opener):
124 global _opener
125 _opener = opener
126
127# TODO(jhylton): Make this work with the same global opener.
128_urlopener = None
129def urlretrieve(url, filename=None, reporthook=None, data=None):
130 global _urlopener
131 if not _urlopener:
132 _urlopener = FancyURLopener()
133 return _urlopener.retrieve(url, filename, reporthook, data)
134
135def urlcleanup():
136 if _urlopener:
137 _urlopener.cleanup()
138 global _opener
139 if _opener:
140 _opener = None
141
142# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000143_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000144def request_host(request):
145 """Return request-host, as defined by RFC 2965.
146
147 Variation from RFC: returned value is lowercased, for convenient
148 comparison.
149
150 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000151 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000152 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000153 if host == "":
154 host = request.get_header("Host", "")
155
156 # remove port, if present
157 host = _cut_port_re.sub("", host, 1)
158 return host.lower()
159
160class Request:
161
162 def __init__(self, url, data=None, headers={},
163 origin_req_host=None, unverifiable=False):
164 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000165 self.full_url = unwrap(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000166 self.data = data
167 self.headers = {}
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000168 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000169 for key, value in headers.items():
170 self.add_header(key, value)
171 self.unredirected_hdrs = {}
172 if origin_req_host is None:
173 origin_req_host = request_host(self)
174 self.origin_req_host = origin_req_host
175 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000176 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000177
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000178 def _parse(self):
179 self.type, rest = splittype(self.full_url)
180 if self.type is None:
181 raise ValueError("unknown url type: %s" % self.full_url)
182 self.host, self.selector = splithost(rest)
183 if self.host:
184 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000185
186 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000187 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000188 return "POST"
189 else:
190 return "GET"
191
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000192 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000193
194 def add_data(self, data):
Senthil Kumaran6d726c32010-02-24 21:03:37 +0000195 if self.has_data():
196 raise TypeError("Request Obj already contains data: %s" %
197 self.data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000198 self.data = data
199
200 def has_data(self):
201 return self.data is not None
202
203 def get_data(self):
204 return self.data
205
206 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000207 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000208
209 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000210 return self.type
211
212 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000213 return self.host
214
215 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000216 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000217
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000218 def is_unverifiable(self):
219 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000220
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000221 def get_origin_req_host(self):
222 return self.origin_req_host
223
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000224 # End deprecated methods
225
226 def set_proxy(self, host, type):
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000227 if self.type == 'https' and not self._tunnel_host:
228 self._tunnel_host = self.host
229 else:
230 self.type= type
231 self.selector = self.full_url
232 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000233
234 def has_proxy(self):
235 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000236
237 def add_header(self, key, val):
238 # useful for something like authentication
239 self.headers[key.capitalize()] = val
240
241 def add_unredirected_header(self, key, val):
242 # will not be added to a redirected request
243 self.unredirected_hdrs[key.capitalize()] = val
244
245 def has_header(self, header_name):
246 return (header_name in self.headers or
247 header_name in self.unredirected_hdrs)
248
249 def get_header(self, header_name, default=None):
250 return self.headers.get(
251 header_name,
252 self.unredirected_hdrs.get(header_name, default))
253
254 def header_items(self):
255 hdrs = self.unredirected_hdrs.copy()
256 hdrs.update(self.headers)
257 return list(hdrs.items())
258
259class OpenerDirector:
260 def __init__(self):
261 client_version = "Python-urllib/%s" % __version__
262 self.addheaders = [('User-agent', client_version)]
263 # manage the individual handlers
264 self.handlers = []
265 self.handle_open = {}
266 self.handle_error = {}
267 self.process_response = {}
268 self.process_request = {}
269
270 def add_handler(self, handler):
271 if not hasattr(handler, "add_parent"):
272 raise TypeError("expected BaseHandler instance, got %r" %
273 type(handler))
274
275 added = False
276 for meth in dir(handler):
277 if meth in ["redirect_request", "do_open", "proxy_open"]:
278 # oops, coincidental match
279 continue
280
281 i = meth.find("_")
282 protocol = meth[:i]
283 condition = meth[i+1:]
284
285 if condition.startswith("error"):
286 j = condition.find("_") + i + 1
287 kind = meth[j+1:]
288 try:
289 kind = int(kind)
290 except ValueError:
291 pass
292 lookup = self.handle_error.get(protocol, {})
293 self.handle_error[protocol] = lookup
294 elif condition == "open":
295 kind = protocol
296 lookup = self.handle_open
297 elif condition == "response":
298 kind = protocol
299 lookup = self.process_response
300 elif condition == "request":
301 kind = protocol
302 lookup = self.process_request
303 else:
304 continue
305
306 handlers = lookup.setdefault(kind, [])
307 if handlers:
308 bisect.insort(handlers, handler)
309 else:
310 handlers.append(handler)
311 added = True
312
313 if added:
314 # the handlers must work in an specific order, the order
315 # is specified in a Handler attribute
316 bisect.insort(self.handlers, handler)
317 handler.add_parent(self)
318
319 def close(self):
320 # Only exists for backwards compatibility.
321 pass
322
323 def _call_chain(self, chain, kind, meth_name, *args):
324 # Handlers raise an exception if no one else should try to handle
325 # the request, or return None if they can't but another handler
326 # could. Otherwise, they return the response.
327 handlers = chain.get(kind, ())
328 for handler in handlers:
329 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330 result = func(*args)
331 if result is not None:
332 return result
333
334 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
335 # accept a URL or a Request object
336 if isinstance(fullurl, str):
337 req = Request(fullurl, data)
338 else:
339 req = fullurl
340 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000341 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000342
343 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000344 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000345
346 # pre-process request
347 meth_name = protocol+"_request"
348 for processor in self.process_request.get(protocol, []):
349 meth = getattr(processor, meth_name)
350 req = meth(req)
351
352 response = self._open(req, data)
353
354 # post-process response
355 meth_name = protocol+"_response"
356 for processor in self.process_response.get(protocol, []):
357 meth = getattr(processor, meth_name)
358 response = meth(req, response)
359
360 return response
361
362 def _open(self, req, data=None):
363 result = self._call_chain(self.handle_open, 'default',
364 'default_open', req)
365 if result:
366 return result
367
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000368 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000369 result = self._call_chain(self.handle_open, protocol, protocol +
370 '_open', req)
371 if result:
372 return result
373
374 return self._call_chain(self.handle_open, 'unknown',
375 'unknown_open', req)
376
377 def error(self, proto, *args):
378 if proto in ('http', 'https'):
379 # XXX http[s] protocols are special-cased
380 dict = self.handle_error['http'] # https is not different than http
381 proto = args[2] # YUCK!
382 meth_name = 'http_error_%s' % proto
383 http_err = 1
384 orig_args = args
385 else:
386 dict = self.handle_error
387 meth_name = proto + '_error'
388 http_err = 0
389 args = (dict, proto, meth_name) + args
390 result = self._call_chain(*args)
391 if result:
392 return result
393
394 if http_err:
395 args = (dict, 'default', 'http_error_default') + orig_args
396 return self._call_chain(*args)
397
398# XXX probably also want an abstract factory that knows when it makes
399# sense to skip a superclass in favor of a subclass and when it might
400# make sense to include both
401
402def build_opener(*handlers):
403 """Create an opener object from a list of handlers.
404
405 The opener will use several default handlers, including support
Senthil Kumaran1107c5d2009-11-15 06:20:55 +0000406 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000407
408 If any of the handlers passed as arguments are subclasses of the
409 default handlers, the default handlers will not be used.
410 """
411 def isclass(obj):
412 return isinstance(obj, type) or hasattr(obj, "__bases__")
413
414 opener = OpenerDirector()
415 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
416 HTTPDefaultErrorHandler, HTTPRedirectHandler,
417 FTPHandler, FileHandler, HTTPErrorProcessor]
418 if hasattr(http.client, "HTTPSConnection"):
419 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000420 skip = set()
421 for klass in default_classes:
422 for check in handlers:
423 if isclass(check):
424 if issubclass(check, klass):
425 skip.add(klass)
426 elif isinstance(check, klass):
427 skip.add(klass)
428 for klass in skip:
429 default_classes.remove(klass)
430
431 for klass in default_classes:
432 opener.add_handler(klass())
433
434 for h in handlers:
435 if isclass(h):
436 h = h()
437 opener.add_handler(h)
438 return opener
439
440class BaseHandler:
441 handler_order = 500
442
443 def add_parent(self, parent):
444 self.parent = parent
445
446 def close(self):
447 # Only exists for backwards compatibility
448 pass
449
450 def __lt__(self, other):
451 if not hasattr(other, "handler_order"):
452 # Try to preserve the old behavior of having custom classes
453 # inserted after default ones (works only for custom user
454 # classes which are not aware of handler_order).
455 return True
456 return self.handler_order < other.handler_order
457
458
459class HTTPErrorProcessor(BaseHandler):
460 """Process HTTP error responses."""
461 handler_order = 1000 # after all other processing
462
463 def http_response(self, request, response):
464 code, msg, hdrs = response.code, response.msg, response.info()
465
466 # According to RFC 2616, "2xx" code indicates that the client's
467 # request was successfully received, understood, and accepted.
468 if not (200 <= code < 300):
469 response = self.parent.error(
470 'http', request, response, code, msg, hdrs)
471
472 return response
473
474 https_response = http_response
475
476class HTTPDefaultErrorHandler(BaseHandler):
477 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000478 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000479
480class HTTPRedirectHandler(BaseHandler):
481 # maximum number of redirections to any single URL
482 # this is needed because of the state that cookies introduce
483 max_repeats = 4
484 # maximum total number of redirections (regardless of URL) before
485 # assuming we're in a loop
486 max_redirections = 10
487
488 def redirect_request(self, req, fp, code, msg, headers, newurl):
489 """Return a Request or None in response to a redirect.
490
491 This is called by the http_error_30x methods when a
492 redirection response is received. If a redirection should
493 take place, return a new Request to allow http_error_30x to
494 perform the redirect. Otherwise, raise HTTPError if no-one
495 else should try to handle this url. Return None if you can't
496 but another Handler might.
497 """
498 m = req.get_method()
499 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
500 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000501 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000502
503 # Strictly (according to RFC 2616), 301 or 302 in response to
504 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000505 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000506 # essentially all clients do redirect in this case, so we do
507 # the same.
508 # be conciliant with URIs containing a space
509 newurl = newurl.replace(' ', '%20')
510 CONTENT_HEADERS = ("content-length", "content-type")
511 newheaders = dict((k, v) for k, v in req.headers.items()
512 if k.lower() not in CONTENT_HEADERS)
513 return Request(newurl,
514 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000515 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000516 unverifiable=True)
517
518 # Implementation note: To avoid the server sending us into an
519 # infinite loop, the request object needs to track what URLs we
520 # have already seen. Do this by adding a handler-specific
521 # attribute to the Request object.
522 def http_error_302(self, req, fp, code, msg, headers):
523 # Some servers (incorrectly) return multiple Location headers
524 # (so probably same goes for URI). Use first header.
525 if "location" in headers:
526 newurl = headers["location"]
527 elif "uri" in headers:
528 newurl = headers["uri"]
529 else:
530 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000531
532 # fix a possible malformed URL
533 urlparts = urlparse(newurl)
534 if not urlparts.path:
535 urlparts = list(urlparts)
536 urlparts[2] = "/"
537 newurl = urlunparse(urlparts)
538
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000539 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000540
541 # XXX Probably want to forget about the state of the current
542 # request, although that might interact poorly with other
543 # handlers that also use handler-specific request attributes
544 new = self.redirect_request(req, fp, code, msg, headers, newurl)
545 if new is None:
546 return
547
548 # loop detection
549 # .redirect_dict has a key url if url was previously visited.
550 if hasattr(req, 'redirect_dict'):
551 visited = new.redirect_dict = req.redirect_dict
552 if (visited.get(newurl, 0) >= self.max_repeats or
553 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000554 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000555 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000556 else:
557 visited = new.redirect_dict = req.redirect_dict = {}
558 visited[newurl] = visited.get(newurl, 0) + 1
559
560 # Don't close the fp until we are sure that we won't use it
561 # with HTTPError.
562 fp.read()
563 fp.close()
564
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000565 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000566
567 http_error_301 = http_error_303 = http_error_307 = http_error_302
568
569 inf_msg = "The HTTP server returned a redirect error that would " \
570 "lead to an infinite loop.\n" \
571 "The last 30x error message was:\n"
572
573
574def _parse_proxy(proxy):
575 """Return (scheme, user, password, host/port) given a URL or an authority.
576
577 If a URL is supplied, it must have an authority (host:port) component.
578 According to RFC 3986, having an authority component means the URL must
579 have two slashes after the scheme:
580
581 >>> _parse_proxy('file:/ftp.example.com/')
582 Traceback (most recent call last):
583 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
584
585 The first three items of the returned tuple may be None.
586
587 Examples of authority parsing:
588
589 >>> _parse_proxy('proxy.example.com')
590 (None, None, None, 'proxy.example.com')
591 >>> _parse_proxy('proxy.example.com:3128')
592 (None, None, None, 'proxy.example.com:3128')
593
594 The authority component may optionally include userinfo (assumed to be
595 username:password):
596
597 >>> _parse_proxy('joe:password@proxy.example.com')
598 (None, 'joe', 'password', 'proxy.example.com')
599 >>> _parse_proxy('joe:password@proxy.example.com:3128')
600 (None, 'joe', 'password', 'proxy.example.com:3128')
601
602 Same examples, but with URLs instead:
603
604 >>> _parse_proxy('http://proxy.example.com/')
605 ('http', None, None, 'proxy.example.com')
606 >>> _parse_proxy('http://proxy.example.com:3128/')
607 ('http', None, None, 'proxy.example.com:3128')
608 >>> _parse_proxy('http://joe:password@proxy.example.com/')
609 ('http', 'joe', 'password', 'proxy.example.com')
610 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
611 ('http', 'joe', 'password', 'proxy.example.com:3128')
612
613 Everything after the authority is ignored:
614
615 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
616 ('ftp', 'joe', 'password', 'proxy.example.com')
617
618 Test for no trailing '/' case:
619
620 >>> _parse_proxy('http://joe:password@proxy.example.com')
621 ('http', 'joe', 'password', 'proxy.example.com')
622
623 """
Georg Brandl13e89462008-07-01 19:56:00 +0000624 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000625 if not r_scheme.startswith("/"):
626 # authority
627 scheme = None
628 authority = proxy
629 else:
630 # URL
631 if not r_scheme.startswith("//"):
632 raise ValueError("proxy URL with no authority: %r" % proxy)
633 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
634 # and 3.3.), path is empty or starts with '/'
635 end = r_scheme.find("/", 2)
636 if end == -1:
637 end = None
638 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000639 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000640 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000641 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000642 else:
643 user = password = None
644 return scheme, user, password, hostport
645
646class ProxyHandler(BaseHandler):
647 # Proxies must be in front
648 handler_order = 100
649
650 def __init__(self, proxies=None):
651 if proxies is None:
652 proxies = getproxies()
653 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
654 self.proxies = proxies
655 for type, url in proxies.items():
656 setattr(self, '%s_open' % type,
657 lambda r, proxy=url, type=type, meth=self.proxy_open: \
658 meth(r, proxy, type))
659
660 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000661 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000662 proxy_type, user, password, hostport = _parse_proxy(proxy)
663 if proxy_type is None:
664 proxy_type = orig_type
Senthil Kumaran7bb04972009-10-11 04:58:55 +0000665
666 if req.host and proxy_bypass(req.host):
667 return None
668
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000669 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000670 user_pass = '%s:%s' % (unquote(user),
671 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000672 creds = base64.b64encode(user_pass.encode()).decode("ascii")
673 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000674 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000675 req.set_proxy(hostport, proxy_type)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +0000676 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000677 # let other handlers take care of it
678 return None
679 else:
680 # need to start over, because the other handlers don't
681 # grok the proxy's URL type
682 # e.g. if we have a constructor arg proxies like so:
683 # {'http': 'ftp://proxy.example.com'}, we may end up turning
684 # a request for http://acme.example.com/a into one for
685 # ftp://proxy.example.com/a
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000686 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000687
688class HTTPPasswordMgr:
689
690 def __init__(self):
691 self.passwd = {}
692
693 def add_password(self, realm, uri, user, passwd):
694 # uri could be a single URI or a sequence
695 if isinstance(uri, str):
696 uri = [uri]
697 if not realm in self.passwd:
698 self.passwd[realm] = {}
699 for default_port in True, False:
700 reduced_uri = tuple(
701 [self.reduce_uri(u, default_port) for u in uri])
702 self.passwd[realm][reduced_uri] = (user, passwd)
703
704 def find_user_password(self, realm, authuri):
705 domains = self.passwd.get(realm, {})
706 for default_port in True, False:
707 reduced_authuri = self.reduce_uri(authuri, default_port)
708 for uris, authinfo in domains.items():
709 for uri in uris:
710 if self.is_suburi(uri, reduced_authuri):
711 return authinfo
712 return None, None
713
714 def reduce_uri(self, uri, default_port=True):
715 """Accept authority or URI and extract only the authority and path."""
716 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000717 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000718 if parts[1]:
719 # URI
720 scheme = parts[0]
721 authority = parts[1]
722 path = parts[2] or '/'
723 else:
724 # host or host:port
725 scheme = None
726 authority = uri
727 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000728 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000729 if default_port and port is None and scheme is not None:
730 dport = {"http": 80,
731 "https": 443,
732 }.get(scheme)
733 if dport is not None:
734 authority = "%s:%d" % (host, dport)
735 return authority, path
736
737 def is_suburi(self, base, test):
738 """Check if test is below base in a URI tree
739
740 Both args must be URIs in reduced form.
741 """
742 if base == test:
743 return True
744 if base[0] != test[0]:
745 return False
746 common = posixpath.commonprefix((base[1], test[1]))
747 if len(common) == len(base[1]):
748 return True
749 return False
750
751
752class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
753
754 def find_user_password(self, realm, authuri):
755 user, password = HTTPPasswordMgr.find_user_password(self, realm,
756 authuri)
757 if user is not None:
758 return user, password
759 return HTTPPasswordMgr.find_user_password(self, None, authuri)
760
761
762class AbstractBasicAuthHandler:
763
764 # XXX this allows for multiple auth-schemes, but will stupidly pick
765 # the last one with a realm specified.
766
767 # allow for double- and single-quoted realm values
768 # (single quotes are a violation of the RFC, but appear in the wild)
769 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
770 'realm=(["\'])(.*?)\\2', re.I)
771
772 # XXX could pre-emptively send auth info already accepted (RFC 2617,
773 # end of section 2, and section 1.2 immediately after "credentials"
774 # production).
775
776 def __init__(self, password_mgr=None):
777 if password_mgr is None:
778 password_mgr = HTTPPasswordMgr()
779 self.passwd = password_mgr
780 self.add_password = self.passwd.add_password
781
782 def http_error_auth_reqed(self, authreq, host, req, headers):
783 # host may be an authority (without userinfo) or a URL with an
784 # authority
785 # XXX could be multiple headers
786 authreq = headers.get(authreq, None)
787 if authreq:
788 mo = AbstractBasicAuthHandler.rx.search(authreq)
789 if mo:
790 scheme, quote, realm = mo.groups()
791 if scheme.lower() == 'basic':
792 return self.retry_http_basic_auth(host, req, realm)
793
794 def retry_http_basic_auth(self, host, req, realm):
795 user, pw = self.passwd.find_user_password(realm, host)
796 if pw is not None:
797 raw = "%s:%s" % (user, pw)
798 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
799 if req.headers.get(self.auth_header, None) == auth:
800 return None
Senthil Kumaranca2fc9e2010-02-24 16:53:16 +0000801 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000802 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000803 else:
804 return None
805
806
807class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
808
809 auth_header = 'Authorization'
810
811 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000812 url = req.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 return self.http_error_auth_reqed('www-authenticate',
814 url, req, headers)
815
816
817class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
818
819 auth_header = 'Proxy-authorization'
820
821 def http_error_407(self, req, fp, code, msg, headers):
822 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000823 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000824 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
825 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000826 authority = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000827 return self.http_error_auth_reqed('proxy-authenticate',
828 authority, req, headers)
829
830
831def randombytes(n):
832 """Return n random bytes."""
833 return os.urandom(n)
834
835class AbstractDigestAuthHandler:
836 # Digest authentication is specified in RFC 2617.
837
838 # XXX The client does not inspect the Authentication-Info header
839 # in a successful response.
840
841 # XXX It should be possible to test this implementation against
842 # a mock server that just generates a static set of challenges.
843
844 # XXX qop="auth-int" supports is shaky
845
846 def __init__(self, passwd=None):
847 if passwd is None:
848 passwd = HTTPPasswordMgr()
849 self.passwd = passwd
850 self.add_password = self.passwd.add_password
851 self.retried = 0
852 self.nonce_count = 0
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000853 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000854
855 def reset_retry_count(self):
856 self.retried = 0
857
858 def http_error_auth_reqed(self, auth_header, host, req, headers):
859 authreq = headers.get(auth_header, None)
860 if self.retried > 5:
861 # Don't fail endlessly - if we failed once, we'll probably
862 # fail a second time. Hm. Unless the Password Manager is
863 # prompting for the information. Crap. This isn't great
864 # but it's better than the current 'repeat until recursion
865 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000866 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000867 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000868 else:
869 self.retried += 1
870 if authreq:
871 scheme = authreq.split()[0]
872 if scheme.lower() == 'digest':
873 return self.retry_http_digest_auth(req, authreq)
874
875 def retry_http_digest_auth(self, req, auth):
876 token, challenge = auth.split(' ', 1)
877 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
878 auth = self.get_authorization(req, chal)
879 if auth:
880 auth_val = 'Digest %s' % auth
881 if req.headers.get(self.auth_header, None) == auth_val:
882 return None
883 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumaranfb8cc2f2009-07-19 02:44:19 +0000884 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000885 return resp
886
887 def get_cnonce(self, nonce):
888 # The cnonce-value is an opaque
889 # quoted string value provided by the client and used by both client
890 # and server to avoid chosen plaintext attacks, to provide mutual
891 # authentication, and to provide some message integrity protection.
892 # This isn't a fabulous effort, but it's probably Good Enough.
893 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
894 b = s.encode("ascii") + randombytes(8)
895 dig = hashlib.sha1(b).hexdigest()
896 return dig[:16]
897
898 def get_authorization(self, req, chal):
899 try:
900 realm = chal['realm']
901 nonce = chal['nonce']
902 qop = chal.get('qop')
903 algorithm = chal.get('algorithm', 'MD5')
904 # mod_digest doesn't send an opaque, even though it isn't
905 # supposed to be optional
906 opaque = chal.get('opaque', None)
907 except KeyError:
908 return None
909
910 H, KD = self.get_algorithm_impls(algorithm)
911 if H is None:
912 return None
913
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000914 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000915 if user is None:
916 return None
917
918 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000919 if req.data is not None:
920 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000921 else:
922 entdig = None
923
924 A1 = "%s:%s:%s" % (user, realm, pw)
925 A2 = "%s:%s" % (req.get_method(),
926 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000927 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000928 if qop == 'auth':
Senthil Kumaran4c7eaee2009-11-15 08:43:45 +0000929 if nonce == self.last_nonce:
930 self.nonce_count += 1
931 else:
932 self.nonce_count = 1
933 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000934 ncvalue = '%08x' % self.nonce_count
935 cnonce = self.get_cnonce(nonce)
936 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
937 respdig = KD(H(A1), noncebit)
938 elif qop is None:
939 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
940 else:
941 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000942 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000943
944 # XXX should the partial digests be encoded too?
945
946 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000947 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000948 respdig)
949 if opaque:
950 base += ', opaque="%s"' % opaque
951 if entdig:
952 base += ', digest="%s"' % entdig
953 base += ', algorithm="%s"' % algorithm
954 if qop:
955 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
956 return base
957
958 def get_algorithm_impls(self, algorithm):
959 # lambdas assume digest modules are imported at the top level
960 if algorithm == 'MD5':
961 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
962 elif algorithm == 'SHA':
963 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
964 # XXX MD5-sess
965 KD = lambda s, d: H("%s:%s" % (s, d))
966 return H, KD
967
968 def get_entity_digest(self, data, chal):
969 # XXX not implemented yet
970 return None
971
972
973class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
974 """An authentication protocol defined by RFC 2069
975
976 Digest authentication improves on basic authentication because it
977 does not transmit passwords in the clear.
978 """
979
980 auth_header = 'Authorization'
981 handler_order = 490 # before Basic auth
982
983 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000984 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000985 retry = self.http_error_auth_reqed('www-authenticate',
986 host, req, headers)
987 self.reset_retry_count()
988 return retry
989
990
991class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
992
993 auth_header = 'Proxy-Authorization'
994 handler_order = 490 # before Basic auth
995
996 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000997 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000998 retry = self.http_error_auth_reqed('proxy-authenticate',
999 host, req, headers)
1000 self.reset_retry_count()
1001 return retry
1002
1003class AbstractHTTPHandler(BaseHandler):
1004
1005 def __init__(self, debuglevel=0):
1006 self._debuglevel = debuglevel
1007
1008 def set_http_debuglevel(self, level):
1009 self._debuglevel = level
1010
1011 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001012 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001013 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001014 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001015
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001016 if request.data is not None: # POST
1017 data = request.data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001018 if not request.has_header('Content-type'):
1019 request.add_unredirected_header(
1020 'Content-type',
1021 'application/x-www-form-urlencoded')
1022 if not request.has_header('Content-length'):
1023 request.add_unredirected_header(
1024 'Content-length', '%d' % len(data))
1025
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001026 sel_host = host
1027 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001028 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001029 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001030 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001031 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001032 for name, value in self.parent.addheaders:
1033 name = name.capitalize()
1034 if not request.has_header(name):
1035 request.add_unredirected_header(name, value)
1036
1037 return request
1038
1039 def do_open(self, http_class, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001040 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001041
1042 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001043 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001044 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001045 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001046 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001047
1048 h = http_class(host, timeout=req.timeout) # will parse host:port
1049 headers = dict(req.headers)
1050 headers.update(req.unredirected_hdrs)
1051
1052 # TODO(jhylton): Should this be redesigned to handle
1053 # persistent connections?
1054
1055 # We want to make an HTTP/1.1 request, but the addinfourl
1056 # class isn't prepared to deal with a persistent connection.
1057 # It will try to read all remaining data from the socket,
1058 # which will block while the server waits for the next request.
1059 # So make sure the connection gets closed after the (only)
1060 # request.
1061 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001062 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001063
1064 if req._tunnel_host:
Senthil Kumaran47fff872009-12-20 07:10:31 +00001065 tunnel_headers = {}
1066 proxy_auth_hdr = "Proxy-Authorization"
1067 if proxy_auth_hdr in headers:
1068 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1069 # Proxy-Authorization should not be sent to origin
1070 # server.
1071 del headers[proxy_auth_hdr]
1072 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran97f0c6b2009-07-25 04:24:38 +00001073
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001074 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001075 h.request(req.get_method(), req.selector, req.data, headers)
1076 r = h.getresponse() # an HTTPResponse instance
1077 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001078 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001079
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001080 r.url = req.full_url
1081 # This line replaces the .msg attribute of the HTTPResponse
1082 # with .headers, because urllib clients expect the response to
1083 # have the reason in .msg. It would be good to mark this
1084 # attribute is deprecated and get then to use info() or
1085 # .headers.
1086 r.msg = r.reason
1087 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001088
1089
1090class HTTPHandler(AbstractHTTPHandler):
1091
1092 def http_open(self, req):
1093 return self.do_open(http.client.HTTPConnection, req)
1094
1095 http_request = AbstractHTTPHandler.do_request_
1096
1097if hasattr(http.client, 'HTTPSConnection'):
1098 class HTTPSHandler(AbstractHTTPHandler):
1099
1100 def https_open(self, req):
1101 return self.do_open(http.client.HTTPSConnection, req)
1102
1103 https_request = AbstractHTTPHandler.do_request_
1104
1105class HTTPCookieProcessor(BaseHandler):
1106 def __init__(self, cookiejar=None):
1107 import http.cookiejar
1108 if cookiejar is None:
1109 cookiejar = http.cookiejar.CookieJar()
1110 self.cookiejar = cookiejar
1111
1112 def http_request(self, request):
1113 self.cookiejar.add_cookie_header(request)
1114 return request
1115
1116 def http_response(self, request, response):
1117 self.cookiejar.extract_cookies(response, request)
1118 return response
1119
1120 https_request = http_request
1121 https_response = http_response
1122
1123class UnknownHandler(BaseHandler):
1124 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001125 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001126 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001127
1128def parse_keqv_list(l):
1129 """Parse list of key=value strings where keys are not duplicated."""
1130 parsed = {}
1131 for elt in l:
1132 k, v = elt.split('=', 1)
1133 if v[0] == '"' and v[-1] == '"':
1134 v = v[1:-1]
1135 parsed[k] = v
1136 return parsed
1137
1138def parse_http_list(s):
1139 """Parse lists as described by RFC 2068 Section 2.
1140
1141 In particular, parse comma-separated lists where the elements of
1142 the list may include quoted-strings. A quoted-string could
1143 contain a comma. A non-quoted string could have quotes in the
1144 middle. Neither commas nor quotes count if they are escaped.
1145 Only double-quotes count, not single-quotes.
1146 """
1147 res = []
1148 part = ''
1149
1150 escape = quote = False
1151 for cur in s:
1152 if escape:
1153 part += cur
1154 escape = False
1155 continue
1156 if quote:
1157 if cur == '\\':
1158 escape = True
1159 continue
1160 elif cur == '"':
1161 quote = False
1162 part += cur
1163 continue
1164
1165 if cur == ',':
1166 res.append(part)
1167 part = ''
1168 continue
1169
1170 if cur == '"':
1171 quote = True
1172
1173 part += cur
1174
1175 # append last part
1176 if part:
1177 res.append(part)
1178
1179 return [part.strip() for part in res]
1180
1181class FileHandler(BaseHandler):
1182 # Use local file or FTP depending on form of URL
1183 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001184 url = req.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001185 if url[:2] == '//' and url[2:3] != '/':
1186 req.type = 'ftp'
1187 return self.parent.open(req)
1188 else:
1189 return self.open_local_file(req)
1190
1191 # names for the localhost
1192 names = None
1193 def get_names(self):
1194 if FileHandler.names is None:
1195 try:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001196 FileHandler.names = tuple(
1197 socket.gethostbyname_ex('localhost')[2] +
1198 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001199 except socket.gaierror:
1200 FileHandler.names = (socket.gethostbyname('localhost'),)
1201 return FileHandler.names
1202
1203 # not entirely sure what the rules are here
1204 def open_local_file(self, req):
1205 import email.utils
1206 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001207 host = req.host
1208 file = req.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001209 localfile = url2pathname(file)
1210 try:
1211 stats = os.stat(localfile)
1212 size = stats.st_size
1213 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1214 mtype = mimetypes.guess_type(file)[0]
1215 headers = email.message_from_string(
1216 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1217 (mtype or 'text/plain', size, modified))
1218 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001219 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001220 if not host or \
1221 (not port and _safe_gethostbyname(host) in self.get_names()):
Georg Brandl13e89462008-07-01 19:56:00 +00001222 return addinfourl(open(localfile, 'rb'), headers, 'file:'+file)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001223 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001224 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001225 raise URLError(msg)
1226 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001227
1228def _safe_gethostbyname(host):
1229 try:
1230 return socket.gethostbyname(host)
1231 except socket.gaierror:
1232 return None
1233
1234class FTPHandler(BaseHandler):
1235 def ftp_open(self, req):
1236 import ftplib
1237 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001238 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001239 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001240 raise URLError('ftp error: no host given')
1241 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001242 if port is None:
1243 port = ftplib.FTP_PORT
1244 else:
1245 port = int(port)
1246
1247 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001248 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001249 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001250 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001251 else:
1252 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001253 host = unquote(host)
1254 user = unquote(user or '')
1255 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001256
1257 try:
1258 host = socket.gethostbyname(host)
1259 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001260 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001261 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001262 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001263 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001264 dirs, file = dirs[:-1], dirs[-1]
1265 if dirs and not dirs[0]:
1266 dirs = dirs[1:]
1267 try:
1268 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1269 type = file and 'I' or 'D'
1270 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001271 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001272 if attr.lower() == 'type' and \
1273 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1274 type = value.upper()
1275 fp, retrlen = fw.retrfile(file, type)
1276 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001277 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001278 if mtype:
1279 headers += "Content-type: %s\n" % mtype
1280 if retrlen is not None and retrlen >= 0:
1281 headers += "Content-length: %d\n" % retrlen
1282 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001283 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001284 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001285 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001286 raise exc.with_traceback(sys.exc_info()[2])
1287
1288 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1289 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1290 return fw
1291
1292class CacheFTPHandler(FTPHandler):
1293 # XXX would be nice to have pluggable cache strategies
1294 # XXX this stuff is definitely not thread safe
1295 def __init__(self):
1296 self.cache = {}
1297 self.timeout = {}
1298 self.soonest = 0
1299 self.delay = 60
1300 self.max_conns = 16
1301
1302 def setTimeout(self, t):
1303 self.delay = t
1304
1305 def setMaxConns(self, m):
1306 self.max_conns = m
1307
1308 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1309 key = user, host, port, '/'.join(dirs), timeout
1310 if key in self.cache:
1311 self.timeout[key] = time.time() + self.delay
1312 else:
1313 self.cache[key] = ftpwrapper(user, passwd, host, port,
1314 dirs, timeout)
1315 self.timeout[key] = time.time() + self.delay
1316 self.check_cache()
1317 return self.cache[key]
1318
1319 def check_cache(self):
1320 # first check for old ones
1321 t = time.time()
1322 if self.soonest <= t:
1323 for k, v in list(self.timeout.items()):
1324 if v < t:
1325 self.cache[k].close()
1326 del self.cache[k]
1327 del self.timeout[k]
1328 self.soonest = min(list(self.timeout.values()))
1329
1330 # then check the size
1331 if len(self.cache) == self.max_conns:
1332 for k, v in list(self.timeout.items()):
1333 if v == self.soonest:
1334 del self.cache[k]
1335 del self.timeout[k]
1336 break
1337 self.soonest = min(list(self.timeout.values()))
1338
1339# Code move from the old urllib module
1340
1341MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1342
1343# Helper for non-unix systems
1344if os.name == 'mac':
1345 from macurl2path import url2pathname, pathname2url
1346elif os.name == 'nt':
1347 from nturl2path import url2pathname, pathname2url
1348else:
1349 def url2pathname(pathname):
1350 """OS-specific conversion from a relative URL of the 'file' scheme
1351 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001352 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001353
1354 def pathname2url(pathname):
1355 """OS-specific conversion from a file system path to a relative URL
1356 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001357 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001358
1359# This really consists of two pieces:
1360# (1) a class which handles opening of all sorts of URLs
1361# (plus assorted utilities etc.)
1362# (2) a set of functions for parsing URLs
1363# XXX Should these be separated out into different modules?
1364
1365
1366ftpcache = {}
1367class URLopener:
1368 """Class to open URLs.
1369 This is a class rather than just a subroutine because we may need
1370 more than one set of global protocol-specific options.
1371 Note -- this is a base class for those who don't want the
1372 automatic handling of errors type 302 (relocated) and 401
1373 (authorization needed)."""
1374
1375 __tempfiles = None
1376
1377 version = "Python-urllib/%s" % __version__
1378
1379 # Constructor
1380 def __init__(self, proxies=None, **x509):
1381 if proxies is None:
1382 proxies = getproxies()
1383 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1384 self.proxies = proxies
1385 self.key_file = x509.get('key_file')
1386 self.cert_file = x509.get('cert_file')
1387 self.addheaders = [('User-Agent', self.version)]
1388 self.__tempfiles = []
1389 self.__unlink = os.unlink # See cleanup()
1390 self.tempcache = None
1391 # Undocumented feature: if you assign {} to tempcache,
1392 # it is used to cache files retrieved with
1393 # self.retrieve(). This is not enabled by default
1394 # since it does not work for changing documents (and I
1395 # haven't got the logic to check expiration headers
1396 # yet).
1397 self.ftpcache = ftpcache
1398 # Undocumented feature: you can use a different
1399 # ftp cache by assigning to the .ftpcache member;
1400 # in case you want logically independent URL openers
1401 # XXX This is not threadsafe. Bah.
1402
1403 def __del__(self):
1404 self.close()
1405
1406 def close(self):
1407 self.cleanup()
1408
1409 def cleanup(self):
1410 # This code sometimes runs when the rest of this module
1411 # has already been deleted, so it can't use any globals
1412 # or import anything.
1413 if self.__tempfiles:
1414 for file in self.__tempfiles:
1415 try:
1416 self.__unlink(file)
1417 except OSError:
1418 pass
1419 del self.__tempfiles[:]
1420 if self.tempcache:
1421 self.tempcache.clear()
1422
1423 def addheader(self, *args):
1424 """Add a header to be used by the HTTP interface only
1425 e.g. u.addheader('Accept', 'sound/basic')"""
1426 self.addheaders.append(args)
1427
1428 # External interface
1429 def open(self, fullurl, data=None):
1430 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001431 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran734f0592010-02-20 22:19:04 +00001432 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001433 if self.tempcache and fullurl in self.tempcache:
1434 filename, headers = self.tempcache[fullurl]
1435 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001436 return addinfourl(fp, headers, fullurl)
1437 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001438 if not urltype:
1439 urltype = 'file'
1440 if urltype in self.proxies:
1441 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001442 urltype, proxyhost = splittype(proxy)
1443 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001444 url = (host, fullurl) # Signal special case to open_*()
1445 else:
1446 proxy = None
1447 name = 'open_' + urltype
1448 self.type = urltype
1449 name = name.replace('-', '_')
1450 if not hasattr(self, name):
1451 if proxy:
1452 return self.open_unknown_proxy(proxy, fullurl, data)
1453 else:
1454 return self.open_unknown(fullurl, data)
1455 try:
1456 if data is None:
1457 return getattr(self, name)(url)
1458 else:
1459 return getattr(self, name)(url, data)
1460 except socket.error as msg:
1461 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1462
1463 def open_unknown(self, fullurl, data=None):
1464 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001465 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001466 raise IOError('url error', 'unknown url type', type)
1467
1468 def open_unknown_proxy(self, proxy, fullurl, data=None):
1469 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001470 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001471 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1472
1473 # External interface
1474 def retrieve(self, url, filename=None, reporthook=None, data=None):
1475 """retrieve(url) returns (filename, headers) for a local object
1476 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001477 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001478 if self.tempcache and url in self.tempcache:
1479 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001480 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001481 if filename is None and (not type or type == 'file'):
1482 try:
1483 fp = self.open_local_file(url1)
1484 hdrs = fp.info()
Philip Jenveycb134d72009-12-03 02:45:01 +00001485 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001486 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001487 except IOError as msg:
1488 pass
1489 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001490 try:
1491 headers = fp.info()
1492 if filename:
1493 tfp = open(filename, 'wb')
1494 else:
1495 import tempfile
1496 garbage, path = splittype(url)
1497 garbage, path = splithost(path or "")
1498 path, garbage = splitquery(path or "")
1499 path, garbage = splitattr(path or "")
1500 suffix = os.path.splitext(path)[1]
1501 (fd, filename) = tempfile.mkstemp(suffix)
1502 self.__tempfiles.append(filename)
1503 tfp = os.fdopen(fd, 'wb')
1504 try:
1505 result = filename, headers
1506 if self.tempcache is not None:
1507 self.tempcache[url] = result
1508 bs = 1024*8
1509 size = -1
1510 read = 0
1511 blocknum = 0
1512 if reporthook:
1513 if "content-length" in headers:
1514 size = int(headers["Content-Length"])
1515 reporthook(blocknum, bs, size)
1516 while 1:
1517 block = fp.read(bs)
1518 if not block:
1519 break
1520 read += len(block)
1521 tfp.write(block)
1522 blocknum += 1
1523 if reporthook:
1524 reporthook(blocknum, bs, size)
1525 finally:
1526 tfp.close()
1527 finally:
1528 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001529
1530 # raise exception if actual size does not match content-length header
1531 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001532 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001533 "retrieval incomplete: got only %i out of %i bytes"
1534 % (read, size), result)
1535
1536 return result
1537
1538 # Each method named open_<type> knows how to open that type of URL
1539
1540 def _open_generic_http(self, connection_factory, url, data):
1541 """Make an HTTP connection using connection_class.
1542
1543 This is an internal method that should be called from
1544 open_http() or open_https().
1545
1546 Arguments:
1547 - connection_factory should take a host name and return an
1548 HTTPConnection instance.
1549 - url is the url to retrieval or a host, relative-path pair.
1550 - data is payload for a POST request or None.
1551 """
1552
1553 user_passwd = None
1554 proxy_passwd= None
1555 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001556 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001557 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001558 user_passwd, host = splituser(host)
1559 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001560 realhost = host
1561 else:
1562 host, selector = url
1563 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001564 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001565 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001566 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001567 url = rest
1568 user_passwd = None
1569 if urltype.lower() != 'http':
1570 realhost = None
1571 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001572 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001573 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001574 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001575 if user_passwd:
1576 selector = "%s://%s%s" % (urltype, realhost, rest)
1577 if proxy_bypass(realhost):
1578 host = realhost
1579
1580 #print "proxy via http:", host, selector
1581 if not host: raise IOError('http error', 'no host given')
1582
1583 if proxy_passwd:
1584 import base64
1585 proxy_auth = base64.b64encode(proxy_passwd).strip()
1586 else:
1587 proxy_auth = None
1588
1589 if user_passwd:
1590 import base64
1591 auth = base64.b64encode(user_passwd).strip()
1592 else:
1593 auth = None
1594 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001595 headers = {}
1596 if proxy_auth:
1597 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1598 if auth:
1599 headers["Authorization"] = "Basic %s" % auth
1600 if realhost:
1601 headers["Host"] = realhost
1602 for header, value in self.addheaders:
1603 headers[header] = value
1604
1605 if data is not None:
1606 headers["Content-Type"] = "application/x-www-form-urlencoded"
1607 http_conn.request("POST", selector, data, headers)
1608 else:
1609 http_conn.request("GET", selector, headers=headers)
1610
1611 try:
1612 response = http_conn.getresponse()
1613 except http.client.BadStatusLine:
1614 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001615 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001616
1617 # According to RFC 2616, "2xx" code indicates that the client's
1618 # request was successfully received, understood, and accepted.
1619 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001620 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001621 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001622 else:
1623 return self.http_error(
1624 url, response.fp,
1625 response.status, response.reason, response.msg, data)
1626
1627 def open_http(self, url, data=None):
1628 """Use HTTP protocol."""
1629 return self._open_generic_http(http.client.HTTPConnection, url, data)
1630
1631 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1632 """Handle http errors.
1633
1634 Derived class can override this, or provide specific handlers
1635 named http_error_DDD where DDD is the 3-digit error code."""
1636 # First check if there's a specific handler for this error
1637 name = 'http_error_%d' % errcode
1638 if hasattr(self, name):
1639 method = getattr(self, name)
1640 if data is None:
1641 result = method(url, fp, errcode, errmsg, headers)
1642 else:
1643 result = method(url, fp, errcode, errmsg, headers, data)
1644 if result: return result
1645 return self.http_error_default(url, fp, errcode, errmsg, headers)
1646
1647 def http_error_default(self, url, fp, errcode, errmsg, headers):
1648 """Default error handler: close the connection and raise IOError."""
1649 void = fp.read()
1650 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001651 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001652
1653 if _have_ssl:
1654 def _https_connection(self, host):
1655 return http.client.HTTPSConnection(host,
1656 key_file=self.key_file,
1657 cert_file=self.cert_file)
1658
1659 def open_https(self, url, data=None):
1660 """Use HTTPS protocol."""
1661 return self._open_generic_http(self._https_connection, url, data)
1662
1663 def open_file(self, url):
1664 """Use local file or FTP depending on form of URL."""
1665 if not isinstance(url, str):
1666 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1667 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1668 return self.open_ftp(url)
1669 else:
1670 return self.open_local_file(url)
1671
1672 def open_local_file(self, url):
1673 """Use local file."""
1674 import mimetypes, email.utils
1675 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001676 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001677 localname = url2pathname(file)
1678 try:
1679 stats = os.stat(localname)
1680 except OSError as e:
1681 raise URLError(e.errno, e.strerror, e.filename)
1682 size = stats.st_size
1683 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1684 mtype = mimetypes.guess_type(url)[0]
1685 headers = email.message_from_string(
1686 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1687 (mtype or 'text/plain', size, modified))
1688 if not host:
1689 urlfile = file
1690 if file[:1] == '/':
1691 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001692 return addinfourl(open(localname, 'rb'), headers, urlfile)
1693 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001694 if (not port
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00001695 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001696 urlfile = file
1697 if file[:1] == '/':
1698 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001699 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001700 raise URLError('local file error', 'not on local host')
1701
1702 def open_ftp(self, url):
1703 """Use FTP protocol."""
1704 if not isinstance(url, str):
1705 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1706 import mimetypes
1707 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001708 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001709 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001710 host, port = splitport(host)
1711 user, host = splituser(host)
1712 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001713 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001714 host = unquote(host)
1715 user = unquote(user or '')
1716 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001717 host = socket.gethostbyname(host)
1718 if not port:
1719 import ftplib
1720 port = ftplib.FTP_PORT
1721 else:
1722 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001723 path, attrs = splitattr(path)
1724 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001725 dirs = path.split('/')
1726 dirs, file = dirs[:-1], dirs[-1]
1727 if dirs and not dirs[0]: dirs = dirs[1:]
1728 if dirs and not dirs[0]: dirs[0] = '/'
1729 key = user, host, port, '/'.join(dirs)
1730 # XXX thread unsafe!
1731 if len(self.ftpcache) > MAXFTPCACHE:
1732 # Prune the cache, rather arbitrarily
1733 for k in self.ftpcache.keys():
1734 if k != key:
1735 v = self.ftpcache[k]
1736 del self.ftpcache[k]
1737 v.close()
1738 try:
1739 if not key in self.ftpcache:
1740 self.ftpcache[key] = \
1741 ftpwrapper(user, passwd, host, port, dirs)
1742 if not file: type = 'D'
1743 else: type = 'I'
1744 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001745 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001746 if attr.lower() == 'type' and \
1747 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1748 type = value.upper()
1749 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1750 mtype = mimetypes.guess_type("ftp:" + url)[0]
1751 headers = ""
1752 if mtype:
1753 headers += "Content-Type: %s\n" % mtype
1754 if retrlen is not None and retrlen >= 0:
1755 headers += "Content-Length: %d\n" % retrlen
1756 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001757 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001758 except ftperrors() as msg:
1759 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1760
1761 def open_data(self, url, data=None):
1762 """Use "data" URL."""
1763 if not isinstance(url, str):
1764 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1765 # ignore POSTed data
1766 #
1767 # syntax of data URLs:
1768 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1769 # mediatype := [ type "/" subtype ] *( ";" parameter )
1770 # data := *urlchar
1771 # parameter := attribute "=" value
1772 try:
1773 [type, data] = url.split(',', 1)
1774 except ValueError:
1775 raise IOError('data error', 'bad data URL')
1776 if not type:
1777 type = 'text/plain;charset=US-ASCII'
1778 semi = type.rfind(';')
1779 if semi >= 0 and '=' not in type[semi:]:
1780 encoding = type[semi+1:]
1781 type = type[:semi]
1782 else:
1783 encoding = ''
1784 msg = []
1785 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
1786 time.gmtime(time.time())))
1787 msg.append('Content-type: %s' % type)
1788 if encoding == 'base64':
1789 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001790 # XXX is this encoding/decoding ok?
1791 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001792 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001793 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001794 msg.append('Content-Length: %d' % len(data))
1795 msg.append('')
1796 msg.append(data)
1797 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001798 headers = email.message_from_string(msg)
1799 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001800 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001801 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001802
1803
1804class FancyURLopener(URLopener):
1805 """Derived class with handlers for errors we can handle (perhaps)."""
1806
1807 def __init__(self, *args, **kwargs):
1808 URLopener.__init__(self, *args, **kwargs)
1809 self.auth_cache = {}
1810 self.tries = 0
1811 self.maxtries = 10
1812
1813 def http_error_default(self, url, fp, errcode, errmsg, headers):
1814 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001815 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001816
1817 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1818 """Error 302 -- relocated (temporarily)."""
1819 self.tries += 1
1820 if self.maxtries and self.tries >= self.maxtries:
1821 if hasattr(self, "http_error_500"):
1822 meth = self.http_error_500
1823 else:
1824 meth = self.http_error_default
1825 self.tries = 0
1826 return meth(url, fp, 500,
1827 "Internal Server Error: Redirect Recursion", headers)
1828 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1829 data)
1830 self.tries = 0
1831 return result
1832
1833 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1834 if 'location' in headers:
1835 newurl = headers['location']
1836 elif 'uri' in headers:
1837 newurl = headers['uri']
1838 else:
1839 return
1840 void = fp.read()
1841 fp.close()
1842 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001843 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001844 return self.open(newurl)
1845
1846 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1847 """Error 301 -- also relocated (permanently)."""
1848 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1849
1850 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1851 """Error 303 -- also relocated (essentially identical to 302)."""
1852 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1853
1854 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1855 """Error 307 -- relocated, but turn POST into error."""
1856 if data is None:
1857 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1858 else:
1859 return self.http_error_default(url, fp, errcode, errmsg, headers)
1860
1861 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
1862 """Error 401 -- authentication required.
1863 This function supports Basic authentication only."""
1864 if not 'www-authenticate' in headers:
1865 URLopener.http_error_default(self, url, fp,
1866 errcode, errmsg, headers)
1867 stuff = headers['www-authenticate']
1868 import re
1869 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1870 if not match:
1871 URLopener.http_error_default(self, url, fp,
1872 errcode, errmsg, headers)
1873 scheme, realm = match.groups()
1874 if scheme.lower() != 'basic':
1875 URLopener.http_error_default(self, url, fp,
1876 errcode, errmsg, headers)
1877 name = 'retry_' + self.type + '_basic_auth'
1878 if data is None:
1879 return getattr(self,name)(url, realm)
1880 else:
1881 return getattr(self,name)(url, realm, data)
1882
1883 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
1884 """Error 407 -- proxy authentication required.
1885 This function supports Basic authentication only."""
1886 if not 'proxy-authenticate' in headers:
1887 URLopener.http_error_default(self, url, fp,
1888 errcode, errmsg, headers)
1889 stuff = headers['proxy-authenticate']
1890 import re
1891 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1892 if not match:
1893 URLopener.http_error_default(self, url, fp,
1894 errcode, errmsg, headers)
1895 scheme, realm = match.groups()
1896 if scheme.lower() != 'basic':
1897 URLopener.http_error_default(self, url, fp,
1898 errcode, errmsg, headers)
1899 name = 'retry_proxy_' + self.type + '_basic_auth'
1900 if data is None:
1901 return getattr(self,name)(url, realm)
1902 else:
1903 return getattr(self,name)(url, realm, data)
1904
1905 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001906 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001907 newurl = 'http://' + host + selector
1908 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001909 urltype, proxyhost = splittype(proxy)
1910 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001911 i = proxyhost.find('@') + 1
1912 proxyhost = proxyhost[i:]
1913 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1914 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001915 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001916 quote(passwd, safe=''), proxyhost)
1917 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1918 if data is None:
1919 return self.open(newurl)
1920 else:
1921 return self.open(newurl, data)
1922
1923 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001924 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001925 newurl = 'https://' + host + selector
1926 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001927 urltype, proxyhost = splittype(proxy)
1928 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001929 i = proxyhost.find('@') + 1
1930 proxyhost = proxyhost[i:]
1931 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1932 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001933 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001934 quote(passwd, safe=''), proxyhost)
1935 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1936 if data is None:
1937 return self.open(newurl)
1938 else:
1939 return self.open(newurl, data)
1940
1941 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001942 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001943 i = host.find('@') + 1
1944 host = host[i:]
1945 user, passwd = self.get_user_passwd(host, realm, i)
1946 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001947 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001948 quote(passwd, safe=''), host)
1949 newurl = 'http://' + host + selector
1950 if data is None:
1951 return self.open(newurl)
1952 else:
1953 return self.open(newurl, data)
1954
1955 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001956 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001957 i = host.find('@') + 1
1958 host = host[i:]
1959 user, passwd = self.get_user_passwd(host, realm, i)
1960 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001961 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001962 quote(passwd, safe=''), host)
1963 newurl = 'https://' + host + selector
1964 if data is None:
1965 return self.open(newurl)
1966 else:
1967 return self.open(newurl, data)
1968
1969 def get_user_passwd(self, host, realm, clear_cache = 0):
1970 key = realm + '@' + host.lower()
1971 if key in self.auth_cache:
1972 if clear_cache:
1973 del self.auth_cache[key]
1974 else:
1975 return self.auth_cache[key]
1976 user, passwd = self.prompt_user_passwd(host, realm)
1977 if user or passwd: self.auth_cache[key] = (user, passwd)
1978 return user, passwd
1979
1980 def prompt_user_passwd(self, host, realm):
1981 """Override this in a GUI environment!"""
1982 import getpass
1983 try:
1984 user = input("Enter username for %s at %s: " % (realm, host))
1985 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
1986 (user, realm, host))
1987 return user, passwd
1988 except KeyboardInterrupt:
1989 print()
1990 return None, None
1991
1992
1993# Utility functions
1994
1995_localhost = None
1996def localhost():
1997 """Return the IP address of the magic hostname 'localhost'."""
1998 global _localhost
1999 if _localhost is None:
2000 _localhost = socket.gethostbyname('localhost')
2001 return _localhost
2002
2003_thishost = None
2004def thishost():
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002005 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002006 global _thishost
2007 if _thishost is None:
Senthil Kumaran99b2c8f2009-12-27 10:13:39 +00002008 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname()[2]))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002009 return _thishost
2010
2011_ftperrors = None
2012def ftperrors():
2013 """Return the set of errors raised by the FTP class."""
2014 global _ftperrors
2015 if _ftperrors is None:
2016 import ftplib
2017 _ftperrors = ftplib.all_errors
2018 return _ftperrors
2019
2020_noheaders = None
2021def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002022 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002023 global _noheaders
2024 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002025 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002026 return _noheaders
2027
2028
2029# Utility classes
2030
2031class ftpwrapper:
2032 """Class used by open_ftp() for cache of open FTP connections."""
2033
2034 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2035 self.user = user
2036 self.passwd = passwd
2037 self.host = host
2038 self.port = port
2039 self.dirs = dirs
2040 self.timeout = timeout
2041 self.init()
2042
2043 def init(self):
2044 import ftplib
2045 self.busy = 0
2046 self.ftp = ftplib.FTP()
2047 self.ftp.connect(self.host, self.port, self.timeout)
2048 self.ftp.login(self.user, self.passwd)
2049 for dir in self.dirs:
2050 self.ftp.cwd(dir)
2051
2052 def retrfile(self, file, type):
2053 import ftplib
2054 self.endtransfer()
2055 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2056 else: cmd = 'TYPE ' + type; isdir = 0
2057 try:
2058 self.ftp.voidcmd(cmd)
2059 except ftplib.all_errors:
2060 self.init()
2061 self.ftp.voidcmd(cmd)
2062 conn = None
2063 if file and not isdir:
2064 # Try to retrieve as a file
2065 try:
2066 cmd = 'RETR ' + file
2067 conn = self.ftp.ntransfercmd(cmd)
2068 except ftplib.error_perm as reason:
2069 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002070 raise URLError('ftp error', reason).with_traceback(
2071 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002072 if not conn:
2073 # Set transfer mode to ASCII!
2074 self.ftp.voidcmd('TYPE A')
2075 # Try a directory listing. Verify that directory exists.
2076 if file:
2077 pwd = self.ftp.pwd()
2078 try:
2079 try:
2080 self.ftp.cwd(file)
2081 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002082 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002083 finally:
2084 self.ftp.cwd(pwd)
2085 cmd = 'LIST ' + file
2086 else:
2087 cmd = 'LIST'
2088 conn = self.ftp.ntransfercmd(cmd)
2089 self.busy = 1
2090 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002091 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002092 def endtransfer(self):
2093 if not self.busy:
2094 return
2095 self.busy = 0
2096 try:
2097 self.ftp.voidresp()
2098 except ftperrors():
2099 pass
2100
2101 def close(self):
2102 self.endtransfer()
2103 try:
2104 self.ftp.close()
2105 except ftperrors():
2106 pass
2107
2108# Proxy handling
2109def getproxies_environment():
2110 """Return a dictionary of scheme -> proxy server URL mappings.
2111
2112 Scan the environment for variables named <scheme>_proxy;
2113 this seems to be the standard convention. If you need a
2114 different way, you can pass a proxies dictionary to the
2115 [Fancy]URLopener constructor.
2116
2117 """
2118 proxies = {}
2119 for name, value in os.environ.items():
2120 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002121 if value and name[-6:] == '_proxy':
2122 proxies[name[:-6]] = value
2123 return proxies
2124
2125def proxy_bypass_environment(host):
2126 """Test if proxies should not be used for a particular host.
2127
2128 Checks the environment for a variable named no_proxy, which should
2129 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2130 """
2131 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2132 # '*' is special case for always bypass
2133 if no_proxy == '*':
2134 return 1
2135 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002136 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002137 # check if the host ends with any of the DNS suffixes
2138 for name in no_proxy.split(','):
2139 if name and (hostonly.endswith(name) or host.endswith(name)):
2140 return 1
2141 # otherwise, don't bypass
2142 return 0
2143
2144
2145if sys.platform == 'darwin':
2146 def getproxies_internetconfig():
2147 """Return a dictionary of scheme -> proxy server URL mappings.
2148
2149 By convention the mac uses Internet Config to store
2150 proxies. An HTTP proxy, for instance, is stored under
2151 the HttpProxy key.
2152
2153 """
2154 try:
2155 import ic
2156 except ImportError:
2157 return {}
2158
2159 try:
2160 config = ic.IC()
2161 except ic.error:
2162 return {}
2163 proxies = {}
2164 # HTTP:
2165 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
2166 try:
2167 value = config['HTTPProxyHost']
2168 except ic.error:
2169 pass
2170 else:
2171 proxies['http'] = 'http://%s' % value
2172 # FTP: XXX To be done.
2173 # Gopher: XXX To be done.
2174 return proxies
2175
2176 def proxy_bypass(host):
2177 if getproxies_environment():
2178 return proxy_bypass_environment(host)
2179 else:
2180 return 0
2181
2182 def getproxies():
2183 return getproxies_environment() or getproxies_internetconfig()
2184
2185elif os.name == 'nt':
2186 def getproxies_registry():
2187 """Return a dictionary of scheme -> proxy server URL mappings.
2188
2189 Win32 uses the registry to store proxies.
2190
2191 """
2192 proxies = {}
2193 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002194 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002195 except ImportError:
2196 # Std module, so should be around - but you never know!
2197 return proxies
2198 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002199 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002200 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002201 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002202 'ProxyEnable')[0]
2203 if proxyEnable:
2204 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002205 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002206 'ProxyServer')[0])
2207 if '=' in proxyServer:
2208 # Per-protocol settings
2209 for p in proxyServer.split(';'):
2210 protocol, address = p.split('=', 1)
2211 # See if address has a type:// prefix
2212 import re
2213 if not re.match('^([^/:]+)://', address):
2214 address = '%s://%s' % (protocol, address)
2215 proxies[protocol] = address
2216 else:
2217 # Use one setting for all protocols
2218 if proxyServer[:5] == 'http:':
2219 proxies['http'] = proxyServer
2220 else:
2221 proxies['http'] = 'http://%s' % proxyServer
2222 proxies['ftp'] = 'ftp://%s' % proxyServer
2223 internetSettings.Close()
2224 except (WindowsError, ValueError, TypeError):
2225 # Either registry key not found etc, or the value in an
2226 # unexpected format.
2227 # proxies already set up to be empty so nothing to do
2228 pass
2229 return proxies
2230
2231 def getproxies():
2232 """Return a dictionary of scheme -> proxy server URL mappings.
2233
2234 Returns settings gathered from the environment, if specified,
2235 or the registry.
2236
2237 """
2238 return getproxies_environment() or getproxies_registry()
2239
2240 def proxy_bypass_registry(host):
2241 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002242 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002243 import re
2244 except ImportError:
2245 # Std modules, so should be around - but you never know!
2246 return 0
2247 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002248 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002249 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002250 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002251 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002252 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002253 'ProxyOverride')[0])
2254 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2255 except WindowsError:
2256 return 0
2257 if not proxyEnable or not proxyOverride:
2258 return 0
2259 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002260 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002261 host = [rawHost]
2262 try:
2263 addr = socket.gethostbyname(rawHost)
2264 if addr != rawHost:
2265 host.append(addr)
2266 except socket.error:
2267 pass
2268 try:
2269 fqdn = socket.getfqdn(rawHost)
2270 if fqdn != rawHost:
2271 host.append(fqdn)
2272 except socket.error:
2273 pass
2274 # make a check value list from the registry entry: replace the
2275 # '<local>' string by the localhost entry and the corresponding
2276 # canonical entry.
2277 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002278 # now check if we match one of the registry values.
2279 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002280 if test == '<local>':
2281 if '.' not in rawHost:
2282 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002283 test = test.replace(".", r"\.") # mask dots
2284 test = test.replace("*", r".*") # change glob sequence
2285 test = test.replace("?", r".") # change glob char
2286 for val in host:
2287 # print "%s <--> %s" %( test, val )
2288 if re.match(test, val, re.I):
2289 return 1
2290 return 0
2291
2292 def proxy_bypass(host):
2293 """Return a dictionary of scheme -> proxy server URL mappings.
2294
2295 Returns settings gathered from the environment, if specified,
2296 or the registry.
2297
2298 """
2299 if getproxies_environment():
2300 return proxy_bypass_environment(host)
2301 else:
2302 return proxy_bypass_registry(host)
2303
2304else:
2305 # By default use environment variables
2306 getproxies = getproxies_environment
2307 proxy_bypass = proxy_bypass_environment