blob: 47fe75e7cf15ee6889f543bdb8f1aa43e661d329 [file] [log] [blame]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
Senthil Kumaran04454cd2009-11-15 07:27:02 +000033
Senthil Kumaran4b9fbeb2009-12-20 07:18:22 +000034OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
Jeremy Hylton1afc1692008-06-18 20:49:58 +000036
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
Georg Brandl029986a2008-06-23 11:44:14 +000049import urllib.request
Jeremy Hylton1afc1692008-06-18 20:49:58 +000050
51# set up authentication info
Georg Brandl029986a2008-06-23 11:44:14 +000052authinfo = urllib.request.HTTPBasicAuthHandler()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000053authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
57
Georg Brandl029986a2008-06-23 11:44:14 +000058proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
Jeremy Hylton1afc1692008-06-18 20:49:58 +000059
60# build a new opener that adds authentication and caching FTP handlers
Georg Brandl029986a2008-06-23 11:44:14 +000061opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063
64# install it
Georg Brandl029986a2008-06-23 11:44:14 +000065urllib.request.install_opener(opener)
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066
Georg Brandl029986a2008-06-23 11:44:14 +000067f = urllib.request.urlopen('http://www.python.org/')
Jeremy Hylton1afc1692008-06-18 20:49:58 +000068"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +000085import bisect
Jeremy Hylton1afc1692008-06-18 20:49:58 +000086import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import random
93import re
94import socket
95import sys
96import time
Jeremy Hylton1afc1692008-06-18 20:49:58 +000097
Georg Brandl13e89462008-07-01 19:56:00 +000098from urllib.error import URLError, HTTPError, ContentTooShortError
99from urllib.parse import (
100 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
101 splittype, splithost, splitport, splituser, splitpasswd,
Facundo Batistaf24802c2008-08-17 03:36:03 +0000102 splitattr, splitquery, splitvalue, to_bytes, urlunparse)
Georg Brandl13e89462008-07-01 19:56:00 +0000103from urllib.response import addinfourl, addclosehook
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000104
105# check for SSL
106try:
107 import ssl
108except:
109 _have_ssl = False
110else:
111 _have_ssl = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000112
113# used in User-Agent header sent
114__version__ = sys.version[:3]
115
116_opener = None
117def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
118 global _opener
119 if _opener is None:
120 _opener = build_opener()
121 return _opener.open(url, data, timeout)
122
123def install_opener(opener):
124 global _opener
125 _opener = opener
126
127# TODO(jhylton): Make this work with the same global opener.
128_urlopener = None
129def urlretrieve(url, filename=None, reporthook=None, data=None):
130 global _urlopener
131 if not _urlopener:
132 _urlopener = FancyURLopener()
133 return _urlopener.retrieve(url, filename, reporthook, data)
134
135def urlcleanup():
136 if _urlopener:
137 _urlopener.cleanup()
138 global _opener
139 if _opener:
140 _opener = None
141
142# copied from cookielib.py
Antoine Pitroufd036452008-08-19 17:56:33 +0000143_cut_port_re = re.compile(r":\d+$", re.ASCII)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000144def request_host(request):
145 """Return request-host, as defined by RFC 2965.
146
147 Variation from RFC: returned value is lowercased, for convenient
148 comparison.
149
150 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000151 url = request.full_url
Georg Brandl13e89462008-07-01 19:56:00 +0000152 host = urlparse(url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000153 if host == "":
154 host = request.get_header("Host", "")
155
156 # remove port, if present
157 host = _cut_port_re.sub("", host, 1)
158 return host.lower()
159
160class Request:
161
162 def __init__(self, url, data=None, headers={},
163 origin_req_host=None, unverifiable=False):
164 # unwrap('<URL:type://host/path>') --> 'type://host/path'
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000165 self.full_url = unwrap(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000166 self.data = data
167 self.headers = {}
Senthil Kumaran0ac1f832009-07-26 12:39:47 +0000168 self._tunnel_host = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000169 for key, value in headers.items():
170 self.add_header(key, value)
171 self.unredirected_hdrs = {}
172 if origin_req_host is None:
173 origin_req_host = request_host(self)
174 self.origin_req_host = origin_req_host
175 self.unverifiable = unverifiable
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000176 self._parse()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000177
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000178 def _parse(self):
179 self.type, rest = splittype(self.full_url)
180 if self.type is None:
181 raise ValueError("unknown url type: %s" % self.full_url)
182 self.host, self.selector = splithost(rest)
183 if self.host:
184 self.host = unquote(self.host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000185
186 def get_method(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000187 if self.data is not None:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000188 return "POST"
189 else:
190 return "GET"
191
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000192 # Begin deprecated methods
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000193
194 def add_data(self, data):
Senthil Kumaran73ad4342010-02-24 21:04:55 +0000195 if self.has_data():
196 raise TypeError("Request Obj already contains data: %s" %
197 self.data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000198 self.data = data
199
200 def has_data(self):
201 return self.data is not None
202
203 def get_data(self):
204 return self.data
205
206 def get_full_url(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000207 return self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000208
209 def get_type(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000210 return self.type
211
212 def get_host(self):
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000213 return self.host
214
215 def get_selector(self):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000216 return self.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000217
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000218 def is_unverifiable(self):
219 return self.unverifiable
Facundo Batista72dc1ea2008-08-16 14:44:32 +0000220
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000221 def get_origin_req_host(self):
222 return self.origin_req_host
223
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000224 # End deprecated methods
225
226 def set_proxy(self, host, type):
Senthil Kumaran0ac1f832009-07-26 12:39:47 +0000227 if self.type == 'https' and not self._tunnel_host:
228 self._tunnel_host = self.host
229 else:
230 self.type= type
231 self.selector = self.full_url
232 self.host = host
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000233
234 def has_proxy(self):
235 return self.selector == self.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000236
237 def add_header(self, key, val):
238 # useful for something like authentication
239 self.headers[key.capitalize()] = val
240
241 def add_unredirected_header(self, key, val):
242 # will not be added to a redirected request
243 self.unredirected_hdrs[key.capitalize()] = val
244
245 def has_header(self, header_name):
246 return (header_name in self.headers or
247 header_name in self.unredirected_hdrs)
248
249 def get_header(self, header_name, default=None):
250 return self.headers.get(
251 header_name,
252 self.unredirected_hdrs.get(header_name, default))
253
254 def header_items(self):
255 hdrs = self.unredirected_hdrs.copy()
256 hdrs.update(self.headers)
257 return list(hdrs.items())
258
259class OpenerDirector:
260 def __init__(self):
261 client_version = "Python-urllib/%s" % __version__
262 self.addheaders = [('User-agent', client_version)]
263 # manage the individual handlers
264 self.handlers = []
265 self.handle_open = {}
266 self.handle_error = {}
267 self.process_response = {}
268 self.process_request = {}
269
270 def add_handler(self, handler):
271 if not hasattr(handler, "add_parent"):
272 raise TypeError("expected BaseHandler instance, got %r" %
273 type(handler))
274
275 added = False
276 for meth in dir(handler):
277 if meth in ["redirect_request", "do_open", "proxy_open"]:
278 # oops, coincidental match
279 continue
280
281 i = meth.find("_")
282 protocol = meth[:i]
283 condition = meth[i+1:]
284
285 if condition.startswith("error"):
286 j = condition.find("_") + i + 1
287 kind = meth[j+1:]
288 try:
289 kind = int(kind)
290 except ValueError:
291 pass
292 lookup = self.handle_error.get(protocol, {})
293 self.handle_error[protocol] = lookup
294 elif condition == "open":
295 kind = protocol
296 lookup = self.handle_open
297 elif condition == "response":
298 kind = protocol
299 lookup = self.process_response
300 elif condition == "request":
301 kind = protocol
302 lookup = self.process_request
303 else:
304 continue
305
306 handlers = lookup.setdefault(kind, [])
307 if handlers:
308 bisect.insort(handlers, handler)
309 else:
310 handlers.append(handler)
311 added = True
312
313 if added:
314 # the handlers must work in an specific order, the order
315 # is specified in a Handler attribute
316 bisect.insort(self.handlers, handler)
317 handler.add_parent(self)
318
319 def close(self):
320 # Only exists for backwards compatibility.
321 pass
322
323 def _call_chain(self, chain, kind, meth_name, *args):
324 # Handlers raise an exception if no one else should try to handle
325 # the request, or return None if they can't but another handler
326 # could. Otherwise, they return the response.
327 handlers = chain.get(kind, ())
328 for handler in handlers:
329 func = getattr(handler, meth_name)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000330 result = func(*args)
331 if result is not None:
332 return result
333
334 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
335 # accept a URL or a Request object
336 if isinstance(fullurl, str):
337 req = Request(fullurl, data)
338 else:
339 req = fullurl
340 if data is not None:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000341 req.data = data
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000342
343 req.timeout = timeout
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000344 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000345
346 # pre-process request
347 meth_name = protocol+"_request"
348 for processor in self.process_request.get(protocol, []):
349 meth = getattr(processor, meth_name)
350 req = meth(req)
351
352 response = self._open(req, data)
353
354 # post-process response
355 meth_name = protocol+"_response"
356 for processor in self.process_response.get(protocol, []):
357 meth = getattr(processor, meth_name)
358 response = meth(req, response)
359
360 return response
361
362 def _open(self, req, data=None):
363 result = self._call_chain(self.handle_open, 'default',
364 'default_open', req)
365 if result:
366 return result
367
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000368 protocol = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000369 result = self._call_chain(self.handle_open, protocol, protocol +
370 '_open', req)
371 if result:
372 return result
373
374 return self._call_chain(self.handle_open, 'unknown',
375 'unknown_open', req)
376
377 def error(self, proto, *args):
378 if proto in ('http', 'https'):
379 # XXX http[s] protocols are special-cased
380 dict = self.handle_error['http'] # https is not different than http
381 proto = args[2] # YUCK!
382 meth_name = 'http_error_%s' % proto
383 http_err = 1
384 orig_args = args
385 else:
386 dict = self.handle_error
387 meth_name = proto + '_error'
388 http_err = 0
389 args = (dict, proto, meth_name) + args
390 result = self._call_chain(*args)
391 if result:
392 return result
393
394 if http_err:
395 args = (dict, 'default', 'http_error_default') + orig_args
396 return self._call_chain(*args)
397
398# XXX probably also want an abstract factory that knows when it makes
399# sense to skip a superclass in favor of a subclass and when it might
400# make sense to include both
401
402def build_opener(*handlers):
403 """Create an opener object from a list of handlers.
404
405 The opener will use several default handlers, including support
Senthil Kumaran04454cd2009-11-15 07:27:02 +0000406 for HTTP, FTP and when applicable HTTPS.
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000407
408 If any of the handlers passed as arguments are subclasses of the
409 default handlers, the default handlers will not be used.
410 """
411 def isclass(obj):
412 return isinstance(obj, type) or hasattr(obj, "__bases__")
413
414 opener = OpenerDirector()
415 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
416 HTTPDefaultErrorHandler, HTTPRedirectHandler,
417 FTPHandler, FileHandler, HTTPErrorProcessor]
418 if hasattr(http.client, "HTTPSConnection"):
419 default_classes.append(HTTPSHandler)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000420 skip = set()
421 for klass in default_classes:
422 for check in handlers:
423 if isclass(check):
424 if issubclass(check, klass):
425 skip.add(klass)
426 elif isinstance(check, klass):
427 skip.add(klass)
428 for klass in skip:
429 default_classes.remove(klass)
430
431 for klass in default_classes:
432 opener.add_handler(klass())
433
434 for h in handlers:
435 if isclass(h):
436 h = h()
437 opener.add_handler(h)
438 return opener
439
440class BaseHandler:
441 handler_order = 500
442
443 def add_parent(self, parent):
444 self.parent = parent
445
446 def close(self):
447 # Only exists for backwards compatibility
448 pass
449
450 def __lt__(self, other):
451 if not hasattr(other, "handler_order"):
452 # Try to preserve the old behavior of having custom classes
453 # inserted after default ones (works only for custom user
454 # classes which are not aware of handler_order).
455 return True
456 return self.handler_order < other.handler_order
457
458
459class HTTPErrorProcessor(BaseHandler):
460 """Process HTTP error responses."""
461 handler_order = 1000 # after all other processing
462
463 def http_response(self, request, response):
464 code, msg, hdrs = response.code, response.msg, response.info()
465
466 # According to RFC 2616, "2xx" code indicates that the client's
467 # request was successfully received, understood, and accepted.
468 if not (200 <= code < 300):
469 response = self.parent.error(
470 'http', request, response, code, msg, hdrs)
471
472 return response
473
474 https_response = http_response
475
476class HTTPDefaultErrorHandler(BaseHandler):
477 def http_error_default(self, req, fp, code, msg, hdrs):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000478 raise HTTPError(req.full_url, code, msg, hdrs, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000479
480class HTTPRedirectHandler(BaseHandler):
481 # maximum number of redirections to any single URL
482 # this is needed because of the state that cookies introduce
483 max_repeats = 4
484 # maximum total number of redirections (regardless of URL) before
485 # assuming we're in a loop
486 max_redirections = 10
487
488 def redirect_request(self, req, fp, code, msg, headers, newurl):
489 """Return a Request or None in response to a redirect.
490
491 This is called by the http_error_30x methods when a
492 redirection response is received. If a redirection should
493 take place, return a new Request to allow http_error_30x to
494 perform the redirect. Otherwise, raise HTTPError if no-one
495 else should try to handle this url. Return None if you can't
496 but another Handler might.
497 """
498 m = req.get_method()
499 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
500 or code in (301, 302, 303) and m == "POST")):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000501 raise HTTPError(req.full_url, code, msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000502
503 # Strictly (according to RFC 2616), 301 or 302 in response to
504 # a POST MUST NOT cause a redirection without confirmation
Georg Brandl029986a2008-06-23 11:44:14 +0000505 # from the user (of urllib.request, in this case). In practice,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000506 # essentially all clients do redirect in this case, so we do
507 # the same.
508 # be conciliant with URIs containing a space
509 newurl = newurl.replace(' ', '%20')
510 CONTENT_HEADERS = ("content-length", "content-type")
511 newheaders = dict((k, v) for k, v in req.headers.items()
512 if k.lower() not in CONTENT_HEADERS)
513 return Request(newurl,
514 headers=newheaders,
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000515 origin_req_host=req.origin_req_host,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000516 unverifiable=True)
517
518 # Implementation note: To avoid the server sending us into an
519 # infinite loop, the request object needs to track what URLs we
520 # have already seen. Do this by adding a handler-specific
521 # attribute to the Request object.
522 def http_error_302(self, req, fp, code, msg, headers):
523 # Some servers (incorrectly) return multiple Location headers
524 # (so probably same goes for URI). Use first header.
525 if "location" in headers:
526 newurl = headers["location"]
527 elif "uri" in headers:
528 newurl = headers["uri"]
529 else:
530 return
Facundo Batistaf24802c2008-08-17 03:36:03 +0000531
532 # fix a possible malformed URL
533 urlparts = urlparse(newurl)
534 if not urlparts.path:
535 urlparts = list(urlparts)
536 urlparts[2] = "/"
537 newurl = urlunparse(urlparts)
538
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000539 newurl = urljoin(req.full_url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000540
541 # XXX Probably want to forget about the state of the current
542 # request, although that might interact poorly with other
543 # handlers that also use handler-specific request attributes
544 new = self.redirect_request(req, fp, code, msg, headers, newurl)
545 if new is None:
546 return
547
548 # loop detection
549 # .redirect_dict has a key url if url was previously visited.
550 if hasattr(req, 'redirect_dict'):
551 visited = new.redirect_dict = req.redirect_dict
552 if (visited.get(newurl, 0) >= self.max_repeats or
553 len(visited) >= self.max_redirections):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000554 raise HTTPError(req.full_url, code,
Georg Brandl13e89462008-07-01 19:56:00 +0000555 self.inf_msg + msg, headers, fp)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000556 else:
557 visited = new.redirect_dict = req.redirect_dict = {}
558 visited[newurl] = visited.get(newurl, 0) + 1
559
560 # Don't close the fp until we are sure that we won't use it
561 # with HTTPError.
562 fp.read()
563 fp.close()
564
Senthil Kumarane9da06f2009-07-19 04:20:12 +0000565 return self.parent.open(new, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000566
567 http_error_301 = http_error_303 = http_error_307 = http_error_302
568
569 inf_msg = "The HTTP server returned a redirect error that would " \
570 "lead to an infinite loop.\n" \
571 "The last 30x error message was:\n"
572
573
574def _parse_proxy(proxy):
575 """Return (scheme, user, password, host/port) given a URL or an authority.
576
577 If a URL is supplied, it must have an authority (host:port) component.
578 According to RFC 3986, having an authority component means the URL must
579 have two slashes after the scheme:
580
581 >>> _parse_proxy('file:/ftp.example.com/')
582 Traceback (most recent call last):
583 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
584
585 The first three items of the returned tuple may be None.
586
587 Examples of authority parsing:
588
589 >>> _parse_proxy('proxy.example.com')
590 (None, None, None, 'proxy.example.com')
591 >>> _parse_proxy('proxy.example.com:3128')
592 (None, None, None, 'proxy.example.com:3128')
593
594 The authority component may optionally include userinfo (assumed to be
595 username:password):
596
597 >>> _parse_proxy('joe:password@proxy.example.com')
598 (None, 'joe', 'password', 'proxy.example.com')
599 >>> _parse_proxy('joe:password@proxy.example.com:3128')
600 (None, 'joe', 'password', 'proxy.example.com:3128')
601
602 Same examples, but with URLs instead:
603
604 >>> _parse_proxy('http://proxy.example.com/')
605 ('http', None, None, 'proxy.example.com')
606 >>> _parse_proxy('http://proxy.example.com:3128/')
607 ('http', None, None, 'proxy.example.com:3128')
608 >>> _parse_proxy('http://joe:password@proxy.example.com/')
609 ('http', 'joe', 'password', 'proxy.example.com')
610 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
611 ('http', 'joe', 'password', 'proxy.example.com:3128')
612
613 Everything after the authority is ignored:
614
615 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
616 ('ftp', 'joe', 'password', 'proxy.example.com')
617
618 Test for no trailing '/' case:
619
620 >>> _parse_proxy('http://joe:password@proxy.example.com')
621 ('http', 'joe', 'password', 'proxy.example.com')
622
623 """
Georg Brandl13e89462008-07-01 19:56:00 +0000624 scheme, r_scheme = splittype(proxy)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000625 if not r_scheme.startswith("/"):
626 # authority
627 scheme = None
628 authority = proxy
629 else:
630 # URL
631 if not r_scheme.startswith("//"):
632 raise ValueError("proxy URL with no authority: %r" % proxy)
633 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
634 # and 3.3.), path is empty or starts with '/'
635 end = r_scheme.find("/", 2)
636 if end == -1:
637 end = None
638 authority = r_scheme[2:end]
Georg Brandl13e89462008-07-01 19:56:00 +0000639 userinfo, hostport = splituser(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000640 if userinfo is not None:
Georg Brandl13e89462008-07-01 19:56:00 +0000641 user, password = splitpasswd(userinfo)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000642 else:
643 user = password = None
644 return scheme, user, password, hostport
645
646class ProxyHandler(BaseHandler):
647 # Proxies must be in front
648 handler_order = 100
649
650 def __init__(self, proxies=None):
651 if proxies is None:
652 proxies = getproxies()
653 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
654 self.proxies = proxies
655 for type, url in proxies.items():
656 setattr(self, '%s_open' % type,
657 lambda r, proxy=url, type=type, meth=self.proxy_open: \
658 meth(r, proxy, type))
659
660 def proxy_open(self, req, proxy, type):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000661 orig_type = req.type
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000662 proxy_type, user, password, hostport = _parse_proxy(proxy)
663 if proxy_type is None:
664 proxy_type = orig_type
Senthil Kumaran11301632009-10-11 06:07:46 +0000665
666 if req.host and proxy_bypass(req.host):
667 return None
668
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000669 if user and password:
Georg Brandl13e89462008-07-01 19:56:00 +0000670 user_pass = '%s:%s' % (unquote(user),
671 unquote(password))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000672 creds = base64.b64encode(user_pass.encode()).decode("ascii")
673 req.add_header('Proxy-authorization', 'Basic ' + creds)
Georg Brandl13e89462008-07-01 19:56:00 +0000674 hostport = unquote(hostport)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000675 req.set_proxy(hostport, proxy_type)
Senthil Kumaran0ac1f832009-07-26 12:39:47 +0000676 if orig_type == proxy_type or orig_type == 'https':
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000677 # let other handlers take care of it
678 return None
679 else:
680 # need to start over, because the other handlers don't
681 # grok the proxy's URL type
682 # e.g. if we have a constructor arg proxies like so:
683 # {'http': 'ftp://proxy.example.com'}, we may end up turning
684 # a request for http://acme.example.com/a into one for
685 # ftp://proxy.example.com/a
Senthil Kumarane9da06f2009-07-19 04:20:12 +0000686 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000687
688class HTTPPasswordMgr:
689
690 def __init__(self):
691 self.passwd = {}
692
693 def add_password(self, realm, uri, user, passwd):
694 # uri could be a single URI or a sequence
695 if isinstance(uri, str):
696 uri = [uri]
697 if not realm in self.passwd:
698 self.passwd[realm] = {}
699 for default_port in True, False:
700 reduced_uri = tuple(
701 [self.reduce_uri(u, default_port) for u in uri])
702 self.passwd[realm][reduced_uri] = (user, passwd)
703
704 def find_user_password(self, realm, authuri):
705 domains = self.passwd.get(realm, {})
706 for default_port in True, False:
707 reduced_authuri = self.reduce_uri(authuri, default_port)
708 for uris, authinfo in domains.items():
709 for uri in uris:
710 if self.is_suburi(uri, reduced_authuri):
711 return authinfo
712 return None, None
713
714 def reduce_uri(self, uri, default_port=True):
715 """Accept authority or URI and extract only the authority and path."""
716 # note HTTP URLs do not have a userinfo component
Georg Brandl13e89462008-07-01 19:56:00 +0000717 parts = urlsplit(uri)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000718 if parts[1]:
719 # URI
720 scheme = parts[0]
721 authority = parts[1]
722 path = parts[2] or '/'
723 else:
724 # host or host:port
725 scheme = None
726 authority = uri
727 path = '/'
Georg Brandl13e89462008-07-01 19:56:00 +0000728 host, port = splitport(authority)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000729 if default_port and port is None and scheme is not None:
730 dport = {"http": 80,
731 "https": 443,
732 }.get(scheme)
733 if dport is not None:
734 authority = "%s:%d" % (host, dport)
735 return authority, path
736
737 def is_suburi(self, base, test):
738 """Check if test is below base in a URI tree
739
740 Both args must be URIs in reduced form.
741 """
742 if base == test:
743 return True
744 if base[0] != test[0]:
745 return False
746 common = posixpath.commonprefix((base[1], test[1]))
747 if len(common) == len(base[1]):
748 return True
749 return False
750
751
752class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
753
754 def find_user_password(self, realm, authuri):
755 user, password = HTTPPasswordMgr.find_user_password(self, realm,
756 authuri)
757 if user is not None:
758 return user, password
759 return HTTPPasswordMgr.find_user_password(self, None, authuri)
760
761
762class AbstractBasicAuthHandler:
763
764 # XXX this allows for multiple auth-schemes, but will stupidly pick
765 # the last one with a realm specified.
766
767 # allow for double- and single-quoted realm values
768 # (single quotes are a violation of the RFC, but appear in the wild)
769 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
770 'realm=(["\'])(.*?)\\2', re.I)
771
772 # XXX could pre-emptively send auth info already accepted (RFC 2617,
773 # end of section 2, and section 1.2 immediately after "credentials"
774 # production).
775
776 def __init__(self, password_mgr=None):
777 if password_mgr is None:
778 password_mgr = HTTPPasswordMgr()
779 self.passwd = password_mgr
780 self.add_password = self.passwd.add_password
781
782 def http_error_auth_reqed(self, authreq, host, req, headers):
783 # host may be an authority (without userinfo) or a URL with an
784 # authority
785 # XXX could be multiple headers
786 authreq = headers.get(authreq, None)
787 if authreq:
788 mo = AbstractBasicAuthHandler.rx.search(authreq)
789 if mo:
790 scheme, quote, realm = mo.groups()
791 if scheme.lower() == 'basic':
792 return self.retry_http_basic_auth(host, req, realm)
793
794 def retry_http_basic_auth(self, host, req, realm):
795 user, pw = self.passwd.find_user_password(realm, host)
796 if pw is not None:
797 raw = "%s:%s" % (user, pw)
798 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
799 if req.headers.get(self.auth_header, None) == auth:
800 return None
Senthil Kumaranefcd8832010-02-24 16:56:20 +0000801 req.add_unredirected_header(self.auth_header, auth)
Senthil Kumarane9da06f2009-07-19 04:20:12 +0000802 return self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000803 else:
804 return None
805
806
807class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
808
809 auth_header = 'Authorization'
810
811 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000812 url = req.full_url
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000813 return self.http_error_auth_reqed('www-authenticate',
814 url, req, headers)
815
816
817class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
818
819 auth_header = 'Proxy-authorization'
820
821 def http_error_407(self, req, fp, code, msg, headers):
822 # http_error_auth_reqed requires that there is no userinfo component in
Georg Brandl029986a2008-06-23 11:44:14 +0000823 # authority. Assume there isn't one, since urllib.request does not (and
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000824 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
825 # userinfo.
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000826 authority = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000827 return self.http_error_auth_reqed('proxy-authenticate',
828 authority, req, headers)
829
830
831def randombytes(n):
832 """Return n random bytes."""
833 return os.urandom(n)
834
835class AbstractDigestAuthHandler:
836 # Digest authentication is specified in RFC 2617.
837
838 # XXX The client does not inspect the Authentication-Info header
839 # in a successful response.
840
841 # XXX It should be possible to test this implementation against
842 # a mock server that just generates a static set of challenges.
843
844 # XXX qop="auth-int" supports is shaky
845
846 def __init__(self, passwd=None):
847 if passwd is None:
848 passwd = HTTPPasswordMgr()
849 self.passwd = passwd
850 self.add_password = self.passwd.add_password
851 self.retried = 0
852 self.nonce_count = 0
Senthil Kumaranb58474f2009-11-15 08:45:27 +0000853 self.last_nonce = None
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000854
855 def reset_retry_count(self):
856 self.retried = 0
857
858 def http_error_auth_reqed(self, auth_header, host, req, headers):
859 authreq = headers.get(auth_header, None)
860 if self.retried > 5:
861 # Don't fail endlessly - if we failed once, we'll probably
862 # fail a second time. Hm. Unless the Password Manager is
863 # prompting for the information. Crap. This isn't great
864 # but it's better than the current 'repeat until recursion
865 # depth exceeded' approach <wink>
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000866 raise HTTPError(req.full_url, 401, "digest auth failed",
Georg Brandl13e89462008-07-01 19:56:00 +0000867 headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000868 else:
869 self.retried += 1
870 if authreq:
871 scheme = authreq.split()[0]
872 if scheme.lower() == 'digest':
873 return self.retry_http_digest_auth(req, authreq)
874
875 def retry_http_digest_auth(self, req, auth):
876 token, challenge = auth.split(' ', 1)
877 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
878 auth = self.get_authorization(req, chal)
879 if auth:
880 auth_val = 'Digest %s' % auth
881 if req.headers.get(self.auth_header, None) == auth_val:
882 return None
883 req.add_unredirected_header(self.auth_header, auth_val)
Senthil Kumarane9da06f2009-07-19 04:20:12 +0000884 resp = self.parent.open(req, timeout=req.timeout)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000885 return resp
886
887 def get_cnonce(self, nonce):
888 # The cnonce-value is an opaque
889 # quoted string value provided by the client and used by both client
890 # and server to avoid chosen plaintext attacks, to provide mutual
891 # authentication, and to provide some message integrity protection.
892 # This isn't a fabulous effort, but it's probably Good Enough.
893 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
894 b = s.encode("ascii") + randombytes(8)
895 dig = hashlib.sha1(b).hexdigest()
896 return dig[:16]
897
898 def get_authorization(self, req, chal):
899 try:
900 realm = chal['realm']
901 nonce = chal['nonce']
902 qop = chal.get('qop')
903 algorithm = chal.get('algorithm', 'MD5')
904 # mod_digest doesn't send an opaque, even though it isn't
905 # supposed to be optional
906 opaque = chal.get('opaque', None)
907 except KeyError:
908 return None
909
910 H, KD = self.get_algorithm_impls(algorithm)
911 if H is None:
912 return None
913
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000914 user, pw = self.passwd.find_user_password(realm, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000915 if user is None:
916 return None
917
918 # XXX not implemented yet
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000919 if req.data is not None:
920 entdig = self.get_entity_digest(req.data, chal)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000921 else:
922 entdig = None
923
924 A1 = "%s:%s:%s" % (user, realm, pw)
925 A2 = "%s:%s" % (req.get_method(),
926 # XXX selector: what about proxies and full urls
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000927 req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000928 if qop == 'auth':
Senthil Kumaranb58474f2009-11-15 08:45:27 +0000929 if nonce == self.last_nonce:
930 self.nonce_count += 1
931 else:
932 self.nonce_count = 1
933 self.last_nonce = nonce
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000934 ncvalue = '%08x' % self.nonce_count
935 cnonce = self.get_cnonce(nonce)
936 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
937 respdig = KD(H(A1), noncebit)
938 elif qop is None:
939 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
940 else:
941 # XXX handle auth-int.
Georg Brandl13e89462008-07-01 19:56:00 +0000942 raise URLError("qop '%s' is not supported." % qop)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000943
944 # XXX should the partial digests be encoded too?
945
946 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000947 'response="%s"' % (user, realm, nonce, req.selector,
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000948 respdig)
949 if opaque:
950 base += ', opaque="%s"' % opaque
951 if entdig:
952 base += ', digest="%s"' % entdig
953 base += ', algorithm="%s"' % algorithm
954 if qop:
955 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
956 return base
957
958 def get_algorithm_impls(self, algorithm):
959 # lambdas assume digest modules are imported at the top level
960 if algorithm == 'MD5':
961 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
962 elif algorithm == 'SHA':
963 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
964 # XXX MD5-sess
965 KD = lambda s, d: H("%s:%s" % (s, d))
966 return H, KD
967
968 def get_entity_digest(self, data, chal):
969 # XXX not implemented yet
970 return None
971
972
973class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
974 """An authentication protocol defined by RFC 2069
975
976 Digest authentication improves on basic authentication because it
977 does not transmit passwords in the clear.
978 """
979
980 auth_header = 'Authorization'
981 handler_order = 490 # before Basic auth
982
983 def http_error_401(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000984 host = urlparse(req.full_url)[1]
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000985 retry = self.http_error_auth_reqed('www-authenticate',
986 host, req, headers)
987 self.reset_retry_count()
988 return retry
989
990
991class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
992
993 auth_header = 'Proxy-Authorization'
994 handler_order = 490 # before Basic auth
995
996 def http_error_407(self, req, fp, code, msg, headers):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +0000997 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000998 retry = self.http_error_auth_reqed('proxy-authenticate',
999 host, req, headers)
1000 self.reset_retry_count()
1001 return retry
1002
1003class AbstractHTTPHandler(BaseHandler):
1004
1005 def __init__(self, debuglevel=0):
1006 self._debuglevel = debuglevel
1007
1008 def set_http_debuglevel(self, level):
1009 self._debuglevel = level
1010
1011 def do_request_(self, request):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001012 host = request.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001013 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001014 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001015
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001016 if request.data is not None: # POST
1017 data = request.data
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001018 if not request.has_header('Content-type'):
1019 request.add_unredirected_header(
1020 'Content-type',
1021 'application/x-www-form-urlencoded')
1022 if not request.has_header('Content-length'):
1023 request.add_unredirected_header(
1024 'Content-length', '%d' % len(data))
1025
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001026 sel_host = host
1027 if request.has_proxy():
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001028 scheme, sel = splittype(request.selector)
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001029 sel_host, sel_path = splithost(sel)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001030 if not request.has_header('Host'):
Facundo Batista72dc1ea2008-08-16 14:44:32 +00001031 request.add_unredirected_header('Host', sel_host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001032 for name, value in self.parent.addheaders:
1033 name = name.capitalize()
1034 if not request.has_header(name):
1035 request.add_unredirected_header(name, value)
1036
1037 return request
1038
1039 def do_open(self, http_class, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001040 """Return an HTTPResponse object for the request, using http_class.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001041
1042 http_class must implement the HTTPConnection API from http.client.
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001043 """
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001044 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001045 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001046 raise URLError('no host given')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001047
1048 h = http_class(host, timeout=req.timeout) # will parse host:port
1049 headers = dict(req.headers)
1050 headers.update(req.unredirected_hdrs)
1051
1052 # TODO(jhylton): Should this be redesigned to handle
1053 # persistent connections?
1054
1055 # We want to make an HTTP/1.1 request, but the addinfourl
1056 # class isn't prepared to deal with a persistent connection.
1057 # It will try to read all remaining data from the socket,
1058 # which will block while the server waits for the next request.
1059 # So make sure the connection gets closed after the (only)
1060 # request.
1061 headers["Connection"] = "close"
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001062 headers = dict((name.title(), val) for name, val in headers.items())
Senthil Kumaran0ac1f832009-07-26 12:39:47 +00001063
1064 if req._tunnel_host:
Senthil Kumaran4b9fbeb2009-12-20 07:18:22 +00001065 tunnel_headers = {}
1066 proxy_auth_hdr = "Proxy-Authorization"
1067 if proxy_auth_hdr in headers:
1068 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1069 # Proxy-Authorization should not be sent to origin
1070 # server.
1071 del headers[proxy_auth_hdr]
1072 h._set_tunnel(req._tunnel_host, headers=tunnel_headers)
Senthil Kumaran0ac1f832009-07-26 12:39:47 +00001073
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001074 try:
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001075 h.request(req.get_method(), req.selector, req.data, headers)
1076 r = h.getresponse() # an HTTPResponse instance
1077 except socket.error as err:
Georg Brandl13e89462008-07-01 19:56:00 +00001078 raise URLError(err)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001079
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001080 r.url = req.full_url
1081 # This line replaces the .msg attribute of the HTTPResponse
1082 # with .headers, because urllib clients expect the response to
1083 # have the reason in .msg. It would be good to mark this
1084 # attribute is deprecated and get then to use info() or
1085 # .headers.
1086 r.msg = r.reason
1087 return r
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001088
1089
1090class HTTPHandler(AbstractHTTPHandler):
1091
1092 def http_open(self, req):
1093 return self.do_open(http.client.HTTPConnection, req)
1094
1095 http_request = AbstractHTTPHandler.do_request_
1096
1097if hasattr(http.client, 'HTTPSConnection'):
1098 class HTTPSHandler(AbstractHTTPHandler):
1099
1100 def https_open(self, req):
1101 return self.do_open(http.client.HTTPSConnection, req)
1102
1103 https_request = AbstractHTTPHandler.do_request_
1104
1105class HTTPCookieProcessor(BaseHandler):
1106 def __init__(self, cookiejar=None):
1107 import http.cookiejar
1108 if cookiejar is None:
1109 cookiejar = http.cookiejar.CookieJar()
1110 self.cookiejar = cookiejar
1111
1112 def http_request(self, request):
1113 self.cookiejar.add_cookie_header(request)
1114 return request
1115
1116 def http_response(self, request, response):
1117 self.cookiejar.extract_cookies(response, request)
1118 return response
1119
1120 https_request = http_request
1121 https_response = http_response
1122
1123class UnknownHandler(BaseHandler):
1124 def unknown_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001125 type = req.type
Georg Brandl13e89462008-07-01 19:56:00 +00001126 raise URLError('unknown url type: %s' % type)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001127
1128def parse_keqv_list(l):
1129 """Parse list of key=value strings where keys are not duplicated."""
1130 parsed = {}
1131 for elt in l:
1132 k, v = elt.split('=', 1)
1133 if v[0] == '"' and v[-1] == '"':
1134 v = v[1:-1]
1135 parsed[k] = v
1136 return parsed
1137
1138def parse_http_list(s):
1139 """Parse lists as described by RFC 2068 Section 2.
1140
1141 In particular, parse comma-separated lists where the elements of
1142 the list may include quoted-strings. A quoted-string could
1143 contain a comma. A non-quoted string could have quotes in the
1144 middle. Neither commas nor quotes count if they are escaped.
1145 Only double-quotes count, not single-quotes.
1146 """
1147 res = []
1148 part = ''
1149
1150 escape = quote = False
1151 for cur in s:
1152 if escape:
1153 part += cur
1154 escape = False
1155 continue
1156 if quote:
1157 if cur == '\\':
1158 escape = True
1159 continue
1160 elif cur == '"':
1161 quote = False
1162 part += cur
1163 continue
1164
1165 if cur == ',':
1166 res.append(part)
1167 part = ''
1168 continue
1169
1170 if cur == '"':
1171 quote = True
1172
1173 part += cur
1174
1175 # append last part
1176 if part:
1177 res.append(part)
1178
1179 return [part.strip() for part in res]
1180
1181class FileHandler(BaseHandler):
1182 # Use local file or FTP depending on form of URL
1183 def file_open(self, req):
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001184 url = req.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001185 if url[:2] == '//' and url[2:3] != '/':
1186 req.type = 'ftp'
1187 return self.parent.open(req)
1188 else:
1189 return self.open_local_file(req)
1190
1191 # names for the localhost
1192 names = None
1193 def get_names(self):
1194 if FileHandler.names is None:
1195 try:
Senthil Kumaran88a495d2009-12-27 10:15:45 +00001196 FileHandler.names = tuple(
1197 socket.gethostbyname_ex('localhost')[2] +
1198 socket.gethostbyname_ex(socket.gethostname())[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001199 except socket.gaierror:
1200 FileHandler.names = (socket.gethostbyname('localhost'),)
1201 return FileHandler.names
1202
1203 # not entirely sure what the rules are here
1204 def open_local_file(self, req):
1205 import email.utils
1206 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001207 host = req.host
1208 file = req.selector
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001209 localfile = url2pathname(file)
1210 try:
1211 stats = os.stat(localfile)
1212 size = stats.st_size
1213 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1214 mtype = mimetypes.guess_type(file)[0]
1215 headers = email.message_from_string(
1216 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1217 (mtype or 'text/plain', size, modified))
1218 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001219 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001220 if not host or \
1221 (not port and _safe_gethostbyname(host) in self.get_names()):
Georg Brandl13e89462008-07-01 19:56:00 +00001222 return addinfourl(open(localfile, 'rb'), headers, 'file:'+file)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001223 except OSError as msg:
Georg Brandl029986a2008-06-23 11:44:14 +00001224 # users shouldn't expect OSErrors coming from urlopen()
Georg Brandl13e89462008-07-01 19:56:00 +00001225 raise URLError(msg)
1226 raise URLError('file not on local host')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001227
1228def _safe_gethostbyname(host):
1229 try:
1230 return socket.gethostbyname(host)
1231 except socket.gaierror:
1232 return None
1233
1234class FTPHandler(BaseHandler):
1235 def ftp_open(self, req):
1236 import ftplib
1237 import mimetypes
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001238 host = req.host
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001239 if not host:
Georg Brandl13e89462008-07-01 19:56:00 +00001240 raise URLError('ftp error: no host given')
1241 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001242 if port is None:
1243 port = ftplib.FTP_PORT
1244 else:
1245 port = int(port)
1246
1247 # username/password handling
Georg Brandl13e89462008-07-01 19:56:00 +00001248 user, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001249 if user:
Georg Brandl13e89462008-07-01 19:56:00 +00001250 user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001251 else:
1252 passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001253 host = unquote(host)
1254 user = unquote(user or '')
1255 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001256
1257 try:
1258 host = socket.gethostbyname(host)
1259 except socket.error as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001260 raise URLError(msg)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001261 path, attrs = splitattr(req.selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001262 dirs = path.split('/')
Georg Brandl13e89462008-07-01 19:56:00 +00001263 dirs = list(map(unquote, dirs))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001264 dirs, file = dirs[:-1], dirs[-1]
1265 if dirs and not dirs[0]:
1266 dirs = dirs[1:]
1267 try:
1268 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1269 type = file and 'I' or 'D'
1270 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001271 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001272 if attr.lower() == 'type' and \
1273 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1274 type = value.upper()
1275 fp, retrlen = fw.retrfile(file, type)
1276 headers = ""
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001277 mtype = mimetypes.guess_type(req.full_url)[0]
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001278 if mtype:
1279 headers += "Content-type: %s\n" % mtype
1280 if retrlen is not None and retrlen >= 0:
1281 headers += "Content-length: %d\n" % retrlen
1282 headers = email.message_from_string(headers)
Jeremy Hylton6c5e28c2009-03-31 14:35:53 +00001283 return addinfourl(fp, headers, req.full_url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001284 except ftplib.all_errors as msg:
Georg Brandl13e89462008-07-01 19:56:00 +00001285 exc = URLError('ftp error: %s' % msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001286 raise exc.with_traceback(sys.exc_info()[2])
1287
1288 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1289 fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1290 return fw
1291
1292class CacheFTPHandler(FTPHandler):
1293 # XXX would be nice to have pluggable cache strategies
1294 # XXX this stuff is definitely not thread safe
1295 def __init__(self):
1296 self.cache = {}
1297 self.timeout = {}
1298 self.soonest = 0
1299 self.delay = 60
1300 self.max_conns = 16
1301
1302 def setTimeout(self, t):
1303 self.delay = t
1304
1305 def setMaxConns(self, m):
1306 self.max_conns = m
1307
1308 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1309 key = user, host, port, '/'.join(dirs), timeout
1310 if key in self.cache:
1311 self.timeout[key] = time.time() + self.delay
1312 else:
1313 self.cache[key] = ftpwrapper(user, passwd, host, port,
1314 dirs, timeout)
1315 self.timeout[key] = time.time() + self.delay
1316 self.check_cache()
1317 return self.cache[key]
1318
1319 def check_cache(self):
1320 # first check for old ones
1321 t = time.time()
1322 if self.soonest <= t:
1323 for k, v in list(self.timeout.items()):
1324 if v < t:
1325 self.cache[k].close()
1326 del self.cache[k]
1327 del self.timeout[k]
1328 self.soonest = min(list(self.timeout.values()))
1329
1330 # then check the size
1331 if len(self.cache) == self.max_conns:
1332 for k, v in list(self.timeout.items()):
1333 if v == self.soonest:
1334 del self.cache[k]
1335 del self.timeout[k]
1336 break
1337 self.soonest = min(list(self.timeout.values()))
1338
1339# Code move from the old urllib module
1340
1341MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1342
1343# Helper for non-unix systems
1344if os.name == 'mac':
1345 from macurl2path import url2pathname, pathname2url
1346elif os.name == 'nt':
1347 from nturl2path import url2pathname, pathname2url
1348else:
1349 def url2pathname(pathname):
1350 """OS-specific conversion from a relative URL of the 'file' scheme
1351 to a file system path; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001352 return unquote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001353
1354 def pathname2url(pathname):
1355 """OS-specific conversion from a file system path to a relative URL
1356 of the 'file' scheme; not recommended for general use."""
Georg Brandl13e89462008-07-01 19:56:00 +00001357 return quote(pathname)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001358
1359# This really consists of two pieces:
1360# (1) a class which handles opening of all sorts of URLs
1361# (plus assorted utilities etc.)
1362# (2) a set of functions for parsing URLs
1363# XXX Should these be separated out into different modules?
1364
1365
1366ftpcache = {}
1367class URLopener:
1368 """Class to open URLs.
1369 This is a class rather than just a subroutine because we may need
1370 more than one set of global protocol-specific options.
1371 Note -- this is a base class for those who don't want the
1372 automatic handling of errors type 302 (relocated) and 401
1373 (authorization needed)."""
1374
1375 __tempfiles = None
1376
1377 version = "Python-urllib/%s" % __version__
1378
1379 # Constructor
1380 def __init__(self, proxies=None, **x509):
1381 if proxies is None:
1382 proxies = getproxies()
1383 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1384 self.proxies = proxies
1385 self.key_file = x509.get('key_file')
1386 self.cert_file = x509.get('cert_file')
1387 self.addheaders = [('User-Agent', self.version)]
1388 self.__tempfiles = []
1389 self.__unlink = os.unlink # See cleanup()
1390 self.tempcache = None
1391 # Undocumented feature: if you assign {} to tempcache,
1392 # it is used to cache files retrieved with
1393 # self.retrieve(). This is not enabled by default
1394 # since it does not work for changing documents (and I
1395 # haven't got the logic to check expiration headers
1396 # yet).
1397 self.ftpcache = ftpcache
1398 # Undocumented feature: you can use a different
1399 # ftp cache by assigning to the .ftpcache member;
1400 # in case you want logically independent URL openers
1401 # XXX This is not threadsafe. Bah.
1402
1403 def __del__(self):
1404 self.close()
1405
1406 def close(self):
1407 self.cleanup()
1408
1409 def cleanup(self):
1410 # This code sometimes runs when the rest of this module
1411 # has already been deleted, so it can't use any globals
1412 # or import anything.
1413 if self.__tempfiles:
1414 for file in self.__tempfiles:
1415 try:
1416 self.__unlink(file)
1417 except OSError:
1418 pass
1419 del self.__tempfiles[:]
1420 if self.tempcache:
1421 self.tempcache.clear()
1422
1423 def addheader(self, *args):
1424 """Add a header to be used by the HTTP interface only
1425 e.g. u.addheader('Accept', 'sound/basic')"""
1426 self.addheaders.append(args)
1427
1428 # External interface
1429 def open(self, fullurl, data=None):
1430 """Use URLopener().open(file) instead of open(file, 'r')."""
Georg Brandl13e89462008-07-01 19:56:00 +00001431 fullurl = unwrap(to_bytes(fullurl))
Senthil Kumaran0e7e9ae2010-02-20 22:30:21 +00001432 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001433 if self.tempcache and fullurl in self.tempcache:
1434 filename, headers = self.tempcache[fullurl]
1435 fp = open(filename, 'rb')
Georg Brandl13e89462008-07-01 19:56:00 +00001436 return addinfourl(fp, headers, fullurl)
1437 urltype, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001438 if not urltype:
1439 urltype = 'file'
1440 if urltype in self.proxies:
1441 proxy = self.proxies[urltype]
Georg Brandl13e89462008-07-01 19:56:00 +00001442 urltype, proxyhost = splittype(proxy)
1443 host, selector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001444 url = (host, fullurl) # Signal special case to open_*()
1445 else:
1446 proxy = None
1447 name = 'open_' + urltype
1448 self.type = urltype
1449 name = name.replace('-', '_')
1450 if not hasattr(self, name):
1451 if proxy:
1452 return self.open_unknown_proxy(proxy, fullurl, data)
1453 else:
1454 return self.open_unknown(fullurl, data)
1455 try:
1456 if data is None:
1457 return getattr(self, name)(url)
1458 else:
1459 return getattr(self, name)(url, data)
1460 except socket.error as msg:
1461 raise IOError('socket error', msg).with_traceback(sys.exc_info()[2])
1462
1463 def open_unknown(self, fullurl, data=None):
1464 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001465 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001466 raise IOError('url error', 'unknown url type', type)
1467
1468 def open_unknown_proxy(self, proxy, fullurl, data=None):
1469 """Overridable interface to open unknown URL type."""
Georg Brandl13e89462008-07-01 19:56:00 +00001470 type, url = splittype(fullurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001471 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1472
1473 # External interface
1474 def retrieve(self, url, filename=None, reporthook=None, data=None):
1475 """retrieve(url) returns (filename, headers) for a local object
1476 or (tempfilename, headers) for a remote object."""
Georg Brandl13e89462008-07-01 19:56:00 +00001477 url = unwrap(to_bytes(url))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001478 if self.tempcache and url in self.tempcache:
1479 return self.tempcache[url]
Georg Brandl13e89462008-07-01 19:56:00 +00001480 type, url1 = splittype(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001481 if filename is None and (not type or type == 'file'):
1482 try:
1483 fp = self.open_local_file(url1)
1484 hdrs = fp.info()
1485 del fp
Georg Brandl13e89462008-07-01 19:56:00 +00001486 return url2pathname(splithost(url1)[1]), hdrs
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001487 except IOError as msg:
1488 pass
1489 fp = self.open(url, data)
Benjamin Peterson5f28b7b2009-03-26 21:49:58 +00001490 try:
1491 headers = fp.info()
1492 if filename:
1493 tfp = open(filename, 'wb')
1494 else:
1495 import tempfile
1496 garbage, path = splittype(url)
1497 garbage, path = splithost(path or "")
1498 path, garbage = splitquery(path or "")
1499 path, garbage = splitattr(path or "")
1500 suffix = os.path.splitext(path)[1]
1501 (fd, filename) = tempfile.mkstemp(suffix)
1502 self.__tempfiles.append(filename)
1503 tfp = os.fdopen(fd, 'wb')
1504 try:
1505 result = filename, headers
1506 if self.tempcache is not None:
1507 self.tempcache[url] = result
1508 bs = 1024*8
1509 size = -1
1510 read = 0
1511 blocknum = 0
1512 if reporthook:
1513 if "content-length" in headers:
1514 size = int(headers["Content-Length"])
1515 reporthook(blocknum, bs, size)
1516 while 1:
1517 block = fp.read(bs)
1518 if not block:
1519 break
1520 read += len(block)
1521 tfp.write(block)
1522 blocknum += 1
1523 if reporthook:
1524 reporthook(blocknum, bs, size)
1525 finally:
1526 tfp.close()
1527 finally:
1528 fp.close()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001529 del fp
1530 del tfp
1531
1532 # raise exception if actual size does not match content-length header
1533 if size >= 0 and read < size:
Georg Brandl13e89462008-07-01 19:56:00 +00001534 raise ContentTooShortError(
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001535 "retrieval incomplete: got only %i out of %i bytes"
1536 % (read, size), result)
1537
1538 return result
1539
1540 # Each method named open_<type> knows how to open that type of URL
1541
1542 def _open_generic_http(self, connection_factory, url, data):
1543 """Make an HTTP connection using connection_class.
1544
1545 This is an internal method that should be called from
1546 open_http() or open_https().
1547
1548 Arguments:
1549 - connection_factory should take a host name and return an
1550 HTTPConnection instance.
1551 - url is the url to retrieval or a host, relative-path pair.
1552 - data is payload for a POST request or None.
1553 """
1554
1555 user_passwd = None
1556 proxy_passwd= None
1557 if isinstance(url, str):
Georg Brandl13e89462008-07-01 19:56:00 +00001558 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001559 if host:
Georg Brandl13e89462008-07-01 19:56:00 +00001560 user_passwd, host = splituser(host)
1561 host = unquote(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001562 realhost = host
1563 else:
1564 host, selector = url
1565 # check whether the proxy contains authorization information
Georg Brandl13e89462008-07-01 19:56:00 +00001566 proxy_passwd, host = splituser(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001567 # now we proceed with the url we want to obtain
Georg Brandl13e89462008-07-01 19:56:00 +00001568 urltype, rest = splittype(selector)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001569 url = rest
1570 user_passwd = None
1571 if urltype.lower() != 'http':
1572 realhost = None
1573 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001574 realhost, rest = splithost(rest)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001575 if realhost:
Georg Brandl13e89462008-07-01 19:56:00 +00001576 user_passwd, realhost = splituser(realhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001577 if user_passwd:
1578 selector = "%s://%s%s" % (urltype, realhost, rest)
1579 if proxy_bypass(realhost):
1580 host = realhost
1581
1582 #print "proxy via http:", host, selector
1583 if not host: raise IOError('http error', 'no host given')
1584
1585 if proxy_passwd:
1586 import base64
1587 proxy_auth = base64.b64encode(proxy_passwd).strip()
1588 else:
1589 proxy_auth = None
1590
1591 if user_passwd:
1592 import base64
1593 auth = base64.b64encode(user_passwd).strip()
1594 else:
1595 auth = None
1596 http_conn = connection_factory(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001597 headers = {}
1598 if proxy_auth:
1599 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1600 if auth:
1601 headers["Authorization"] = "Basic %s" % auth
1602 if realhost:
1603 headers["Host"] = realhost
1604 for header, value in self.addheaders:
1605 headers[header] = value
1606
1607 if data is not None:
1608 headers["Content-Type"] = "application/x-www-form-urlencoded"
1609 http_conn.request("POST", selector, data, headers)
1610 else:
1611 http_conn.request("GET", selector, headers=headers)
1612
1613 try:
1614 response = http_conn.getresponse()
1615 except http.client.BadStatusLine:
1616 # something went wrong with the HTTP status line
Georg Brandl13e89462008-07-01 19:56:00 +00001617 raise URLError("http protocol error: bad status line")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001618
1619 # According to RFC 2616, "2xx" code indicates that the client's
1620 # request was successfully received, understood, and accepted.
1621 if 200 <= response.status < 300:
Antoine Pitroub353c122009-02-11 00:39:14 +00001622 return addinfourl(response, response.msg, "http:" + url,
Georg Brandl13e89462008-07-01 19:56:00 +00001623 response.status)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001624 else:
1625 return self.http_error(
1626 url, response.fp,
1627 response.status, response.reason, response.msg, data)
1628
1629 def open_http(self, url, data=None):
1630 """Use HTTP protocol."""
1631 return self._open_generic_http(http.client.HTTPConnection, url, data)
1632
1633 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1634 """Handle http errors.
1635
1636 Derived class can override this, or provide specific handlers
1637 named http_error_DDD where DDD is the 3-digit error code."""
1638 # First check if there's a specific handler for this error
1639 name = 'http_error_%d' % errcode
1640 if hasattr(self, name):
1641 method = getattr(self, name)
1642 if data is None:
1643 result = method(url, fp, errcode, errmsg, headers)
1644 else:
1645 result = method(url, fp, errcode, errmsg, headers, data)
1646 if result: return result
1647 return self.http_error_default(url, fp, errcode, errmsg, headers)
1648
1649 def http_error_default(self, url, fp, errcode, errmsg, headers):
1650 """Default error handler: close the connection and raise IOError."""
1651 void = fp.read()
1652 fp.close()
Georg Brandl13e89462008-07-01 19:56:00 +00001653 raise HTTPError(url, errcode, errmsg, headers, None)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001654
1655 if _have_ssl:
1656 def _https_connection(self, host):
1657 return http.client.HTTPSConnection(host,
1658 key_file=self.key_file,
1659 cert_file=self.cert_file)
1660
1661 def open_https(self, url, data=None):
1662 """Use HTTPS protocol."""
1663 return self._open_generic_http(self._https_connection, url, data)
1664
1665 def open_file(self, url):
1666 """Use local file or FTP depending on form of URL."""
1667 if not isinstance(url, str):
1668 raise URLError('file error', 'proxy support for file protocol currently not implemented')
1669 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1670 return self.open_ftp(url)
1671 else:
1672 return self.open_local_file(url)
1673
1674 def open_local_file(self, url):
1675 """Use local file."""
1676 import mimetypes, email.utils
1677 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001678 host, file = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001679 localname = url2pathname(file)
1680 try:
1681 stats = os.stat(localname)
1682 except OSError as e:
1683 raise URLError(e.errno, e.strerror, e.filename)
1684 size = stats.st_size
1685 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1686 mtype = mimetypes.guess_type(url)[0]
1687 headers = email.message_from_string(
1688 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1689 (mtype or 'text/plain', size, modified))
1690 if not host:
1691 urlfile = file
1692 if file[:1] == '/':
1693 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001694 return addinfourl(open(localname, 'rb'), headers, urlfile)
1695 host, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001696 if (not port
Senthil Kumaran88a495d2009-12-27 10:15:45 +00001697 and socket.gethostbyname(host) in (localhost() + thishost())):
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001698 urlfile = file
1699 if file[:1] == '/':
1700 urlfile = 'file://' + file
Georg Brandl13e89462008-07-01 19:56:00 +00001701 return addinfourl(open(localname, 'rb'), headers, urlfile)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001702 raise URLError('local file error', 'not on local host')
1703
1704 def open_ftp(self, url):
1705 """Use FTP protocol."""
1706 if not isinstance(url, str):
1707 raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented')
1708 import mimetypes
1709 from io import StringIO
Georg Brandl13e89462008-07-01 19:56:00 +00001710 host, path = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001711 if not host: raise URLError('ftp error', 'no host given')
Georg Brandl13e89462008-07-01 19:56:00 +00001712 host, port = splitport(host)
1713 user, host = splituser(host)
1714 if user: user, passwd = splitpasswd(user)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001715 else: passwd = None
Georg Brandl13e89462008-07-01 19:56:00 +00001716 host = unquote(host)
1717 user = unquote(user or '')
1718 passwd = unquote(passwd or '')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001719 host = socket.gethostbyname(host)
1720 if not port:
1721 import ftplib
1722 port = ftplib.FTP_PORT
1723 else:
1724 port = int(port)
Georg Brandl13e89462008-07-01 19:56:00 +00001725 path, attrs = splitattr(path)
1726 path = unquote(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001727 dirs = path.split('/')
1728 dirs, file = dirs[:-1], dirs[-1]
1729 if dirs and not dirs[0]: dirs = dirs[1:]
1730 if dirs and not dirs[0]: dirs[0] = '/'
1731 key = user, host, port, '/'.join(dirs)
1732 # XXX thread unsafe!
1733 if len(self.ftpcache) > MAXFTPCACHE:
1734 # Prune the cache, rather arbitrarily
1735 for k in self.ftpcache.keys():
1736 if k != key:
1737 v = self.ftpcache[k]
1738 del self.ftpcache[k]
1739 v.close()
1740 try:
1741 if not key in self.ftpcache:
1742 self.ftpcache[key] = \
1743 ftpwrapper(user, passwd, host, port, dirs)
1744 if not file: type = 'D'
1745 else: type = 'I'
1746 for attr in attrs:
Georg Brandl13e89462008-07-01 19:56:00 +00001747 attr, value = splitvalue(attr)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001748 if attr.lower() == 'type' and \
1749 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1750 type = value.upper()
1751 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1752 mtype = mimetypes.guess_type("ftp:" + url)[0]
1753 headers = ""
1754 if mtype:
1755 headers += "Content-Type: %s\n" % mtype
1756 if retrlen is not None and retrlen >= 0:
1757 headers += "Content-Length: %d\n" % retrlen
1758 headers = email.message_from_string(headers)
Georg Brandl13e89462008-07-01 19:56:00 +00001759 return addinfourl(fp, headers, "ftp:" + url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001760 except ftperrors() as msg:
1761 raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2])
1762
1763 def open_data(self, url, data=None):
1764 """Use "data" URL."""
1765 if not isinstance(url, str):
1766 raise URLError('data error', 'proxy support for data protocol currently not implemented')
1767 # ignore POSTed data
1768 #
1769 # syntax of data URLs:
1770 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1771 # mediatype := [ type "/" subtype ] *( ";" parameter )
1772 # data := *urlchar
1773 # parameter := attribute "=" value
1774 try:
1775 [type, data] = url.split(',', 1)
1776 except ValueError:
1777 raise IOError('data error', 'bad data URL')
1778 if not type:
1779 type = 'text/plain;charset=US-ASCII'
1780 semi = type.rfind(';')
1781 if semi >= 0 and '=' not in type[semi:]:
1782 encoding = type[semi+1:]
1783 type = type[:semi]
1784 else:
1785 encoding = ''
1786 msg = []
1787 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
1788 time.gmtime(time.time())))
1789 msg.append('Content-type: %s' % type)
1790 if encoding == 'base64':
1791 import base64
Georg Brandl706824f2009-06-04 09:42:55 +00001792 # XXX is this encoding/decoding ok?
1793 data = base64.decodebytes(data.encode('ascii')).decode('latin1')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001794 else:
Georg Brandl13e89462008-07-01 19:56:00 +00001795 data = unquote(data)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001796 msg.append('Content-Length: %d' % len(data))
1797 msg.append('')
1798 msg.append(data)
1799 msg = '\n'.join(msg)
Georg Brandl13e89462008-07-01 19:56:00 +00001800 headers = email.message_from_string(msg)
1801 f = io.StringIO(msg)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001802 #f.fileno = None # needed for addinfourl
Georg Brandl13e89462008-07-01 19:56:00 +00001803 return addinfourl(f, headers, url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001804
1805
1806class FancyURLopener(URLopener):
1807 """Derived class with handlers for errors we can handle (perhaps)."""
1808
1809 def __init__(self, *args, **kwargs):
1810 URLopener.__init__(self, *args, **kwargs)
1811 self.auth_cache = {}
1812 self.tries = 0
1813 self.maxtries = 10
1814
1815 def http_error_default(self, url, fp, errcode, errmsg, headers):
1816 """Default error handling -- don't raise an exception."""
Georg Brandl13e89462008-07-01 19:56:00 +00001817 return addinfourl(fp, headers, "http:" + url, errcode)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001818
1819 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
1820 """Error 302 -- relocated (temporarily)."""
1821 self.tries += 1
1822 if self.maxtries and self.tries >= self.maxtries:
1823 if hasattr(self, "http_error_500"):
1824 meth = self.http_error_500
1825 else:
1826 meth = self.http_error_default
1827 self.tries = 0
1828 return meth(url, fp, 500,
1829 "Internal Server Error: Redirect Recursion", headers)
1830 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
1831 data)
1832 self.tries = 0
1833 return result
1834
1835 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
1836 if 'location' in headers:
1837 newurl = headers['location']
1838 elif 'uri' in headers:
1839 newurl = headers['uri']
1840 else:
1841 return
1842 void = fp.read()
1843 fp.close()
1844 # In case the server sent a relative URL, join with original:
Georg Brandl13e89462008-07-01 19:56:00 +00001845 newurl = urljoin(self.type + ":" + url, newurl)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001846 return self.open(newurl)
1847
1848 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
1849 """Error 301 -- also relocated (permanently)."""
1850 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1851
1852 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
1853 """Error 303 -- also relocated (essentially identical to 302)."""
1854 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1855
1856 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
1857 """Error 307 -- relocated, but turn POST into error."""
1858 if data is None:
1859 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
1860 else:
1861 return self.http_error_default(url, fp, errcode, errmsg, headers)
1862
1863 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
1864 """Error 401 -- authentication required.
1865 This function supports Basic authentication only."""
1866 if not 'www-authenticate' in headers:
1867 URLopener.http_error_default(self, url, fp,
1868 errcode, errmsg, headers)
1869 stuff = headers['www-authenticate']
1870 import re
1871 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1872 if not match:
1873 URLopener.http_error_default(self, url, fp,
1874 errcode, errmsg, headers)
1875 scheme, realm = match.groups()
1876 if scheme.lower() != 'basic':
1877 URLopener.http_error_default(self, url, fp,
1878 errcode, errmsg, headers)
1879 name = 'retry_' + self.type + '_basic_auth'
1880 if data is None:
1881 return getattr(self,name)(url, realm)
1882 else:
1883 return getattr(self,name)(url, realm, data)
1884
1885 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
1886 """Error 407 -- proxy authentication required.
1887 This function supports Basic authentication only."""
1888 if not 'proxy-authenticate' in headers:
1889 URLopener.http_error_default(self, url, fp,
1890 errcode, errmsg, headers)
1891 stuff = headers['proxy-authenticate']
1892 import re
1893 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
1894 if not match:
1895 URLopener.http_error_default(self, url, fp,
1896 errcode, errmsg, headers)
1897 scheme, realm = match.groups()
1898 if scheme.lower() != 'basic':
1899 URLopener.http_error_default(self, url, fp,
1900 errcode, errmsg, headers)
1901 name = 'retry_proxy_' + self.type + '_basic_auth'
1902 if data is None:
1903 return getattr(self,name)(url, realm)
1904 else:
1905 return getattr(self,name)(url, realm, data)
1906
1907 def retry_proxy_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001908 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001909 newurl = 'http://' + host + selector
1910 proxy = self.proxies['http']
Georg Brandl13e89462008-07-01 19:56:00 +00001911 urltype, proxyhost = splittype(proxy)
1912 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001913 i = proxyhost.find('@') + 1
1914 proxyhost = proxyhost[i:]
1915 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1916 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001917 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001918 quote(passwd, safe=''), proxyhost)
1919 self.proxies['http'] = 'http://' + proxyhost + proxyselector
1920 if data is None:
1921 return self.open(newurl)
1922 else:
1923 return self.open(newurl, data)
1924
1925 def retry_proxy_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001926 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001927 newurl = 'https://' + host + selector
1928 proxy = self.proxies['https']
Georg Brandl13e89462008-07-01 19:56:00 +00001929 urltype, proxyhost = splittype(proxy)
1930 proxyhost, proxyselector = splithost(proxyhost)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001931 i = proxyhost.find('@') + 1
1932 proxyhost = proxyhost[i:]
1933 user, passwd = self.get_user_passwd(proxyhost, realm, i)
1934 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001935 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001936 quote(passwd, safe=''), proxyhost)
1937 self.proxies['https'] = 'https://' + proxyhost + proxyselector
1938 if data is None:
1939 return self.open(newurl)
1940 else:
1941 return self.open(newurl, data)
1942
1943 def retry_http_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001944 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001945 i = host.find('@') + 1
1946 host = host[i:]
1947 user, passwd = self.get_user_passwd(host, realm, i)
1948 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001949 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001950 quote(passwd, safe=''), host)
1951 newurl = 'http://' + host + selector
1952 if data is None:
1953 return self.open(newurl)
1954 else:
1955 return self.open(newurl, data)
1956
1957 def retry_https_basic_auth(self, url, realm, data=None):
Georg Brandl13e89462008-07-01 19:56:00 +00001958 host, selector = splithost(url)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001959 i = host.find('@') + 1
1960 host = host[i:]
1961 user, passwd = self.get_user_passwd(host, realm, i)
1962 if not (user or passwd): return None
Georg Brandl13e89462008-07-01 19:56:00 +00001963 host = "%s:%s@%s" % (quote(user, safe=''),
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001964 quote(passwd, safe=''), host)
1965 newurl = 'https://' + host + selector
1966 if data is None:
1967 return self.open(newurl)
1968 else:
1969 return self.open(newurl, data)
1970
1971 def get_user_passwd(self, host, realm, clear_cache = 0):
1972 key = realm + '@' + host.lower()
1973 if key in self.auth_cache:
1974 if clear_cache:
1975 del self.auth_cache[key]
1976 else:
1977 return self.auth_cache[key]
1978 user, passwd = self.prompt_user_passwd(host, realm)
1979 if user or passwd: self.auth_cache[key] = (user, passwd)
1980 return user, passwd
1981
1982 def prompt_user_passwd(self, host, realm):
1983 """Override this in a GUI environment!"""
1984 import getpass
1985 try:
1986 user = input("Enter username for %s at %s: " % (realm, host))
1987 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
1988 (user, realm, host))
1989 return user, passwd
1990 except KeyboardInterrupt:
1991 print()
1992 return None, None
1993
1994
1995# Utility functions
1996
1997_localhost = None
1998def localhost():
1999 """Return the IP address of the magic hostname 'localhost'."""
2000 global _localhost
2001 if _localhost is None:
2002 _localhost = socket.gethostbyname('localhost')
2003 return _localhost
2004
2005_thishost = None
2006def thishost():
Senthil Kumaran88a495d2009-12-27 10:15:45 +00002007 """Return the IP addresses of the current host."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002008 global _thishost
2009 if _thishost is None:
Senthil Kumaran88a495d2009-12-27 10:15:45 +00002010 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname()[2]))
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002011 return _thishost
2012
2013_ftperrors = None
2014def ftperrors():
2015 """Return the set of errors raised by the FTP class."""
2016 global _ftperrors
2017 if _ftperrors is None:
2018 import ftplib
2019 _ftperrors = ftplib.all_errors
2020 return _ftperrors
2021
2022_noheaders = None
2023def noheaders():
Georg Brandl13e89462008-07-01 19:56:00 +00002024 """Return an empty email Message object."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002025 global _noheaders
2026 if _noheaders is None:
Georg Brandl13e89462008-07-01 19:56:00 +00002027 _noheaders = email.message_from_string("")
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002028 return _noheaders
2029
2030
2031# Utility classes
2032
2033class ftpwrapper:
2034 """Class used by open_ftp() for cache of open FTP connections."""
2035
2036 def __init__(self, user, passwd, host, port, dirs, timeout=None):
2037 self.user = user
2038 self.passwd = passwd
2039 self.host = host
2040 self.port = port
2041 self.dirs = dirs
2042 self.timeout = timeout
2043 self.init()
2044
2045 def init(self):
2046 import ftplib
2047 self.busy = 0
2048 self.ftp = ftplib.FTP()
2049 self.ftp.connect(self.host, self.port, self.timeout)
2050 self.ftp.login(self.user, self.passwd)
2051 for dir in self.dirs:
2052 self.ftp.cwd(dir)
2053
2054 def retrfile(self, file, type):
2055 import ftplib
2056 self.endtransfer()
2057 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2058 else: cmd = 'TYPE ' + type; isdir = 0
2059 try:
2060 self.ftp.voidcmd(cmd)
2061 except ftplib.all_errors:
2062 self.init()
2063 self.ftp.voidcmd(cmd)
2064 conn = None
2065 if file and not isdir:
2066 # Try to retrieve as a file
2067 try:
2068 cmd = 'RETR ' + file
2069 conn = self.ftp.ntransfercmd(cmd)
2070 except ftplib.error_perm as reason:
2071 if str(reason)[:3] != '550':
Georg Brandl13e89462008-07-01 19:56:00 +00002072 raise URLError('ftp error', reason).with_traceback(
2073 sys.exc_info()[2])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002074 if not conn:
2075 # Set transfer mode to ASCII!
2076 self.ftp.voidcmd('TYPE A')
2077 # Try a directory listing. Verify that directory exists.
2078 if file:
2079 pwd = self.ftp.pwd()
2080 try:
2081 try:
2082 self.ftp.cwd(file)
2083 except ftplib.error_perm as reason:
Georg Brandl13e89462008-07-01 19:56:00 +00002084 raise URLError('ftp error', reason) from reason
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002085 finally:
2086 self.ftp.cwd(pwd)
2087 cmd = 'LIST ' + file
2088 else:
2089 cmd = 'LIST'
2090 conn = self.ftp.ntransfercmd(cmd)
2091 self.busy = 1
2092 # Pass back both a suitably decorated object and a retrieval length
Georg Brandl13e89462008-07-01 19:56:00 +00002093 return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1])
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002094 def endtransfer(self):
2095 if not self.busy:
2096 return
2097 self.busy = 0
2098 try:
2099 self.ftp.voidresp()
2100 except ftperrors():
2101 pass
2102
2103 def close(self):
2104 self.endtransfer()
2105 try:
2106 self.ftp.close()
2107 except ftperrors():
2108 pass
2109
2110# Proxy handling
2111def getproxies_environment():
2112 """Return a dictionary of scheme -> proxy server URL mappings.
2113
2114 Scan the environment for variables named <scheme>_proxy;
2115 this seems to be the standard convention. If you need a
2116 different way, you can pass a proxies dictionary to the
2117 [Fancy]URLopener constructor.
2118
2119 """
2120 proxies = {}
2121 for name, value in os.environ.items():
2122 name = name.lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002123 if value and name[-6:] == '_proxy':
2124 proxies[name[:-6]] = value
2125 return proxies
2126
2127def proxy_bypass_environment(host):
2128 """Test if proxies should not be used for a particular host.
2129
2130 Checks the environment for a variable named no_proxy, which should
2131 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2132 """
2133 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2134 # '*' is special case for always bypass
2135 if no_proxy == '*':
2136 return 1
2137 # strip port off host
Georg Brandl13e89462008-07-01 19:56:00 +00002138 hostonly, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002139 # check if the host ends with any of the DNS suffixes
2140 for name in no_proxy.split(','):
2141 if name and (hostonly.endswith(name) or host.endswith(name)):
2142 return 1
2143 # otherwise, don't bypass
2144 return 0
2145
2146
2147if sys.platform == 'darwin':
2148 def getproxies_internetconfig():
2149 """Return a dictionary of scheme -> proxy server URL mappings.
2150
2151 By convention the mac uses Internet Config to store
2152 proxies. An HTTP proxy, for instance, is stored under
2153 the HttpProxy key.
2154
2155 """
2156 try:
2157 import ic
2158 except ImportError:
2159 return {}
2160
2161 try:
2162 config = ic.IC()
2163 except ic.error:
2164 return {}
2165 proxies = {}
2166 # HTTP:
2167 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
2168 try:
2169 value = config['HTTPProxyHost']
2170 except ic.error:
2171 pass
2172 else:
2173 proxies['http'] = 'http://%s' % value
2174 # FTP: XXX To be done.
2175 # Gopher: XXX To be done.
2176 return proxies
2177
2178 def proxy_bypass(host):
2179 if getproxies_environment():
2180 return proxy_bypass_environment(host)
2181 else:
2182 return 0
2183
2184 def getproxies():
2185 return getproxies_environment() or getproxies_internetconfig()
2186
2187elif os.name == 'nt':
2188 def getproxies_registry():
2189 """Return a dictionary of scheme -> proxy server URL mappings.
2190
2191 Win32 uses the registry to store proxies.
2192
2193 """
2194 proxies = {}
2195 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002196 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002197 except ImportError:
2198 # Std module, so should be around - but you never know!
2199 return proxies
2200 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002201 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002202 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002203 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002204 'ProxyEnable')[0]
2205 if proxyEnable:
2206 # Returned as Unicode but problems if not converted to ASCII
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002207 proxyServer = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002208 'ProxyServer')[0])
2209 if '=' in proxyServer:
2210 # Per-protocol settings
2211 for p in proxyServer.split(';'):
2212 protocol, address = p.split('=', 1)
2213 # See if address has a type:// prefix
2214 import re
2215 if not re.match('^([^/:]+)://', address):
2216 address = '%s://%s' % (protocol, address)
2217 proxies[protocol] = address
2218 else:
2219 # Use one setting for all protocols
2220 if proxyServer[:5] == 'http:':
2221 proxies['http'] = proxyServer
2222 else:
2223 proxies['http'] = 'http://%s' % proxyServer
2224 proxies['ftp'] = 'ftp://%s' % proxyServer
2225 internetSettings.Close()
2226 except (WindowsError, ValueError, TypeError):
2227 # Either registry key not found etc, or the value in an
2228 # unexpected format.
2229 # proxies already set up to be empty so nothing to do
2230 pass
2231 return proxies
2232
2233 def getproxies():
2234 """Return a dictionary of scheme -> proxy server URL mappings.
2235
2236 Returns settings gathered from the environment, if specified,
2237 or the registry.
2238
2239 """
2240 return getproxies_environment() or getproxies_registry()
2241
2242 def proxy_bypass_registry(host):
2243 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002244 import winreg
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002245 import re
2246 except ImportError:
2247 # Std modules, so should be around - but you never know!
2248 return 0
2249 try:
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002250 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002251 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002252 proxyEnable = winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002253 'ProxyEnable')[0]
Georg Brandl4ed72ac2009-04-01 04:28:33 +00002254 proxyOverride = str(winreg.QueryValueEx(internetSettings,
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002255 'ProxyOverride')[0])
2256 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2257 except WindowsError:
2258 return 0
2259 if not proxyEnable or not proxyOverride:
2260 return 0
2261 # try to make a host list from name and IP address.
Georg Brandl13e89462008-07-01 19:56:00 +00002262 rawHost, port = splitport(host)
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002263 host = [rawHost]
2264 try:
2265 addr = socket.gethostbyname(rawHost)
2266 if addr != rawHost:
2267 host.append(addr)
2268 except socket.error:
2269 pass
2270 try:
2271 fqdn = socket.getfqdn(rawHost)
2272 if fqdn != rawHost:
2273 host.append(fqdn)
2274 except socket.error:
2275 pass
2276 # make a check value list from the registry entry: replace the
2277 # '<local>' string by the localhost entry and the corresponding
2278 # canonical entry.
2279 proxyOverride = proxyOverride.split(';')
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002280 # now check if we match one of the registry values.
2281 for test in proxyOverride:
Senthil Kumaran49476062009-05-01 06:00:23 +00002282 if test == '<local>':
2283 if '.' not in rawHost:
2284 return 1
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002285 test = test.replace(".", r"\.") # mask dots
2286 test = test.replace("*", r".*") # change glob sequence
2287 test = test.replace("?", r".") # change glob char
2288 for val in host:
2289 # print "%s <--> %s" %( test, val )
2290 if re.match(test, val, re.I):
2291 return 1
2292 return 0
2293
2294 def proxy_bypass(host):
2295 """Return a dictionary of scheme -> proxy server URL mappings.
2296
2297 Returns settings gathered from the environment, if specified,
2298 or the registry.
2299
2300 """
2301 if getproxies_environment():
2302 return proxy_bypass_environment(host)
2303 else:
2304 return proxy_bypass_registry(host)
2305
2306else:
2307 # By default use environment variables
2308 getproxies = getproxies_environment
2309 proxy_bypass = proxy_bypass_environment