blob: 9b0ee8039cfc402fd9211366f4ffa457a7dd52aa [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
Thomas Wouters477c8d52006-05-27 19:21:47 +000010Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000013
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Thomas Wouters477c8d52006-05-27 19:21:47 +000028__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
Jeremy Hylton1afc1692008-06-18 20:49:58 +000031import copy
32import re
33import time
34import urllib.parse, urllib.request
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000035try:
36 import threading as _threading
37except ImportError:
38 import dummy_threading as _threading
Georg Brandl24420152008-05-26 16:32:26 +000039import http.client # only for the default HTTP port
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000040from calendar import timegm
41
Thomas Wouters477c8d52006-05-27 19:21:47 +000042debug = False # set to True to enable debugging via the logging module
43logger = None
44
45def _debug(*args):
46 if not debug:
47 return
48 global logger
49 if not logger:
50 import logging
Georg Brandl24420152008-05-26 16:32:26 +000051 logger = logging.getLogger("http.cookiejar")
Thomas Wouters477c8d52006-05-27 19:21:47 +000052 return logger.debug(*args)
53
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000054
Georg Brandl24420152008-05-26 16:32:26 +000055DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000056MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
57 "instance initialised with one)")
58
Thomas Wouters477c8d52006-05-27 19:21:47 +000059def _warn_unhandled_exception():
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000060 # There are a few catch-all except: statements in this module, for
Thomas Wouters477c8d52006-05-27 19:21:47 +000061 # catching input that's bad in unexpected ways. Warn if any
62 # exceptions are caught there.
Jeremy Hylton7ecf3dc2008-05-10 20:38:40 +000063 import io, warnings, traceback
Guido van Rossum34d19282007-08-09 01:03:29 +000064 f = io.StringIO()
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000065 traceback.print_exc(None, f)
66 msg = f.getvalue()
Georg Brandl24420152008-05-26 16:32:26 +000067 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000068
69
70# Date/time conversion
71# -----------------------------------------------------------------------------
72
73EPOCH_YEAR = 1970
74def _timegm(tt):
75 year, month, mday, hour, min, sec = tt[:6]
76 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
77 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
78 return timegm(tt)
79 else:
80 return None
81
82DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
83MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
84 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
85MONTHS_LOWER = []
86for month in MONTHS: MONTHS_LOWER.append(month.lower())
87
88def time2isoz(t=None):
89 """Return a string representing time in seconds since epoch, t.
90
91 If the function is called without an argument, it will use the current
92 time.
93
94 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
95 representing Universal Time (UTC, aka GMT). An example of this format is:
96
97 1994-11-24 08:49:37Z
98
99 """
100 if t is None: t = time.time()
101 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
102 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
103 year, mon, mday, hour, min, sec)
104
105def time2netscape(t=None):
106 """Return a string representing time in seconds since epoch, t.
107
108 If the function is called without an argument, it will use the current
109 time.
110
111 The format of the returned string is like this:
112
113 Wed, DD-Mon-YYYY HH:MM:SS GMT
114
115 """
116 if t is None: t = time.time()
117 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
118 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
119 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
120
121
122UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
123
Antoine Pitroufd036452008-08-19 17:56:33 +0000124TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000125def offset_from_tz_string(tz):
126 offset = None
127 if tz in UTC_ZONES:
128 offset = 0
129 else:
130 m = TIMEZONE_RE.search(tz)
131 if m:
132 offset = 3600 * int(m.group(2))
133 if m.group(3):
134 offset = offset + 60 * int(m.group(3))
135 if m.group(1) == '-':
136 offset = -offset
137 return offset
138
139def _str2time(day, mon, yr, hr, min, sec, tz):
140 # translate month name to number
141 # month numbers start with 1 (January)
142 try:
143 mon = MONTHS_LOWER.index(mon.lower())+1
144 except ValueError:
145 # maybe it's already a number
146 try:
147 imon = int(mon)
148 except ValueError:
149 return None
150 if 1 <= imon <= 12:
151 mon = imon
152 else:
153 return None
154
155 # make sure clock elements are defined
156 if hr is None: hr = 0
157 if min is None: min = 0
158 if sec is None: sec = 0
159
160 yr = int(yr)
161 day = int(day)
162 hr = int(hr)
163 min = int(min)
164 sec = int(sec)
165
166 if yr < 1000:
167 # find "obvious" year
168 cur_yr = time.localtime(time.time())[0]
169 m = cur_yr % 100
170 tmp = yr
171 yr = yr + cur_yr - m
172 m = m - tmp
173 if abs(m) > 50:
174 if m > 0: yr = yr + 100
175 else: yr = yr - 100
176
177 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
178 t = _timegm((yr, mon, day, hr, min, sec, tz))
179
180 if t is not None:
181 # adjust time using timezone string, to get absolute time since epoch
182 if tz is None:
183 tz = "UTC"
184 tz = tz.upper()
185 offset = offset_from_tz_string(tz)
186 if offset is None:
187 return None
188 t = t - offset
189
190 return t
191
192STRICT_DATE_RE = re.compile(
193 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
Antoine Pitroufd036452008-08-19 17:56:33 +0000194 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000195WEEKDAY_RE = re.compile(
Antoine Pitroufd036452008-08-19 17:56:33 +0000196 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000197LOOSE_HTTP_DATE_RE = re.compile(
198 r"""^
199 (\d\d?) # day
200 (?:\s+|[-\/])
201 (\w+) # month
202 (?:\s+|[-\/])
203 (\d+) # year
204 (?:
205 (?:\s+|:) # separator before clock
206 (\d\d?):(\d\d) # hour:min
207 (?::(\d\d))? # optional seconds
208 )? # optional clock
209 \s*
210 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
211 \s*
212 (?:\(\w+\))? # ASCII representation of timezone in parens.
Antoine Pitroufd036452008-08-19 17:56:33 +0000213 \s*$""", re.X | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000214def http2time(text):
215 """Returns time in seconds since epoch of time represented by a string.
216
217 Return value is an integer.
218
219 None is returned if the format of str is unrecognized, the time is outside
220 the representable range, or the timezone string is not recognized. If the
221 string contains no timezone, UTC is assumed.
222
223 The timezone in the string may be numerical (like "-0800" or "+0100") or a
224 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
225 timezone strings equivalent to UTC (zero offset) are known to the function.
226
227 The function loosely parses the following formats:
228
229 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
230 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
231 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
232 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
233 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
234 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
235
236 The parser ignores leading and trailing whitespace. The time may be
237 absent.
238
239 If the year is given with only 2 digits, the function will select the
240 century that makes the year closest to the current date.
241
242 """
243 # fast exit for strictly conforming string
244 m = STRICT_DATE_RE.search(text)
245 if m:
246 g = m.groups()
247 mon = MONTHS_LOWER.index(g[1].lower()) + 1
248 tt = (int(g[2]), mon, int(g[0]),
249 int(g[3]), int(g[4]), float(g[5]))
250 return _timegm(tt)
251
252 # No, we need some messy parsing...
253
254 # clean up
255 text = text.lstrip()
256 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
257
258 # tz is time zone specifier string
259 day, mon, yr, hr, min, sec, tz = [None]*7
260
261 # loose regexp parse
262 m = LOOSE_HTTP_DATE_RE.search(text)
263 if m is not None:
264 day, mon, yr, hr, min, sec, tz = m.groups()
265 else:
266 return None # bad format
267
268 return _str2time(day, mon, yr, hr, min, sec, tz)
269
270ISO_DATE_RE = re.compile(
271 """^
272 (\d{4}) # year
273 [-\/]?
274 (\d\d?) # numerical month
275 [-\/]?
276 (\d\d?) # day
277 (?:
278 (?:\s+|[-:Tt]) # separator before clock
279 (\d\d?):?(\d\d) # hour:min
280 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
281 )? # optional clock
282 \s*
283 ([-+]?\d\d?:?(:?\d\d)?
284 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
Antoine Pitroufd036452008-08-19 17:56:33 +0000285 \s*$""", re.X | re. ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000286def iso2time(text):
287 """
288 As for http2time, but parses the ISO 8601 formats:
289
290 1994-02-03 14:15:29 -0100 -- ISO 8601 format
291 1994-02-03 14:15:29 -- zone is optional
292 1994-02-03 -- only date
293 1994-02-03T14:15:29 -- Use T as separator
294 19940203T141529Z -- ISO 8601 compact format
295 19940203 -- only date
296
297 """
298 # clean up
299 text = text.lstrip()
300
301 # tz is time zone specifier string
302 day, mon, yr, hr, min, sec, tz = [None]*7
303
304 # loose regexp parse
305 m = ISO_DATE_RE.search(text)
306 if m is not None:
307 # XXX there's an extra bit of the timezone I'm ignoring here: is
308 # this the right thing to do?
309 yr, mon, day, hr, min, sec, tz, _ = m.groups()
310 else:
311 return None # bad format
312
313 return _str2time(day, mon, yr, hr, min, sec, tz)
314
315
316# Header parsing
317# -----------------------------------------------------------------------------
318
319def unmatched(match):
320 """Return unmatched part of re.Match object."""
321 start, end = match.span(0)
322 return match.string[:start]+match.string[end:]
323
324HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
325HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
326HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
327HEADER_ESCAPE_RE = re.compile(r"\\(.)")
328def split_header_words(header_values):
329 r"""Parse header values into a list of lists containing key,value pairs.
330
331 The function knows how to deal with ",", ";" and "=" as well as quoted
332 values after "=". A list of space separated tokens are parsed as if they
333 were separated by ";".
334
335 If the header_values passed as argument contains multiple values, then they
336 are treated as if they were a single value separated by comma ",".
337
338 This means that this function is useful for parsing header fields that
339 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
340 the requirement for tokens).
341
342 headers = #header
343 header = (token | parameter) *( [";"] (token | parameter))
344
345 token = 1*<any CHAR except CTLs or separators>
346 separators = "(" | ")" | "<" | ">" | "@"
347 | "," | ";" | ":" | "\" | <">
348 | "/" | "[" | "]" | "?" | "="
349 | "{" | "}" | SP | HT
350
351 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
352 qdtext = <any TEXT except <">>
353 quoted-pair = "\" CHAR
354
355 parameter = attribute "=" value
356 attribute = token
357 value = token | quoted-string
358
359 Each header is represented by a list of key/value pairs. The value for a
360 simple token (not part of a parameter) is None. Syntactically incorrect
361 headers will not necessarily be parsed as you would want.
362
363 This is easier to describe with some examples:
364
365 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
366 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
367 >>> split_header_words(['text/html; charset="iso-8859-1"'])
368 [[('text/html', None), ('charset', 'iso-8859-1')]]
369 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
370 [[('Basic', None), ('realm', '"foobar"')]]
371
372 """
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000373 assert not isinstance(header_values, str)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000374 result = []
375 for text in header_values:
376 orig_text = text
377 pairs = []
378 while text:
379 m = HEADER_TOKEN_RE.search(text)
380 if m:
381 text = unmatched(m)
382 name = m.group(1)
383 m = HEADER_QUOTED_VALUE_RE.search(text)
384 if m: # quoted value
385 text = unmatched(m)
386 value = m.group(1)
387 value = HEADER_ESCAPE_RE.sub(r"\1", value)
388 else:
389 m = HEADER_VALUE_RE.search(text)
390 if m: # unquoted value
391 text = unmatched(m)
392 value = m.group(1)
393 value = value.rstrip()
394 else:
395 # no value, a lone token
396 value = None
397 pairs.append((name, value))
398 elif text.lstrip().startswith(","):
399 # concatenated headers, as per RFC 2616 section 4.2
400 text = text.lstrip()[1:]
401 if pairs: result.append(pairs)
402 pairs = []
403 else:
404 # skip junk
405 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
406 assert nr_junk_chars > 0, (
407 "split_header_words bug: '%s', '%s', %s" %
408 (orig_text, text, pairs))
409 text = non_junk
410 if pairs: result.append(pairs)
411 return result
412
413HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
414def join_header_words(lists):
415 """Do the inverse (almost) of the conversion done by split_header_words.
416
417 Takes a list of lists of (key, value) pairs and produces a single header
418 value. Attribute values are quoted if needed.
419
420 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
421 'text/plain; charset="iso-8859/1"'
422 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
423 'text/plain, charset="iso-8859/1"'
424
425 """
426 headers = []
427 for pairs in lists:
428 attr = []
429 for k, v in pairs:
430 if v is not None:
431 if not re.search(r"^\w+$", v):
432 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
433 v = '"%s"' % v
434 k = "%s=%s" % (k, v)
435 attr.append(k)
436 if attr: headers.append("; ".join(attr))
437 return ", ".join(headers)
438
439def parse_ns_headers(ns_headers):
440 """Ad-hoc parser for Netscape protocol cookie-attributes.
441
442 The old Netscape cookie format for Set-Cookie can for instance contain
443 an unquoted "," in the expires field, so we have to use this ad-hoc
444 parser instead of split_header_words.
445
446 XXX This may not make the best possible effort to parse all the crap
447 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
448 parser is probably better, so could do worse than following that if
449 this ever gives any trouble.
450
451 Currently, this is also used for parsing RFC 2109 cookies.
452
453 """
454 known_attrs = ("expires", "domain", "path", "secure",
455 # RFC 2109 attrs (may turn up in Netscape cookies, too)
456 "port", "max-age")
457
458 result = []
459 for ns_header in ns_headers:
460 pairs = []
461 version_set = False
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000462 for ii, param in enumerate(re.split(r";\s*", ns_header)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000463 param = param.rstrip()
464 if param == "": continue
465 if "=" not in param:
Martin v. Löwisc5574e82005-03-03 10:57:37 +0000466 k, v = param, None
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000467 else:
468 k, v = re.split(r"\s*=\s*", param, 1)
469 k = k.lstrip()
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000470 if ii != 0:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000471 lc = k.lower()
472 if lc in known_attrs:
473 k = lc
474 if k == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000475 # This is an RFC 2109 cookie.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000476 version_set = True
477 if k == "expires":
478 # convert expires date to seconds since epoch
479 if v.startswith('"'): v = v[1:]
480 if v.endswith('"'): v = v[:-1]
481 v = http2time(v) # None if invalid
482 pairs.append((k, v))
483
484 if pairs:
485 if not version_set:
486 pairs.append(("version", "0"))
487 result.append(pairs)
488
489 return result
490
491
Antoine Pitroufd036452008-08-19 17:56:33 +0000492IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000493def is_HDN(text):
494 """Return True if text is a host domain name."""
495 # XXX
496 # This may well be wrong. Which RFC is HDN defined in, if any (for
497 # the purposes of RFC 2965)?
498 # For the current implementation, what about IPv6? Remember to look
499 # at other uses of IPV4_RE also, if change this.
500 if IPV4_RE.search(text):
501 return False
502 if text == "":
503 return False
504 if text[0] == "." or text[-1] == ".":
505 return False
506 return True
507
508def domain_match(A, B):
509 """Return True if domain A domain-matches domain B, according to RFC 2965.
510
511 A and B may be host domain names or IP addresses.
512
513 RFC 2965, section 1:
514
515 Host names can be specified either as an IP address or a HDN string.
516 Sometimes we compare one host name with another. (Such comparisons SHALL
517 be case-insensitive.) Host A's name domain-matches host B's if
518
519 * their host name strings string-compare equal; or
520
521 * A is a HDN string and has the form NB, where N is a non-empty
522 name string, B has the form .B', and B' is a HDN string. (So,
523 x.y.com domain-matches .Y.com but not Y.com.)
524
525 Note that domain-match is not a commutative operation: a.b.c.com
526 domain-matches .c.com, but not the reverse.
527
528 """
529 # Note that, if A or B are IP addresses, the only relevant part of the
530 # definition of the domain-match algorithm is the direct string-compare.
531 A = A.lower()
532 B = B.lower()
533 if A == B:
534 return True
535 if not is_HDN(A):
536 return False
537 i = A.rfind(B)
538 if i == -1 or i == 0:
539 # A does not have form NB, or N is the empty string
540 return False
541 if not B.startswith("."):
542 return False
543 if not is_HDN(B[1:]):
544 return False
545 return True
546
547def liberal_is_HDN(text):
548 """Return True if text is a sort-of-like a host domain name.
549
550 For accepting/blocking domains.
551
552 """
553 if IPV4_RE.search(text):
554 return False
555 return True
556
557def user_domain_match(A, B):
558 """For blocking/accepting domains.
559
560 A and B may be host domain names or IP addresses.
561
562 """
563 A = A.lower()
564 B = B.lower()
565 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
566 if A == B:
567 # equal IP addresses
568 return True
569 return False
570 initial_dot = B.startswith(".")
571 if initial_dot and A.endswith(B):
572 return True
573 if not initial_dot and A == B:
574 return True
575 return False
576
Antoine Pitroufd036452008-08-19 17:56:33 +0000577cut_port_re = re.compile(r":\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000578def request_host(request):
579 """Return request-host, as defined by RFC 2965.
580
581 Variation from RFC: returned value is lowercased, for convenient
582 comparison.
583
584 """
585 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000586 host = urllib.parse.urlparse(url)[1]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000587 if host == "":
588 host = request.get_header("Host", "")
589
590 # remove port, if present
591 host = cut_port_re.sub("", host, 1)
592 return host.lower()
593
594def eff_request_host(request):
595 """Return a tuple (request-host, effective request-host name).
596
597 As defined by RFC 2965, except both are lowercased.
598
599 """
600 erhn = req_host = request_host(request)
601 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
602 erhn = req_host + ".local"
603 return req_host, erhn
604
605def request_path(request):
Gregory P. Smith2c4973d2010-07-25 19:53:20 +0000606 """Path component of request-URI, as defined by RFC 2965."""
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000607 url = request.get_full_url()
Gregory P. Smith2c4973d2010-07-25 19:53:20 +0000608 parts = urllib.parse.urlsplit(url)
609 path = escape_path(parts.path)
610 if not path.startswith("/"):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000611 # fix bad RFC 2396 absoluteURI
Gregory P. Smith2c4973d2010-07-25 19:53:20 +0000612 path = "/" + path
613 return path
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000614
615def request_port(request):
616 host = request.get_host()
617 i = host.find(':')
618 if i >= 0:
619 port = host[i+1:]
620 try:
621 int(port)
622 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000623 _debug("nonnumeric port: '%s'", port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000624 return None
625 else:
626 port = DEFAULT_HTTP_PORT
627 return port
628
629# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
630# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
631HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
632ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
633def uppercase_escaped_char(match):
634 return "%%%s" % match.group(1).upper()
635def escape_path(path):
636 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
637 # There's no knowing what character encoding was used to create URLs
638 # containing %-escapes, but since we have to pick one to escape invalid
639 # path characters, we pick UTF-8, as recommended in the HTML 4.0
640 # specification:
641 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
642 # And here, kind of: draft-fielding-uri-rfc2396bis-03
643 # (And in draft IRI specification: draft-duerst-iri-05)
644 # (And here, for new URI schemes: RFC 2718)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000645 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000646 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
647 return path
648
649def reach(h):
650 """Return reach of host h, as defined by RFC 2965, section 1.
651
652 The reach R of a host name H is defined as follows:
653
654 * If
655
656 - H is the host domain name of a host; and,
657
658 - H has the form A.B; and
659
660 - A has no embedded (that is, interior) dots; and
661
662 - B has at least one embedded dot, or B is the string "local".
663 then the reach of H is .B.
664
665 * Otherwise, the reach of H is H.
666
667 >>> reach("www.acme.com")
668 '.acme.com'
669 >>> reach("acme.com")
670 'acme.com'
671 >>> reach("acme.local")
672 '.local'
673
674 """
675 i = h.find(".")
676 if i >= 0:
677 #a = h[:i] # this line is only here to show what a is
678 b = h[i+1:]
679 i = b.find(".")
680 if is_HDN(h) and (i >= 0 or b == "local"):
681 return "."+b
682 return h
683
684def is_third_party(request):
685 """
686
687 RFC 2965, section 3.3.6:
688
689 An unverifiable transaction is to a third-party host if its request-
690 host U does not domain-match the reach R of the request-host O in the
691 origin transaction.
692
693 """
694 req_host = request_host(request)
695 if not domain_match(req_host, reach(request.get_origin_req_host())):
696 return True
697 else:
698 return False
699
700
701class Cookie:
702 """HTTP Cookie.
703
704 This class represents both Netscape and RFC 2965 cookies.
705
706 This is deliberately a very simple class. It just holds attributes. It's
707 possible to construct Cookie instances that don't comply with the cookie
708 standards. CookieJar.make_cookies is the factory function for Cookie
709 objects -- it deals with cookie parsing, supplying defaults, and
710 normalising to the representation used in this class. CookiePolicy is
711 responsible for checking them to see whether they should be accepted from
712 and returned to the server.
713
714 Note that the port may be present in the headers, but unspecified ("Port"
715 rather than"Port=80", for example); if this is the case, port is None.
716
717 """
718
719 def __init__(self, version, name, value,
720 port, port_specified,
721 domain, domain_specified, domain_initial_dot,
722 path, path_specified,
723 secure,
724 expires,
725 discard,
726 comment,
727 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000728 rest,
729 rfc2109=False,
730 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000731
732 if version is not None: version = int(version)
733 if expires is not None: expires = int(expires)
734 if port is None and port_specified is True:
735 raise ValueError("if port is None, port_specified must be false")
736
737 self.version = version
738 self.name = name
739 self.value = value
740 self.port = port
741 self.port_specified = port_specified
742 # normalise case, as per RFC 2965 section 3.3.3
743 self.domain = domain.lower()
744 self.domain_specified = domain_specified
745 # Sigh. We need to know whether the domain given in the
746 # cookie-attribute had an initial dot, in order to follow RFC 2965
747 # (as clarified in draft errata). Needed for the returned $Domain
748 # value.
749 self.domain_initial_dot = domain_initial_dot
750 self.path = path
751 self.path_specified = path_specified
752 self.secure = secure
753 self.expires = expires
754 self.discard = discard
755 self.comment = comment
756 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000757 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000758
759 self._rest = copy.copy(rest)
760
761 def has_nonstandard_attr(self, name):
762 return name in self._rest
763 def get_nonstandard_attr(self, name, default=None):
764 return self._rest.get(name, default)
765 def set_nonstandard_attr(self, name, value):
766 self._rest[name] = value
767
768 def is_expired(self, now=None):
769 if now is None: now = time.time()
770 if (self.expires is not None) and (self.expires <= now):
771 return True
772 return False
773
774 def __str__(self):
775 if self.port is None: p = ""
776 else: p = ":"+self.port
777 limit = self.domain + p + self.path
778 if self.value is not None:
779 namevalue = "%s=%s" % (self.name, self.value)
780 else:
781 namevalue = self.name
782 return "<Cookie %s for %s>" % (namevalue, limit)
783
784 def __repr__(self):
785 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000786 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000787 "port", "port_specified",
788 "domain", "domain_specified", "domain_initial_dot",
789 "path", "path_specified",
790 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000791 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000792 attr = getattr(self, name)
793 args.append("%s=%s" % (name, repr(attr)))
794 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000795 args.append("rfc2109=%s" % repr(self.rfc2109))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000796 return "Cookie(%s)" % ", ".join(args)
797
798
799class CookiePolicy:
800 """Defines which cookies get accepted from and returned to server.
801
802 May also modify cookies, though this is probably a bad idea.
803
804 The subclass DefaultCookiePolicy defines the standard rules for Netscape
805 and RFC 2965 cookies -- override that if you want a customised policy.
806
807 """
808 def set_ok(self, cookie, request):
809 """Return true if (and only if) cookie should be accepted from server.
810
811 Currently, pre-expired cookies never get this far -- the CookieJar
812 class deletes such cookies itself.
813
814 """
815 raise NotImplementedError()
816
817 def return_ok(self, cookie, request):
818 """Return true if (and only if) cookie should be returned to server."""
819 raise NotImplementedError()
820
821 def domain_return_ok(self, domain, request):
822 """Return false if cookies should not be returned, given cookie domain.
823 """
824 return True
825
826 def path_return_ok(self, path, request):
827 """Return false if cookies should not be returned, given cookie path.
828 """
829 return True
830
831
832class DefaultCookiePolicy(CookiePolicy):
833 """Implements the standard rules for accepting and returning cookies."""
834
835 DomainStrictNoDots = 1
836 DomainStrictNonDomain = 2
837 DomainRFC2965Match = 4
838
839 DomainLiberal = 0
840 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
841
842 def __init__(self,
843 blocked_domains=None, allowed_domains=None,
844 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000845 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000846 hide_cookie2=False,
847 strict_domain=False,
848 strict_rfc2965_unverifiable=True,
849 strict_ns_unverifiable=False,
850 strict_ns_domain=DomainLiberal,
851 strict_ns_set_initial_dollar=False,
852 strict_ns_set_path=False,
853 ):
854 """Constructor arguments should be passed as keyword arguments only."""
855 self.netscape = netscape
856 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000857 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000858 self.hide_cookie2 = hide_cookie2
859 self.strict_domain = strict_domain
860 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
861 self.strict_ns_unverifiable = strict_ns_unverifiable
862 self.strict_ns_domain = strict_ns_domain
863 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
864 self.strict_ns_set_path = strict_ns_set_path
865
866 if blocked_domains is not None:
867 self._blocked_domains = tuple(blocked_domains)
868 else:
869 self._blocked_domains = ()
870
871 if allowed_domains is not None:
872 allowed_domains = tuple(allowed_domains)
873 self._allowed_domains = allowed_domains
874
875 def blocked_domains(self):
876 """Return the sequence of blocked domains (as a tuple)."""
877 return self._blocked_domains
878 def set_blocked_domains(self, blocked_domains):
879 """Set the sequence of blocked domains."""
880 self._blocked_domains = tuple(blocked_domains)
881
882 def is_blocked(self, domain):
883 for blocked_domain in self._blocked_domains:
884 if user_domain_match(domain, blocked_domain):
885 return True
886 return False
887
888 def allowed_domains(self):
889 """Return None, or the sequence of allowed domains (as a tuple)."""
890 return self._allowed_domains
891 def set_allowed_domains(self, allowed_domains):
892 """Set the sequence of allowed domains, or None."""
893 if allowed_domains is not None:
894 allowed_domains = tuple(allowed_domains)
895 self._allowed_domains = allowed_domains
896
897 def is_not_allowed(self, domain):
898 if self._allowed_domains is None:
899 return False
900 for allowed_domain in self._allowed_domains:
901 if user_domain_match(domain, allowed_domain):
902 return False
903 return True
904
905 def set_ok(self, cookie, request):
906 """
907 If you override .set_ok(), be sure to call this method. If it returns
908 false, so should your subclass (assuming your subclass wants to be more
909 strict about which cookies to accept).
910
911 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000912 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000913
914 assert cookie.name is not None
915
916 for n in "version", "verifiability", "name", "path", "domain", "port":
917 fn_name = "set_ok_"+n
918 fn = getattr(self, fn_name)
919 if not fn(cookie, request):
920 return False
921
922 return True
923
924 def set_ok_version(self, cookie, request):
925 if cookie.version is None:
926 # Version is always set to 0 by parse_ns_headers if it's a Netscape
927 # cookie, so this must be an invalid RFC 2965 cookie.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000928 _debug(" Set-Cookie2 without version attribute (%s=%s)",
929 cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000930 return False
931 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000932 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000933 return False
934 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000935 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000936 return False
937 return True
938
939 def set_ok_verifiability(self, cookie, request):
940 if request.is_unverifiable() and is_third_party(request):
941 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000942 _debug(" third-party RFC 2965 cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000943 "unverifiable transaction")
944 return False
945 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000946 _debug(" third-party Netscape cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000947 "unverifiable transaction")
948 return False
949 return True
950
951 def set_ok_name(self, cookie, request):
952 # Try and stop servers setting V0 cookies designed to hack other
953 # servers that know both V0 and V1 protocols.
954 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
955 cookie.name.startswith("$")):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000956 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000957 return False
958 return True
959
960 def set_ok_path(self, cookie, request):
961 if cookie.path_specified:
962 req_path = request_path(request)
963 if ((cookie.version > 0 or
964 (cookie.version == 0 and self.strict_ns_set_path)) and
965 not req_path.startswith(cookie.path)):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000966 _debug(" path attribute %s is not a prefix of request "
967 "path %s", cookie.path, req_path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000968 return False
969 return True
970
971 def set_ok_domain(self, cookie, request):
972 if self.is_blocked(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000973 _debug(" domain %s is in user block-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000974 return False
975 if self.is_not_allowed(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000976 _debug(" domain %s is not in user allow-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000977 return False
978 if cookie.domain_specified:
979 req_host, erhn = eff_request_host(request)
980 domain = cookie.domain
981 if self.strict_domain and (domain.count(".") >= 2):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000982 # XXX This should probably be compared with the Konqueror
983 # (kcookiejar.cpp) and Mozilla implementations, but it's a
984 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000985 i = domain.rfind(".")
986 j = domain.rfind(".", 0, i)
987 if j == 0: # domain like .foo.bar
988 tld = domain[i+1:]
989 sld = domain[j+1:i]
Thomas Wouters477c8d52006-05-27 19:21:47 +0000990 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
991 "gov", "mil", "int", "aero", "biz", "cat", "coop",
992 "info", "jobs", "mobi", "museum", "name", "pro",
993 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000994 # domain like .co.uk
Thomas Wouters477c8d52006-05-27 19:21:47 +0000995 _debug(" country-code second level domain %s", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000996 return False
997 if domain.startswith("."):
998 undotted_domain = domain[1:]
999 else:
1000 undotted_domain = domain
1001 embedded_dots = (undotted_domain.find(".") >= 0)
1002 if not embedded_dots and domain != ".local":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001003 _debug(" non-local domain %s contains no embedded dot",
1004 domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001005 return False
1006 if cookie.version == 0:
1007 if (not erhn.endswith(domain) and
1008 (not erhn.startswith(".") and
1009 not ("."+erhn).endswith(domain))):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001010 _debug(" effective request-host %s (even with added "
1011 "initial dot) does not end end with %s",
1012 erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001013 return False
1014 if (cookie.version > 0 or
1015 (self.strict_ns_domain & self.DomainRFC2965Match)):
1016 if not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001017 _debug(" effective request-host %s does not domain-match "
1018 "%s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001019 return False
1020 if (cookie.version > 0 or
1021 (self.strict_ns_domain & self.DomainStrictNoDots)):
1022 host_prefix = req_host[:-len(domain)]
1023 if (host_prefix.find(".") >= 0 and
1024 not IPV4_RE.search(req_host)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001025 _debug(" host prefix %s for domain %s contains a dot",
1026 host_prefix, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001027 return False
1028 return True
1029
1030 def set_ok_port(self, cookie, request):
1031 if cookie.port_specified:
1032 req_port = request_port(request)
1033 if req_port is None:
1034 req_port = "80"
1035 else:
1036 req_port = str(req_port)
1037 for p in cookie.port.split(","):
1038 try:
1039 int(p)
1040 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001041 _debug(" bad port %s (not numeric)", p)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001042 return False
1043 if p == req_port:
1044 break
1045 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001046 _debug(" request port (%s) not found in %s",
1047 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001048 return False
1049 return True
1050
1051 def return_ok(self, cookie, request):
1052 """
1053 If you override .return_ok(), be sure to call this method. If it
1054 returns false, so should your subclass (assuming your subclass wants to
1055 be more strict about which cookies to return).
1056
1057 """
1058 # Path has already been checked by .path_return_ok(), and domain
1059 # blocking done by .domain_return_ok().
Thomas Wouters477c8d52006-05-27 19:21:47 +00001060 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001061
1062 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1063 fn_name = "return_ok_"+n
1064 fn = getattr(self, fn_name)
1065 if not fn(cookie, request):
1066 return False
1067 return True
1068
1069 def return_ok_version(self, cookie, request):
1070 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001071 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001072 return False
1073 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001074 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001075 return False
1076 return True
1077
1078 def return_ok_verifiability(self, cookie, request):
1079 if request.is_unverifiable() and is_third_party(request):
1080 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001081 _debug(" third-party RFC 2965 cookie during unverifiable "
1082 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001083 return False
1084 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001085 _debug(" third-party Netscape cookie during unverifiable "
1086 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001087 return False
1088 return True
1089
1090 def return_ok_secure(self, cookie, request):
1091 if cookie.secure and request.get_type() != "https":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001092 _debug(" secure cookie with non-secure request")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001093 return False
1094 return True
1095
1096 def return_ok_expires(self, cookie, request):
1097 if cookie.is_expired(self._now):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001098 _debug(" cookie expired")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001099 return False
1100 return True
1101
1102 def return_ok_port(self, cookie, request):
1103 if cookie.port:
1104 req_port = request_port(request)
1105 if req_port is None:
1106 req_port = "80"
1107 for p in cookie.port.split(","):
1108 if p == req_port:
1109 break
1110 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001111 _debug(" request port %s does not match cookie port %s",
1112 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001113 return False
1114 return True
1115
1116 def return_ok_domain(self, cookie, request):
1117 req_host, erhn = eff_request_host(request)
1118 domain = cookie.domain
1119
1120 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1121 if (cookie.version == 0 and
1122 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1123 not cookie.domain_specified and domain != erhn):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001124 _debug(" cookie with unspecified domain does not string-compare "
1125 "equal to request domain")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001126 return False
1127
1128 if cookie.version > 0 and not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001129 _debug(" effective request-host name %s does not domain-match "
1130 "RFC 2965 cookie domain %s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001131 return False
1132 if cookie.version == 0 and not ("."+erhn).endswith(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001133 _debug(" request-host %s does not match Netscape cookie domain "
1134 "%s", req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001135 return False
1136 return True
1137
1138 def domain_return_ok(self, domain, request):
1139 # Liberal check of. This is here as an optimization to avoid
1140 # having to load lots of MSIE cookie files unless necessary.
1141 req_host, erhn = eff_request_host(request)
1142 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001143 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001144 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001145 erhn = "."+erhn
1146 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001147 #_debug(" request domain %s does not match cookie domain %s",
1148 # req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001149 return False
1150
1151 if self.is_blocked(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001152 _debug(" domain %s is in user block-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001153 return False
1154 if self.is_not_allowed(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001155 _debug(" domain %s is not in user allow-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001156 return False
1157
1158 return True
1159
1160 def path_return_ok(self, path, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001161 _debug("- checking cookie path=%s", path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001162 req_path = request_path(request)
1163 if not req_path.startswith(path):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001164 _debug(" %s does not path-match %s", req_path, path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001165 return False
1166 return True
1167
1168
1169def vals_sorted_by_key(adict):
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001170 keys = sorted(adict.keys())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001171 return map(adict.get, keys)
1172
1173def deepvalues(mapping):
1174 """Iterates over nested mapping, depth-first, in sorted order by key."""
1175 values = vals_sorted_by_key(mapping)
1176 for obj in values:
1177 mapping = False
1178 try:
1179 obj.items
1180 except AttributeError:
1181 pass
1182 else:
1183 mapping = True
1184 for subobj in deepvalues(obj):
1185 yield subobj
1186 if not mapping:
1187 yield obj
1188
1189
1190# Used as second parameter to dict.get() method, to distinguish absent
1191# dict key from one with a None value.
1192class Absent: pass
1193
1194class CookieJar:
1195 """Collection of HTTP cookies.
1196
1197 You may not need to know about this class: try
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001198 urllib.request.build_opener(HTTPCookieProcessor).open(url).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001199 """
1200
1201 non_word_re = re.compile(r"\W")
1202 quote_re = re.compile(r"([\"\\])")
1203 strict_domain_re = re.compile(r"\.?[^.]*")
1204 domain_re = re.compile(r"[^.]*")
1205 dots_re = re.compile(r"^\.+")
1206
Antoine Pitroufd036452008-08-19 17:56:33 +00001207 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001208
1209 def __init__(self, policy=None):
1210 if policy is None:
1211 policy = DefaultCookiePolicy()
1212 self._policy = policy
1213
1214 self._cookies_lock = _threading.RLock()
1215 self._cookies = {}
1216
1217 def set_policy(self, policy):
1218 self._policy = policy
1219
1220 def _cookies_for_domain(self, domain, request):
1221 cookies = []
1222 if not self._policy.domain_return_ok(domain, request):
1223 return []
Thomas Wouters477c8d52006-05-27 19:21:47 +00001224 _debug("Checking %s for cookies to return", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001225 cookies_by_path = self._cookies[domain]
1226 for path in cookies_by_path.keys():
1227 if not self._policy.path_return_ok(path, request):
1228 continue
1229 cookies_by_name = cookies_by_path[path]
1230 for cookie in cookies_by_name.values():
1231 if not self._policy.return_ok(cookie, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001232 _debug(" not returning cookie")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001233 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001234 _debug(" it's a match")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001235 cookies.append(cookie)
1236 return cookies
1237
1238 def _cookies_for_request(self, request):
1239 """Return a list of cookies to be returned to server."""
1240 cookies = []
1241 for domain in self._cookies.keys():
1242 cookies.extend(self._cookies_for_domain(domain, request))
1243 return cookies
1244
1245 def _cookie_attrs(self, cookies):
1246 """Return a list of cookie-attributes to be returned to server.
1247
1248 like ['foo="bar"; $Path="/"', ...]
1249
1250 The $Version attribute is also added when appropriate (currently only
1251 once per request).
1252
1253 """
1254 # add cookies in order of most specific (ie. longest) path first
Raymond Hettinger70b64fc2008-01-30 20:15:17 +00001255 cookies.sort(key=lambda a: len(a.path), reverse=True)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001256
1257 version_set = False
1258
1259 attrs = []
1260 for cookie in cookies:
1261 # set version of Cookie header
1262 # XXX
1263 # What should it be if multiple matching Set-Cookie headers have
1264 # different versions themselves?
1265 # Answer: there is no answer; was supposed to be settled by
1266 # RFC 2965 errata, but that may never appear...
1267 version = cookie.version
1268 if not version_set:
1269 version_set = True
1270 if version > 0:
1271 attrs.append("$Version=%s" % version)
1272
1273 # quote cookie value if necessary
1274 # (not for Netscape protocol, which already has any quotes
1275 # intact, due to the poorly-specified Netscape Cookie: syntax)
1276 if ((cookie.value is not None) and
1277 self.non_word_re.search(cookie.value) and version > 0):
1278 value = self.quote_re.sub(r"\\\1", cookie.value)
1279 else:
1280 value = cookie.value
1281
1282 # add cookie-attributes to be returned in Cookie header
1283 if cookie.value is None:
1284 attrs.append(cookie.name)
1285 else:
1286 attrs.append("%s=%s" % (cookie.name, value))
1287 if version > 0:
1288 if cookie.path_specified:
1289 attrs.append('$Path="%s"' % cookie.path)
1290 if cookie.domain.startswith("."):
1291 domain = cookie.domain
1292 if (not cookie.domain_initial_dot and
1293 domain.startswith(".")):
1294 domain = domain[1:]
1295 attrs.append('$Domain="%s"' % domain)
1296 if cookie.port is not None:
1297 p = "$Port"
1298 if cookie.port_specified:
1299 p = p + ('="%s"' % cookie.port)
1300 attrs.append(p)
1301
1302 return attrs
1303
1304 def add_cookie_header(self, request):
Georg Brandl029986a2008-06-23 11:44:14 +00001305 """Add correct Cookie: header to request (urllib.request.Request object).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001306
1307 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1308
1309 """
Thomas Wouters477c8d52006-05-27 19:21:47 +00001310 _debug("add_cookie_header")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001311 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001312 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001313
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001314 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001315
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001316 cookies = self._cookies_for_request(request)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001317
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001318 attrs = self._cookie_attrs(cookies)
1319 if attrs:
1320 if not request.has_header("Cookie"):
1321 request.add_unredirected_header(
1322 "Cookie", "; ".join(attrs))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001323
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001324 # if necessary, advertise that we know RFC 2965
1325 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1326 not request.has_header("Cookie2")):
1327 for cookie in cookies:
1328 if cookie.version != 1:
1329 request.add_unredirected_header("Cookie2", '$Version="1"')
1330 break
1331
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001332 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001333 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001334
1335 self.clear_expired_cookies()
1336
1337 def _normalized_cookie_tuples(self, attrs_set):
1338 """Return list of tuples containing normalised cookie information.
1339
1340 attrs_set is the list of lists of key,value pairs extracted from
1341 the Set-Cookie or Set-Cookie2 headers.
1342
1343 Tuples are name, value, standard, rest, where name and value are the
1344 cookie name and value, standard is a dictionary containing the standard
1345 cookie-attributes (discard, secure, version, expires or max-age,
1346 domain, path and port) and rest is a dictionary containing the rest of
1347 the cookie-attributes.
1348
1349 """
1350 cookie_tuples = []
1351
1352 boolean_attrs = "discard", "secure"
1353 value_attrs = ("version",
1354 "expires", "max-age",
1355 "domain", "path", "port",
1356 "comment", "commenturl")
1357
1358 for cookie_attrs in attrs_set:
1359 name, value = cookie_attrs[0]
1360
1361 # Build dictionary of standard cookie-attributes (standard) and
1362 # dictionary of other cookie-attributes (rest).
1363
1364 # Note: expiry time is normalised to seconds since epoch. V0
1365 # cookies should have the Expires cookie-attribute, and V1 cookies
1366 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1367 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1368 # accept either (but prefer Max-Age).
1369 max_age_set = False
1370
1371 bad_cookie = False
1372
1373 standard = {}
1374 rest = {}
1375 for k, v in cookie_attrs[1:]:
1376 lc = k.lower()
1377 # don't lose case distinction for unknown fields
1378 if lc in value_attrs or lc in boolean_attrs:
1379 k = lc
1380 if k in boolean_attrs and v is None:
1381 # boolean cookie-attribute is present, but has no value
1382 # (like "discard", rather than "port=80")
1383 v = True
1384 if k in standard:
1385 # only first value is significant
1386 continue
1387 if k == "domain":
1388 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001389 _debug(" missing value for domain attribute")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001390 bad_cookie = True
1391 break
1392 # RFC 2965 section 3.3.3
1393 v = v.lower()
1394 if k == "expires":
1395 if max_age_set:
1396 # Prefer max-age to expires (like Mozilla)
1397 continue
1398 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001399 _debug(" missing or invalid value for expires "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001400 "attribute: treating as session cookie")
1401 continue
1402 if k == "max-age":
1403 max_age_set = True
1404 try:
1405 v = int(v)
1406 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001407 _debug(" missing or invalid (non-numeric) value for "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001408 "max-age attribute")
1409 bad_cookie = True
1410 break
1411 # convert RFC 2965 Max-Age to seconds since epoch
1412 # XXX Strictly you're supposed to follow RFC 2616
1413 # age-calculation rules. Remember that zero Max-Age is a
1414 # is a request to discard (old and new) cookie, though.
1415 k = "expires"
1416 v = self._now + v
1417 if (k in value_attrs) or (k in boolean_attrs):
1418 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001419 k not in ("port", "comment", "commenturl")):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001420 _debug(" missing value for %s attribute" % k)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001421 bad_cookie = True
1422 break
1423 standard[k] = v
1424 else:
1425 rest[k] = v
1426
1427 if bad_cookie:
1428 continue
1429
1430 cookie_tuples.append((name, value, standard, rest))
1431
1432 return cookie_tuples
1433
1434 def _cookie_from_cookie_tuple(self, tup, request):
1435 # standard is dict of standard cookie-attributes, rest is dict of the
1436 # rest of them
1437 name, value, standard, rest = tup
1438
1439 domain = standard.get("domain", Absent)
1440 path = standard.get("path", Absent)
1441 port = standard.get("port", Absent)
1442 expires = standard.get("expires", Absent)
1443
1444 # set the easy defaults
1445 version = standard.get("version", None)
1446 if version is not None: version = int(version)
1447 secure = standard.get("secure", False)
1448 # (discard is also set if expires is Absent)
1449 discard = standard.get("discard", False)
1450 comment = standard.get("comment", None)
1451 comment_url = standard.get("commenturl", None)
1452
1453 # set default path
1454 if path is not Absent and path != "":
1455 path_specified = True
1456 path = escape_path(path)
1457 else:
1458 path_specified = False
1459 path = request_path(request)
1460 i = path.rfind("/")
1461 if i != -1:
1462 if version == 0:
1463 # Netscape spec parts company from reality here
1464 path = path[:i]
1465 else:
1466 path = path[:i+1]
1467 if len(path) == 0: path = "/"
1468
1469 # set default domain
1470 domain_specified = domain is not Absent
1471 # but first we have to remember whether it starts with a dot
1472 domain_initial_dot = False
1473 if domain_specified:
1474 domain_initial_dot = bool(domain.startswith("."))
1475 if domain is Absent:
1476 req_host, erhn = eff_request_host(request)
1477 domain = erhn
1478 elif not domain.startswith("."):
1479 domain = "."+domain
1480
1481 # set default port
1482 port_specified = False
1483 if port is not Absent:
1484 if port is None:
1485 # Port attr present, but has no value: default to request port.
1486 # Cookie should then only be sent back on that port.
1487 port = request_port(request)
1488 else:
1489 port_specified = True
1490 port = re.sub(r"\s+", "", port)
1491 else:
1492 # No port attr present. Cookie can be sent back on any port.
1493 port = None
1494
1495 # set default expires and discard
1496 if expires is Absent:
1497 expires = None
1498 discard = True
1499 elif expires <= self._now:
1500 # Expiry date in past is request to delete cookie. This can't be
1501 # in DefaultCookiePolicy, because can't delete cookies there.
1502 try:
1503 self.clear(domain, path, name)
1504 except KeyError:
1505 pass
Thomas Wouters477c8d52006-05-27 19:21:47 +00001506 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1507 domain, path, name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001508 return None
1509
1510 return Cookie(version,
1511 name, value,
1512 port, port_specified,
1513 domain, domain_specified, domain_initial_dot,
1514 path, path_specified,
1515 secure,
1516 expires,
1517 discard,
1518 comment,
1519 comment_url,
1520 rest)
1521
1522 def _cookies_from_attrs_set(self, attrs_set, request):
1523 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1524
1525 cookies = []
1526 for tup in cookie_tuples:
1527 cookie = self._cookie_from_cookie_tuple(tup, request)
1528 if cookie: cookies.append(cookie)
1529 return cookies
1530
Neal Norwitz71dad722005-12-23 21:43:48 +00001531 def _process_rfc2109_cookies(self, cookies):
1532 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1533 if rfc2109_as_ns is None:
1534 rfc2109_as_ns = not self._policy.rfc2965
1535 for cookie in cookies:
1536 if cookie.version == 1:
1537 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001538 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001539 # treat 2109 cookies as Netscape cookies rather than
1540 # as RFC2965 cookies
1541 cookie.version = 0
1542
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001543 def make_cookies(self, response, request):
1544 """Return sequence of Cookie objects extracted from response object."""
1545 # get cookie-attributes for RFC 2965 and Netscape protocols
1546 headers = response.info()
Barry Warsaw820c1202008-06-12 04:06:45 +00001547 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1548 ns_hdrs = headers.get_all("Set-Cookie", [])
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001549
1550 rfc2965 = self._policy.rfc2965
1551 netscape = self._policy.netscape
1552
1553 if ((not rfc2965_hdrs and not ns_hdrs) or
1554 (not ns_hdrs and not rfc2965) or
1555 (not rfc2965_hdrs and not netscape) or
1556 (not netscape and not rfc2965)):
1557 return [] # no relevant cookie headers: quick exit
1558
1559 try:
1560 cookies = self._cookies_from_attrs_set(
1561 split_header_words(rfc2965_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001562 except Exception:
1563 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001564 cookies = []
1565
1566 if ns_hdrs and netscape:
1567 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001568 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001569 ns_cookies = self._cookies_from_attrs_set(
1570 parse_ns_headers(ns_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001571 except Exception:
1572 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001573 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001574 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001575
1576 # Look for Netscape cookies (from Set-Cookie headers) that match
1577 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1578 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1579 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1580 # bundled in with the Netscape cookies for this purpose, which is
1581 # reasonable behaviour.
1582 if rfc2965:
1583 lookup = {}
1584 for cookie in cookies:
1585 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1586
1587 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1588 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1589 return key not in lookup
1590 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1591
1592 if ns_cookies:
1593 cookies.extend(ns_cookies)
1594
1595 return cookies
1596
1597 def set_cookie_if_ok(self, cookie, request):
1598 """Set a cookie if policy says it's OK to do so."""
1599 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001600 try:
1601 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001602
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001603 if self._policy.set_ok(cookie, request):
1604 self.set_cookie(cookie)
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001605
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001606
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001607 finally:
1608 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001609
1610 def set_cookie(self, cookie):
1611 """Set a cookie, without checking whether or not it should be set."""
1612 c = self._cookies
1613 self._cookies_lock.acquire()
1614 try:
1615 if cookie.domain not in c: c[cookie.domain] = {}
1616 c2 = c[cookie.domain]
1617 if cookie.path not in c2: c2[cookie.path] = {}
1618 c3 = c2[cookie.path]
1619 c3[cookie.name] = cookie
1620 finally:
1621 self._cookies_lock.release()
1622
1623 def extract_cookies(self, response, request):
1624 """Extract cookies from response, where allowable given the request."""
Thomas Wouters477c8d52006-05-27 19:21:47 +00001625 _debug("extract_cookies: %s", response.info())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001626 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001627 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001628 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001629
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001630 for cookie in self.make_cookies(response, request):
1631 if self._policy.set_ok(cookie, request):
1632 _debug(" setting cookie: %s", cookie)
1633 self.set_cookie(cookie)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001634 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001635 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001636
1637 def clear(self, domain=None, path=None, name=None):
1638 """Clear some cookies.
1639
1640 Invoking this method without arguments will clear all cookies. If
1641 given a single argument, only cookies belonging to that domain will be
1642 removed. If given two arguments, cookies belonging to the specified
1643 path within that domain are removed. If given three arguments, then
1644 the cookie with the specified name, path and domain is removed.
1645
1646 Raises KeyError if no matching cookie exists.
1647
1648 """
1649 if name is not None:
1650 if (domain is None) or (path is None):
1651 raise ValueError(
1652 "domain and path must be given to remove a cookie by name")
1653 del self._cookies[domain][path][name]
1654 elif path is not None:
1655 if domain is None:
1656 raise ValueError(
1657 "domain must be given to remove cookies by path")
1658 del self._cookies[domain][path]
1659 elif domain is not None:
1660 del self._cookies[domain]
1661 else:
1662 self._cookies = {}
1663
1664 def clear_session_cookies(self):
1665 """Discard all session cookies.
1666
1667 Note that the .save() method won't save session cookies anyway, unless
1668 you ask otherwise by passing a true ignore_discard argument.
1669
1670 """
1671 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001672 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001673 for cookie in self:
1674 if cookie.discard:
1675 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001676 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001677 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001678
1679 def clear_expired_cookies(self):
1680 """Discard all expired cookies.
1681
1682 You probably don't need to call this method: expired cookies are never
1683 sent back to the server (provided you're using DefaultCookiePolicy),
1684 this method is called by CookieJar itself every so often, and the
1685 .save() method won't save expired cookies anyway (unless you ask
1686 otherwise by passing a true ignore_expires argument).
1687
1688 """
1689 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001690 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001691 now = time.time()
1692 for cookie in self:
1693 if cookie.is_expired(now):
1694 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001695 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001696 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001697
1698 def __iter__(self):
1699 return deepvalues(self._cookies)
1700
1701 def __len__(self):
1702 """Return number of contained cookies."""
1703 i = 0
1704 for cookie in self: i = i + 1
1705 return i
1706
1707 def __repr__(self):
1708 r = []
1709 for cookie in self: r.append(repr(cookie))
1710 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1711
1712 def __str__(self):
1713 r = []
1714 for cookie in self: r.append(str(cookie))
1715 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1716
1717
Neal Norwitz3e7de592005-12-23 21:24:35 +00001718# derives from IOError for backwards-compatibility with Python 2.4.0
1719class LoadError(IOError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001720
1721class FileCookieJar(CookieJar):
1722 """CookieJar that can be loaded from and saved to a file."""
1723
1724 def __init__(self, filename=None, delayload=False, policy=None):
1725 """
1726 Cookies are NOT loaded from the named file until either the .load() or
1727 .revert() method is called.
1728
1729 """
1730 CookieJar.__init__(self, policy)
1731 if filename is not None:
1732 try:
1733 filename+""
1734 except:
1735 raise ValueError("filename must be string-like")
1736 self.filename = filename
1737 self.delayload = bool(delayload)
1738
1739 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1740 """Save cookies to a file."""
1741 raise NotImplementedError()
1742
1743 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1744 """Load cookies from a file."""
1745 if filename is None:
1746 if self.filename is not None: filename = self.filename
1747 else: raise ValueError(MISSING_FILENAME_TEXT)
1748
1749 f = open(filename)
1750 try:
1751 self._really_load(f, filename, ignore_discard, ignore_expires)
1752 finally:
1753 f.close()
1754
1755 def revert(self, filename=None,
1756 ignore_discard=False, ignore_expires=False):
1757 """Clear all cookies and reload cookies from a saved file.
1758
1759 Raises LoadError (or IOError) if reversion is not successful; the
1760 object's state will not be altered if this happens.
1761
1762 """
1763 if filename is None:
1764 if self.filename is not None: filename = self.filename
1765 else: raise ValueError(MISSING_FILENAME_TEXT)
1766
1767 self._cookies_lock.acquire()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001768 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001769
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001770 old_state = copy.deepcopy(self._cookies)
1771 self._cookies = {}
1772 try:
1773 self.load(filename, ignore_discard, ignore_expires)
1774 except (LoadError, IOError):
1775 self._cookies = old_state
1776 raise
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001777
1778 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001779 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001780
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001781
1782def lwp_cookie_str(cookie):
1783 """Return string representation of Cookie in an the LWP cookie file format.
1784
1785 Actually, the format is extended a bit -- see module docstring.
1786
1787 """
1788 h = [(cookie.name, cookie.value),
1789 ("path", cookie.path),
1790 ("domain", cookie.domain)]
1791 if cookie.port is not None: h.append(("port", cookie.port))
1792 if cookie.path_specified: h.append(("path_spec", None))
1793 if cookie.port_specified: h.append(("port_spec", None))
1794 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1795 if cookie.secure: h.append(("secure", None))
1796 if cookie.expires: h.append(("expires",
1797 time2isoz(float(cookie.expires))))
1798 if cookie.discard: h.append(("discard", None))
1799 if cookie.comment: h.append(("comment", cookie.comment))
1800 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1801
1802 keys = sorted(cookie._rest.keys())
1803 for k in keys:
1804 h.append((k, str(cookie._rest[k])))
1805
1806 h.append(("version", str(cookie.version)))
1807
1808 return join_header_words([h])
1809
1810class LWPCookieJar(FileCookieJar):
1811 """
1812 The LWPCookieJar saves a sequence of"Set-Cookie3" lines.
1813 "Set-Cookie3" is the format used by the libwww-perl libary, not known
1814 to be compatible with any browser, but which is easy to read and
1815 doesn't lose information about RFC 2965 cookies.
1816
1817 Additional methods
1818
1819 as_lwp_str(ignore_discard=True, ignore_expired=True)
1820
1821 """
1822
1823 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1824 """Return cookies as a string of "\n"-separated "Set-Cookie3" headers.
1825
1826 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1827
1828 """
1829 now = time.time()
1830 r = []
1831 for cookie in self:
1832 if not ignore_discard and cookie.discard:
1833 continue
1834 if not ignore_expires and cookie.is_expired(now):
1835 continue
1836 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1837 return "\n".join(r+[""])
1838
1839 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1840 if filename is None:
1841 if self.filename is not None: filename = self.filename
1842 else: raise ValueError(MISSING_FILENAME_TEXT)
1843
1844 f = open(filename, "w")
1845 try:
1846 # There really isn't an LWP Cookies 2.0 format, but this indicates
1847 # that there is extra information in here (domain_dot and
1848 # port_spec) while still being compatible with libwww-perl, I hope.
1849 f.write("#LWP-Cookies-2.0\n")
1850 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1851 finally:
1852 f.close()
1853
1854 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1855 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001856 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001857 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1858 "file" % filename)
1859 raise LoadError(msg)
1860
1861 now = time.time()
1862
1863 header = "Set-Cookie3:"
1864 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1865 "secure", "discard")
1866 value_attrs = ("version",
1867 "port", "path", "domain",
1868 "expires",
1869 "comment", "commenturl")
1870
1871 try:
1872 while 1:
1873 line = f.readline()
1874 if line == "": break
1875 if not line.startswith(header):
1876 continue
1877 line = line[len(header):].strip()
1878
1879 for data in split_header_words([line]):
1880 name, value = data[0]
1881 standard = {}
1882 rest = {}
1883 for k in boolean_attrs:
1884 standard[k] = False
1885 for k, v in data[1:]:
1886 if k is not None:
1887 lc = k.lower()
1888 else:
1889 lc = None
1890 # don't lose case distinction for unknown fields
1891 if (lc in value_attrs) or (lc in boolean_attrs):
1892 k = lc
1893 if k in boolean_attrs:
1894 if v is None: v = True
1895 standard[k] = v
1896 elif k in value_attrs:
1897 standard[k] = v
1898 else:
1899 rest[k] = v
1900
1901 h = standard.get
1902 expires = h("expires")
1903 discard = h("discard")
1904 if expires is not None:
1905 expires = iso2time(expires)
1906 if expires is None:
1907 discard = True
1908 domain = h("domain")
1909 domain_specified = domain.startswith(".")
1910 c = Cookie(h("version"), name, value,
1911 h("port"), h("port_spec"),
1912 domain, domain_specified, h("domain_dot"),
1913 h("path"), h("path_spec"),
1914 h("secure"),
1915 expires,
1916 discard,
1917 h("comment"),
1918 h("commenturl"),
1919 rest)
1920 if not ignore_discard and c.discard:
1921 continue
1922 if not ignore_expires and c.is_expired(now):
1923 continue
1924 self.set_cookie(c)
1925
1926 except IOError:
1927 raise
1928 except Exception:
1929 _warn_unhandled_exception()
1930 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1931 (filename, line))
1932
1933
1934class MozillaCookieJar(FileCookieJar):
1935 """
1936
1937 WARNING: you may want to backup your browser's cookies file if you use
1938 this class to save cookies. I *think* it works, but there have been
1939 bugs in the past!
1940
1941 This class differs from CookieJar only in the format it uses to save and
1942 load cookies to and from a file. This class uses the Mozilla/Netscape
1943 `cookies.txt' format. lynx uses this file format, too.
1944
1945 Don't expect cookies saved while the browser is running to be noticed by
1946 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1947 you change them on disk while it's running; on Windows, you probably can't
1948 save at all while the browser is running).
1949
1950 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1951 Netscape cookies on saving.
1952
1953 In particular, the cookie version and port number information is lost,
1954 together with information about whether or not Path, Port and Discard were
1955 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1956 domain as set in the HTTP header started with a dot (yes, I'm aware some
1957 domains in Netscape files start with a dot and some don't -- trust me, you
1958 really don't want to know any more about this).
1959
1960 Note that though Mozilla and Netscape use the same format, they use
1961 slightly different headers. The class saves cookies using the Netscape
1962 header by default (Mozilla can cope with that).
1963
1964 """
Antoine Pitroufd036452008-08-19 17:56:33 +00001965 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001966 header = """\
Georg Brandle5d518f2010-08-01 19:09:07 +00001967# Netscape HTTP Cookie File
1968# http://www.netscape.com/newsref/std/cookie_spec.html
1969# This is a generated file! Do not edit.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001970
1971"""
1972
1973 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1974 now = time.time()
1975
1976 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001977 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001978 f.close()
1979 raise LoadError(
1980 "%r does not look like a Netscape format cookies file" %
1981 filename)
1982
1983 try:
1984 while 1:
1985 line = f.readline()
1986 if line == "": break
1987
1988 # last field may be absent, so keep any trailing tab
1989 if line.endswith("\n"): line = line[:-1]
1990
1991 # skip comments and blank lines XXX what is $ for?
1992 if (line.strip().startswith(("#", "$")) or
1993 line.strip() == ""):
1994 continue
1995
1996 domain, domain_specified, path, secure, expires, name, value = \
1997 line.split("\t")
1998 secure = (secure == "TRUE")
1999 domain_specified = (domain_specified == "TRUE")
2000 if name == "":
2001 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2002 # with no name, whereas http.cookiejar regards it as a
2003 # cookie with no value.
2004 name = value
2005 value = None
2006
2007 initial_dot = domain.startswith(".")
2008 assert domain_specified == initial_dot
2009
2010 discard = False
2011 if expires == "":
2012 expires = None
2013 discard = True
2014
2015 # assume path_specified is false
2016 c = Cookie(0, name, value,
2017 None, False,
2018 domain, domain_specified, initial_dot,
2019 path, False,
2020 secure,
2021 expires,
2022 discard,
2023 None,
2024 None,
2025 {})
2026 if not ignore_discard and c.discard:
2027 continue
2028 if not ignore_expires and c.is_expired(now):
2029 continue
2030 self.set_cookie(c)
2031
2032 except IOError:
2033 raise
2034 except Exception:
2035 _warn_unhandled_exception()
2036 raise LoadError("invalid Netscape format cookies file %r: %r" %
2037 (filename, line))
2038
2039 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2040 if filename is None:
2041 if self.filename is not None: filename = self.filename
2042 else: raise ValueError(MISSING_FILENAME_TEXT)
2043
2044 f = open(filename, "w")
2045 try:
2046 f.write(self.header)
2047 now = time.time()
2048 for cookie in self:
2049 if not ignore_discard and cookie.discard:
2050 continue
2051 if not ignore_expires and cookie.is_expired(now):
2052 continue
2053 if cookie.secure: secure = "TRUE"
2054 else: secure = "FALSE"
2055 if cookie.domain.startswith("."): initial_dot = "TRUE"
2056 else: initial_dot = "FALSE"
2057 if cookie.expires is not None:
2058 expires = str(cookie.expires)
2059 else:
2060 expires = ""
2061 if cookie.value is None:
2062 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2063 # with no name, whereas http.cookiejar regards it as a
2064 # cookie with no value.
2065 name = ""
2066 value = cookie.name
2067 else:
2068 name = cookie.name
2069 value = cookie.value
2070 f.write(
2071 "\t".join([cookie.domain, initial_dot, cookie.path,
2072 secure, expires, name, value])+
2073 "\n")
2074 finally:
2075 f.close()