blob: 57332c6d7c89bb06adf3336ebc54db38e26a0e7e [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
Thomas Wouters477c8d52006-05-27 19:21:47 +000010Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000013
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Thomas Wouters477c8d52006-05-27 19:21:47 +000028__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
Jeremy Hylton1afc1692008-06-18 20:49:58 +000031import copy
32import re
33import time
34import urllib.parse, urllib.request
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000035try:
36 import threading as _threading
37except ImportError:
38 import dummy_threading as _threading
Georg Brandl24420152008-05-26 16:32:26 +000039import http.client # only for the default HTTP port
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000040from calendar import timegm
41
Thomas Wouters477c8d52006-05-27 19:21:47 +000042debug = False # set to True to enable debugging via the logging module
43logger = None
44
45def _debug(*args):
46 if not debug:
47 return
48 global logger
49 if not logger:
50 import logging
Georg Brandl24420152008-05-26 16:32:26 +000051 logger = logging.getLogger("http.cookiejar")
Thomas Wouters477c8d52006-05-27 19:21:47 +000052 return logger.debug(*args)
53
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000054
Georg Brandl24420152008-05-26 16:32:26 +000055DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000056MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
57 "instance initialised with one)")
58
Thomas Wouters477c8d52006-05-27 19:21:47 +000059def _warn_unhandled_exception():
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000060 # There are a few catch-all except: statements in this module, for
Thomas Wouters477c8d52006-05-27 19:21:47 +000061 # catching input that's bad in unexpected ways. Warn if any
62 # exceptions are caught there.
Jeremy Hylton7ecf3dc2008-05-10 20:38:40 +000063 import io, warnings, traceback
Guido van Rossum34d19282007-08-09 01:03:29 +000064 f = io.StringIO()
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000065 traceback.print_exc(None, f)
66 msg = f.getvalue()
Georg Brandl24420152008-05-26 16:32:26 +000067 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000068
69
70# Date/time conversion
71# -----------------------------------------------------------------------------
72
73EPOCH_YEAR = 1970
74def _timegm(tt):
75 year, month, mday, hour, min, sec = tt[:6]
76 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
77 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
78 return timegm(tt)
79 else:
80 return None
81
82DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
83MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
84 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
85MONTHS_LOWER = []
86for month in MONTHS: MONTHS_LOWER.append(month.lower())
87
88def time2isoz(t=None):
89 """Return a string representing time in seconds since epoch, t.
90
91 If the function is called without an argument, it will use the current
92 time.
93
94 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
95 representing Universal Time (UTC, aka GMT). An example of this format is:
96
97 1994-11-24 08:49:37Z
98
99 """
100 if t is None: t = time.time()
101 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
102 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
103 year, mon, mday, hour, min, sec)
104
105def time2netscape(t=None):
106 """Return a string representing time in seconds since epoch, t.
107
108 If the function is called without an argument, it will use the current
109 time.
110
111 The format of the returned string is like this:
112
113 Wed, DD-Mon-YYYY HH:MM:SS GMT
114
115 """
116 if t is None: t = time.time()
117 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
118 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
119 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
120
121
122UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
123
Antoine Pitroufd036452008-08-19 17:56:33 +0000124TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000125def offset_from_tz_string(tz):
126 offset = None
127 if tz in UTC_ZONES:
128 offset = 0
129 else:
130 m = TIMEZONE_RE.search(tz)
131 if m:
132 offset = 3600 * int(m.group(2))
133 if m.group(3):
134 offset = offset + 60 * int(m.group(3))
135 if m.group(1) == '-':
136 offset = -offset
137 return offset
138
139def _str2time(day, mon, yr, hr, min, sec, tz):
140 # translate month name to number
141 # month numbers start with 1 (January)
142 try:
143 mon = MONTHS_LOWER.index(mon.lower())+1
144 except ValueError:
145 # maybe it's already a number
146 try:
147 imon = int(mon)
148 except ValueError:
149 return None
150 if 1 <= imon <= 12:
151 mon = imon
152 else:
153 return None
154
155 # make sure clock elements are defined
156 if hr is None: hr = 0
157 if min is None: min = 0
158 if sec is None: sec = 0
159
160 yr = int(yr)
161 day = int(day)
162 hr = int(hr)
163 min = int(min)
164 sec = int(sec)
165
166 if yr < 1000:
167 # find "obvious" year
168 cur_yr = time.localtime(time.time())[0]
169 m = cur_yr % 100
170 tmp = yr
171 yr = yr + cur_yr - m
172 m = m - tmp
173 if abs(m) > 50:
174 if m > 0: yr = yr + 100
175 else: yr = yr - 100
176
177 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
178 t = _timegm((yr, mon, day, hr, min, sec, tz))
179
180 if t is not None:
181 # adjust time using timezone string, to get absolute time since epoch
182 if tz is None:
183 tz = "UTC"
184 tz = tz.upper()
185 offset = offset_from_tz_string(tz)
186 if offset is None:
187 return None
188 t = t - offset
189
190 return t
191
192STRICT_DATE_RE = re.compile(
193 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
Antoine Pitroufd036452008-08-19 17:56:33 +0000194 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000195WEEKDAY_RE = re.compile(
Antoine Pitroufd036452008-08-19 17:56:33 +0000196 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000197LOOSE_HTTP_DATE_RE = re.compile(
198 r"""^
199 (\d\d?) # day
200 (?:\s+|[-\/])
201 (\w+) # month
202 (?:\s+|[-\/])
203 (\d+) # year
204 (?:
205 (?:\s+|:) # separator before clock
206 (\d\d?):(\d\d) # hour:min
207 (?::(\d\d))? # optional seconds
208 )? # optional clock
209 \s*
210 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
211 \s*
212 (?:\(\w+\))? # ASCII representation of timezone in parens.
Antoine Pitroufd036452008-08-19 17:56:33 +0000213 \s*$""", re.X | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000214def http2time(text):
215 """Returns time in seconds since epoch of time represented by a string.
216
217 Return value is an integer.
218
219 None is returned if the format of str is unrecognized, the time is outside
220 the representable range, or the timezone string is not recognized. If the
221 string contains no timezone, UTC is assumed.
222
223 The timezone in the string may be numerical (like "-0800" or "+0100") or a
224 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
225 timezone strings equivalent to UTC (zero offset) are known to the function.
226
227 The function loosely parses the following formats:
228
229 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
230 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
231 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
232 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
233 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
234 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
235
236 The parser ignores leading and trailing whitespace. The time may be
237 absent.
238
239 If the year is given with only 2 digits, the function will select the
240 century that makes the year closest to the current date.
241
242 """
243 # fast exit for strictly conforming string
244 m = STRICT_DATE_RE.search(text)
245 if m:
246 g = m.groups()
247 mon = MONTHS_LOWER.index(g[1].lower()) + 1
248 tt = (int(g[2]), mon, int(g[0]),
249 int(g[3]), int(g[4]), float(g[5]))
250 return _timegm(tt)
251
252 # No, we need some messy parsing...
253
254 # clean up
255 text = text.lstrip()
256 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
257
258 # tz is time zone specifier string
259 day, mon, yr, hr, min, sec, tz = [None]*7
260
261 # loose regexp parse
262 m = LOOSE_HTTP_DATE_RE.search(text)
263 if m is not None:
264 day, mon, yr, hr, min, sec, tz = m.groups()
265 else:
266 return None # bad format
267
268 return _str2time(day, mon, yr, hr, min, sec, tz)
269
270ISO_DATE_RE = re.compile(
271 """^
272 (\d{4}) # year
273 [-\/]?
274 (\d\d?) # numerical month
275 [-\/]?
276 (\d\d?) # day
277 (?:
278 (?:\s+|[-:Tt]) # separator before clock
279 (\d\d?):?(\d\d) # hour:min
280 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
281 )? # optional clock
282 \s*
283 ([-+]?\d\d?:?(:?\d\d)?
284 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
Antoine Pitroufd036452008-08-19 17:56:33 +0000285 \s*$""", re.X | re. ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000286def iso2time(text):
287 """
288 As for http2time, but parses the ISO 8601 formats:
289
290 1994-02-03 14:15:29 -0100 -- ISO 8601 format
291 1994-02-03 14:15:29 -- zone is optional
292 1994-02-03 -- only date
293 1994-02-03T14:15:29 -- Use T as separator
294 19940203T141529Z -- ISO 8601 compact format
295 19940203 -- only date
296
297 """
298 # clean up
299 text = text.lstrip()
300
301 # tz is time zone specifier string
302 day, mon, yr, hr, min, sec, tz = [None]*7
303
304 # loose regexp parse
305 m = ISO_DATE_RE.search(text)
306 if m is not None:
307 # XXX there's an extra bit of the timezone I'm ignoring here: is
308 # this the right thing to do?
309 yr, mon, day, hr, min, sec, tz, _ = m.groups()
310 else:
311 return None # bad format
312
313 return _str2time(day, mon, yr, hr, min, sec, tz)
314
315
316# Header parsing
317# -----------------------------------------------------------------------------
318
319def unmatched(match):
320 """Return unmatched part of re.Match object."""
321 start, end = match.span(0)
322 return match.string[:start]+match.string[end:]
323
324HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
325HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
326HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
327HEADER_ESCAPE_RE = re.compile(r"\\(.)")
328def split_header_words(header_values):
329 r"""Parse header values into a list of lists containing key,value pairs.
330
331 The function knows how to deal with ",", ";" and "=" as well as quoted
332 values after "=". A list of space separated tokens are parsed as if they
333 were separated by ";".
334
335 If the header_values passed as argument contains multiple values, then they
336 are treated as if they were a single value separated by comma ",".
337
338 This means that this function is useful for parsing header fields that
339 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
340 the requirement for tokens).
341
342 headers = #header
343 header = (token | parameter) *( [";"] (token | parameter))
344
345 token = 1*<any CHAR except CTLs or separators>
346 separators = "(" | ")" | "<" | ">" | "@"
347 | "," | ";" | ":" | "\" | <">
348 | "/" | "[" | "]" | "?" | "="
349 | "{" | "}" | SP | HT
350
351 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
352 qdtext = <any TEXT except <">>
353 quoted-pair = "\" CHAR
354
355 parameter = attribute "=" value
356 attribute = token
357 value = token | quoted-string
358
359 Each header is represented by a list of key/value pairs. The value for a
360 simple token (not part of a parameter) is None. Syntactically incorrect
361 headers will not necessarily be parsed as you would want.
362
363 This is easier to describe with some examples:
364
365 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
366 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
367 >>> split_header_words(['text/html; charset="iso-8859-1"'])
368 [[('text/html', None), ('charset', 'iso-8859-1')]]
369 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
370 [[('Basic', None), ('realm', '"foobar"')]]
371
372 """
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000373 assert not isinstance(header_values, str)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000374 result = []
375 for text in header_values:
376 orig_text = text
377 pairs = []
378 while text:
379 m = HEADER_TOKEN_RE.search(text)
380 if m:
381 text = unmatched(m)
382 name = m.group(1)
383 m = HEADER_QUOTED_VALUE_RE.search(text)
384 if m: # quoted value
385 text = unmatched(m)
386 value = m.group(1)
387 value = HEADER_ESCAPE_RE.sub(r"\1", value)
388 else:
389 m = HEADER_VALUE_RE.search(text)
390 if m: # unquoted value
391 text = unmatched(m)
392 value = m.group(1)
393 value = value.rstrip()
394 else:
395 # no value, a lone token
396 value = None
397 pairs.append((name, value))
398 elif text.lstrip().startswith(","):
399 # concatenated headers, as per RFC 2616 section 4.2
400 text = text.lstrip()[1:]
401 if pairs: result.append(pairs)
402 pairs = []
403 else:
404 # skip junk
405 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
406 assert nr_junk_chars > 0, (
407 "split_header_words bug: '%s', '%s', %s" %
408 (orig_text, text, pairs))
409 text = non_junk
410 if pairs: result.append(pairs)
411 return result
412
413HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
414def join_header_words(lists):
415 """Do the inverse (almost) of the conversion done by split_header_words.
416
417 Takes a list of lists of (key, value) pairs and produces a single header
418 value. Attribute values are quoted if needed.
419
420 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
421 'text/plain; charset="iso-8859/1"'
422 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
423 'text/plain, charset="iso-8859/1"'
424
425 """
426 headers = []
427 for pairs in lists:
428 attr = []
429 for k, v in pairs:
430 if v is not None:
431 if not re.search(r"^\w+$", v):
432 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
433 v = '"%s"' % v
434 k = "%s=%s" % (k, v)
435 attr.append(k)
436 if attr: headers.append("; ".join(attr))
437 return ", ".join(headers)
438
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000439def strip_quotes(text):
440 if text.startswith('"'):
441 text = text[1:]
442 if text.endswith('"'):
443 text = text[:-1]
444 return text
445
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000446def parse_ns_headers(ns_headers):
447 """Ad-hoc parser for Netscape protocol cookie-attributes.
448
449 The old Netscape cookie format for Set-Cookie can for instance contain
450 an unquoted "," in the expires field, so we have to use this ad-hoc
451 parser instead of split_header_words.
452
453 XXX This may not make the best possible effort to parse all the crap
454 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
455 parser is probably better, so could do worse than following that if
456 this ever gives any trouble.
457
458 Currently, this is also used for parsing RFC 2109 cookies.
459
460 """
461 known_attrs = ("expires", "domain", "path", "secure",
462 # RFC 2109 attrs (may turn up in Netscape cookies, too)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000463 "version", "port", "max-age")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000464
465 result = []
466 for ns_header in ns_headers:
467 pairs = []
468 version_set = False
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000469 for ii, param in enumerate(re.split(r";\s*", ns_header)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000470 param = param.rstrip()
471 if param == "": continue
472 if "=" not in param:
Martin v. Löwisc5574e82005-03-03 10:57:37 +0000473 k, v = param, None
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000474 else:
475 k, v = re.split(r"\s*=\s*", param, 1)
476 k = k.lstrip()
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000477 if ii != 0:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000478 lc = k.lower()
479 if lc in known_attrs:
480 k = lc
481 if k == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000482 # This is an RFC 2109 cookie.
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000483 v = strip_quotes(v)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000484 version_set = True
485 if k == "expires":
486 # convert expires date to seconds since epoch
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000487 v = http2time(strip_quotes(v)) # None if invalid
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000488 pairs.append((k, v))
489
490 if pairs:
491 if not version_set:
492 pairs.append(("version", "0"))
493 result.append(pairs)
494
495 return result
496
497
Antoine Pitroufd036452008-08-19 17:56:33 +0000498IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000499def is_HDN(text):
500 """Return True if text is a host domain name."""
501 # XXX
502 # This may well be wrong. Which RFC is HDN defined in, if any (for
503 # the purposes of RFC 2965)?
504 # For the current implementation, what about IPv6? Remember to look
505 # at other uses of IPV4_RE also, if change this.
506 if IPV4_RE.search(text):
507 return False
508 if text == "":
509 return False
510 if text[0] == "." or text[-1] == ".":
511 return False
512 return True
513
514def domain_match(A, B):
515 """Return True if domain A domain-matches domain B, according to RFC 2965.
516
517 A and B may be host domain names or IP addresses.
518
519 RFC 2965, section 1:
520
521 Host names can be specified either as an IP address or a HDN string.
522 Sometimes we compare one host name with another. (Such comparisons SHALL
523 be case-insensitive.) Host A's name domain-matches host B's if
524
525 * their host name strings string-compare equal; or
526
527 * A is a HDN string and has the form NB, where N is a non-empty
528 name string, B has the form .B', and B' is a HDN string. (So,
529 x.y.com domain-matches .Y.com but not Y.com.)
530
531 Note that domain-match is not a commutative operation: a.b.c.com
532 domain-matches .c.com, but not the reverse.
533
534 """
535 # Note that, if A or B are IP addresses, the only relevant part of the
536 # definition of the domain-match algorithm is the direct string-compare.
537 A = A.lower()
538 B = B.lower()
539 if A == B:
540 return True
541 if not is_HDN(A):
542 return False
543 i = A.rfind(B)
544 if i == -1 or i == 0:
545 # A does not have form NB, or N is the empty string
546 return False
547 if not B.startswith("."):
548 return False
549 if not is_HDN(B[1:]):
550 return False
551 return True
552
553def liberal_is_HDN(text):
554 """Return True if text is a sort-of-like a host domain name.
555
556 For accepting/blocking domains.
557
558 """
559 if IPV4_RE.search(text):
560 return False
561 return True
562
563def user_domain_match(A, B):
564 """For blocking/accepting domains.
565
566 A and B may be host domain names or IP addresses.
567
568 """
569 A = A.lower()
570 B = B.lower()
571 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
572 if A == B:
573 # equal IP addresses
574 return True
575 return False
576 initial_dot = B.startswith(".")
577 if initial_dot and A.endswith(B):
578 return True
579 if not initial_dot and A == B:
580 return True
581 return False
582
Antoine Pitroufd036452008-08-19 17:56:33 +0000583cut_port_re = re.compile(r":\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000584def request_host(request):
585 """Return request-host, as defined by RFC 2965.
586
587 Variation from RFC: returned value is lowercased, for convenient
588 comparison.
589
590 """
591 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000592 host = urllib.parse.urlparse(url)[1]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000593 if host == "":
594 host = request.get_header("Host", "")
595
596 # remove port, if present
597 host = cut_port_re.sub("", host, 1)
598 return host.lower()
599
600def eff_request_host(request):
601 """Return a tuple (request-host, effective request-host name).
602
603 As defined by RFC 2965, except both are lowercased.
604
605 """
606 erhn = req_host = request_host(request)
607 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
608 erhn = req_host + ".local"
609 return req_host, erhn
610
611def request_path(request):
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000612 """Path component of request-URI, as defined by RFC 2965."""
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000613 url = request.get_full_url()
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000614 parts = urllib.parse.urlsplit(url)
615 path = escape_path(parts.path)
616 if not path.startswith("/"):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000617 # fix bad RFC 2396 absoluteURI
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000618 path = "/" + path
619 return path
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000620
621def request_port(request):
622 host = request.get_host()
623 i = host.find(':')
624 if i >= 0:
625 port = host[i+1:]
626 try:
627 int(port)
628 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000629 _debug("nonnumeric port: '%s'", port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000630 return None
631 else:
632 port = DEFAULT_HTTP_PORT
633 return port
634
635# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
636# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
637HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
638ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
639def uppercase_escaped_char(match):
640 return "%%%s" % match.group(1).upper()
641def escape_path(path):
642 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
643 # There's no knowing what character encoding was used to create URLs
644 # containing %-escapes, but since we have to pick one to escape invalid
645 # path characters, we pick UTF-8, as recommended in the HTML 4.0
646 # specification:
647 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
648 # And here, kind of: draft-fielding-uri-rfc2396bis-03
649 # (And in draft IRI specification: draft-duerst-iri-05)
650 # (And here, for new URI schemes: RFC 2718)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000651 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000652 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
653 return path
654
655def reach(h):
656 """Return reach of host h, as defined by RFC 2965, section 1.
657
658 The reach R of a host name H is defined as follows:
659
660 * If
661
662 - H is the host domain name of a host; and,
663
664 - H has the form A.B; and
665
666 - A has no embedded (that is, interior) dots; and
667
668 - B has at least one embedded dot, or B is the string "local".
669 then the reach of H is .B.
670
671 * Otherwise, the reach of H is H.
672
673 >>> reach("www.acme.com")
674 '.acme.com'
675 >>> reach("acme.com")
676 'acme.com'
677 >>> reach("acme.local")
678 '.local'
679
680 """
681 i = h.find(".")
682 if i >= 0:
683 #a = h[:i] # this line is only here to show what a is
684 b = h[i+1:]
685 i = b.find(".")
686 if is_HDN(h) and (i >= 0 or b == "local"):
687 return "."+b
688 return h
689
690def is_third_party(request):
691 """
692
693 RFC 2965, section 3.3.6:
694
695 An unverifiable transaction is to a third-party host if its request-
696 host U does not domain-match the reach R of the request-host O in the
697 origin transaction.
698
699 """
700 req_host = request_host(request)
701 if not domain_match(req_host, reach(request.get_origin_req_host())):
702 return True
703 else:
704 return False
705
706
707class Cookie:
708 """HTTP Cookie.
709
710 This class represents both Netscape and RFC 2965 cookies.
711
712 This is deliberately a very simple class. It just holds attributes. It's
713 possible to construct Cookie instances that don't comply with the cookie
714 standards. CookieJar.make_cookies is the factory function for Cookie
715 objects -- it deals with cookie parsing, supplying defaults, and
716 normalising to the representation used in this class. CookiePolicy is
717 responsible for checking them to see whether they should be accepted from
718 and returned to the server.
719
720 Note that the port may be present in the headers, but unspecified ("Port"
721 rather than"Port=80", for example); if this is the case, port is None.
722
723 """
724
725 def __init__(self, version, name, value,
726 port, port_specified,
727 domain, domain_specified, domain_initial_dot,
728 path, path_specified,
729 secure,
730 expires,
731 discard,
732 comment,
733 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000734 rest,
735 rfc2109=False,
736 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000737
738 if version is not None: version = int(version)
739 if expires is not None: expires = int(expires)
740 if port is None and port_specified is True:
741 raise ValueError("if port is None, port_specified must be false")
742
743 self.version = version
744 self.name = name
745 self.value = value
746 self.port = port
747 self.port_specified = port_specified
748 # normalise case, as per RFC 2965 section 3.3.3
749 self.domain = domain.lower()
750 self.domain_specified = domain_specified
751 # Sigh. We need to know whether the domain given in the
752 # cookie-attribute had an initial dot, in order to follow RFC 2965
753 # (as clarified in draft errata). Needed for the returned $Domain
754 # value.
755 self.domain_initial_dot = domain_initial_dot
756 self.path = path
757 self.path_specified = path_specified
758 self.secure = secure
759 self.expires = expires
760 self.discard = discard
761 self.comment = comment
762 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000763 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000764
765 self._rest = copy.copy(rest)
766
767 def has_nonstandard_attr(self, name):
768 return name in self._rest
769 def get_nonstandard_attr(self, name, default=None):
770 return self._rest.get(name, default)
771 def set_nonstandard_attr(self, name, value):
772 self._rest[name] = value
773
774 def is_expired(self, now=None):
775 if now is None: now = time.time()
776 if (self.expires is not None) and (self.expires <= now):
777 return True
778 return False
779
780 def __str__(self):
781 if self.port is None: p = ""
782 else: p = ":"+self.port
783 limit = self.domain + p + self.path
784 if self.value is not None:
785 namevalue = "%s=%s" % (self.name, self.value)
786 else:
787 namevalue = self.name
788 return "<Cookie %s for %s>" % (namevalue, limit)
789
790 def __repr__(self):
791 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000792 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000793 "port", "port_specified",
794 "domain", "domain_specified", "domain_initial_dot",
795 "path", "path_specified",
796 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000797 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000798 attr = getattr(self, name)
799 args.append("%s=%s" % (name, repr(attr)))
800 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000801 args.append("rfc2109=%s" % repr(self.rfc2109))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000802 return "Cookie(%s)" % ", ".join(args)
803
804
805class CookiePolicy:
806 """Defines which cookies get accepted from and returned to server.
807
808 May also modify cookies, though this is probably a bad idea.
809
810 The subclass DefaultCookiePolicy defines the standard rules for Netscape
811 and RFC 2965 cookies -- override that if you want a customised policy.
812
813 """
814 def set_ok(self, cookie, request):
815 """Return true if (and only if) cookie should be accepted from server.
816
817 Currently, pre-expired cookies never get this far -- the CookieJar
818 class deletes such cookies itself.
819
820 """
821 raise NotImplementedError()
822
823 def return_ok(self, cookie, request):
824 """Return true if (and only if) cookie should be returned to server."""
825 raise NotImplementedError()
826
827 def domain_return_ok(self, domain, request):
828 """Return false if cookies should not be returned, given cookie domain.
829 """
830 return True
831
832 def path_return_ok(self, path, request):
833 """Return false if cookies should not be returned, given cookie path.
834 """
835 return True
836
837
838class DefaultCookiePolicy(CookiePolicy):
839 """Implements the standard rules for accepting and returning cookies."""
840
841 DomainStrictNoDots = 1
842 DomainStrictNonDomain = 2
843 DomainRFC2965Match = 4
844
845 DomainLiberal = 0
846 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
847
848 def __init__(self,
849 blocked_domains=None, allowed_domains=None,
850 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000851 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000852 hide_cookie2=False,
853 strict_domain=False,
854 strict_rfc2965_unverifiable=True,
855 strict_ns_unverifiable=False,
856 strict_ns_domain=DomainLiberal,
857 strict_ns_set_initial_dollar=False,
858 strict_ns_set_path=False,
859 ):
860 """Constructor arguments should be passed as keyword arguments only."""
861 self.netscape = netscape
862 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000863 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000864 self.hide_cookie2 = hide_cookie2
865 self.strict_domain = strict_domain
866 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
867 self.strict_ns_unverifiable = strict_ns_unverifiable
868 self.strict_ns_domain = strict_ns_domain
869 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
870 self.strict_ns_set_path = strict_ns_set_path
871
872 if blocked_domains is not None:
873 self._blocked_domains = tuple(blocked_domains)
874 else:
875 self._blocked_domains = ()
876
877 if allowed_domains is not None:
878 allowed_domains = tuple(allowed_domains)
879 self._allowed_domains = allowed_domains
880
881 def blocked_domains(self):
882 """Return the sequence of blocked domains (as a tuple)."""
883 return self._blocked_domains
884 def set_blocked_domains(self, blocked_domains):
885 """Set the sequence of blocked domains."""
886 self._blocked_domains = tuple(blocked_domains)
887
888 def is_blocked(self, domain):
889 for blocked_domain in self._blocked_domains:
890 if user_domain_match(domain, blocked_domain):
891 return True
892 return False
893
894 def allowed_domains(self):
895 """Return None, or the sequence of allowed domains (as a tuple)."""
896 return self._allowed_domains
897 def set_allowed_domains(self, allowed_domains):
898 """Set the sequence of allowed domains, or None."""
899 if allowed_domains is not None:
900 allowed_domains = tuple(allowed_domains)
901 self._allowed_domains = allowed_domains
902
903 def is_not_allowed(self, domain):
904 if self._allowed_domains is None:
905 return False
906 for allowed_domain in self._allowed_domains:
907 if user_domain_match(domain, allowed_domain):
908 return False
909 return True
910
911 def set_ok(self, cookie, request):
912 """
913 If you override .set_ok(), be sure to call this method. If it returns
914 false, so should your subclass (assuming your subclass wants to be more
915 strict about which cookies to accept).
916
917 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000918 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000919
920 assert cookie.name is not None
921
922 for n in "version", "verifiability", "name", "path", "domain", "port":
923 fn_name = "set_ok_"+n
924 fn = getattr(self, fn_name)
925 if not fn(cookie, request):
926 return False
927
928 return True
929
930 def set_ok_version(self, cookie, request):
931 if cookie.version is None:
932 # Version is always set to 0 by parse_ns_headers if it's a Netscape
933 # cookie, so this must be an invalid RFC 2965 cookie.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000934 _debug(" Set-Cookie2 without version attribute (%s=%s)",
935 cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000936 return False
937 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000938 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000939 return False
940 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000941 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000942 return False
943 return True
944
945 def set_ok_verifiability(self, cookie, request):
946 if request.is_unverifiable() and is_third_party(request):
947 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000948 _debug(" third-party RFC 2965 cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000949 "unverifiable transaction")
950 return False
951 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000952 _debug(" third-party Netscape cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000953 "unverifiable transaction")
954 return False
955 return True
956
957 def set_ok_name(self, cookie, request):
958 # Try and stop servers setting V0 cookies designed to hack other
959 # servers that know both V0 and V1 protocols.
960 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
961 cookie.name.startswith("$")):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000962 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000963 return False
964 return True
965
966 def set_ok_path(self, cookie, request):
967 if cookie.path_specified:
968 req_path = request_path(request)
969 if ((cookie.version > 0 or
970 (cookie.version == 0 and self.strict_ns_set_path)) and
971 not req_path.startswith(cookie.path)):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000972 _debug(" path attribute %s is not a prefix of request "
973 "path %s", cookie.path, req_path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000974 return False
975 return True
976
977 def set_ok_domain(self, cookie, request):
978 if self.is_blocked(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000979 _debug(" domain %s is in user block-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000980 return False
981 if self.is_not_allowed(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000982 _debug(" domain %s is not in user allow-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000983 return False
984 if cookie.domain_specified:
985 req_host, erhn = eff_request_host(request)
986 domain = cookie.domain
987 if self.strict_domain and (domain.count(".") >= 2):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000988 # XXX This should probably be compared with the Konqueror
989 # (kcookiejar.cpp) and Mozilla implementations, but it's a
990 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000991 i = domain.rfind(".")
992 j = domain.rfind(".", 0, i)
993 if j == 0: # domain like .foo.bar
994 tld = domain[i+1:]
995 sld = domain[j+1:i]
Thomas Wouters477c8d52006-05-27 19:21:47 +0000996 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
997 "gov", "mil", "int", "aero", "biz", "cat", "coop",
998 "info", "jobs", "mobi", "museum", "name", "pro",
999 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001000 # domain like .co.uk
Thomas Wouters477c8d52006-05-27 19:21:47 +00001001 _debug(" country-code second level domain %s", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001002 return False
1003 if domain.startswith("."):
1004 undotted_domain = domain[1:]
1005 else:
1006 undotted_domain = domain
1007 embedded_dots = (undotted_domain.find(".") >= 0)
1008 if not embedded_dots and domain != ".local":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001009 _debug(" non-local domain %s contains no embedded dot",
1010 domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001011 return False
1012 if cookie.version == 0:
1013 if (not erhn.endswith(domain) and
1014 (not erhn.startswith(".") and
1015 not ("."+erhn).endswith(domain))):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001016 _debug(" effective request-host %s (even with added "
1017 "initial dot) does not end end with %s",
1018 erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001019 return False
1020 if (cookie.version > 0 or
1021 (self.strict_ns_domain & self.DomainRFC2965Match)):
1022 if not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001023 _debug(" effective request-host %s does not domain-match "
1024 "%s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001025 return False
1026 if (cookie.version > 0 or
1027 (self.strict_ns_domain & self.DomainStrictNoDots)):
1028 host_prefix = req_host[:-len(domain)]
1029 if (host_prefix.find(".") >= 0 and
1030 not IPV4_RE.search(req_host)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001031 _debug(" host prefix %s for domain %s contains a dot",
1032 host_prefix, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001033 return False
1034 return True
1035
1036 def set_ok_port(self, cookie, request):
1037 if cookie.port_specified:
1038 req_port = request_port(request)
1039 if req_port is None:
1040 req_port = "80"
1041 else:
1042 req_port = str(req_port)
1043 for p in cookie.port.split(","):
1044 try:
1045 int(p)
1046 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001047 _debug(" bad port %s (not numeric)", p)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001048 return False
1049 if p == req_port:
1050 break
1051 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001052 _debug(" request port (%s) not found in %s",
1053 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001054 return False
1055 return True
1056
1057 def return_ok(self, cookie, request):
1058 """
1059 If you override .return_ok(), be sure to call this method. If it
1060 returns false, so should your subclass (assuming your subclass wants to
1061 be more strict about which cookies to return).
1062
1063 """
1064 # Path has already been checked by .path_return_ok(), and domain
1065 # blocking done by .domain_return_ok().
Thomas Wouters477c8d52006-05-27 19:21:47 +00001066 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001067
1068 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1069 fn_name = "return_ok_"+n
1070 fn = getattr(self, fn_name)
1071 if not fn(cookie, request):
1072 return False
1073 return True
1074
1075 def return_ok_version(self, cookie, request):
1076 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001077 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001078 return False
1079 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001080 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001081 return False
1082 return True
1083
1084 def return_ok_verifiability(self, cookie, request):
1085 if request.is_unverifiable() and is_third_party(request):
1086 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001087 _debug(" third-party RFC 2965 cookie during unverifiable "
1088 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001089 return False
1090 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001091 _debug(" third-party Netscape cookie during unverifiable "
1092 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001093 return False
1094 return True
1095
1096 def return_ok_secure(self, cookie, request):
1097 if cookie.secure and request.get_type() != "https":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001098 _debug(" secure cookie with non-secure request")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001099 return False
1100 return True
1101
1102 def return_ok_expires(self, cookie, request):
1103 if cookie.is_expired(self._now):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001104 _debug(" cookie expired")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001105 return False
1106 return True
1107
1108 def return_ok_port(self, cookie, request):
1109 if cookie.port:
1110 req_port = request_port(request)
1111 if req_port is None:
1112 req_port = "80"
1113 for p in cookie.port.split(","):
1114 if p == req_port:
1115 break
1116 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001117 _debug(" request port %s does not match cookie port %s",
1118 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001119 return False
1120 return True
1121
1122 def return_ok_domain(self, cookie, request):
1123 req_host, erhn = eff_request_host(request)
1124 domain = cookie.domain
1125
1126 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1127 if (cookie.version == 0 and
1128 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1129 not cookie.domain_specified and domain != erhn):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001130 _debug(" cookie with unspecified domain does not string-compare "
1131 "equal to request domain")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001132 return False
1133
1134 if cookie.version > 0 and not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001135 _debug(" effective request-host name %s does not domain-match "
1136 "RFC 2965 cookie domain %s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001137 return False
1138 if cookie.version == 0 and not ("."+erhn).endswith(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001139 _debug(" request-host %s does not match Netscape cookie domain "
1140 "%s", req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001141 return False
1142 return True
1143
1144 def domain_return_ok(self, domain, request):
1145 # Liberal check of. This is here as an optimization to avoid
1146 # having to load lots of MSIE cookie files unless necessary.
1147 req_host, erhn = eff_request_host(request)
1148 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001149 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001150 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001151 erhn = "."+erhn
1152 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001153 #_debug(" request domain %s does not match cookie domain %s",
1154 # req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001155 return False
1156
1157 if self.is_blocked(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001158 _debug(" domain %s is in user block-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001159 return False
1160 if self.is_not_allowed(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001161 _debug(" domain %s is not in user allow-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001162 return False
1163
1164 return True
1165
1166 def path_return_ok(self, path, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001167 _debug("- checking cookie path=%s", path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001168 req_path = request_path(request)
1169 if not req_path.startswith(path):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001170 _debug(" %s does not path-match %s", req_path, path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001171 return False
1172 return True
1173
1174
1175def vals_sorted_by_key(adict):
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001176 keys = sorted(adict.keys())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001177 return map(adict.get, keys)
1178
1179def deepvalues(mapping):
1180 """Iterates over nested mapping, depth-first, in sorted order by key."""
1181 values = vals_sorted_by_key(mapping)
1182 for obj in values:
1183 mapping = False
1184 try:
1185 obj.items
1186 except AttributeError:
1187 pass
1188 else:
1189 mapping = True
1190 for subobj in deepvalues(obj):
1191 yield subobj
1192 if not mapping:
1193 yield obj
1194
1195
1196# Used as second parameter to dict.get() method, to distinguish absent
1197# dict key from one with a None value.
1198class Absent: pass
1199
1200class CookieJar:
1201 """Collection of HTTP cookies.
1202
1203 You may not need to know about this class: try
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001204 urllib.request.build_opener(HTTPCookieProcessor).open(url).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001205 """
1206
1207 non_word_re = re.compile(r"\W")
1208 quote_re = re.compile(r"([\"\\])")
1209 strict_domain_re = re.compile(r"\.?[^.]*")
1210 domain_re = re.compile(r"[^.]*")
1211 dots_re = re.compile(r"^\.+")
1212
Antoine Pitroufd036452008-08-19 17:56:33 +00001213 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001214
1215 def __init__(self, policy=None):
1216 if policy is None:
1217 policy = DefaultCookiePolicy()
1218 self._policy = policy
1219
1220 self._cookies_lock = _threading.RLock()
1221 self._cookies = {}
1222
1223 def set_policy(self, policy):
1224 self._policy = policy
1225
1226 def _cookies_for_domain(self, domain, request):
1227 cookies = []
1228 if not self._policy.domain_return_ok(domain, request):
1229 return []
Thomas Wouters477c8d52006-05-27 19:21:47 +00001230 _debug("Checking %s for cookies to return", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001231 cookies_by_path = self._cookies[domain]
1232 for path in cookies_by_path.keys():
1233 if not self._policy.path_return_ok(path, request):
1234 continue
1235 cookies_by_name = cookies_by_path[path]
1236 for cookie in cookies_by_name.values():
1237 if not self._policy.return_ok(cookie, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001238 _debug(" not returning cookie")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001239 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001240 _debug(" it's a match")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001241 cookies.append(cookie)
1242 return cookies
1243
1244 def _cookies_for_request(self, request):
1245 """Return a list of cookies to be returned to server."""
1246 cookies = []
1247 for domain in self._cookies.keys():
1248 cookies.extend(self._cookies_for_domain(domain, request))
1249 return cookies
1250
1251 def _cookie_attrs(self, cookies):
1252 """Return a list of cookie-attributes to be returned to server.
1253
1254 like ['foo="bar"; $Path="/"', ...]
1255
1256 The $Version attribute is also added when appropriate (currently only
1257 once per request).
1258
1259 """
1260 # add cookies in order of most specific (ie. longest) path first
Raymond Hettinger70b64fc2008-01-30 20:15:17 +00001261 cookies.sort(key=lambda a: len(a.path), reverse=True)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001262
1263 version_set = False
1264
1265 attrs = []
1266 for cookie in cookies:
1267 # set version of Cookie header
1268 # XXX
1269 # What should it be if multiple matching Set-Cookie headers have
1270 # different versions themselves?
1271 # Answer: there is no answer; was supposed to be settled by
1272 # RFC 2965 errata, but that may never appear...
1273 version = cookie.version
1274 if not version_set:
1275 version_set = True
1276 if version > 0:
1277 attrs.append("$Version=%s" % version)
1278
1279 # quote cookie value if necessary
1280 # (not for Netscape protocol, which already has any quotes
1281 # intact, due to the poorly-specified Netscape Cookie: syntax)
1282 if ((cookie.value is not None) and
1283 self.non_word_re.search(cookie.value) and version > 0):
1284 value = self.quote_re.sub(r"\\\1", cookie.value)
1285 else:
1286 value = cookie.value
1287
1288 # add cookie-attributes to be returned in Cookie header
1289 if cookie.value is None:
1290 attrs.append(cookie.name)
1291 else:
1292 attrs.append("%s=%s" % (cookie.name, value))
1293 if version > 0:
1294 if cookie.path_specified:
1295 attrs.append('$Path="%s"' % cookie.path)
1296 if cookie.domain.startswith("."):
1297 domain = cookie.domain
1298 if (not cookie.domain_initial_dot and
1299 domain.startswith(".")):
1300 domain = domain[1:]
1301 attrs.append('$Domain="%s"' % domain)
1302 if cookie.port is not None:
1303 p = "$Port"
1304 if cookie.port_specified:
1305 p = p + ('="%s"' % cookie.port)
1306 attrs.append(p)
1307
1308 return attrs
1309
1310 def add_cookie_header(self, request):
Georg Brandl029986a2008-06-23 11:44:14 +00001311 """Add correct Cookie: header to request (urllib.request.Request object).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001312
1313 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1314
1315 """
Thomas Wouters477c8d52006-05-27 19:21:47 +00001316 _debug("add_cookie_header")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001317 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001318 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001319
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001320 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001321
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001322 cookies = self._cookies_for_request(request)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001323
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001324 attrs = self._cookie_attrs(cookies)
1325 if attrs:
1326 if not request.has_header("Cookie"):
1327 request.add_unredirected_header(
1328 "Cookie", "; ".join(attrs))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001329
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001330 # if necessary, advertise that we know RFC 2965
1331 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1332 not request.has_header("Cookie2")):
1333 for cookie in cookies:
1334 if cookie.version != 1:
1335 request.add_unredirected_header("Cookie2", '$Version="1"')
1336 break
1337
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001338 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001339 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001340
1341 self.clear_expired_cookies()
1342
1343 def _normalized_cookie_tuples(self, attrs_set):
1344 """Return list of tuples containing normalised cookie information.
1345
1346 attrs_set is the list of lists of key,value pairs extracted from
1347 the Set-Cookie or Set-Cookie2 headers.
1348
1349 Tuples are name, value, standard, rest, where name and value are the
1350 cookie name and value, standard is a dictionary containing the standard
1351 cookie-attributes (discard, secure, version, expires or max-age,
1352 domain, path and port) and rest is a dictionary containing the rest of
1353 the cookie-attributes.
1354
1355 """
1356 cookie_tuples = []
1357
1358 boolean_attrs = "discard", "secure"
1359 value_attrs = ("version",
1360 "expires", "max-age",
1361 "domain", "path", "port",
1362 "comment", "commenturl")
1363
1364 for cookie_attrs in attrs_set:
1365 name, value = cookie_attrs[0]
1366
1367 # Build dictionary of standard cookie-attributes (standard) and
1368 # dictionary of other cookie-attributes (rest).
1369
1370 # Note: expiry time is normalised to seconds since epoch. V0
1371 # cookies should have the Expires cookie-attribute, and V1 cookies
1372 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1373 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1374 # accept either (but prefer Max-Age).
1375 max_age_set = False
1376
1377 bad_cookie = False
1378
1379 standard = {}
1380 rest = {}
1381 for k, v in cookie_attrs[1:]:
1382 lc = k.lower()
1383 # don't lose case distinction for unknown fields
1384 if lc in value_attrs or lc in boolean_attrs:
1385 k = lc
1386 if k in boolean_attrs and v is None:
1387 # boolean cookie-attribute is present, but has no value
1388 # (like "discard", rather than "port=80")
1389 v = True
1390 if k in standard:
1391 # only first value is significant
1392 continue
1393 if k == "domain":
1394 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001395 _debug(" missing value for domain attribute")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001396 bad_cookie = True
1397 break
1398 # RFC 2965 section 3.3.3
1399 v = v.lower()
1400 if k == "expires":
1401 if max_age_set:
1402 # Prefer max-age to expires (like Mozilla)
1403 continue
1404 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001405 _debug(" missing or invalid value for expires "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001406 "attribute: treating as session cookie")
1407 continue
1408 if k == "max-age":
1409 max_age_set = True
1410 try:
1411 v = int(v)
1412 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001413 _debug(" missing or invalid (non-numeric) value for "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001414 "max-age attribute")
1415 bad_cookie = True
1416 break
1417 # convert RFC 2965 Max-Age to seconds since epoch
1418 # XXX Strictly you're supposed to follow RFC 2616
1419 # age-calculation rules. Remember that zero Max-Age is a
1420 # is a request to discard (old and new) cookie, though.
1421 k = "expires"
1422 v = self._now + v
1423 if (k in value_attrs) or (k in boolean_attrs):
1424 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001425 k not in ("port", "comment", "commenturl")):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001426 _debug(" missing value for %s attribute" % k)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001427 bad_cookie = True
1428 break
1429 standard[k] = v
1430 else:
1431 rest[k] = v
1432
1433 if bad_cookie:
1434 continue
1435
1436 cookie_tuples.append((name, value, standard, rest))
1437
1438 return cookie_tuples
1439
1440 def _cookie_from_cookie_tuple(self, tup, request):
1441 # standard is dict of standard cookie-attributes, rest is dict of the
1442 # rest of them
1443 name, value, standard, rest = tup
1444
1445 domain = standard.get("domain", Absent)
1446 path = standard.get("path", Absent)
1447 port = standard.get("port", Absent)
1448 expires = standard.get("expires", Absent)
1449
1450 # set the easy defaults
1451 version = standard.get("version", None)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +00001452 if version is not None:
1453 try:
1454 version = int(version)
1455 except ValueError:
1456 return None # invalid version, ignore cookie
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001457 secure = standard.get("secure", False)
1458 # (discard is also set if expires is Absent)
1459 discard = standard.get("discard", False)
1460 comment = standard.get("comment", None)
1461 comment_url = standard.get("commenturl", None)
1462
1463 # set default path
1464 if path is not Absent and path != "":
1465 path_specified = True
1466 path = escape_path(path)
1467 else:
1468 path_specified = False
1469 path = request_path(request)
1470 i = path.rfind("/")
1471 if i != -1:
1472 if version == 0:
1473 # Netscape spec parts company from reality here
1474 path = path[:i]
1475 else:
1476 path = path[:i+1]
1477 if len(path) == 0: path = "/"
1478
1479 # set default domain
1480 domain_specified = domain is not Absent
1481 # but first we have to remember whether it starts with a dot
1482 domain_initial_dot = False
1483 if domain_specified:
1484 domain_initial_dot = bool(domain.startswith("."))
1485 if domain is Absent:
1486 req_host, erhn = eff_request_host(request)
1487 domain = erhn
1488 elif not domain.startswith("."):
1489 domain = "."+domain
1490
1491 # set default port
1492 port_specified = False
1493 if port is not Absent:
1494 if port is None:
1495 # Port attr present, but has no value: default to request port.
1496 # Cookie should then only be sent back on that port.
1497 port = request_port(request)
1498 else:
1499 port_specified = True
1500 port = re.sub(r"\s+", "", port)
1501 else:
1502 # No port attr present. Cookie can be sent back on any port.
1503 port = None
1504
1505 # set default expires and discard
1506 if expires is Absent:
1507 expires = None
1508 discard = True
1509 elif expires <= self._now:
1510 # Expiry date in past is request to delete cookie. This can't be
1511 # in DefaultCookiePolicy, because can't delete cookies there.
1512 try:
1513 self.clear(domain, path, name)
1514 except KeyError:
1515 pass
Thomas Wouters477c8d52006-05-27 19:21:47 +00001516 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1517 domain, path, name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001518 return None
1519
1520 return Cookie(version,
1521 name, value,
1522 port, port_specified,
1523 domain, domain_specified, domain_initial_dot,
1524 path, path_specified,
1525 secure,
1526 expires,
1527 discard,
1528 comment,
1529 comment_url,
1530 rest)
1531
1532 def _cookies_from_attrs_set(self, attrs_set, request):
1533 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1534
1535 cookies = []
1536 for tup in cookie_tuples:
1537 cookie = self._cookie_from_cookie_tuple(tup, request)
1538 if cookie: cookies.append(cookie)
1539 return cookies
1540
Neal Norwitz71dad722005-12-23 21:43:48 +00001541 def _process_rfc2109_cookies(self, cookies):
1542 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1543 if rfc2109_as_ns is None:
1544 rfc2109_as_ns = not self._policy.rfc2965
1545 for cookie in cookies:
1546 if cookie.version == 1:
1547 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001548 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001549 # treat 2109 cookies as Netscape cookies rather than
1550 # as RFC2965 cookies
1551 cookie.version = 0
1552
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001553 def make_cookies(self, response, request):
1554 """Return sequence of Cookie objects extracted from response object."""
1555 # get cookie-attributes for RFC 2965 and Netscape protocols
1556 headers = response.info()
Barry Warsaw820c1202008-06-12 04:06:45 +00001557 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1558 ns_hdrs = headers.get_all("Set-Cookie", [])
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001559
1560 rfc2965 = self._policy.rfc2965
1561 netscape = self._policy.netscape
1562
1563 if ((not rfc2965_hdrs and not ns_hdrs) or
1564 (not ns_hdrs and not rfc2965) or
1565 (not rfc2965_hdrs and not netscape) or
1566 (not netscape and not rfc2965)):
1567 return [] # no relevant cookie headers: quick exit
1568
1569 try:
1570 cookies = self._cookies_from_attrs_set(
1571 split_header_words(rfc2965_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001572 except Exception:
1573 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001574 cookies = []
1575
1576 if ns_hdrs and netscape:
1577 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001578 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001579 ns_cookies = self._cookies_from_attrs_set(
1580 parse_ns_headers(ns_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001581 except Exception:
1582 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001583 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001584 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001585
1586 # Look for Netscape cookies (from Set-Cookie headers) that match
1587 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1588 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1589 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1590 # bundled in with the Netscape cookies for this purpose, which is
1591 # reasonable behaviour.
1592 if rfc2965:
1593 lookup = {}
1594 for cookie in cookies:
1595 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1596
1597 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1598 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1599 return key not in lookup
1600 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1601
1602 if ns_cookies:
1603 cookies.extend(ns_cookies)
1604
1605 return cookies
1606
1607 def set_cookie_if_ok(self, cookie, request):
1608 """Set a cookie if policy says it's OK to do so."""
1609 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001610 try:
1611 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001612
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001613 if self._policy.set_ok(cookie, request):
1614 self.set_cookie(cookie)
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001615
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001616
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001617 finally:
1618 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001619
1620 def set_cookie(self, cookie):
1621 """Set a cookie, without checking whether or not it should be set."""
1622 c = self._cookies
1623 self._cookies_lock.acquire()
1624 try:
1625 if cookie.domain not in c: c[cookie.domain] = {}
1626 c2 = c[cookie.domain]
1627 if cookie.path not in c2: c2[cookie.path] = {}
1628 c3 = c2[cookie.path]
1629 c3[cookie.name] = cookie
1630 finally:
1631 self._cookies_lock.release()
1632
1633 def extract_cookies(self, response, request):
1634 """Extract cookies from response, where allowable given the request."""
Thomas Wouters477c8d52006-05-27 19:21:47 +00001635 _debug("extract_cookies: %s", response.info())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001636 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001637 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001638 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001639
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001640 for cookie in self.make_cookies(response, request):
1641 if self._policy.set_ok(cookie, request):
1642 _debug(" setting cookie: %s", cookie)
1643 self.set_cookie(cookie)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001644 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001645 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001646
1647 def clear(self, domain=None, path=None, name=None):
1648 """Clear some cookies.
1649
1650 Invoking this method without arguments will clear all cookies. If
1651 given a single argument, only cookies belonging to that domain will be
1652 removed. If given two arguments, cookies belonging to the specified
1653 path within that domain are removed. If given three arguments, then
1654 the cookie with the specified name, path and domain is removed.
1655
1656 Raises KeyError if no matching cookie exists.
1657
1658 """
1659 if name is not None:
1660 if (domain is None) or (path is None):
1661 raise ValueError(
1662 "domain and path must be given to remove a cookie by name")
1663 del self._cookies[domain][path][name]
1664 elif path is not None:
1665 if domain is None:
1666 raise ValueError(
1667 "domain must be given to remove cookies by path")
1668 del self._cookies[domain][path]
1669 elif domain is not None:
1670 del self._cookies[domain]
1671 else:
1672 self._cookies = {}
1673
1674 def clear_session_cookies(self):
1675 """Discard all session cookies.
1676
1677 Note that the .save() method won't save session cookies anyway, unless
1678 you ask otherwise by passing a true ignore_discard argument.
1679
1680 """
1681 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001682 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001683 for cookie in self:
1684 if cookie.discard:
1685 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001686 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001687 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001688
1689 def clear_expired_cookies(self):
1690 """Discard all expired cookies.
1691
1692 You probably don't need to call this method: expired cookies are never
1693 sent back to the server (provided you're using DefaultCookiePolicy),
1694 this method is called by CookieJar itself every so often, and the
1695 .save() method won't save expired cookies anyway (unless you ask
1696 otherwise by passing a true ignore_expires argument).
1697
1698 """
1699 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001700 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001701 now = time.time()
1702 for cookie in self:
1703 if cookie.is_expired(now):
1704 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001705 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001706 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001707
1708 def __iter__(self):
1709 return deepvalues(self._cookies)
1710
1711 def __len__(self):
1712 """Return number of contained cookies."""
1713 i = 0
1714 for cookie in self: i = i + 1
1715 return i
1716
1717 def __repr__(self):
1718 r = []
1719 for cookie in self: r.append(repr(cookie))
1720 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1721
1722 def __str__(self):
1723 r = []
1724 for cookie in self: r.append(str(cookie))
1725 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1726
1727
Neal Norwitz3e7de592005-12-23 21:24:35 +00001728# derives from IOError for backwards-compatibility with Python 2.4.0
1729class LoadError(IOError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001730
1731class FileCookieJar(CookieJar):
1732 """CookieJar that can be loaded from and saved to a file."""
1733
1734 def __init__(self, filename=None, delayload=False, policy=None):
1735 """
1736 Cookies are NOT loaded from the named file until either the .load() or
1737 .revert() method is called.
1738
1739 """
1740 CookieJar.__init__(self, policy)
1741 if filename is not None:
1742 try:
1743 filename+""
1744 except:
1745 raise ValueError("filename must be string-like")
1746 self.filename = filename
1747 self.delayload = bool(delayload)
1748
1749 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1750 """Save cookies to a file."""
1751 raise NotImplementedError()
1752
1753 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1754 """Load cookies from a file."""
1755 if filename is None:
1756 if self.filename is not None: filename = self.filename
1757 else: raise ValueError(MISSING_FILENAME_TEXT)
1758
1759 f = open(filename)
1760 try:
1761 self._really_load(f, filename, ignore_discard, ignore_expires)
1762 finally:
1763 f.close()
1764
1765 def revert(self, filename=None,
1766 ignore_discard=False, ignore_expires=False):
1767 """Clear all cookies and reload cookies from a saved file.
1768
1769 Raises LoadError (or IOError) if reversion is not successful; the
1770 object's state will not be altered if this happens.
1771
1772 """
1773 if filename is None:
1774 if self.filename is not None: filename = self.filename
1775 else: raise ValueError(MISSING_FILENAME_TEXT)
1776
1777 self._cookies_lock.acquire()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001778 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001779
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001780 old_state = copy.deepcopy(self._cookies)
1781 self._cookies = {}
1782 try:
1783 self.load(filename, ignore_discard, ignore_expires)
1784 except (LoadError, IOError):
1785 self._cookies = old_state
1786 raise
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001787
1788 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001789 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001790
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001791
1792def lwp_cookie_str(cookie):
1793 """Return string representation of Cookie in an the LWP cookie file format.
1794
1795 Actually, the format is extended a bit -- see module docstring.
1796
1797 """
1798 h = [(cookie.name, cookie.value),
1799 ("path", cookie.path),
1800 ("domain", cookie.domain)]
1801 if cookie.port is not None: h.append(("port", cookie.port))
1802 if cookie.path_specified: h.append(("path_spec", None))
1803 if cookie.port_specified: h.append(("port_spec", None))
1804 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1805 if cookie.secure: h.append(("secure", None))
1806 if cookie.expires: h.append(("expires",
1807 time2isoz(float(cookie.expires))))
1808 if cookie.discard: h.append(("discard", None))
1809 if cookie.comment: h.append(("comment", cookie.comment))
1810 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1811
1812 keys = sorted(cookie._rest.keys())
1813 for k in keys:
1814 h.append((k, str(cookie._rest[k])))
1815
1816 h.append(("version", str(cookie.version)))
1817
1818 return join_header_words([h])
1819
1820class LWPCookieJar(FileCookieJar):
1821 """
1822 The LWPCookieJar saves a sequence of"Set-Cookie3" lines.
1823 "Set-Cookie3" is the format used by the libwww-perl libary, not known
1824 to be compatible with any browser, but which is easy to read and
1825 doesn't lose information about RFC 2965 cookies.
1826
1827 Additional methods
1828
1829 as_lwp_str(ignore_discard=True, ignore_expired=True)
1830
1831 """
1832
1833 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1834 """Return cookies as a string of "\n"-separated "Set-Cookie3" headers.
1835
1836 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1837
1838 """
1839 now = time.time()
1840 r = []
1841 for cookie in self:
1842 if not ignore_discard and cookie.discard:
1843 continue
1844 if not ignore_expires and cookie.is_expired(now):
1845 continue
1846 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1847 return "\n".join(r+[""])
1848
1849 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1850 if filename is None:
1851 if self.filename is not None: filename = self.filename
1852 else: raise ValueError(MISSING_FILENAME_TEXT)
1853
1854 f = open(filename, "w")
1855 try:
1856 # There really isn't an LWP Cookies 2.0 format, but this indicates
1857 # that there is extra information in here (domain_dot and
1858 # port_spec) while still being compatible with libwww-perl, I hope.
1859 f.write("#LWP-Cookies-2.0\n")
1860 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1861 finally:
1862 f.close()
1863
1864 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1865 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001866 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001867 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1868 "file" % filename)
1869 raise LoadError(msg)
1870
1871 now = time.time()
1872
1873 header = "Set-Cookie3:"
1874 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1875 "secure", "discard")
1876 value_attrs = ("version",
1877 "port", "path", "domain",
1878 "expires",
1879 "comment", "commenturl")
1880
1881 try:
1882 while 1:
1883 line = f.readline()
1884 if line == "": break
1885 if not line.startswith(header):
1886 continue
1887 line = line[len(header):].strip()
1888
1889 for data in split_header_words([line]):
1890 name, value = data[0]
1891 standard = {}
1892 rest = {}
1893 for k in boolean_attrs:
1894 standard[k] = False
1895 for k, v in data[1:]:
1896 if k is not None:
1897 lc = k.lower()
1898 else:
1899 lc = None
1900 # don't lose case distinction for unknown fields
1901 if (lc in value_attrs) or (lc in boolean_attrs):
1902 k = lc
1903 if k in boolean_attrs:
1904 if v is None: v = True
1905 standard[k] = v
1906 elif k in value_attrs:
1907 standard[k] = v
1908 else:
1909 rest[k] = v
1910
1911 h = standard.get
1912 expires = h("expires")
1913 discard = h("discard")
1914 if expires is not None:
1915 expires = iso2time(expires)
1916 if expires is None:
1917 discard = True
1918 domain = h("domain")
1919 domain_specified = domain.startswith(".")
1920 c = Cookie(h("version"), name, value,
1921 h("port"), h("port_spec"),
1922 domain, domain_specified, h("domain_dot"),
1923 h("path"), h("path_spec"),
1924 h("secure"),
1925 expires,
1926 discard,
1927 h("comment"),
1928 h("commenturl"),
1929 rest)
1930 if not ignore_discard and c.discard:
1931 continue
1932 if not ignore_expires and c.is_expired(now):
1933 continue
1934 self.set_cookie(c)
1935
1936 except IOError:
1937 raise
1938 except Exception:
1939 _warn_unhandled_exception()
1940 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1941 (filename, line))
1942
1943
1944class MozillaCookieJar(FileCookieJar):
1945 """
1946
1947 WARNING: you may want to backup your browser's cookies file if you use
1948 this class to save cookies. I *think* it works, but there have been
1949 bugs in the past!
1950
1951 This class differs from CookieJar only in the format it uses to save and
1952 load cookies to and from a file. This class uses the Mozilla/Netscape
1953 `cookies.txt' format. lynx uses this file format, too.
1954
1955 Don't expect cookies saved while the browser is running to be noticed by
1956 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1957 you change them on disk while it's running; on Windows, you probably can't
1958 save at all while the browser is running).
1959
1960 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1961 Netscape cookies on saving.
1962
1963 In particular, the cookie version and port number information is lost,
1964 together with information about whether or not Path, Port and Discard were
1965 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1966 domain as set in the HTTP header started with a dot (yes, I'm aware some
1967 domains in Netscape files start with a dot and some don't -- trust me, you
1968 really don't want to know any more about this).
1969
1970 Note that though Mozilla and Netscape use the same format, they use
1971 slightly different headers. The class saves cookies using the Netscape
1972 header by default (Mozilla can cope with that).
1973
1974 """
Antoine Pitroufd036452008-08-19 17:56:33 +00001975 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001976 header = """\
1977 # Netscape HTTP Cookie File
1978 # http://www.netscape.com/newsref/std/cookie_spec.html
1979 # This is a generated file! Do not edit.
1980
1981"""
1982
1983 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1984 now = time.time()
1985
1986 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001987 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001988 f.close()
1989 raise LoadError(
1990 "%r does not look like a Netscape format cookies file" %
1991 filename)
1992
1993 try:
1994 while 1:
1995 line = f.readline()
1996 if line == "": break
1997
1998 # last field may be absent, so keep any trailing tab
1999 if line.endswith("\n"): line = line[:-1]
2000
2001 # skip comments and blank lines XXX what is $ for?
2002 if (line.strip().startswith(("#", "$")) or
2003 line.strip() == ""):
2004 continue
2005
2006 domain, domain_specified, path, secure, expires, name, value = \
2007 line.split("\t")
2008 secure = (secure == "TRUE")
2009 domain_specified = (domain_specified == "TRUE")
2010 if name == "":
2011 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2012 # with no name, whereas http.cookiejar regards it as a
2013 # cookie with no value.
2014 name = value
2015 value = None
2016
2017 initial_dot = domain.startswith(".")
2018 assert domain_specified == initial_dot
2019
2020 discard = False
2021 if expires == "":
2022 expires = None
2023 discard = True
2024
2025 # assume path_specified is false
2026 c = Cookie(0, name, value,
2027 None, False,
2028 domain, domain_specified, initial_dot,
2029 path, False,
2030 secure,
2031 expires,
2032 discard,
2033 None,
2034 None,
2035 {})
2036 if not ignore_discard and c.discard:
2037 continue
2038 if not ignore_expires and c.is_expired(now):
2039 continue
2040 self.set_cookie(c)
2041
2042 except IOError:
2043 raise
2044 except Exception:
2045 _warn_unhandled_exception()
2046 raise LoadError("invalid Netscape format cookies file %r: %r" %
2047 (filename, line))
2048
2049 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2050 if filename is None:
2051 if self.filename is not None: filename = self.filename
2052 else: raise ValueError(MISSING_FILENAME_TEXT)
2053
2054 f = open(filename, "w")
2055 try:
2056 f.write(self.header)
2057 now = time.time()
2058 for cookie in self:
2059 if not ignore_discard and cookie.discard:
2060 continue
2061 if not ignore_expires and cookie.is_expired(now):
2062 continue
2063 if cookie.secure: secure = "TRUE"
2064 else: secure = "FALSE"
2065 if cookie.domain.startswith("."): initial_dot = "TRUE"
2066 else: initial_dot = "FALSE"
2067 if cookie.expires is not None:
2068 expires = str(cookie.expires)
2069 else:
2070 expires = ""
2071 if cookie.value is None:
2072 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2073 # with no name, whereas http.cookiejar regards it as a
2074 # cookie with no value.
2075 name = ""
2076 value = cookie.name
2077 else:
2078 name = cookie.name
2079 value = cookie.value
2080 f.write(
2081 "\t".join([cookie.domain, initial_dot, cookie.path,
2082 secure, expires, name, value])+
2083 "\n")
2084 finally:
2085 f.close()