blob: 657faa1ca12b281d54f357c7c7ed2fe6be4a6973 [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
Thomas Wouters477c8d52006-05-27 19:21:47 +000010Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000013
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Thomas Wouters477c8d52006-05-27 19:21:47 +000028__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
Jeremy Hylton1afc1692008-06-18 20:49:58 +000031import copy
32import re
33import time
34import urllib.parse, urllib.request
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000035try:
36 import threading as _threading
37except ImportError:
38 import dummy_threading as _threading
Georg Brandl24420152008-05-26 16:32:26 +000039import http.client # only for the default HTTP port
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000040from calendar import timegm
41
Thomas Wouters477c8d52006-05-27 19:21:47 +000042debug = False # set to True to enable debugging via the logging module
43logger = None
44
45def _debug(*args):
46 if not debug:
47 return
48 global logger
49 if not logger:
50 import logging
Georg Brandl24420152008-05-26 16:32:26 +000051 logger = logging.getLogger("http.cookiejar")
Thomas Wouters477c8d52006-05-27 19:21:47 +000052 return logger.debug(*args)
53
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000054
Georg Brandl24420152008-05-26 16:32:26 +000055DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000056MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
57 "instance initialised with one)")
58
Thomas Wouters477c8d52006-05-27 19:21:47 +000059def _warn_unhandled_exception():
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000060 # There are a few catch-all except: statements in this module, for
Thomas Wouters477c8d52006-05-27 19:21:47 +000061 # catching input that's bad in unexpected ways. Warn if any
62 # exceptions are caught there.
Jeremy Hylton7ecf3dc2008-05-10 20:38:40 +000063 import io, warnings, traceback
Guido van Rossum34d19282007-08-09 01:03:29 +000064 f = io.StringIO()
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000065 traceback.print_exc(None, f)
66 msg = f.getvalue()
Georg Brandl24420152008-05-26 16:32:26 +000067 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000068
69
70# Date/time conversion
71# -----------------------------------------------------------------------------
72
73EPOCH_YEAR = 1970
74def _timegm(tt):
75 year, month, mday, hour, min, sec = tt[:6]
76 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
77 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
78 return timegm(tt)
79 else:
80 return None
81
82DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
83MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
84 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
85MONTHS_LOWER = []
86for month in MONTHS: MONTHS_LOWER.append(month.lower())
87
88def time2isoz(t=None):
89 """Return a string representing time in seconds since epoch, t.
90
91 If the function is called without an argument, it will use the current
92 time.
93
94 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
95 representing Universal Time (UTC, aka GMT). An example of this format is:
96
97 1994-11-24 08:49:37Z
98
99 """
100 if t is None: t = time.time()
101 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
102 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
103 year, mon, mday, hour, min, sec)
104
105def time2netscape(t=None):
106 """Return a string representing time in seconds since epoch, t.
107
108 If the function is called without an argument, it will use the current
109 time.
110
111 The format of the returned string is like this:
112
113 Wed, DD-Mon-YYYY HH:MM:SS GMT
114
115 """
116 if t is None: t = time.time()
117 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
118 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
119 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
120
121
122UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
123
Antoine Pitroufd036452008-08-19 17:56:33 +0000124TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000125def offset_from_tz_string(tz):
126 offset = None
127 if tz in UTC_ZONES:
128 offset = 0
129 else:
130 m = TIMEZONE_RE.search(tz)
131 if m:
132 offset = 3600 * int(m.group(2))
133 if m.group(3):
134 offset = offset + 60 * int(m.group(3))
135 if m.group(1) == '-':
136 offset = -offset
137 return offset
138
139def _str2time(day, mon, yr, hr, min, sec, tz):
140 # translate month name to number
141 # month numbers start with 1 (January)
142 try:
143 mon = MONTHS_LOWER.index(mon.lower())+1
144 except ValueError:
145 # maybe it's already a number
146 try:
147 imon = int(mon)
148 except ValueError:
149 return None
150 if 1 <= imon <= 12:
151 mon = imon
152 else:
153 return None
154
155 # make sure clock elements are defined
156 if hr is None: hr = 0
157 if min is None: min = 0
158 if sec is None: sec = 0
159
160 yr = int(yr)
161 day = int(day)
162 hr = int(hr)
163 min = int(min)
164 sec = int(sec)
165
166 if yr < 1000:
167 # find "obvious" year
168 cur_yr = time.localtime(time.time())[0]
169 m = cur_yr % 100
170 tmp = yr
171 yr = yr + cur_yr - m
172 m = m - tmp
173 if abs(m) > 50:
174 if m > 0: yr = yr + 100
175 else: yr = yr - 100
176
177 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
178 t = _timegm((yr, mon, day, hr, min, sec, tz))
179
180 if t is not None:
181 # adjust time using timezone string, to get absolute time since epoch
182 if tz is None:
183 tz = "UTC"
184 tz = tz.upper()
185 offset = offset_from_tz_string(tz)
186 if offset is None:
187 return None
188 t = t - offset
189
190 return t
191
192STRICT_DATE_RE = re.compile(
193 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
Antoine Pitroufd036452008-08-19 17:56:33 +0000194 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000195WEEKDAY_RE = re.compile(
Antoine Pitroufd036452008-08-19 17:56:33 +0000196 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000197LOOSE_HTTP_DATE_RE = re.compile(
198 r"""^
199 (\d\d?) # day
200 (?:\s+|[-\/])
201 (\w+) # month
202 (?:\s+|[-\/])
203 (\d+) # year
204 (?:
205 (?:\s+|:) # separator before clock
206 (\d\d?):(\d\d) # hour:min
207 (?::(\d\d))? # optional seconds
208 )? # optional clock
209 \s*
210 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
211 \s*
212 (?:\(\w+\))? # ASCII representation of timezone in parens.
Antoine Pitroufd036452008-08-19 17:56:33 +0000213 \s*$""", re.X | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000214def http2time(text):
215 """Returns time in seconds since epoch of time represented by a string.
216
217 Return value is an integer.
218
219 None is returned if the format of str is unrecognized, the time is outside
220 the representable range, or the timezone string is not recognized. If the
221 string contains no timezone, UTC is assumed.
222
223 The timezone in the string may be numerical (like "-0800" or "+0100") or a
224 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
225 timezone strings equivalent to UTC (zero offset) are known to the function.
226
227 The function loosely parses the following formats:
228
229 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
230 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
231 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
232 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
233 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
234 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
235
236 The parser ignores leading and trailing whitespace. The time may be
237 absent.
238
239 If the year is given with only 2 digits, the function will select the
240 century that makes the year closest to the current date.
241
242 """
243 # fast exit for strictly conforming string
244 m = STRICT_DATE_RE.search(text)
245 if m:
246 g = m.groups()
247 mon = MONTHS_LOWER.index(g[1].lower()) + 1
248 tt = (int(g[2]), mon, int(g[0]),
249 int(g[3]), int(g[4]), float(g[5]))
250 return _timegm(tt)
251
252 # No, we need some messy parsing...
253
254 # clean up
255 text = text.lstrip()
256 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
257
258 # tz is time zone specifier string
259 day, mon, yr, hr, min, sec, tz = [None]*7
260
261 # loose regexp parse
262 m = LOOSE_HTTP_DATE_RE.search(text)
263 if m is not None:
264 day, mon, yr, hr, min, sec, tz = m.groups()
265 else:
266 return None # bad format
267
268 return _str2time(day, mon, yr, hr, min, sec, tz)
269
270ISO_DATE_RE = re.compile(
271 """^
272 (\d{4}) # year
273 [-\/]?
274 (\d\d?) # numerical month
275 [-\/]?
276 (\d\d?) # day
277 (?:
278 (?:\s+|[-:Tt]) # separator before clock
279 (\d\d?):?(\d\d) # hour:min
280 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
281 )? # optional clock
282 \s*
283 ([-+]?\d\d?:?(:?\d\d)?
284 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
Antoine Pitroufd036452008-08-19 17:56:33 +0000285 \s*$""", re.X | re. ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000286def iso2time(text):
287 """
288 As for http2time, but parses the ISO 8601 formats:
289
290 1994-02-03 14:15:29 -0100 -- ISO 8601 format
291 1994-02-03 14:15:29 -- zone is optional
292 1994-02-03 -- only date
293 1994-02-03T14:15:29 -- Use T as separator
294 19940203T141529Z -- ISO 8601 compact format
295 19940203 -- only date
296
297 """
298 # clean up
299 text = text.lstrip()
300
301 # tz is time zone specifier string
302 day, mon, yr, hr, min, sec, tz = [None]*7
303
304 # loose regexp parse
305 m = ISO_DATE_RE.search(text)
306 if m is not None:
307 # XXX there's an extra bit of the timezone I'm ignoring here: is
308 # this the right thing to do?
309 yr, mon, day, hr, min, sec, tz, _ = m.groups()
310 else:
311 return None # bad format
312
313 return _str2time(day, mon, yr, hr, min, sec, tz)
314
315
316# Header parsing
317# -----------------------------------------------------------------------------
318
319def unmatched(match):
320 """Return unmatched part of re.Match object."""
321 start, end = match.span(0)
322 return match.string[:start]+match.string[end:]
323
324HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
325HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
326HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
327HEADER_ESCAPE_RE = re.compile(r"\\(.)")
328def split_header_words(header_values):
329 r"""Parse header values into a list of lists containing key,value pairs.
330
331 The function knows how to deal with ",", ";" and "=" as well as quoted
332 values after "=". A list of space separated tokens are parsed as if they
333 were separated by ";".
334
335 If the header_values passed as argument contains multiple values, then they
336 are treated as if they were a single value separated by comma ",".
337
338 This means that this function is useful for parsing header fields that
339 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
340 the requirement for tokens).
341
342 headers = #header
343 header = (token | parameter) *( [";"] (token | parameter))
344
345 token = 1*<any CHAR except CTLs or separators>
346 separators = "(" | ")" | "<" | ">" | "@"
347 | "," | ";" | ":" | "\" | <">
348 | "/" | "[" | "]" | "?" | "="
349 | "{" | "}" | SP | HT
350
351 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
352 qdtext = <any TEXT except <">>
353 quoted-pair = "\" CHAR
354
355 parameter = attribute "=" value
356 attribute = token
357 value = token | quoted-string
358
359 Each header is represented by a list of key/value pairs. The value for a
360 simple token (not part of a parameter) is None. Syntactically incorrect
361 headers will not necessarily be parsed as you would want.
362
363 This is easier to describe with some examples:
364
365 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
366 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
367 >>> split_header_words(['text/html; charset="iso-8859-1"'])
368 [[('text/html', None), ('charset', 'iso-8859-1')]]
369 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
370 [[('Basic', None), ('realm', '"foobar"')]]
371
372 """
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000373 assert not isinstance(header_values, str)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000374 result = []
375 for text in header_values:
376 orig_text = text
377 pairs = []
378 while text:
379 m = HEADER_TOKEN_RE.search(text)
380 if m:
381 text = unmatched(m)
382 name = m.group(1)
383 m = HEADER_QUOTED_VALUE_RE.search(text)
384 if m: # quoted value
385 text = unmatched(m)
386 value = m.group(1)
387 value = HEADER_ESCAPE_RE.sub(r"\1", value)
388 else:
389 m = HEADER_VALUE_RE.search(text)
390 if m: # unquoted value
391 text = unmatched(m)
392 value = m.group(1)
393 value = value.rstrip()
394 else:
395 # no value, a lone token
396 value = None
397 pairs.append((name, value))
398 elif text.lstrip().startswith(","):
399 # concatenated headers, as per RFC 2616 section 4.2
400 text = text.lstrip()[1:]
401 if pairs: result.append(pairs)
402 pairs = []
403 else:
404 # skip junk
405 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
406 assert nr_junk_chars > 0, (
407 "split_header_words bug: '%s', '%s', %s" %
408 (orig_text, text, pairs))
409 text = non_junk
410 if pairs: result.append(pairs)
411 return result
412
413HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
414def join_header_words(lists):
415 """Do the inverse (almost) of the conversion done by split_header_words.
416
417 Takes a list of lists of (key, value) pairs and produces a single header
418 value. Attribute values are quoted if needed.
419
420 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
421 'text/plain; charset="iso-8859/1"'
422 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
423 'text/plain, charset="iso-8859/1"'
424
425 """
426 headers = []
427 for pairs in lists:
428 attr = []
429 for k, v in pairs:
430 if v is not None:
431 if not re.search(r"^\w+$", v):
432 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
433 v = '"%s"' % v
434 k = "%s=%s" % (k, v)
435 attr.append(k)
436 if attr: headers.append("; ".join(attr))
437 return ", ".join(headers)
438
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000439def strip_quotes(text):
440 if text.startswith('"'):
441 text = text[1:]
442 if text.endswith('"'):
443 text = text[:-1]
444 return text
445
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000446def parse_ns_headers(ns_headers):
447 """Ad-hoc parser for Netscape protocol cookie-attributes.
448
449 The old Netscape cookie format for Set-Cookie can for instance contain
450 an unquoted "," in the expires field, so we have to use this ad-hoc
451 parser instead of split_header_words.
452
453 XXX This may not make the best possible effort to parse all the crap
454 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
455 parser is probably better, so could do worse than following that if
456 this ever gives any trouble.
457
458 Currently, this is also used for parsing RFC 2109 cookies.
459
460 """
461 known_attrs = ("expires", "domain", "path", "secure",
462 # RFC 2109 attrs (may turn up in Netscape cookies, too)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000463 "version", "port", "max-age")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000464
465 result = []
466 for ns_header in ns_headers:
467 pairs = []
468 version_set = False
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000469 for ii, param in enumerate(re.split(r";\s*", ns_header)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000470 param = param.rstrip()
471 if param == "": continue
472 if "=" not in param:
Martin v. Löwisc5574e82005-03-03 10:57:37 +0000473 k, v = param, None
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000474 else:
475 k, v = re.split(r"\s*=\s*", param, 1)
476 k = k.lstrip()
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000477 if ii != 0:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000478 lc = k.lower()
479 if lc in known_attrs:
480 k = lc
481 if k == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000482 # This is an RFC 2109 cookie.
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000483 v = strip_quotes(v)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000484 version_set = True
485 if k == "expires":
486 # convert expires date to seconds since epoch
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000487 v = http2time(strip_quotes(v)) # None if invalid
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000488 pairs.append((k, v))
489
490 if pairs:
491 if not version_set:
492 pairs.append(("version", "0"))
493 result.append(pairs)
494
495 return result
496
497
Antoine Pitroufd036452008-08-19 17:56:33 +0000498IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000499def is_HDN(text):
500 """Return True if text is a host domain name."""
501 # XXX
502 # This may well be wrong. Which RFC is HDN defined in, if any (for
503 # the purposes of RFC 2965)?
504 # For the current implementation, what about IPv6? Remember to look
505 # at other uses of IPV4_RE also, if change this.
506 if IPV4_RE.search(text):
507 return False
508 if text == "":
509 return False
510 if text[0] == "." or text[-1] == ".":
511 return False
512 return True
513
514def domain_match(A, B):
515 """Return True if domain A domain-matches domain B, according to RFC 2965.
516
517 A and B may be host domain names or IP addresses.
518
519 RFC 2965, section 1:
520
521 Host names can be specified either as an IP address or a HDN string.
522 Sometimes we compare one host name with another. (Such comparisons SHALL
523 be case-insensitive.) Host A's name domain-matches host B's if
524
525 * their host name strings string-compare equal; or
526
527 * A is a HDN string and has the form NB, where N is a non-empty
528 name string, B has the form .B', and B' is a HDN string. (So,
529 x.y.com domain-matches .Y.com but not Y.com.)
530
531 Note that domain-match is not a commutative operation: a.b.c.com
532 domain-matches .c.com, but not the reverse.
533
534 """
535 # Note that, if A or B are IP addresses, the only relevant part of the
536 # definition of the domain-match algorithm is the direct string-compare.
537 A = A.lower()
538 B = B.lower()
539 if A == B:
540 return True
541 if not is_HDN(A):
542 return False
543 i = A.rfind(B)
544 if i == -1 or i == 0:
545 # A does not have form NB, or N is the empty string
546 return False
547 if not B.startswith("."):
548 return False
549 if not is_HDN(B[1:]):
550 return False
551 return True
552
553def liberal_is_HDN(text):
554 """Return True if text is a sort-of-like a host domain name.
555
556 For accepting/blocking domains.
557
558 """
559 if IPV4_RE.search(text):
560 return False
561 return True
562
563def user_domain_match(A, B):
564 """For blocking/accepting domains.
565
566 A and B may be host domain names or IP addresses.
567
568 """
569 A = A.lower()
570 B = B.lower()
571 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
572 if A == B:
573 # equal IP addresses
574 return True
575 return False
576 initial_dot = B.startswith(".")
577 if initial_dot and A.endswith(B):
578 return True
579 if not initial_dot and A == B:
580 return True
581 return False
582
Antoine Pitroufd036452008-08-19 17:56:33 +0000583cut_port_re = re.compile(r":\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000584def request_host(request):
585 """Return request-host, as defined by RFC 2965.
586
587 Variation from RFC: returned value is lowercased, for convenient
588 comparison.
589
590 """
591 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000592 host = urllib.parse.urlparse(url)[1]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000593 if host == "":
594 host = request.get_header("Host", "")
595
596 # remove port, if present
597 host = cut_port_re.sub("", host, 1)
598 return host.lower()
599
600def eff_request_host(request):
601 """Return a tuple (request-host, effective request-host name).
602
603 As defined by RFC 2965, except both are lowercased.
604
605 """
606 erhn = req_host = request_host(request)
607 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
608 erhn = req_host + ".local"
609 return req_host, erhn
610
611def request_path(request):
612 """request-URI, as defined by RFC 2965."""
613 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000614 path, parameters, query, frag = urllib.parse.urlparse(url)[2:]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000615 if parameters:
616 path = "%s;%s" % (path, parameters)
617 path = escape_path(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000618 req_path = urllib.parse.urlunparse(("", "", path, "", query, frag))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000619 if not req_path.startswith("/"):
620 # fix bad RFC 2396 absoluteURI
621 req_path = "/"+req_path
622 return req_path
623
624def request_port(request):
625 host = request.get_host()
626 i = host.find(':')
627 if i >= 0:
628 port = host[i+1:]
629 try:
630 int(port)
631 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000632 _debug("nonnumeric port: '%s'", port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000633 return None
634 else:
635 port = DEFAULT_HTTP_PORT
636 return port
637
638# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
639# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
640HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
641ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
642def uppercase_escaped_char(match):
643 return "%%%s" % match.group(1).upper()
644def escape_path(path):
645 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
646 # There's no knowing what character encoding was used to create URLs
647 # containing %-escapes, but since we have to pick one to escape invalid
648 # path characters, we pick UTF-8, as recommended in the HTML 4.0
649 # specification:
650 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
651 # And here, kind of: draft-fielding-uri-rfc2396bis-03
652 # (And in draft IRI specification: draft-duerst-iri-05)
653 # (And here, for new URI schemes: RFC 2718)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000654 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000655 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
656 return path
657
658def reach(h):
659 """Return reach of host h, as defined by RFC 2965, section 1.
660
661 The reach R of a host name H is defined as follows:
662
663 * If
664
665 - H is the host domain name of a host; and,
666
667 - H has the form A.B; and
668
669 - A has no embedded (that is, interior) dots; and
670
671 - B has at least one embedded dot, or B is the string "local".
672 then the reach of H is .B.
673
674 * Otherwise, the reach of H is H.
675
676 >>> reach("www.acme.com")
677 '.acme.com'
678 >>> reach("acme.com")
679 'acme.com'
680 >>> reach("acme.local")
681 '.local'
682
683 """
684 i = h.find(".")
685 if i >= 0:
686 #a = h[:i] # this line is only here to show what a is
687 b = h[i+1:]
688 i = b.find(".")
689 if is_HDN(h) and (i >= 0 or b == "local"):
690 return "."+b
691 return h
692
693def is_third_party(request):
694 """
695
696 RFC 2965, section 3.3.6:
697
698 An unverifiable transaction is to a third-party host if its request-
699 host U does not domain-match the reach R of the request-host O in the
700 origin transaction.
701
702 """
703 req_host = request_host(request)
704 if not domain_match(req_host, reach(request.get_origin_req_host())):
705 return True
706 else:
707 return False
708
709
710class Cookie:
711 """HTTP Cookie.
712
713 This class represents both Netscape and RFC 2965 cookies.
714
715 This is deliberately a very simple class. It just holds attributes. It's
716 possible to construct Cookie instances that don't comply with the cookie
717 standards. CookieJar.make_cookies is the factory function for Cookie
718 objects -- it deals with cookie parsing, supplying defaults, and
719 normalising to the representation used in this class. CookiePolicy is
720 responsible for checking them to see whether they should be accepted from
721 and returned to the server.
722
723 Note that the port may be present in the headers, but unspecified ("Port"
724 rather than"Port=80", for example); if this is the case, port is None.
725
726 """
727
728 def __init__(self, version, name, value,
729 port, port_specified,
730 domain, domain_specified, domain_initial_dot,
731 path, path_specified,
732 secure,
733 expires,
734 discard,
735 comment,
736 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000737 rest,
738 rfc2109=False,
739 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000740
741 if version is not None: version = int(version)
742 if expires is not None: expires = int(expires)
743 if port is None and port_specified is True:
744 raise ValueError("if port is None, port_specified must be false")
745
746 self.version = version
747 self.name = name
748 self.value = value
749 self.port = port
750 self.port_specified = port_specified
751 # normalise case, as per RFC 2965 section 3.3.3
752 self.domain = domain.lower()
753 self.domain_specified = domain_specified
754 # Sigh. We need to know whether the domain given in the
755 # cookie-attribute had an initial dot, in order to follow RFC 2965
756 # (as clarified in draft errata). Needed for the returned $Domain
757 # value.
758 self.domain_initial_dot = domain_initial_dot
759 self.path = path
760 self.path_specified = path_specified
761 self.secure = secure
762 self.expires = expires
763 self.discard = discard
764 self.comment = comment
765 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000766 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000767
768 self._rest = copy.copy(rest)
769
770 def has_nonstandard_attr(self, name):
771 return name in self._rest
772 def get_nonstandard_attr(self, name, default=None):
773 return self._rest.get(name, default)
774 def set_nonstandard_attr(self, name, value):
775 self._rest[name] = value
776
777 def is_expired(self, now=None):
778 if now is None: now = time.time()
779 if (self.expires is not None) and (self.expires <= now):
780 return True
781 return False
782
783 def __str__(self):
784 if self.port is None: p = ""
785 else: p = ":"+self.port
786 limit = self.domain + p + self.path
787 if self.value is not None:
788 namevalue = "%s=%s" % (self.name, self.value)
789 else:
790 namevalue = self.name
791 return "<Cookie %s for %s>" % (namevalue, limit)
792
793 def __repr__(self):
794 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000795 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000796 "port", "port_specified",
797 "domain", "domain_specified", "domain_initial_dot",
798 "path", "path_specified",
799 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000800 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000801 attr = getattr(self, name)
802 args.append("%s=%s" % (name, repr(attr)))
803 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000804 args.append("rfc2109=%s" % repr(self.rfc2109))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000805 return "Cookie(%s)" % ", ".join(args)
806
807
808class CookiePolicy:
809 """Defines which cookies get accepted from and returned to server.
810
811 May also modify cookies, though this is probably a bad idea.
812
813 The subclass DefaultCookiePolicy defines the standard rules for Netscape
814 and RFC 2965 cookies -- override that if you want a customised policy.
815
816 """
817 def set_ok(self, cookie, request):
818 """Return true if (and only if) cookie should be accepted from server.
819
820 Currently, pre-expired cookies never get this far -- the CookieJar
821 class deletes such cookies itself.
822
823 """
824 raise NotImplementedError()
825
826 def return_ok(self, cookie, request):
827 """Return true if (and only if) cookie should be returned to server."""
828 raise NotImplementedError()
829
830 def domain_return_ok(self, domain, request):
831 """Return false if cookies should not be returned, given cookie domain.
832 """
833 return True
834
835 def path_return_ok(self, path, request):
836 """Return false if cookies should not be returned, given cookie path.
837 """
838 return True
839
840
841class DefaultCookiePolicy(CookiePolicy):
842 """Implements the standard rules for accepting and returning cookies."""
843
844 DomainStrictNoDots = 1
845 DomainStrictNonDomain = 2
846 DomainRFC2965Match = 4
847
848 DomainLiberal = 0
849 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
850
851 def __init__(self,
852 blocked_domains=None, allowed_domains=None,
853 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000854 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000855 hide_cookie2=False,
856 strict_domain=False,
857 strict_rfc2965_unverifiable=True,
858 strict_ns_unverifiable=False,
859 strict_ns_domain=DomainLiberal,
860 strict_ns_set_initial_dollar=False,
861 strict_ns_set_path=False,
862 ):
863 """Constructor arguments should be passed as keyword arguments only."""
864 self.netscape = netscape
865 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000866 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000867 self.hide_cookie2 = hide_cookie2
868 self.strict_domain = strict_domain
869 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
870 self.strict_ns_unverifiable = strict_ns_unverifiable
871 self.strict_ns_domain = strict_ns_domain
872 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
873 self.strict_ns_set_path = strict_ns_set_path
874
875 if blocked_domains is not None:
876 self._blocked_domains = tuple(blocked_domains)
877 else:
878 self._blocked_domains = ()
879
880 if allowed_domains is not None:
881 allowed_domains = tuple(allowed_domains)
882 self._allowed_domains = allowed_domains
883
884 def blocked_domains(self):
885 """Return the sequence of blocked domains (as a tuple)."""
886 return self._blocked_domains
887 def set_blocked_domains(self, blocked_domains):
888 """Set the sequence of blocked domains."""
889 self._blocked_domains = tuple(blocked_domains)
890
891 def is_blocked(self, domain):
892 for blocked_domain in self._blocked_domains:
893 if user_domain_match(domain, blocked_domain):
894 return True
895 return False
896
897 def allowed_domains(self):
898 """Return None, or the sequence of allowed domains (as a tuple)."""
899 return self._allowed_domains
900 def set_allowed_domains(self, allowed_domains):
901 """Set the sequence of allowed domains, or None."""
902 if allowed_domains is not None:
903 allowed_domains = tuple(allowed_domains)
904 self._allowed_domains = allowed_domains
905
906 def is_not_allowed(self, domain):
907 if self._allowed_domains is None:
908 return False
909 for allowed_domain in self._allowed_domains:
910 if user_domain_match(domain, allowed_domain):
911 return False
912 return True
913
914 def set_ok(self, cookie, request):
915 """
916 If you override .set_ok(), be sure to call this method. If it returns
917 false, so should your subclass (assuming your subclass wants to be more
918 strict about which cookies to accept).
919
920 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000921 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000922
923 assert cookie.name is not None
924
925 for n in "version", "verifiability", "name", "path", "domain", "port":
926 fn_name = "set_ok_"+n
927 fn = getattr(self, fn_name)
928 if not fn(cookie, request):
929 return False
930
931 return True
932
933 def set_ok_version(self, cookie, request):
934 if cookie.version is None:
935 # Version is always set to 0 by parse_ns_headers if it's a Netscape
936 # cookie, so this must be an invalid RFC 2965 cookie.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000937 _debug(" Set-Cookie2 without version attribute (%s=%s)",
938 cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000939 return False
940 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000941 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000942 return False
943 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000944 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000945 return False
946 return True
947
948 def set_ok_verifiability(self, cookie, request):
949 if request.is_unverifiable() and is_third_party(request):
950 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000951 _debug(" third-party RFC 2965 cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000952 "unverifiable transaction")
953 return False
954 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000955 _debug(" third-party Netscape cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000956 "unverifiable transaction")
957 return False
958 return True
959
960 def set_ok_name(self, cookie, request):
961 # Try and stop servers setting V0 cookies designed to hack other
962 # servers that know both V0 and V1 protocols.
963 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
964 cookie.name.startswith("$")):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000965 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000966 return False
967 return True
968
969 def set_ok_path(self, cookie, request):
970 if cookie.path_specified:
971 req_path = request_path(request)
972 if ((cookie.version > 0 or
973 (cookie.version == 0 and self.strict_ns_set_path)) and
974 not req_path.startswith(cookie.path)):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000975 _debug(" path attribute %s is not a prefix of request "
976 "path %s", cookie.path, req_path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000977 return False
978 return True
979
980 def set_ok_domain(self, cookie, request):
981 if self.is_blocked(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000982 _debug(" domain %s is in user block-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000983 return False
984 if self.is_not_allowed(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000985 _debug(" domain %s is not in user allow-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000986 return False
987 if cookie.domain_specified:
988 req_host, erhn = eff_request_host(request)
989 domain = cookie.domain
990 if self.strict_domain and (domain.count(".") >= 2):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000991 # XXX This should probably be compared with the Konqueror
992 # (kcookiejar.cpp) and Mozilla implementations, but it's a
993 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000994 i = domain.rfind(".")
995 j = domain.rfind(".", 0, i)
996 if j == 0: # domain like .foo.bar
997 tld = domain[i+1:]
998 sld = domain[j+1:i]
Thomas Wouters477c8d52006-05-27 19:21:47 +0000999 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1000 "gov", "mil", "int", "aero", "biz", "cat", "coop",
1001 "info", "jobs", "mobi", "museum", "name", "pro",
1002 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001003 # domain like .co.uk
Thomas Wouters477c8d52006-05-27 19:21:47 +00001004 _debug(" country-code second level domain %s", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001005 return False
1006 if domain.startswith("."):
1007 undotted_domain = domain[1:]
1008 else:
1009 undotted_domain = domain
1010 embedded_dots = (undotted_domain.find(".") >= 0)
1011 if not embedded_dots and domain != ".local":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001012 _debug(" non-local domain %s contains no embedded dot",
1013 domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001014 return False
1015 if cookie.version == 0:
1016 if (not erhn.endswith(domain) and
1017 (not erhn.startswith(".") and
1018 not ("."+erhn).endswith(domain))):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001019 _debug(" effective request-host %s (even with added "
1020 "initial dot) does not end end with %s",
1021 erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001022 return False
1023 if (cookie.version > 0 or
1024 (self.strict_ns_domain & self.DomainRFC2965Match)):
1025 if not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001026 _debug(" effective request-host %s does not domain-match "
1027 "%s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001028 return False
1029 if (cookie.version > 0 or
1030 (self.strict_ns_domain & self.DomainStrictNoDots)):
1031 host_prefix = req_host[:-len(domain)]
1032 if (host_prefix.find(".") >= 0 and
1033 not IPV4_RE.search(req_host)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001034 _debug(" host prefix %s for domain %s contains a dot",
1035 host_prefix, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001036 return False
1037 return True
1038
1039 def set_ok_port(self, cookie, request):
1040 if cookie.port_specified:
1041 req_port = request_port(request)
1042 if req_port is None:
1043 req_port = "80"
1044 else:
1045 req_port = str(req_port)
1046 for p in cookie.port.split(","):
1047 try:
1048 int(p)
1049 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001050 _debug(" bad port %s (not numeric)", p)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001051 return False
1052 if p == req_port:
1053 break
1054 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001055 _debug(" request port (%s) not found in %s",
1056 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001057 return False
1058 return True
1059
1060 def return_ok(self, cookie, request):
1061 """
1062 If you override .return_ok(), be sure to call this method. If it
1063 returns false, so should your subclass (assuming your subclass wants to
1064 be more strict about which cookies to return).
1065
1066 """
1067 # Path has already been checked by .path_return_ok(), and domain
1068 # blocking done by .domain_return_ok().
Thomas Wouters477c8d52006-05-27 19:21:47 +00001069 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001070
1071 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1072 fn_name = "return_ok_"+n
1073 fn = getattr(self, fn_name)
1074 if not fn(cookie, request):
1075 return False
1076 return True
1077
1078 def return_ok_version(self, cookie, request):
1079 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001080 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001081 return False
1082 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001083 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001084 return False
1085 return True
1086
1087 def return_ok_verifiability(self, cookie, request):
1088 if request.is_unverifiable() and is_third_party(request):
1089 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001090 _debug(" third-party RFC 2965 cookie during unverifiable "
1091 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001092 return False
1093 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001094 _debug(" third-party Netscape cookie during unverifiable "
1095 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001096 return False
1097 return True
1098
1099 def return_ok_secure(self, cookie, request):
1100 if cookie.secure and request.get_type() != "https":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001101 _debug(" secure cookie with non-secure request")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001102 return False
1103 return True
1104
1105 def return_ok_expires(self, cookie, request):
1106 if cookie.is_expired(self._now):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001107 _debug(" cookie expired")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001108 return False
1109 return True
1110
1111 def return_ok_port(self, cookie, request):
1112 if cookie.port:
1113 req_port = request_port(request)
1114 if req_port is None:
1115 req_port = "80"
1116 for p in cookie.port.split(","):
1117 if p == req_port:
1118 break
1119 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001120 _debug(" request port %s does not match cookie port %s",
1121 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001122 return False
1123 return True
1124
1125 def return_ok_domain(self, cookie, request):
1126 req_host, erhn = eff_request_host(request)
1127 domain = cookie.domain
1128
1129 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1130 if (cookie.version == 0 and
1131 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1132 not cookie.domain_specified and domain != erhn):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001133 _debug(" cookie with unspecified domain does not string-compare "
1134 "equal to request domain")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001135 return False
1136
1137 if cookie.version > 0 and not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001138 _debug(" effective request-host name %s does not domain-match "
1139 "RFC 2965 cookie domain %s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001140 return False
1141 if cookie.version == 0 and not ("."+erhn).endswith(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001142 _debug(" request-host %s does not match Netscape cookie domain "
1143 "%s", req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001144 return False
1145 return True
1146
1147 def domain_return_ok(self, domain, request):
1148 # Liberal check of. This is here as an optimization to avoid
1149 # having to load lots of MSIE cookie files unless necessary.
1150 req_host, erhn = eff_request_host(request)
1151 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001152 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001153 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001154 erhn = "."+erhn
1155 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001156 #_debug(" request domain %s does not match cookie domain %s",
1157 # req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001158 return False
1159
1160 if self.is_blocked(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001161 _debug(" domain %s is in user block-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001162 return False
1163 if self.is_not_allowed(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001164 _debug(" domain %s is not in user allow-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001165 return False
1166
1167 return True
1168
1169 def path_return_ok(self, path, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001170 _debug("- checking cookie path=%s", path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001171 req_path = request_path(request)
1172 if not req_path.startswith(path):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001173 _debug(" %s does not path-match %s", req_path, path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001174 return False
1175 return True
1176
1177
1178def vals_sorted_by_key(adict):
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001179 keys = sorted(adict.keys())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001180 return map(adict.get, keys)
1181
1182def deepvalues(mapping):
1183 """Iterates over nested mapping, depth-first, in sorted order by key."""
1184 values = vals_sorted_by_key(mapping)
1185 for obj in values:
1186 mapping = False
1187 try:
1188 obj.items
1189 except AttributeError:
1190 pass
1191 else:
1192 mapping = True
1193 for subobj in deepvalues(obj):
1194 yield subobj
1195 if not mapping:
1196 yield obj
1197
1198
1199# Used as second parameter to dict.get() method, to distinguish absent
1200# dict key from one with a None value.
1201class Absent: pass
1202
1203class CookieJar:
1204 """Collection of HTTP cookies.
1205
1206 You may not need to know about this class: try
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001207 urllib.request.build_opener(HTTPCookieProcessor).open(url).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001208 """
1209
1210 non_word_re = re.compile(r"\W")
1211 quote_re = re.compile(r"([\"\\])")
1212 strict_domain_re = re.compile(r"\.?[^.]*")
1213 domain_re = re.compile(r"[^.]*")
1214 dots_re = re.compile(r"^\.+")
1215
Antoine Pitroufd036452008-08-19 17:56:33 +00001216 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001217
1218 def __init__(self, policy=None):
1219 if policy is None:
1220 policy = DefaultCookiePolicy()
1221 self._policy = policy
1222
1223 self._cookies_lock = _threading.RLock()
1224 self._cookies = {}
1225
1226 def set_policy(self, policy):
1227 self._policy = policy
1228
1229 def _cookies_for_domain(self, domain, request):
1230 cookies = []
1231 if not self._policy.domain_return_ok(domain, request):
1232 return []
Thomas Wouters477c8d52006-05-27 19:21:47 +00001233 _debug("Checking %s for cookies to return", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001234 cookies_by_path = self._cookies[domain]
1235 for path in cookies_by_path.keys():
1236 if not self._policy.path_return_ok(path, request):
1237 continue
1238 cookies_by_name = cookies_by_path[path]
1239 for cookie in cookies_by_name.values():
1240 if not self._policy.return_ok(cookie, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001241 _debug(" not returning cookie")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001242 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001243 _debug(" it's a match")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001244 cookies.append(cookie)
1245 return cookies
1246
1247 def _cookies_for_request(self, request):
1248 """Return a list of cookies to be returned to server."""
1249 cookies = []
1250 for domain in self._cookies.keys():
1251 cookies.extend(self._cookies_for_domain(domain, request))
1252 return cookies
1253
1254 def _cookie_attrs(self, cookies):
1255 """Return a list of cookie-attributes to be returned to server.
1256
1257 like ['foo="bar"; $Path="/"', ...]
1258
1259 The $Version attribute is also added when appropriate (currently only
1260 once per request).
1261
1262 """
1263 # add cookies in order of most specific (ie. longest) path first
Raymond Hettinger70b64fc2008-01-30 20:15:17 +00001264 cookies.sort(key=lambda a: len(a.path), reverse=True)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001265
1266 version_set = False
1267
1268 attrs = []
1269 for cookie in cookies:
1270 # set version of Cookie header
1271 # XXX
1272 # What should it be if multiple matching Set-Cookie headers have
1273 # different versions themselves?
1274 # Answer: there is no answer; was supposed to be settled by
1275 # RFC 2965 errata, but that may never appear...
1276 version = cookie.version
1277 if not version_set:
1278 version_set = True
1279 if version > 0:
1280 attrs.append("$Version=%s" % version)
1281
1282 # quote cookie value if necessary
1283 # (not for Netscape protocol, which already has any quotes
1284 # intact, due to the poorly-specified Netscape Cookie: syntax)
1285 if ((cookie.value is not None) and
1286 self.non_word_re.search(cookie.value) and version > 0):
1287 value = self.quote_re.sub(r"\\\1", cookie.value)
1288 else:
1289 value = cookie.value
1290
1291 # add cookie-attributes to be returned in Cookie header
1292 if cookie.value is None:
1293 attrs.append(cookie.name)
1294 else:
1295 attrs.append("%s=%s" % (cookie.name, value))
1296 if version > 0:
1297 if cookie.path_specified:
1298 attrs.append('$Path="%s"' % cookie.path)
1299 if cookie.domain.startswith("."):
1300 domain = cookie.domain
1301 if (not cookie.domain_initial_dot and
1302 domain.startswith(".")):
1303 domain = domain[1:]
1304 attrs.append('$Domain="%s"' % domain)
1305 if cookie.port is not None:
1306 p = "$Port"
1307 if cookie.port_specified:
1308 p = p + ('="%s"' % cookie.port)
1309 attrs.append(p)
1310
1311 return attrs
1312
1313 def add_cookie_header(self, request):
Georg Brandl029986a2008-06-23 11:44:14 +00001314 """Add correct Cookie: header to request (urllib.request.Request object).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001315
1316 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1317
1318 """
Thomas Wouters477c8d52006-05-27 19:21:47 +00001319 _debug("add_cookie_header")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001320 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001321 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001322
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001323 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001324
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001325 cookies = self._cookies_for_request(request)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001326
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001327 attrs = self._cookie_attrs(cookies)
1328 if attrs:
1329 if not request.has_header("Cookie"):
1330 request.add_unredirected_header(
1331 "Cookie", "; ".join(attrs))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001332
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001333 # if necessary, advertise that we know RFC 2965
1334 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1335 not request.has_header("Cookie2")):
1336 for cookie in cookies:
1337 if cookie.version != 1:
1338 request.add_unredirected_header("Cookie2", '$Version="1"')
1339 break
1340
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001341 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001342 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001343
1344 self.clear_expired_cookies()
1345
1346 def _normalized_cookie_tuples(self, attrs_set):
1347 """Return list of tuples containing normalised cookie information.
1348
1349 attrs_set is the list of lists of key,value pairs extracted from
1350 the Set-Cookie or Set-Cookie2 headers.
1351
1352 Tuples are name, value, standard, rest, where name and value are the
1353 cookie name and value, standard is a dictionary containing the standard
1354 cookie-attributes (discard, secure, version, expires or max-age,
1355 domain, path and port) and rest is a dictionary containing the rest of
1356 the cookie-attributes.
1357
1358 """
1359 cookie_tuples = []
1360
1361 boolean_attrs = "discard", "secure"
1362 value_attrs = ("version",
1363 "expires", "max-age",
1364 "domain", "path", "port",
1365 "comment", "commenturl")
1366
1367 for cookie_attrs in attrs_set:
1368 name, value = cookie_attrs[0]
1369
1370 # Build dictionary of standard cookie-attributes (standard) and
1371 # dictionary of other cookie-attributes (rest).
1372
1373 # Note: expiry time is normalised to seconds since epoch. V0
1374 # cookies should have the Expires cookie-attribute, and V1 cookies
1375 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1376 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1377 # accept either (but prefer Max-Age).
1378 max_age_set = False
1379
1380 bad_cookie = False
1381
1382 standard = {}
1383 rest = {}
1384 for k, v in cookie_attrs[1:]:
1385 lc = k.lower()
1386 # don't lose case distinction for unknown fields
1387 if lc in value_attrs or lc in boolean_attrs:
1388 k = lc
1389 if k in boolean_attrs and v is None:
1390 # boolean cookie-attribute is present, but has no value
1391 # (like "discard", rather than "port=80")
1392 v = True
1393 if k in standard:
1394 # only first value is significant
1395 continue
1396 if k == "domain":
1397 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001398 _debug(" missing value for domain attribute")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001399 bad_cookie = True
1400 break
1401 # RFC 2965 section 3.3.3
1402 v = v.lower()
1403 if k == "expires":
1404 if max_age_set:
1405 # Prefer max-age to expires (like Mozilla)
1406 continue
1407 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001408 _debug(" missing or invalid value for expires "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001409 "attribute: treating as session cookie")
1410 continue
1411 if k == "max-age":
1412 max_age_set = True
1413 try:
1414 v = int(v)
1415 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001416 _debug(" missing or invalid (non-numeric) value for "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001417 "max-age attribute")
1418 bad_cookie = True
1419 break
1420 # convert RFC 2965 Max-Age to seconds since epoch
1421 # XXX Strictly you're supposed to follow RFC 2616
1422 # age-calculation rules. Remember that zero Max-Age is a
1423 # is a request to discard (old and new) cookie, though.
1424 k = "expires"
1425 v = self._now + v
1426 if (k in value_attrs) or (k in boolean_attrs):
1427 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001428 k not in ("port", "comment", "commenturl")):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001429 _debug(" missing value for %s attribute" % k)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001430 bad_cookie = True
1431 break
1432 standard[k] = v
1433 else:
1434 rest[k] = v
1435
1436 if bad_cookie:
1437 continue
1438
1439 cookie_tuples.append((name, value, standard, rest))
1440
1441 return cookie_tuples
1442
1443 def _cookie_from_cookie_tuple(self, tup, request):
1444 # standard is dict of standard cookie-attributes, rest is dict of the
1445 # rest of them
1446 name, value, standard, rest = tup
1447
1448 domain = standard.get("domain", Absent)
1449 path = standard.get("path", Absent)
1450 port = standard.get("port", Absent)
1451 expires = standard.get("expires", Absent)
1452
1453 # set the easy defaults
1454 version = standard.get("version", None)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +00001455 if version is not None:
1456 try:
1457 version = int(version)
1458 except ValueError:
1459 return None # invalid version, ignore cookie
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001460 secure = standard.get("secure", False)
1461 # (discard is also set if expires is Absent)
1462 discard = standard.get("discard", False)
1463 comment = standard.get("comment", None)
1464 comment_url = standard.get("commenturl", None)
1465
1466 # set default path
1467 if path is not Absent and path != "":
1468 path_specified = True
1469 path = escape_path(path)
1470 else:
1471 path_specified = False
1472 path = request_path(request)
1473 i = path.rfind("/")
1474 if i != -1:
1475 if version == 0:
1476 # Netscape spec parts company from reality here
1477 path = path[:i]
1478 else:
1479 path = path[:i+1]
1480 if len(path) == 0: path = "/"
1481
1482 # set default domain
1483 domain_specified = domain is not Absent
1484 # but first we have to remember whether it starts with a dot
1485 domain_initial_dot = False
1486 if domain_specified:
1487 domain_initial_dot = bool(domain.startswith("."))
1488 if domain is Absent:
1489 req_host, erhn = eff_request_host(request)
1490 domain = erhn
1491 elif not domain.startswith("."):
1492 domain = "."+domain
1493
1494 # set default port
1495 port_specified = False
1496 if port is not Absent:
1497 if port is None:
1498 # Port attr present, but has no value: default to request port.
1499 # Cookie should then only be sent back on that port.
1500 port = request_port(request)
1501 else:
1502 port_specified = True
1503 port = re.sub(r"\s+", "", port)
1504 else:
1505 # No port attr present. Cookie can be sent back on any port.
1506 port = None
1507
1508 # set default expires and discard
1509 if expires is Absent:
1510 expires = None
1511 discard = True
1512 elif expires <= self._now:
1513 # Expiry date in past is request to delete cookie. This can't be
1514 # in DefaultCookiePolicy, because can't delete cookies there.
1515 try:
1516 self.clear(domain, path, name)
1517 except KeyError:
1518 pass
Thomas Wouters477c8d52006-05-27 19:21:47 +00001519 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1520 domain, path, name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001521 return None
1522
1523 return Cookie(version,
1524 name, value,
1525 port, port_specified,
1526 domain, domain_specified, domain_initial_dot,
1527 path, path_specified,
1528 secure,
1529 expires,
1530 discard,
1531 comment,
1532 comment_url,
1533 rest)
1534
1535 def _cookies_from_attrs_set(self, attrs_set, request):
1536 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1537
1538 cookies = []
1539 for tup in cookie_tuples:
1540 cookie = self._cookie_from_cookie_tuple(tup, request)
1541 if cookie: cookies.append(cookie)
1542 return cookies
1543
Neal Norwitz71dad722005-12-23 21:43:48 +00001544 def _process_rfc2109_cookies(self, cookies):
1545 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1546 if rfc2109_as_ns is None:
1547 rfc2109_as_ns = not self._policy.rfc2965
1548 for cookie in cookies:
1549 if cookie.version == 1:
1550 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001551 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001552 # treat 2109 cookies as Netscape cookies rather than
1553 # as RFC2965 cookies
1554 cookie.version = 0
1555
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001556 def make_cookies(self, response, request):
1557 """Return sequence of Cookie objects extracted from response object."""
1558 # get cookie-attributes for RFC 2965 and Netscape protocols
1559 headers = response.info()
Barry Warsaw820c1202008-06-12 04:06:45 +00001560 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1561 ns_hdrs = headers.get_all("Set-Cookie", [])
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001562
1563 rfc2965 = self._policy.rfc2965
1564 netscape = self._policy.netscape
1565
1566 if ((not rfc2965_hdrs and not ns_hdrs) or
1567 (not ns_hdrs and not rfc2965) or
1568 (not rfc2965_hdrs and not netscape) or
1569 (not netscape and not rfc2965)):
1570 return [] # no relevant cookie headers: quick exit
1571
1572 try:
1573 cookies = self._cookies_from_attrs_set(
1574 split_header_words(rfc2965_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001575 except Exception:
1576 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001577 cookies = []
1578
1579 if ns_hdrs and netscape:
1580 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001581 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001582 ns_cookies = self._cookies_from_attrs_set(
1583 parse_ns_headers(ns_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001584 except Exception:
1585 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001586 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001587 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001588
1589 # Look for Netscape cookies (from Set-Cookie headers) that match
1590 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1591 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1592 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1593 # bundled in with the Netscape cookies for this purpose, which is
1594 # reasonable behaviour.
1595 if rfc2965:
1596 lookup = {}
1597 for cookie in cookies:
1598 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1599
1600 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1601 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1602 return key not in lookup
1603 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1604
1605 if ns_cookies:
1606 cookies.extend(ns_cookies)
1607
1608 return cookies
1609
1610 def set_cookie_if_ok(self, cookie, request):
1611 """Set a cookie if policy says it's OK to do so."""
1612 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001613 try:
1614 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001615
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001616 if self._policy.set_ok(cookie, request):
1617 self.set_cookie(cookie)
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001618
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001619
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001620 finally:
1621 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001622
1623 def set_cookie(self, cookie):
1624 """Set a cookie, without checking whether or not it should be set."""
1625 c = self._cookies
1626 self._cookies_lock.acquire()
1627 try:
1628 if cookie.domain not in c: c[cookie.domain] = {}
1629 c2 = c[cookie.domain]
1630 if cookie.path not in c2: c2[cookie.path] = {}
1631 c3 = c2[cookie.path]
1632 c3[cookie.name] = cookie
1633 finally:
1634 self._cookies_lock.release()
1635
1636 def extract_cookies(self, response, request):
1637 """Extract cookies from response, where allowable given the request."""
Thomas Wouters477c8d52006-05-27 19:21:47 +00001638 _debug("extract_cookies: %s", response.info())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001639 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001640 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001641 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001642
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001643 for cookie in self.make_cookies(response, request):
1644 if self._policy.set_ok(cookie, request):
1645 _debug(" setting cookie: %s", cookie)
1646 self.set_cookie(cookie)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001647 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001648 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001649
1650 def clear(self, domain=None, path=None, name=None):
1651 """Clear some cookies.
1652
1653 Invoking this method without arguments will clear all cookies. If
1654 given a single argument, only cookies belonging to that domain will be
1655 removed. If given two arguments, cookies belonging to the specified
1656 path within that domain are removed. If given three arguments, then
1657 the cookie with the specified name, path and domain is removed.
1658
1659 Raises KeyError if no matching cookie exists.
1660
1661 """
1662 if name is not None:
1663 if (domain is None) or (path is None):
1664 raise ValueError(
1665 "domain and path must be given to remove a cookie by name")
1666 del self._cookies[domain][path][name]
1667 elif path is not None:
1668 if domain is None:
1669 raise ValueError(
1670 "domain must be given to remove cookies by path")
1671 del self._cookies[domain][path]
1672 elif domain is not None:
1673 del self._cookies[domain]
1674 else:
1675 self._cookies = {}
1676
1677 def clear_session_cookies(self):
1678 """Discard all session cookies.
1679
1680 Note that the .save() method won't save session cookies anyway, unless
1681 you ask otherwise by passing a true ignore_discard argument.
1682
1683 """
1684 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001685 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001686 for cookie in self:
1687 if cookie.discard:
1688 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001689 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001690 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001691
1692 def clear_expired_cookies(self):
1693 """Discard all expired cookies.
1694
1695 You probably don't need to call this method: expired cookies are never
1696 sent back to the server (provided you're using DefaultCookiePolicy),
1697 this method is called by CookieJar itself every so often, and the
1698 .save() method won't save expired cookies anyway (unless you ask
1699 otherwise by passing a true ignore_expires argument).
1700
1701 """
1702 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001703 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001704 now = time.time()
1705 for cookie in self:
1706 if cookie.is_expired(now):
1707 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001708 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001709 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001710
1711 def __iter__(self):
1712 return deepvalues(self._cookies)
1713
1714 def __len__(self):
1715 """Return number of contained cookies."""
1716 i = 0
1717 for cookie in self: i = i + 1
1718 return i
1719
1720 def __repr__(self):
1721 r = []
1722 for cookie in self: r.append(repr(cookie))
1723 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1724
1725 def __str__(self):
1726 r = []
1727 for cookie in self: r.append(str(cookie))
1728 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1729
1730
Neal Norwitz3e7de592005-12-23 21:24:35 +00001731# derives from IOError for backwards-compatibility with Python 2.4.0
1732class LoadError(IOError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001733
1734class FileCookieJar(CookieJar):
1735 """CookieJar that can be loaded from and saved to a file."""
1736
1737 def __init__(self, filename=None, delayload=False, policy=None):
1738 """
1739 Cookies are NOT loaded from the named file until either the .load() or
1740 .revert() method is called.
1741
1742 """
1743 CookieJar.__init__(self, policy)
1744 if filename is not None:
1745 try:
1746 filename+""
1747 except:
1748 raise ValueError("filename must be string-like")
1749 self.filename = filename
1750 self.delayload = bool(delayload)
1751
1752 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1753 """Save cookies to a file."""
1754 raise NotImplementedError()
1755
1756 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1757 """Load cookies from a file."""
1758 if filename is None:
1759 if self.filename is not None: filename = self.filename
1760 else: raise ValueError(MISSING_FILENAME_TEXT)
1761
1762 f = open(filename)
1763 try:
1764 self._really_load(f, filename, ignore_discard, ignore_expires)
1765 finally:
1766 f.close()
1767
1768 def revert(self, filename=None,
1769 ignore_discard=False, ignore_expires=False):
1770 """Clear all cookies and reload cookies from a saved file.
1771
1772 Raises LoadError (or IOError) if reversion is not successful; the
1773 object's state will not be altered if this happens.
1774
1775 """
1776 if filename is None:
1777 if self.filename is not None: filename = self.filename
1778 else: raise ValueError(MISSING_FILENAME_TEXT)
1779
1780 self._cookies_lock.acquire()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001781 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001782
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001783 old_state = copy.deepcopy(self._cookies)
1784 self._cookies = {}
1785 try:
1786 self.load(filename, ignore_discard, ignore_expires)
1787 except (LoadError, IOError):
1788 self._cookies = old_state
1789 raise
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001790
1791 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001792 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001793
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001794
1795def lwp_cookie_str(cookie):
1796 """Return string representation of Cookie in an the LWP cookie file format.
1797
1798 Actually, the format is extended a bit -- see module docstring.
1799
1800 """
1801 h = [(cookie.name, cookie.value),
1802 ("path", cookie.path),
1803 ("domain", cookie.domain)]
1804 if cookie.port is not None: h.append(("port", cookie.port))
1805 if cookie.path_specified: h.append(("path_spec", None))
1806 if cookie.port_specified: h.append(("port_spec", None))
1807 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1808 if cookie.secure: h.append(("secure", None))
1809 if cookie.expires: h.append(("expires",
1810 time2isoz(float(cookie.expires))))
1811 if cookie.discard: h.append(("discard", None))
1812 if cookie.comment: h.append(("comment", cookie.comment))
1813 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1814
1815 keys = sorted(cookie._rest.keys())
1816 for k in keys:
1817 h.append((k, str(cookie._rest[k])))
1818
1819 h.append(("version", str(cookie.version)))
1820
1821 return join_header_words([h])
1822
1823class LWPCookieJar(FileCookieJar):
1824 """
1825 The LWPCookieJar saves a sequence of"Set-Cookie3" lines.
1826 "Set-Cookie3" is the format used by the libwww-perl libary, not known
1827 to be compatible with any browser, but which is easy to read and
1828 doesn't lose information about RFC 2965 cookies.
1829
1830 Additional methods
1831
1832 as_lwp_str(ignore_discard=True, ignore_expired=True)
1833
1834 """
1835
1836 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1837 """Return cookies as a string of "\n"-separated "Set-Cookie3" headers.
1838
1839 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1840
1841 """
1842 now = time.time()
1843 r = []
1844 for cookie in self:
1845 if not ignore_discard and cookie.discard:
1846 continue
1847 if not ignore_expires and cookie.is_expired(now):
1848 continue
1849 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1850 return "\n".join(r+[""])
1851
1852 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1853 if filename is None:
1854 if self.filename is not None: filename = self.filename
1855 else: raise ValueError(MISSING_FILENAME_TEXT)
1856
1857 f = open(filename, "w")
1858 try:
1859 # There really isn't an LWP Cookies 2.0 format, but this indicates
1860 # that there is extra information in here (domain_dot and
1861 # port_spec) while still being compatible with libwww-perl, I hope.
1862 f.write("#LWP-Cookies-2.0\n")
1863 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1864 finally:
1865 f.close()
1866
1867 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1868 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001869 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001870 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1871 "file" % filename)
1872 raise LoadError(msg)
1873
1874 now = time.time()
1875
1876 header = "Set-Cookie3:"
1877 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1878 "secure", "discard")
1879 value_attrs = ("version",
1880 "port", "path", "domain",
1881 "expires",
1882 "comment", "commenturl")
1883
1884 try:
1885 while 1:
1886 line = f.readline()
1887 if line == "": break
1888 if not line.startswith(header):
1889 continue
1890 line = line[len(header):].strip()
1891
1892 for data in split_header_words([line]):
1893 name, value = data[0]
1894 standard = {}
1895 rest = {}
1896 for k in boolean_attrs:
1897 standard[k] = False
1898 for k, v in data[1:]:
1899 if k is not None:
1900 lc = k.lower()
1901 else:
1902 lc = None
1903 # don't lose case distinction for unknown fields
1904 if (lc in value_attrs) or (lc in boolean_attrs):
1905 k = lc
1906 if k in boolean_attrs:
1907 if v is None: v = True
1908 standard[k] = v
1909 elif k in value_attrs:
1910 standard[k] = v
1911 else:
1912 rest[k] = v
1913
1914 h = standard.get
1915 expires = h("expires")
1916 discard = h("discard")
1917 if expires is not None:
1918 expires = iso2time(expires)
1919 if expires is None:
1920 discard = True
1921 domain = h("domain")
1922 domain_specified = domain.startswith(".")
1923 c = Cookie(h("version"), name, value,
1924 h("port"), h("port_spec"),
1925 domain, domain_specified, h("domain_dot"),
1926 h("path"), h("path_spec"),
1927 h("secure"),
1928 expires,
1929 discard,
1930 h("comment"),
1931 h("commenturl"),
1932 rest)
1933 if not ignore_discard and c.discard:
1934 continue
1935 if not ignore_expires and c.is_expired(now):
1936 continue
1937 self.set_cookie(c)
1938
1939 except IOError:
1940 raise
1941 except Exception:
1942 _warn_unhandled_exception()
1943 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1944 (filename, line))
1945
1946
1947class MozillaCookieJar(FileCookieJar):
1948 """
1949
1950 WARNING: you may want to backup your browser's cookies file if you use
1951 this class to save cookies. I *think* it works, but there have been
1952 bugs in the past!
1953
1954 This class differs from CookieJar only in the format it uses to save and
1955 load cookies to and from a file. This class uses the Mozilla/Netscape
1956 `cookies.txt' format. lynx uses this file format, too.
1957
1958 Don't expect cookies saved while the browser is running to be noticed by
1959 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1960 you change them on disk while it's running; on Windows, you probably can't
1961 save at all while the browser is running).
1962
1963 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1964 Netscape cookies on saving.
1965
1966 In particular, the cookie version and port number information is lost,
1967 together with information about whether or not Path, Port and Discard were
1968 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1969 domain as set in the HTTP header started with a dot (yes, I'm aware some
1970 domains in Netscape files start with a dot and some don't -- trust me, you
1971 really don't want to know any more about this).
1972
1973 Note that though Mozilla and Netscape use the same format, they use
1974 slightly different headers. The class saves cookies using the Netscape
1975 header by default (Mozilla can cope with that).
1976
1977 """
Antoine Pitroufd036452008-08-19 17:56:33 +00001978 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001979 header = """\
1980 # Netscape HTTP Cookie File
1981 # http://www.netscape.com/newsref/std/cookie_spec.html
1982 # This is a generated file! Do not edit.
1983
1984"""
1985
1986 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1987 now = time.time()
1988
1989 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001990 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001991 f.close()
1992 raise LoadError(
1993 "%r does not look like a Netscape format cookies file" %
1994 filename)
1995
1996 try:
1997 while 1:
1998 line = f.readline()
1999 if line == "": break
2000
2001 # last field may be absent, so keep any trailing tab
2002 if line.endswith("\n"): line = line[:-1]
2003
2004 # skip comments and blank lines XXX what is $ for?
2005 if (line.strip().startswith(("#", "$")) or
2006 line.strip() == ""):
2007 continue
2008
2009 domain, domain_specified, path, secure, expires, name, value = \
2010 line.split("\t")
2011 secure = (secure == "TRUE")
2012 domain_specified = (domain_specified == "TRUE")
2013 if name == "":
2014 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2015 # with no name, whereas http.cookiejar regards it as a
2016 # cookie with no value.
2017 name = value
2018 value = None
2019
2020 initial_dot = domain.startswith(".")
2021 assert domain_specified == initial_dot
2022
2023 discard = False
2024 if expires == "":
2025 expires = None
2026 discard = True
2027
2028 # assume path_specified is false
2029 c = Cookie(0, name, value,
2030 None, False,
2031 domain, domain_specified, initial_dot,
2032 path, False,
2033 secure,
2034 expires,
2035 discard,
2036 None,
2037 None,
2038 {})
2039 if not ignore_discard and c.discard:
2040 continue
2041 if not ignore_expires and c.is_expired(now):
2042 continue
2043 self.set_cookie(c)
2044
2045 except IOError:
2046 raise
2047 except Exception:
2048 _warn_unhandled_exception()
2049 raise LoadError("invalid Netscape format cookies file %r: %r" %
2050 (filename, line))
2051
2052 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2053 if filename is None:
2054 if self.filename is not None: filename = self.filename
2055 else: raise ValueError(MISSING_FILENAME_TEXT)
2056
2057 f = open(filename, "w")
2058 try:
2059 f.write(self.header)
2060 now = time.time()
2061 for cookie in self:
2062 if not ignore_discard and cookie.discard:
2063 continue
2064 if not ignore_expires and cookie.is_expired(now):
2065 continue
2066 if cookie.secure: secure = "TRUE"
2067 else: secure = "FALSE"
2068 if cookie.domain.startswith("."): initial_dot = "TRUE"
2069 else: initial_dot = "FALSE"
2070 if cookie.expires is not None:
2071 expires = str(cookie.expires)
2072 else:
2073 expires = ""
2074 if cookie.value is None:
2075 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2076 # with no name, whereas http.cookiejar regards it as a
2077 # cookie with no value.
2078 name = ""
2079 value = cookie.name
2080 else:
2081 name = cookie.name
2082 value = cookie.value
2083 f.write(
2084 "\t".join([cookie.domain, initial_dot, cookie.path,
2085 secure, expires, name, value])+
2086 "\n")
2087 finally:
2088 f.close()