blob: e7f0b4b321378c68423a07755b4483f7c52bf19a [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
Thomas Wouters477c8d52006-05-27 19:21:47 +000010Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000013
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Thomas Wouters477c8d52006-05-27 19:21:47 +000028__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
Jeremy Hylton1afc1692008-06-18 20:49:58 +000031import copy
Victor Stinner628225c2011-03-21 02:38:51 +010032import datetime
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033import re
34import time
35import urllib.parse, urllib.request
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000036try:
37 import threading as _threading
38except ImportError:
39 import dummy_threading as _threading
Georg Brandl24420152008-05-26 16:32:26 +000040import http.client # only for the default HTTP port
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000041from calendar import timegm
42
Thomas Wouters477c8d52006-05-27 19:21:47 +000043debug = False # set to True to enable debugging via the logging module
44logger = None
45
46def _debug(*args):
47 if not debug:
48 return
49 global logger
50 if not logger:
51 import logging
Georg Brandl24420152008-05-26 16:32:26 +000052 logger = logging.getLogger("http.cookiejar")
Thomas Wouters477c8d52006-05-27 19:21:47 +000053 return logger.debug(*args)
54
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000055
Georg Brandl24420152008-05-26 16:32:26 +000056DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000057MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
58 "instance initialised with one)")
59
Thomas Wouters477c8d52006-05-27 19:21:47 +000060def _warn_unhandled_exception():
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000061 # There are a few catch-all except: statements in this module, for
Thomas Wouters477c8d52006-05-27 19:21:47 +000062 # catching input that's bad in unexpected ways. Warn if any
63 # exceptions are caught there.
Jeremy Hylton7ecf3dc2008-05-10 20:38:40 +000064 import io, warnings, traceback
Guido van Rossum34d19282007-08-09 01:03:29 +000065 f = io.StringIO()
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000066 traceback.print_exc(None, f)
67 msg = f.getvalue()
Georg Brandl24420152008-05-26 16:32:26 +000068 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000069
70
71# Date/time conversion
72# -----------------------------------------------------------------------------
73
74EPOCH_YEAR = 1970
75def _timegm(tt):
76 year, month, mday, hour, min, sec = tt[:6]
77 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
78 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
79 return timegm(tt)
80 else:
81 return None
82
83DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
84MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
85 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
86MONTHS_LOWER = []
87for month in MONTHS: MONTHS_LOWER.append(month.lower())
88
89def time2isoz(t=None):
90 """Return a string representing time in seconds since epoch, t.
91
92 If the function is called without an argument, it will use the current
93 time.
94
95 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
96 representing Universal Time (UTC, aka GMT). An example of this format is:
97
98 1994-11-24 08:49:37Z
99
100 """
Victor Stinner628225c2011-03-21 02:38:51 +0100101 if t is None:
102 dt = datetime.datetime.utcnow()
103 else:
104 dt = datetime.datetime.utcfromtimestamp(t)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000105 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100106 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000107
108def time2netscape(t=None):
109 """Return a string representing time in seconds since epoch, t.
110
111 If the function is called without an argument, it will use the current
112 time.
113
114 The format of the returned string is like this:
115
116 Wed, DD-Mon-YYYY HH:MM:SS GMT
117
118 """
Victor Stinner628225c2011-03-21 02:38:51 +0100119 if t is None:
120 dt = datetime.datetime.utcnow()
121 else:
122 dt = datetime.datetime.utcfromtimestamp(t)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000123 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100124 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
125 dt.year, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000126
127
128UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
129
Antoine Pitroufd036452008-08-19 17:56:33 +0000130TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000131def offset_from_tz_string(tz):
132 offset = None
133 if tz in UTC_ZONES:
134 offset = 0
135 else:
136 m = TIMEZONE_RE.search(tz)
137 if m:
138 offset = 3600 * int(m.group(2))
139 if m.group(3):
140 offset = offset + 60 * int(m.group(3))
141 if m.group(1) == '-':
142 offset = -offset
143 return offset
144
145def _str2time(day, mon, yr, hr, min, sec, tz):
146 # translate month name to number
147 # month numbers start with 1 (January)
148 try:
149 mon = MONTHS_LOWER.index(mon.lower())+1
150 except ValueError:
151 # maybe it's already a number
152 try:
153 imon = int(mon)
154 except ValueError:
155 return None
156 if 1 <= imon <= 12:
157 mon = imon
158 else:
159 return None
160
161 # make sure clock elements are defined
162 if hr is None: hr = 0
163 if min is None: min = 0
164 if sec is None: sec = 0
165
166 yr = int(yr)
167 day = int(day)
168 hr = int(hr)
169 min = int(min)
170 sec = int(sec)
171
172 if yr < 1000:
173 # find "obvious" year
174 cur_yr = time.localtime(time.time())[0]
175 m = cur_yr % 100
176 tmp = yr
177 yr = yr + cur_yr - m
178 m = m - tmp
179 if abs(m) > 50:
180 if m > 0: yr = yr + 100
181 else: yr = yr - 100
182
183 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
184 t = _timegm((yr, mon, day, hr, min, sec, tz))
185
186 if t is not None:
187 # adjust time using timezone string, to get absolute time since epoch
188 if tz is None:
189 tz = "UTC"
190 tz = tz.upper()
191 offset = offset_from_tz_string(tz)
192 if offset is None:
193 return None
194 t = t - offset
195
196 return t
197
198STRICT_DATE_RE = re.compile(
199 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
Antoine Pitroufd036452008-08-19 17:56:33 +0000200 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000201WEEKDAY_RE = re.compile(
Antoine Pitroufd036452008-08-19 17:56:33 +0000202 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000203LOOSE_HTTP_DATE_RE = re.compile(
204 r"""^
205 (\d\d?) # day
206 (?:\s+|[-\/])
207 (\w+) # month
208 (?:\s+|[-\/])
209 (\d+) # year
210 (?:
211 (?:\s+|:) # separator before clock
212 (\d\d?):(\d\d) # hour:min
213 (?::(\d\d))? # optional seconds
214 )? # optional clock
215 \s*
216 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
217 \s*
218 (?:\(\w+\))? # ASCII representation of timezone in parens.
Antoine Pitroufd036452008-08-19 17:56:33 +0000219 \s*$""", re.X | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000220def http2time(text):
221 """Returns time in seconds since epoch of time represented by a string.
222
223 Return value is an integer.
224
225 None is returned if the format of str is unrecognized, the time is outside
226 the representable range, or the timezone string is not recognized. If the
227 string contains no timezone, UTC is assumed.
228
229 The timezone in the string may be numerical (like "-0800" or "+0100") or a
230 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
231 timezone strings equivalent to UTC (zero offset) are known to the function.
232
233 The function loosely parses the following formats:
234
235 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
236 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
237 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
238 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
239 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
240 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
241
242 The parser ignores leading and trailing whitespace. The time may be
243 absent.
244
245 If the year is given with only 2 digits, the function will select the
246 century that makes the year closest to the current date.
247
248 """
249 # fast exit for strictly conforming string
250 m = STRICT_DATE_RE.search(text)
251 if m:
252 g = m.groups()
253 mon = MONTHS_LOWER.index(g[1].lower()) + 1
254 tt = (int(g[2]), mon, int(g[0]),
255 int(g[3]), int(g[4]), float(g[5]))
256 return _timegm(tt)
257
258 # No, we need some messy parsing...
259
260 # clean up
261 text = text.lstrip()
262 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
263
264 # tz is time zone specifier string
265 day, mon, yr, hr, min, sec, tz = [None]*7
266
267 # loose regexp parse
268 m = LOOSE_HTTP_DATE_RE.search(text)
269 if m is not None:
270 day, mon, yr, hr, min, sec, tz = m.groups()
271 else:
272 return None # bad format
273
274 return _str2time(day, mon, yr, hr, min, sec, tz)
275
276ISO_DATE_RE = re.compile(
277 """^
278 (\d{4}) # year
279 [-\/]?
280 (\d\d?) # numerical month
281 [-\/]?
282 (\d\d?) # day
283 (?:
284 (?:\s+|[-:Tt]) # separator before clock
285 (\d\d?):?(\d\d) # hour:min
286 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
287 )? # optional clock
288 \s*
289 ([-+]?\d\d?:?(:?\d\d)?
290 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
Antoine Pitroufd036452008-08-19 17:56:33 +0000291 \s*$""", re.X | re. ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000292def iso2time(text):
293 """
294 As for http2time, but parses the ISO 8601 formats:
295
296 1994-02-03 14:15:29 -0100 -- ISO 8601 format
297 1994-02-03 14:15:29 -- zone is optional
298 1994-02-03 -- only date
299 1994-02-03T14:15:29 -- Use T as separator
300 19940203T141529Z -- ISO 8601 compact format
301 19940203 -- only date
302
303 """
304 # clean up
305 text = text.lstrip()
306
307 # tz is time zone specifier string
308 day, mon, yr, hr, min, sec, tz = [None]*7
309
310 # loose regexp parse
311 m = ISO_DATE_RE.search(text)
312 if m is not None:
313 # XXX there's an extra bit of the timezone I'm ignoring here: is
314 # this the right thing to do?
315 yr, mon, day, hr, min, sec, tz, _ = m.groups()
316 else:
317 return None # bad format
318
319 return _str2time(day, mon, yr, hr, min, sec, tz)
320
321
322# Header parsing
323# -----------------------------------------------------------------------------
324
325def unmatched(match):
326 """Return unmatched part of re.Match object."""
327 start, end = match.span(0)
328 return match.string[:start]+match.string[end:]
329
330HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
331HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
332HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
333HEADER_ESCAPE_RE = re.compile(r"\\(.)")
334def split_header_words(header_values):
335 r"""Parse header values into a list of lists containing key,value pairs.
336
337 The function knows how to deal with ",", ";" and "=" as well as quoted
338 values after "=". A list of space separated tokens are parsed as if they
339 were separated by ";".
340
341 If the header_values passed as argument contains multiple values, then they
342 are treated as if they were a single value separated by comma ",".
343
344 This means that this function is useful for parsing header fields that
345 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
346 the requirement for tokens).
347
348 headers = #header
349 header = (token | parameter) *( [";"] (token | parameter))
350
351 token = 1*<any CHAR except CTLs or separators>
352 separators = "(" | ")" | "<" | ">" | "@"
353 | "," | ";" | ":" | "\" | <">
354 | "/" | "[" | "]" | "?" | "="
355 | "{" | "}" | SP | HT
356
357 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
358 qdtext = <any TEXT except <">>
359 quoted-pair = "\" CHAR
360
361 parameter = attribute "=" value
362 attribute = token
363 value = token | quoted-string
364
365 Each header is represented by a list of key/value pairs. The value for a
366 simple token (not part of a parameter) is None. Syntactically incorrect
367 headers will not necessarily be parsed as you would want.
368
369 This is easier to describe with some examples:
370
371 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
372 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
373 >>> split_header_words(['text/html; charset="iso-8859-1"'])
374 [[('text/html', None), ('charset', 'iso-8859-1')]]
375 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
376 [[('Basic', None), ('realm', '"foobar"')]]
377
378 """
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000379 assert not isinstance(header_values, str)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000380 result = []
381 for text in header_values:
382 orig_text = text
383 pairs = []
384 while text:
385 m = HEADER_TOKEN_RE.search(text)
386 if m:
387 text = unmatched(m)
388 name = m.group(1)
389 m = HEADER_QUOTED_VALUE_RE.search(text)
390 if m: # quoted value
391 text = unmatched(m)
392 value = m.group(1)
393 value = HEADER_ESCAPE_RE.sub(r"\1", value)
394 else:
395 m = HEADER_VALUE_RE.search(text)
396 if m: # unquoted value
397 text = unmatched(m)
398 value = m.group(1)
399 value = value.rstrip()
400 else:
401 # no value, a lone token
402 value = None
403 pairs.append((name, value))
404 elif text.lstrip().startswith(","):
405 # concatenated headers, as per RFC 2616 section 4.2
406 text = text.lstrip()[1:]
407 if pairs: result.append(pairs)
408 pairs = []
409 else:
410 # skip junk
411 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
412 assert nr_junk_chars > 0, (
413 "split_header_words bug: '%s', '%s', %s" %
414 (orig_text, text, pairs))
415 text = non_junk
416 if pairs: result.append(pairs)
417 return result
418
419HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
420def join_header_words(lists):
421 """Do the inverse (almost) of the conversion done by split_header_words.
422
423 Takes a list of lists of (key, value) pairs and produces a single header
424 value. Attribute values are quoted if needed.
425
426 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
427 'text/plain; charset="iso-8859/1"'
428 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
429 'text/plain, charset="iso-8859/1"'
430
431 """
432 headers = []
433 for pairs in lists:
434 attr = []
435 for k, v in pairs:
436 if v is not None:
437 if not re.search(r"^\w+$", v):
438 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
439 v = '"%s"' % v
440 k = "%s=%s" % (k, v)
441 attr.append(k)
442 if attr: headers.append("; ".join(attr))
443 return ", ".join(headers)
444
445def parse_ns_headers(ns_headers):
446 """Ad-hoc parser for Netscape protocol cookie-attributes.
447
448 The old Netscape cookie format for Set-Cookie can for instance contain
449 an unquoted "," in the expires field, so we have to use this ad-hoc
450 parser instead of split_header_words.
451
452 XXX This may not make the best possible effort to parse all the crap
453 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
454 parser is probably better, so could do worse than following that if
455 this ever gives any trouble.
456
457 Currently, this is also used for parsing RFC 2109 cookies.
458
459 """
460 known_attrs = ("expires", "domain", "path", "secure",
461 # RFC 2109 attrs (may turn up in Netscape cookies, too)
462 "port", "max-age")
463
464 result = []
465 for ns_header in ns_headers:
466 pairs = []
467 version_set = False
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000468 for ii, param in enumerate(re.split(r";\s*", ns_header)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000469 param = param.rstrip()
470 if param == "": continue
471 if "=" not in param:
Martin v. Löwisc5574e82005-03-03 10:57:37 +0000472 k, v = param, None
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000473 else:
474 k, v = re.split(r"\s*=\s*", param, 1)
475 k = k.lstrip()
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000476 if ii != 0:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000477 lc = k.lower()
478 if lc in known_attrs:
479 k = lc
480 if k == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000481 # This is an RFC 2109 cookie.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000482 version_set = True
483 if k == "expires":
484 # convert expires date to seconds since epoch
485 if v.startswith('"'): v = v[1:]
486 if v.endswith('"'): v = v[:-1]
487 v = http2time(v) # None if invalid
488 pairs.append((k, v))
489
490 if pairs:
491 if not version_set:
492 pairs.append(("version", "0"))
493 result.append(pairs)
494
495 return result
496
497
Antoine Pitroufd036452008-08-19 17:56:33 +0000498IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000499def is_HDN(text):
500 """Return True if text is a host domain name."""
501 # XXX
502 # This may well be wrong. Which RFC is HDN defined in, if any (for
503 # the purposes of RFC 2965)?
504 # For the current implementation, what about IPv6? Remember to look
505 # at other uses of IPV4_RE also, if change this.
506 if IPV4_RE.search(text):
507 return False
508 if text == "":
509 return False
510 if text[0] == "." or text[-1] == ".":
511 return False
512 return True
513
514def domain_match(A, B):
515 """Return True if domain A domain-matches domain B, according to RFC 2965.
516
517 A and B may be host domain names or IP addresses.
518
519 RFC 2965, section 1:
520
521 Host names can be specified either as an IP address or a HDN string.
522 Sometimes we compare one host name with another. (Such comparisons SHALL
523 be case-insensitive.) Host A's name domain-matches host B's if
524
525 * their host name strings string-compare equal; or
526
527 * A is a HDN string and has the form NB, where N is a non-empty
528 name string, B has the form .B', and B' is a HDN string. (So,
529 x.y.com domain-matches .Y.com but not Y.com.)
530
531 Note that domain-match is not a commutative operation: a.b.c.com
532 domain-matches .c.com, but not the reverse.
533
534 """
535 # Note that, if A or B are IP addresses, the only relevant part of the
536 # definition of the domain-match algorithm is the direct string-compare.
537 A = A.lower()
538 B = B.lower()
539 if A == B:
540 return True
541 if not is_HDN(A):
542 return False
543 i = A.rfind(B)
544 if i == -1 or i == 0:
545 # A does not have form NB, or N is the empty string
546 return False
547 if not B.startswith("."):
548 return False
549 if not is_HDN(B[1:]):
550 return False
551 return True
552
553def liberal_is_HDN(text):
554 """Return True if text is a sort-of-like a host domain name.
555
556 For accepting/blocking domains.
557
558 """
559 if IPV4_RE.search(text):
560 return False
561 return True
562
563def user_domain_match(A, B):
564 """For blocking/accepting domains.
565
566 A and B may be host domain names or IP addresses.
567
568 """
569 A = A.lower()
570 B = B.lower()
571 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
572 if A == B:
573 # equal IP addresses
574 return True
575 return False
576 initial_dot = B.startswith(".")
577 if initial_dot and A.endswith(B):
578 return True
579 if not initial_dot and A == B:
580 return True
581 return False
582
Antoine Pitroufd036452008-08-19 17:56:33 +0000583cut_port_re = re.compile(r":\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000584def request_host(request):
585 """Return request-host, as defined by RFC 2965.
586
587 Variation from RFC: returned value is lowercased, for convenient
588 comparison.
589
590 """
591 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000592 host = urllib.parse.urlparse(url)[1]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000593 if host == "":
594 host = request.get_header("Host", "")
595
596 # remove port, if present
597 host = cut_port_re.sub("", host, 1)
598 return host.lower()
599
600def eff_request_host(request):
601 """Return a tuple (request-host, effective request-host name).
602
603 As defined by RFC 2965, except both are lowercased.
604
605 """
606 erhn = req_host = request_host(request)
607 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
608 erhn = req_host + ".local"
609 return req_host, erhn
610
611def request_path(request):
Gregory P. Smith2c4973d2010-07-25 19:53:20 +0000612 """Path component of request-URI, as defined by RFC 2965."""
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000613 url = request.get_full_url()
Gregory P. Smith2c4973d2010-07-25 19:53:20 +0000614 parts = urllib.parse.urlsplit(url)
615 path = escape_path(parts.path)
616 if not path.startswith("/"):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000617 # fix bad RFC 2396 absoluteURI
Gregory P. Smith2c4973d2010-07-25 19:53:20 +0000618 path = "/" + path
619 return path
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000620
621def request_port(request):
622 host = request.get_host()
623 i = host.find(':')
624 if i >= 0:
625 port = host[i+1:]
626 try:
627 int(port)
628 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000629 _debug("nonnumeric port: '%s'", port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000630 return None
631 else:
632 port = DEFAULT_HTTP_PORT
633 return port
634
635# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
636# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
637HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
638ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
639def uppercase_escaped_char(match):
640 return "%%%s" % match.group(1).upper()
641def escape_path(path):
642 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
643 # There's no knowing what character encoding was used to create URLs
644 # containing %-escapes, but since we have to pick one to escape invalid
645 # path characters, we pick UTF-8, as recommended in the HTML 4.0
646 # specification:
647 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
648 # And here, kind of: draft-fielding-uri-rfc2396bis-03
649 # (And in draft IRI specification: draft-duerst-iri-05)
650 # (And here, for new URI schemes: RFC 2718)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000651 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000652 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
653 return path
654
655def reach(h):
656 """Return reach of host h, as defined by RFC 2965, section 1.
657
658 The reach R of a host name H is defined as follows:
659
660 * If
661
662 - H is the host domain name of a host; and,
663
664 - H has the form A.B; and
665
666 - A has no embedded (that is, interior) dots; and
667
668 - B has at least one embedded dot, or B is the string "local".
669 then the reach of H is .B.
670
671 * Otherwise, the reach of H is H.
672
673 >>> reach("www.acme.com")
674 '.acme.com'
675 >>> reach("acme.com")
676 'acme.com'
677 >>> reach("acme.local")
678 '.local'
679
680 """
681 i = h.find(".")
682 if i >= 0:
683 #a = h[:i] # this line is only here to show what a is
684 b = h[i+1:]
685 i = b.find(".")
686 if is_HDN(h) and (i >= 0 or b == "local"):
687 return "."+b
688 return h
689
690def is_third_party(request):
691 """
692
693 RFC 2965, section 3.3.6:
694
695 An unverifiable transaction is to a third-party host if its request-
696 host U does not domain-match the reach R of the request-host O in the
697 origin transaction.
698
699 """
700 req_host = request_host(request)
701 if not domain_match(req_host, reach(request.get_origin_req_host())):
702 return True
703 else:
704 return False
705
706
707class Cookie:
708 """HTTP Cookie.
709
710 This class represents both Netscape and RFC 2965 cookies.
711
712 This is deliberately a very simple class. It just holds attributes. It's
713 possible to construct Cookie instances that don't comply with the cookie
714 standards. CookieJar.make_cookies is the factory function for Cookie
715 objects -- it deals with cookie parsing, supplying defaults, and
716 normalising to the representation used in this class. CookiePolicy is
717 responsible for checking them to see whether they should be accepted from
718 and returned to the server.
719
720 Note that the port may be present in the headers, but unspecified ("Port"
721 rather than"Port=80", for example); if this is the case, port is None.
722
723 """
724
725 def __init__(self, version, name, value,
726 port, port_specified,
727 domain, domain_specified, domain_initial_dot,
728 path, path_specified,
729 secure,
730 expires,
731 discard,
732 comment,
733 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000734 rest,
735 rfc2109=False,
736 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000737
738 if version is not None: version = int(version)
739 if expires is not None: expires = int(expires)
740 if port is None and port_specified is True:
741 raise ValueError("if port is None, port_specified must be false")
742
743 self.version = version
744 self.name = name
745 self.value = value
746 self.port = port
747 self.port_specified = port_specified
748 # normalise case, as per RFC 2965 section 3.3.3
749 self.domain = domain.lower()
750 self.domain_specified = domain_specified
751 # Sigh. We need to know whether the domain given in the
752 # cookie-attribute had an initial dot, in order to follow RFC 2965
753 # (as clarified in draft errata). Needed for the returned $Domain
754 # value.
755 self.domain_initial_dot = domain_initial_dot
756 self.path = path
757 self.path_specified = path_specified
758 self.secure = secure
759 self.expires = expires
760 self.discard = discard
761 self.comment = comment
762 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000763 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000764
765 self._rest = copy.copy(rest)
766
767 def has_nonstandard_attr(self, name):
768 return name in self._rest
769 def get_nonstandard_attr(self, name, default=None):
770 return self._rest.get(name, default)
771 def set_nonstandard_attr(self, name, value):
772 self._rest[name] = value
773
774 def is_expired(self, now=None):
775 if now is None: now = time.time()
776 if (self.expires is not None) and (self.expires <= now):
777 return True
778 return False
779
780 def __str__(self):
781 if self.port is None: p = ""
782 else: p = ":"+self.port
783 limit = self.domain + p + self.path
784 if self.value is not None:
785 namevalue = "%s=%s" % (self.name, self.value)
786 else:
787 namevalue = self.name
788 return "<Cookie %s for %s>" % (namevalue, limit)
789
790 def __repr__(self):
791 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000792 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000793 "port", "port_specified",
794 "domain", "domain_specified", "domain_initial_dot",
795 "path", "path_specified",
796 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000797 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000798 attr = getattr(self, name)
799 args.append("%s=%s" % (name, repr(attr)))
800 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000801 args.append("rfc2109=%s" % repr(self.rfc2109))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000802 return "Cookie(%s)" % ", ".join(args)
803
804
805class CookiePolicy:
806 """Defines which cookies get accepted from and returned to server.
807
808 May also modify cookies, though this is probably a bad idea.
809
810 The subclass DefaultCookiePolicy defines the standard rules for Netscape
811 and RFC 2965 cookies -- override that if you want a customised policy.
812
813 """
814 def set_ok(self, cookie, request):
815 """Return true if (and only if) cookie should be accepted from server.
816
817 Currently, pre-expired cookies never get this far -- the CookieJar
818 class deletes such cookies itself.
819
820 """
821 raise NotImplementedError()
822
823 def return_ok(self, cookie, request):
824 """Return true if (and only if) cookie should be returned to server."""
825 raise NotImplementedError()
826
827 def domain_return_ok(self, domain, request):
828 """Return false if cookies should not be returned, given cookie domain.
829 """
830 return True
831
832 def path_return_ok(self, path, request):
833 """Return false if cookies should not be returned, given cookie path.
834 """
835 return True
836
837
838class DefaultCookiePolicy(CookiePolicy):
839 """Implements the standard rules for accepting and returning cookies."""
840
841 DomainStrictNoDots = 1
842 DomainStrictNonDomain = 2
843 DomainRFC2965Match = 4
844
845 DomainLiberal = 0
846 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
847
848 def __init__(self,
849 blocked_domains=None, allowed_domains=None,
850 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000851 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000852 hide_cookie2=False,
853 strict_domain=False,
854 strict_rfc2965_unverifiable=True,
855 strict_ns_unverifiable=False,
856 strict_ns_domain=DomainLiberal,
857 strict_ns_set_initial_dollar=False,
858 strict_ns_set_path=False,
859 ):
860 """Constructor arguments should be passed as keyword arguments only."""
861 self.netscape = netscape
862 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000863 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000864 self.hide_cookie2 = hide_cookie2
865 self.strict_domain = strict_domain
866 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
867 self.strict_ns_unverifiable = strict_ns_unverifiable
868 self.strict_ns_domain = strict_ns_domain
869 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
870 self.strict_ns_set_path = strict_ns_set_path
871
872 if blocked_domains is not None:
873 self._blocked_domains = tuple(blocked_domains)
874 else:
875 self._blocked_domains = ()
876
877 if allowed_domains is not None:
878 allowed_domains = tuple(allowed_domains)
879 self._allowed_domains = allowed_domains
880
881 def blocked_domains(self):
882 """Return the sequence of blocked domains (as a tuple)."""
883 return self._blocked_domains
884 def set_blocked_domains(self, blocked_domains):
885 """Set the sequence of blocked domains."""
886 self._blocked_domains = tuple(blocked_domains)
887
888 def is_blocked(self, domain):
889 for blocked_domain in self._blocked_domains:
890 if user_domain_match(domain, blocked_domain):
891 return True
892 return False
893
894 def allowed_domains(self):
895 """Return None, or the sequence of allowed domains (as a tuple)."""
896 return self._allowed_domains
897 def set_allowed_domains(self, allowed_domains):
898 """Set the sequence of allowed domains, or None."""
899 if allowed_domains is not None:
900 allowed_domains = tuple(allowed_domains)
901 self._allowed_domains = allowed_domains
902
903 def is_not_allowed(self, domain):
904 if self._allowed_domains is None:
905 return False
906 for allowed_domain in self._allowed_domains:
907 if user_domain_match(domain, allowed_domain):
908 return False
909 return True
910
911 def set_ok(self, cookie, request):
912 """
913 If you override .set_ok(), be sure to call this method. If it returns
914 false, so should your subclass (assuming your subclass wants to be more
915 strict about which cookies to accept).
916
917 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000918 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000919
920 assert cookie.name is not None
921
922 for n in "version", "verifiability", "name", "path", "domain", "port":
923 fn_name = "set_ok_"+n
924 fn = getattr(self, fn_name)
925 if not fn(cookie, request):
926 return False
927
928 return True
929
930 def set_ok_version(self, cookie, request):
931 if cookie.version is None:
932 # Version is always set to 0 by parse_ns_headers if it's a Netscape
933 # cookie, so this must be an invalid RFC 2965 cookie.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000934 _debug(" Set-Cookie2 without version attribute (%s=%s)",
935 cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000936 return False
937 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000938 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000939 return False
940 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000941 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000942 return False
943 return True
944
945 def set_ok_verifiability(self, cookie, request):
946 if request.is_unverifiable() and is_third_party(request):
947 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000948 _debug(" third-party RFC 2965 cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000949 "unverifiable transaction")
950 return False
951 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000952 _debug(" third-party Netscape cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000953 "unverifiable transaction")
954 return False
955 return True
956
957 def set_ok_name(self, cookie, request):
958 # Try and stop servers setting V0 cookies designed to hack other
959 # servers that know both V0 and V1 protocols.
960 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
961 cookie.name.startswith("$")):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000962 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000963 return False
964 return True
965
966 def set_ok_path(self, cookie, request):
967 if cookie.path_specified:
968 req_path = request_path(request)
969 if ((cookie.version > 0 or
970 (cookie.version == 0 and self.strict_ns_set_path)) and
971 not req_path.startswith(cookie.path)):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000972 _debug(" path attribute %s is not a prefix of request "
973 "path %s", cookie.path, req_path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000974 return False
975 return True
976
977 def set_ok_domain(self, cookie, request):
978 if self.is_blocked(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000979 _debug(" domain %s is in user block-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000980 return False
981 if self.is_not_allowed(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000982 _debug(" domain %s is not in user allow-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000983 return False
984 if cookie.domain_specified:
985 req_host, erhn = eff_request_host(request)
986 domain = cookie.domain
987 if self.strict_domain and (domain.count(".") >= 2):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000988 # XXX This should probably be compared with the Konqueror
989 # (kcookiejar.cpp) and Mozilla implementations, but it's a
990 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000991 i = domain.rfind(".")
992 j = domain.rfind(".", 0, i)
993 if j == 0: # domain like .foo.bar
994 tld = domain[i+1:]
995 sld = domain[j+1:i]
Thomas Wouters477c8d52006-05-27 19:21:47 +0000996 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
997 "gov", "mil", "int", "aero", "biz", "cat", "coop",
998 "info", "jobs", "mobi", "museum", "name", "pro",
999 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001000 # domain like .co.uk
Thomas Wouters477c8d52006-05-27 19:21:47 +00001001 _debug(" country-code second level domain %s", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001002 return False
1003 if domain.startswith("."):
1004 undotted_domain = domain[1:]
1005 else:
1006 undotted_domain = domain
1007 embedded_dots = (undotted_domain.find(".") >= 0)
1008 if not embedded_dots and domain != ".local":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001009 _debug(" non-local domain %s contains no embedded dot",
1010 domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001011 return False
1012 if cookie.version == 0:
1013 if (not erhn.endswith(domain) and
1014 (not erhn.startswith(".") and
1015 not ("."+erhn).endswith(domain))):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001016 _debug(" effective request-host %s (even with added "
1017 "initial dot) does not end end with %s",
1018 erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001019 return False
1020 if (cookie.version > 0 or
1021 (self.strict_ns_domain & self.DomainRFC2965Match)):
1022 if not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001023 _debug(" effective request-host %s does not domain-match "
1024 "%s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001025 return False
1026 if (cookie.version > 0 or
1027 (self.strict_ns_domain & self.DomainStrictNoDots)):
1028 host_prefix = req_host[:-len(domain)]
1029 if (host_prefix.find(".") >= 0 and
1030 not IPV4_RE.search(req_host)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001031 _debug(" host prefix %s for domain %s contains a dot",
1032 host_prefix, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001033 return False
1034 return True
1035
1036 def set_ok_port(self, cookie, request):
1037 if cookie.port_specified:
1038 req_port = request_port(request)
1039 if req_port is None:
1040 req_port = "80"
1041 else:
1042 req_port = str(req_port)
1043 for p in cookie.port.split(","):
1044 try:
1045 int(p)
1046 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001047 _debug(" bad port %s (not numeric)", p)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001048 return False
1049 if p == req_port:
1050 break
1051 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001052 _debug(" request port (%s) not found in %s",
1053 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001054 return False
1055 return True
1056
1057 def return_ok(self, cookie, request):
1058 """
1059 If you override .return_ok(), be sure to call this method. If it
1060 returns false, so should your subclass (assuming your subclass wants to
1061 be more strict about which cookies to return).
1062
1063 """
1064 # Path has already been checked by .path_return_ok(), and domain
1065 # blocking done by .domain_return_ok().
Thomas Wouters477c8d52006-05-27 19:21:47 +00001066 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001067
1068 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1069 fn_name = "return_ok_"+n
1070 fn = getattr(self, fn_name)
1071 if not fn(cookie, request):
1072 return False
1073 return True
1074
1075 def return_ok_version(self, cookie, request):
1076 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001077 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001078 return False
1079 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001080 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001081 return False
1082 return True
1083
1084 def return_ok_verifiability(self, cookie, request):
1085 if request.is_unverifiable() and is_third_party(request):
1086 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001087 _debug(" third-party RFC 2965 cookie during unverifiable "
1088 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001089 return False
1090 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001091 _debug(" third-party Netscape cookie during unverifiable "
1092 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001093 return False
1094 return True
1095
1096 def return_ok_secure(self, cookie, request):
1097 if cookie.secure and request.get_type() != "https":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001098 _debug(" secure cookie with non-secure request")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001099 return False
1100 return True
1101
1102 def return_ok_expires(self, cookie, request):
1103 if cookie.is_expired(self._now):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001104 _debug(" cookie expired")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001105 return False
1106 return True
1107
1108 def return_ok_port(self, cookie, request):
1109 if cookie.port:
1110 req_port = request_port(request)
1111 if req_port is None:
1112 req_port = "80"
1113 for p in cookie.port.split(","):
1114 if p == req_port:
1115 break
1116 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001117 _debug(" request port %s does not match cookie port %s",
1118 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001119 return False
1120 return True
1121
1122 def return_ok_domain(self, cookie, request):
1123 req_host, erhn = eff_request_host(request)
1124 domain = cookie.domain
1125
1126 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1127 if (cookie.version == 0 and
1128 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1129 not cookie.domain_specified and domain != erhn):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001130 _debug(" cookie with unspecified domain does not string-compare "
1131 "equal to request domain")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001132 return False
1133
1134 if cookie.version > 0 and not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001135 _debug(" effective request-host name %s does not domain-match "
1136 "RFC 2965 cookie domain %s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001137 return False
1138 if cookie.version == 0 and not ("."+erhn).endswith(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001139 _debug(" request-host %s does not match Netscape cookie domain "
1140 "%s", req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001141 return False
1142 return True
1143
1144 def domain_return_ok(self, domain, request):
1145 # Liberal check of. This is here as an optimization to avoid
1146 # having to load lots of MSIE cookie files unless necessary.
1147 req_host, erhn = eff_request_host(request)
1148 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001149 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001150 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001151 erhn = "."+erhn
1152 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001153 #_debug(" request domain %s does not match cookie domain %s",
1154 # req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001155 return False
1156
1157 if self.is_blocked(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001158 _debug(" domain %s is in user block-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001159 return False
1160 if self.is_not_allowed(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001161 _debug(" domain %s is not in user allow-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001162 return False
1163
1164 return True
1165
1166 def path_return_ok(self, path, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001167 _debug("- checking cookie path=%s", path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001168 req_path = request_path(request)
1169 if not req_path.startswith(path):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001170 _debug(" %s does not path-match %s", req_path, path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001171 return False
1172 return True
1173
1174
1175def vals_sorted_by_key(adict):
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001176 keys = sorted(adict.keys())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001177 return map(adict.get, keys)
1178
1179def deepvalues(mapping):
1180 """Iterates over nested mapping, depth-first, in sorted order by key."""
1181 values = vals_sorted_by_key(mapping)
1182 for obj in values:
1183 mapping = False
1184 try:
1185 obj.items
1186 except AttributeError:
1187 pass
1188 else:
1189 mapping = True
1190 for subobj in deepvalues(obj):
1191 yield subobj
1192 if not mapping:
1193 yield obj
1194
1195
1196# Used as second parameter to dict.get() method, to distinguish absent
1197# dict key from one with a None value.
1198class Absent: pass
1199
1200class CookieJar:
1201 """Collection of HTTP cookies.
1202
1203 You may not need to know about this class: try
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001204 urllib.request.build_opener(HTTPCookieProcessor).open(url).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001205 """
1206
1207 non_word_re = re.compile(r"\W")
1208 quote_re = re.compile(r"([\"\\])")
1209 strict_domain_re = re.compile(r"\.?[^.]*")
1210 domain_re = re.compile(r"[^.]*")
1211 dots_re = re.compile(r"^\.+")
1212
Antoine Pitroufd036452008-08-19 17:56:33 +00001213 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001214
1215 def __init__(self, policy=None):
1216 if policy is None:
1217 policy = DefaultCookiePolicy()
1218 self._policy = policy
1219
1220 self._cookies_lock = _threading.RLock()
1221 self._cookies = {}
1222
1223 def set_policy(self, policy):
1224 self._policy = policy
1225
1226 def _cookies_for_domain(self, domain, request):
1227 cookies = []
1228 if not self._policy.domain_return_ok(domain, request):
1229 return []
Thomas Wouters477c8d52006-05-27 19:21:47 +00001230 _debug("Checking %s for cookies to return", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001231 cookies_by_path = self._cookies[domain]
1232 for path in cookies_by_path.keys():
1233 if not self._policy.path_return_ok(path, request):
1234 continue
1235 cookies_by_name = cookies_by_path[path]
1236 for cookie in cookies_by_name.values():
1237 if not self._policy.return_ok(cookie, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001238 _debug(" not returning cookie")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001239 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001240 _debug(" it's a match")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001241 cookies.append(cookie)
1242 return cookies
1243
1244 def _cookies_for_request(self, request):
1245 """Return a list of cookies to be returned to server."""
1246 cookies = []
1247 for domain in self._cookies.keys():
1248 cookies.extend(self._cookies_for_domain(domain, request))
1249 return cookies
1250
1251 def _cookie_attrs(self, cookies):
1252 """Return a list of cookie-attributes to be returned to server.
1253
1254 like ['foo="bar"; $Path="/"', ...]
1255
1256 The $Version attribute is also added when appropriate (currently only
1257 once per request).
1258
1259 """
1260 # add cookies in order of most specific (ie. longest) path first
Raymond Hettinger70b64fc2008-01-30 20:15:17 +00001261 cookies.sort(key=lambda a: len(a.path), reverse=True)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001262
1263 version_set = False
1264
1265 attrs = []
1266 for cookie in cookies:
1267 # set version of Cookie header
1268 # XXX
1269 # What should it be if multiple matching Set-Cookie headers have
1270 # different versions themselves?
1271 # Answer: there is no answer; was supposed to be settled by
1272 # RFC 2965 errata, but that may never appear...
1273 version = cookie.version
1274 if not version_set:
1275 version_set = True
1276 if version > 0:
1277 attrs.append("$Version=%s" % version)
1278
1279 # quote cookie value if necessary
1280 # (not for Netscape protocol, which already has any quotes
1281 # intact, due to the poorly-specified Netscape Cookie: syntax)
1282 if ((cookie.value is not None) and
1283 self.non_word_re.search(cookie.value) and version > 0):
1284 value = self.quote_re.sub(r"\\\1", cookie.value)
1285 else:
1286 value = cookie.value
1287
1288 # add cookie-attributes to be returned in Cookie header
1289 if cookie.value is None:
1290 attrs.append(cookie.name)
1291 else:
1292 attrs.append("%s=%s" % (cookie.name, value))
1293 if version > 0:
1294 if cookie.path_specified:
1295 attrs.append('$Path="%s"' % cookie.path)
1296 if cookie.domain.startswith("."):
1297 domain = cookie.domain
1298 if (not cookie.domain_initial_dot and
1299 domain.startswith(".")):
1300 domain = domain[1:]
1301 attrs.append('$Domain="%s"' % domain)
1302 if cookie.port is not None:
1303 p = "$Port"
1304 if cookie.port_specified:
1305 p = p + ('="%s"' % cookie.port)
1306 attrs.append(p)
1307
1308 return attrs
1309
1310 def add_cookie_header(self, request):
Georg Brandl029986a2008-06-23 11:44:14 +00001311 """Add correct Cookie: header to request (urllib.request.Request object).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001312
1313 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1314
1315 """
Thomas Wouters477c8d52006-05-27 19:21:47 +00001316 _debug("add_cookie_header")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001317 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001318 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001319
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001320 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001321
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001322 cookies = self._cookies_for_request(request)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001323
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001324 attrs = self._cookie_attrs(cookies)
1325 if attrs:
1326 if not request.has_header("Cookie"):
1327 request.add_unredirected_header(
1328 "Cookie", "; ".join(attrs))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001329
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001330 # if necessary, advertise that we know RFC 2965
1331 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1332 not request.has_header("Cookie2")):
1333 for cookie in cookies:
1334 if cookie.version != 1:
1335 request.add_unredirected_header("Cookie2", '$Version="1"')
1336 break
1337
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001338 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001339 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001340
1341 self.clear_expired_cookies()
1342
1343 def _normalized_cookie_tuples(self, attrs_set):
1344 """Return list of tuples containing normalised cookie information.
1345
1346 attrs_set is the list of lists of key,value pairs extracted from
1347 the Set-Cookie or Set-Cookie2 headers.
1348
1349 Tuples are name, value, standard, rest, where name and value are the
1350 cookie name and value, standard is a dictionary containing the standard
1351 cookie-attributes (discard, secure, version, expires or max-age,
1352 domain, path and port) and rest is a dictionary containing the rest of
1353 the cookie-attributes.
1354
1355 """
1356 cookie_tuples = []
1357
1358 boolean_attrs = "discard", "secure"
1359 value_attrs = ("version",
1360 "expires", "max-age",
1361 "domain", "path", "port",
1362 "comment", "commenturl")
1363
1364 for cookie_attrs in attrs_set:
1365 name, value = cookie_attrs[0]
1366
1367 # Build dictionary of standard cookie-attributes (standard) and
1368 # dictionary of other cookie-attributes (rest).
1369
1370 # Note: expiry time is normalised to seconds since epoch. V0
1371 # cookies should have the Expires cookie-attribute, and V1 cookies
1372 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1373 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1374 # accept either (but prefer Max-Age).
1375 max_age_set = False
1376
1377 bad_cookie = False
1378
1379 standard = {}
1380 rest = {}
1381 for k, v in cookie_attrs[1:]:
1382 lc = k.lower()
1383 # don't lose case distinction for unknown fields
1384 if lc in value_attrs or lc in boolean_attrs:
1385 k = lc
1386 if k in boolean_attrs and v is None:
1387 # boolean cookie-attribute is present, but has no value
1388 # (like "discard", rather than "port=80")
1389 v = True
1390 if k in standard:
1391 # only first value is significant
1392 continue
1393 if k == "domain":
1394 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001395 _debug(" missing value for domain attribute")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001396 bad_cookie = True
1397 break
1398 # RFC 2965 section 3.3.3
1399 v = v.lower()
1400 if k == "expires":
1401 if max_age_set:
1402 # Prefer max-age to expires (like Mozilla)
1403 continue
1404 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001405 _debug(" missing or invalid value for expires "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001406 "attribute: treating as session cookie")
1407 continue
1408 if k == "max-age":
1409 max_age_set = True
1410 try:
1411 v = int(v)
1412 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001413 _debug(" missing or invalid (non-numeric) value for "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001414 "max-age attribute")
1415 bad_cookie = True
1416 break
1417 # convert RFC 2965 Max-Age to seconds since epoch
1418 # XXX Strictly you're supposed to follow RFC 2616
1419 # age-calculation rules. Remember that zero Max-Age is a
1420 # is a request to discard (old and new) cookie, though.
1421 k = "expires"
1422 v = self._now + v
1423 if (k in value_attrs) or (k in boolean_attrs):
1424 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001425 k not in ("port", "comment", "commenturl")):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001426 _debug(" missing value for %s attribute" % k)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001427 bad_cookie = True
1428 break
1429 standard[k] = v
1430 else:
1431 rest[k] = v
1432
1433 if bad_cookie:
1434 continue
1435
1436 cookie_tuples.append((name, value, standard, rest))
1437
1438 return cookie_tuples
1439
1440 def _cookie_from_cookie_tuple(self, tup, request):
1441 # standard is dict of standard cookie-attributes, rest is dict of the
1442 # rest of them
1443 name, value, standard, rest = tup
1444
1445 domain = standard.get("domain", Absent)
1446 path = standard.get("path", Absent)
1447 port = standard.get("port", Absent)
1448 expires = standard.get("expires", Absent)
1449
1450 # set the easy defaults
1451 version = standard.get("version", None)
1452 if version is not None: version = int(version)
1453 secure = standard.get("secure", False)
1454 # (discard is also set if expires is Absent)
1455 discard = standard.get("discard", False)
1456 comment = standard.get("comment", None)
1457 comment_url = standard.get("commenturl", None)
1458
1459 # set default path
1460 if path is not Absent and path != "":
1461 path_specified = True
1462 path = escape_path(path)
1463 else:
1464 path_specified = False
1465 path = request_path(request)
1466 i = path.rfind("/")
1467 if i != -1:
1468 if version == 0:
1469 # Netscape spec parts company from reality here
1470 path = path[:i]
1471 else:
1472 path = path[:i+1]
1473 if len(path) == 0: path = "/"
1474
1475 # set default domain
1476 domain_specified = domain is not Absent
1477 # but first we have to remember whether it starts with a dot
1478 domain_initial_dot = False
1479 if domain_specified:
1480 domain_initial_dot = bool(domain.startswith("."))
1481 if domain is Absent:
1482 req_host, erhn = eff_request_host(request)
1483 domain = erhn
1484 elif not domain.startswith("."):
1485 domain = "."+domain
1486
1487 # set default port
1488 port_specified = False
1489 if port is not Absent:
1490 if port is None:
1491 # Port attr present, but has no value: default to request port.
1492 # Cookie should then only be sent back on that port.
1493 port = request_port(request)
1494 else:
1495 port_specified = True
1496 port = re.sub(r"\s+", "", port)
1497 else:
1498 # No port attr present. Cookie can be sent back on any port.
1499 port = None
1500
1501 # set default expires and discard
1502 if expires is Absent:
1503 expires = None
1504 discard = True
1505 elif expires <= self._now:
1506 # Expiry date in past is request to delete cookie. This can't be
1507 # in DefaultCookiePolicy, because can't delete cookies there.
1508 try:
1509 self.clear(domain, path, name)
1510 except KeyError:
1511 pass
Thomas Wouters477c8d52006-05-27 19:21:47 +00001512 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1513 domain, path, name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001514 return None
1515
1516 return Cookie(version,
1517 name, value,
1518 port, port_specified,
1519 domain, domain_specified, domain_initial_dot,
1520 path, path_specified,
1521 secure,
1522 expires,
1523 discard,
1524 comment,
1525 comment_url,
1526 rest)
1527
1528 def _cookies_from_attrs_set(self, attrs_set, request):
1529 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1530
1531 cookies = []
1532 for tup in cookie_tuples:
1533 cookie = self._cookie_from_cookie_tuple(tup, request)
1534 if cookie: cookies.append(cookie)
1535 return cookies
1536
Neal Norwitz71dad722005-12-23 21:43:48 +00001537 def _process_rfc2109_cookies(self, cookies):
1538 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1539 if rfc2109_as_ns is None:
1540 rfc2109_as_ns = not self._policy.rfc2965
1541 for cookie in cookies:
1542 if cookie.version == 1:
1543 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001544 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001545 # treat 2109 cookies as Netscape cookies rather than
1546 # as RFC2965 cookies
1547 cookie.version = 0
1548
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001549 def make_cookies(self, response, request):
1550 """Return sequence of Cookie objects extracted from response object."""
1551 # get cookie-attributes for RFC 2965 and Netscape protocols
1552 headers = response.info()
Barry Warsaw820c1202008-06-12 04:06:45 +00001553 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1554 ns_hdrs = headers.get_all("Set-Cookie", [])
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001555
1556 rfc2965 = self._policy.rfc2965
1557 netscape = self._policy.netscape
1558
1559 if ((not rfc2965_hdrs and not ns_hdrs) or
1560 (not ns_hdrs and not rfc2965) or
1561 (not rfc2965_hdrs and not netscape) or
1562 (not netscape and not rfc2965)):
1563 return [] # no relevant cookie headers: quick exit
1564
1565 try:
1566 cookies = self._cookies_from_attrs_set(
1567 split_header_words(rfc2965_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001568 except Exception:
1569 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001570 cookies = []
1571
1572 if ns_hdrs and netscape:
1573 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001574 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001575 ns_cookies = self._cookies_from_attrs_set(
1576 parse_ns_headers(ns_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001577 except Exception:
1578 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001579 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001580 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001581
1582 # Look for Netscape cookies (from Set-Cookie headers) that match
1583 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1584 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1585 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1586 # bundled in with the Netscape cookies for this purpose, which is
1587 # reasonable behaviour.
1588 if rfc2965:
1589 lookup = {}
1590 for cookie in cookies:
1591 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1592
1593 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1594 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1595 return key not in lookup
1596 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1597
1598 if ns_cookies:
1599 cookies.extend(ns_cookies)
1600
1601 return cookies
1602
1603 def set_cookie_if_ok(self, cookie, request):
1604 """Set a cookie if policy says it's OK to do so."""
1605 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001606 try:
1607 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001608
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001609 if self._policy.set_ok(cookie, request):
1610 self.set_cookie(cookie)
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001611
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001612
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001613 finally:
1614 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001615
1616 def set_cookie(self, cookie):
1617 """Set a cookie, without checking whether or not it should be set."""
1618 c = self._cookies
1619 self._cookies_lock.acquire()
1620 try:
1621 if cookie.domain not in c: c[cookie.domain] = {}
1622 c2 = c[cookie.domain]
1623 if cookie.path not in c2: c2[cookie.path] = {}
1624 c3 = c2[cookie.path]
1625 c3[cookie.name] = cookie
1626 finally:
1627 self._cookies_lock.release()
1628
1629 def extract_cookies(self, response, request):
1630 """Extract cookies from response, where allowable given the request."""
Thomas Wouters477c8d52006-05-27 19:21:47 +00001631 _debug("extract_cookies: %s", response.info())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001632 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001633 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001634 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001635
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001636 for cookie in self.make_cookies(response, request):
1637 if self._policy.set_ok(cookie, request):
1638 _debug(" setting cookie: %s", cookie)
1639 self.set_cookie(cookie)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001640 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001641 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001642
1643 def clear(self, domain=None, path=None, name=None):
1644 """Clear some cookies.
1645
1646 Invoking this method without arguments will clear all cookies. If
1647 given a single argument, only cookies belonging to that domain will be
1648 removed. If given two arguments, cookies belonging to the specified
1649 path within that domain are removed. If given three arguments, then
1650 the cookie with the specified name, path and domain is removed.
1651
1652 Raises KeyError if no matching cookie exists.
1653
1654 """
1655 if name is not None:
1656 if (domain is None) or (path is None):
1657 raise ValueError(
1658 "domain and path must be given to remove a cookie by name")
1659 del self._cookies[domain][path][name]
1660 elif path is not None:
1661 if domain is None:
1662 raise ValueError(
1663 "domain must be given to remove cookies by path")
1664 del self._cookies[domain][path]
1665 elif domain is not None:
1666 del self._cookies[domain]
1667 else:
1668 self._cookies = {}
1669
1670 def clear_session_cookies(self):
1671 """Discard all session cookies.
1672
1673 Note that the .save() method won't save session cookies anyway, unless
1674 you ask otherwise by passing a true ignore_discard argument.
1675
1676 """
1677 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001678 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001679 for cookie in self:
1680 if cookie.discard:
1681 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001682 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001683 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001684
1685 def clear_expired_cookies(self):
1686 """Discard all expired cookies.
1687
1688 You probably don't need to call this method: expired cookies are never
1689 sent back to the server (provided you're using DefaultCookiePolicy),
1690 this method is called by CookieJar itself every so often, and the
1691 .save() method won't save expired cookies anyway (unless you ask
1692 otherwise by passing a true ignore_expires argument).
1693
1694 """
1695 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001696 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001697 now = time.time()
1698 for cookie in self:
1699 if cookie.is_expired(now):
1700 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001701 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001702 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001703
1704 def __iter__(self):
1705 return deepvalues(self._cookies)
1706
1707 def __len__(self):
1708 """Return number of contained cookies."""
1709 i = 0
1710 for cookie in self: i = i + 1
1711 return i
1712
1713 def __repr__(self):
1714 r = []
1715 for cookie in self: r.append(repr(cookie))
1716 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1717
1718 def __str__(self):
1719 r = []
1720 for cookie in self: r.append(str(cookie))
1721 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1722
1723
Neal Norwitz3e7de592005-12-23 21:24:35 +00001724# derives from IOError for backwards-compatibility with Python 2.4.0
1725class LoadError(IOError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001726
1727class FileCookieJar(CookieJar):
1728 """CookieJar that can be loaded from and saved to a file."""
1729
1730 def __init__(self, filename=None, delayload=False, policy=None):
1731 """
1732 Cookies are NOT loaded from the named file until either the .load() or
1733 .revert() method is called.
1734
1735 """
1736 CookieJar.__init__(self, policy)
1737 if filename is not None:
1738 try:
1739 filename+""
1740 except:
1741 raise ValueError("filename must be string-like")
1742 self.filename = filename
1743 self.delayload = bool(delayload)
1744
1745 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1746 """Save cookies to a file."""
1747 raise NotImplementedError()
1748
1749 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1750 """Load cookies from a file."""
1751 if filename is None:
1752 if self.filename is not None: filename = self.filename
1753 else: raise ValueError(MISSING_FILENAME_TEXT)
1754
1755 f = open(filename)
1756 try:
1757 self._really_load(f, filename, ignore_discard, ignore_expires)
1758 finally:
1759 f.close()
1760
1761 def revert(self, filename=None,
1762 ignore_discard=False, ignore_expires=False):
1763 """Clear all cookies and reload cookies from a saved file.
1764
1765 Raises LoadError (or IOError) if reversion is not successful; the
1766 object's state will not be altered if this happens.
1767
1768 """
1769 if filename is None:
1770 if self.filename is not None: filename = self.filename
1771 else: raise ValueError(MISSING_FILENAME_TEXT)
1772
1773 self._cookies_lock.acquire()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001774 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001775
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001776 old_state = copy.deepcopy(self._cookies)
1777 self._cookies = {}
1778 try:
1779 self.load(filename, ignore_discard, ignore_expires)
1780 except (LoadError, IOError):
1781 self._cookies = old_state
1782 raise
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001783
1784 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001785 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001786
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001787
1788def lwp_cookie_str(cookie):
1789 """Return string representation of Cookie in an the LWP cookie file format.
1790
1791 Actually, the format is extended a bit -- see module docstring.
1792
1793 """
1794 h = [(cookie.name, cookie.value),
1795 ("path", cookie.path),
1796 ("domain", cookie.domain)]
1797 if cookie.port is not None: h.append(("port", cookie.port))
1798 if cookie.path_specified: h.append(("path_spec", None))
1799 if cookie.port_specified: h.append(("port_spec", None))
1800 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1801 if cookie.secure: h.append(("secure", None))
1802 if cookie.expires: h.append(("expires",
1803 time2isoz(float(cookie.expires))))
1804 if cookie.discard: h.append(("discard", None))
1805 if cookie.comment: h.append(("comment", cookie.comment))
1806 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1807
1808 keys = sorted(cookie._rest.keys())
1809 for k in keys:
1810 h.append((k, str(cookie._rest[k])))
1811
1812 h.append(("version", str(cookie.version)))
1813
1814 return join_header_words([h])
1815
1816class LWPCookieJar(FileCookieJar):
1817 """
1818 The LWPCookieJar saves a sequence of"Set-Cookie3" lines.
1819 "Set-Cookie3" is the format used by the libwww-perl libary, not known
1820 to be compatible with any browser, but which is easy to read and
1821 doesn't lose information about RFC 2965 cookies.
1822
1823 Additional methods
1824
1825 as_lwp_str(ignore_discard=True, ignore_expired=True)
1826
1827 """
1828
1829 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1830 """Return cookies as a string of "\n"-separated "Set-Cookie3" headers.
1831
1832 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1833
1834 """
1835 now = time.time()
1836 r = []
1837 for cookie in self:
1838 if not ignore_discard and cookie.discard:
1839 continue
1840 if not ignore_expires and cookie.is_expired(now):
1841 continue
1842 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1843 return "\n".join(r+[""])
1844
1845 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1846 if filename is None:
1847 if self.filename is not None: filename = self.filename
1848 else: raise ValueError(MISSING_FILENAME_TEXT)
1849
1850 f = open(filename, "w")
1851 try:
1852 # There really isn't an LWP Cookies 2.0 format, but this indicates
1853 # that there is extra information in here (domain_dot and
1854 # port_spec) while still being compatible with libwww-perl, I hope.
1855 f.write("#LWP-Cookies-2.0\n")
1856 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1857 finally:
1858 f.close()
1859
1860 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1861 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001862 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001863 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1864 "file" % filename)
1865 raise LoadError(msg)
1866
1867 now = time.time()
1868
1869 header = "Set-Cookie3:"
1870 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1871 "secure", "discard")
1872 value_attrs = ("version",
1873 "port", "path", "domain",
1874 "expires",
1875 "comment", "commenturl")
1876
1877 try:
1878 while 1:
1879 line = f.readline()
1880 if line == "": break
1881 if not line.startswith(header):
1882 continue
1883 line = line[len(header):].strip()
1884
1885 for data in split_header_words([line]):
1886 name, value = data[0]
1887 standard = {}
1888 rest = {}
1889 for k in boolean_attrs:
1890 standard[k] = False
1891 for k, v in data[1:]:
1892 if k is not None:
1893 lc = k.lower()
1894 else:
1895 lc = None
1896 # don't lose case distinction for unknown fields
1897 if (lc in value_attrs) or (lc in boolean_attrs):
1898 k = lc
1899 if k in boolean_attrs:
1900 if v is None: v = True
1901 standard[k] = v
1902 elif k in value_attrs:
1903 standard[k] = v
1904 else:
1905 rest[k] = v
1906
1907 h = standard.get
1908 expires = h("expires")
1909 discard = h("discard")
1910 if expires is not None:
1911 expires = iso2time(expires)
1912 if expires is None:
1913 discard = True
1914 domain = h("domain")
1915 domain_specified = domain.startswith(".")
1916 c = Cookie(h("version"), name, value,
1917 h("port"), h("port_spec"),
1918 domain, domain_specified, h("domain_dot"),
1919 h("path"), h("path_spec"),
1920 h("secure"),
1921 expires,
1922 discard,
1923 h("comment"),
1924 h("commenturl"),
1925 rest)
1926 if not ignore_discard and c.discard:
1927 continue
1928 if not ignore_expires and c.is_expired(now):
1929 continue
1930 self.set_cookie(c)
1931
1932 except IOError:
1933 raise
1934 except Exception:
1935 _warn_unhandled_exception()
1936 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1937 (filename, line))
1938
1939
1940class MozillaCookieJar(FileCookieJar):
1941 """
1942
1943 WARNING: you may want to backup your browser's cookies file if you use
1944 this class to save cookies. I *think* it works, but there have been
1945 bugs in the past!
1946
1947 This class differs from CookieJar only in the format it uses to save and
1948 load cookies to and from a file. This class uses the Mozilla/Netscape
1949 `cookies.txt' format. lynx uses this file format, too.
1950
1951 Don't expect cookies saved while the browser is running to be noticed by
1952 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1953 you change them on disk while it's running; on Windows, you probably can't
1954 save at all while the browser is running).
1955
1956 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1957 Netscape cookies on saving.
1958
1959 In particular, the cookie version and port number information is lost,
1960 together with information about whether or not Path, Port and Discard were
1961 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1962 domain as set in the HTTP header started with a dot (yes, I'm aware some
1963 domains in Netscape files start with a dot and some don't -- trust me, you
1964 really don't want to know any more about this).
1965
1966 Note that though Mozilla and Netscape use the same format, they use
1967 slightly different headers. The class saves cookies using the Netscape
1968 header by default (Mozilla can cope with that).
1969
1970 """
Antoine Pitroufd036452008-08-19 17:56:33 +00001971 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001972 header = """\
Georg Brandle5d518f2010-08-01 19:09:07 +00001973# Netscape HTTP Cookie File
1974# http://www.netscape.com/newsref/std/cookie_spec.html
1975# This is a generated file! Do not edit.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001976
1977"""
1978
1979 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1980 now = time.time()
1981
1982 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001983 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001984 f.close()
1985 raise LoadError(
1986 "%r does not look like a Netscape format cookies file" %
1987 filename)
1988
1989 try:
1990 while 1:
1991 line = f.readline()
1992 if line == "": break
1993
1994 # last field may be absent, so keep any trailing tab
1995 if line.endswith("\n"): line = line[:-1]
1996
1997 # skip comments and blank lines XXX what is $ for?
1998 if (line.strip().startswith(("#", "$")) or
1999 line.strip() == ""):
2000 continue
2001
2002 domain, domain_specified, path, secure, expires, name, value = \
2003 line.split("\t")
2004 secure = (secure == "TRUE")
2005 domain_specified = (domain_specified == "TRUE")
2006 if name == "":
2007 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2008 # with no name, whereas http.cookiejar regards it as a
2009 # cookie with no value.
2010 name = value
2011 value = None
2012
2013 initial_dot = domain.startswith(".")
2014 assert domain_specified == initial_dot
2015
2016 discard = False
2017 if expires == "":
2018 expires = None
2019 discard = True
2020
2021 # assume path_specified is false
2022 c = Cookie(0, name, value,
2023 None, False,
2024 domain, domain_specified, initial_dot,
2025 path, False,
2026 secure,
2027 expires,
2028 discard,
2029 None,
2030 None,
2031 {})
2032 if not ignore_discard and c.discard:
2033 continue
2034 if not ignore_expires and c.is_expired(now):
2035 continue
2036 self.set_cookie(c)
2037
2038 except IOError:
2039 raise
2040 except Exception:
2041 _warn_unhandled_exception()
2042 raise LoadError("invalid Netscape format cookies file %r: %r" %
2043 (filename, line))
2044
2045 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2046 if filename is None:
2047 if self.filename is not None: filename = self.filename
2048 else: raise ValueError(MISSING_FILENAME_TEXT)
2049
2050 f = open(filename, "w")
2051 try:
2052 f.write(self.header)
2053 now = time.time()
2054 for cookie in self:
2055 if not ignore_discard and cookie.discard:
2056 continue
2057 if not ignore_expires and cookie.is_expired(now):
2058 continue
2059 if cookie.secure: secure = "TRUE"
2060 else: secure = "FALSE"
2061 if cookie.domain.startswith("."): initial_dot = "TRUE"
2062 else: initial_dot = "FALSE"
2063 if cookie.expires is not None:
2064 expires = str(cookie.expires)
2065 else:
2066 expires = ""
2067 if cookie.value is None:
2068 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2069 # with no name, whereas http.cookiejar regards it as a
2070 # cookie with no value.
2071 name = ""
2072 value = cookie.name
2073 else:
2074 name = cookie.name
2075 value = cookie.value
2076 f.write(
2077 "\t".join([cookie.domain, initial_dot, cookie.path,
2078 secure, expires, name, value])+
2079 "\n")
2080 finally:
2081 f.close()