blob: 97383d603e69677a80079ace91b0c3fd3d3b77d7 [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
Thomas Wouters477c8d52006-05-27 19:21:47 +000010Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000013
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Thomas Wouters477c8d52006-05-27 19:21:47 +000028__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
Jeremy Hylton1afc1692008-06-18 20:49:58 +000031import copy
Victor Stinner628225c2011-03-21 02:38:51 +010032import datetime
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033import re
34import time
35import urllib.parse, urllib.request
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000036try:
37 import threading as _threading
38except ImportError:
39 import dummy_threading as _threading
Georg Brandl24420152008-05-26 16:32:26 +000040import http.client # only for the default HTTP port
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000041from calendar import timegm
42
Thomas Wouters477c8d52006-05-27 19:21:47 +000043debug = False # set to True to enable debugging via the logging module
44logger = None
45
46def _debug(*args):
47 if not debug:
48 return
49 global logger
50 if not logger:
51 import logging
Georg Brandl24420152008-05-26 16:32:26 +000052 logger = logging.getLogger("http.cookiejar")
Thomas Wouters477c8d52006-05-27 19:21:47 +000053 return logger.debug(*args)
54
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000055
Georg Brandl24420152008-05-26 16:32:26 +000056DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000057MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
58 "instance initialised with one)")
59
Thomas Wouters477c8d52006-05-27 19:21:47 +000060def _warn_unhandled_exception():
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000061 # There are a few catch-all except: statements in this module, for
Thomas Wouters477c8d52006-05-27 19:21:47 +000062 # catching input that's bad in unexpected ways. Warn if any
63 # exceptions are caught there.
Jeremy Hylton7ecf3dc2008-05-10 20:38:40 +000064 import io, warnings, traceback
Guido van Rossum34d19282007-08-09 01:03:29 +000065 f = io.StringIO()
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000066 traceback.print_exc(None, f)
67 msg = f.getvalue()
Georg Brandl24420152008-05-26 16:32:26 +000068 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000069
70
71# Date/time conversion
72# -----------------------------------------------------------------------------
73
74EPOCH_YEAR = 1970
75def _timegm(tt):
76 year, month, mday, hour, min, sec = tt[:6]
77 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
78 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
79 return timegm(tt)
80 else:
81 return None
82
83DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
84MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
85 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
86MONTHS_LOWER = []
87for month in MONTHS: MONTHS_LOWER.append(month.lower())
88
89def time2isoz(t=None):
90 """Return a string representing time in seconds since epoch, t.
91
92 If the function is called without an argument, it will use the current
93 time.
94
95 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
96 representing Universal Time (UTC, aka GMT). An example of this format is:
97
98 1994-11-24 08:49:37Z
99
100 """
Victor Stinner628225c2011-03-21 02:38:51 +0100101 if t is None:
102 dt = datetime.datetime.utcnow()
103 else:
104 dt = datetime.datetime.utcfromtimestamp(t)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000105 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100106 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000107
108def time2netscape(t=None):
109 """Return a string representing time in seconds since epoch, t.
110
111 If the function is called without an argument, it will use the current
112 time.
113
114 The format of the returned string is like this:
115
116 Wed, DD-Mon-YYYY HH:MM:SS GMT
117
118 """
Victor Stinner628225c2011-03-21 02:38:51 +0100119 if t is None:
120 dt = datetime.datetime.utcnow()
121 else:
122 dt = datetime.datetime.utcfromtimestamp(t)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000123 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100124 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
125 dt.year, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000126
127
128UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
129
Antoine Pitroufd036452008-08-19 17:56:33 +0000130TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000131def offset_from_tz_string(tz):
132 offset = None
133 if tz in UTC_ZONES:
134 offset = 0
135 else:
136 m = TIMEZONE_RE.search(tz)
137 if m:
138 offset = 3600 * int(m.group(2))
139 if m.group(3):
140 offset = offset + 60 * int(m.group(3))
141 if m.group(1) == '-':
142 offset = -offset
143 return offset
144
145def _str2time(day, mon, yr, hr, min, sec, tz):
146 # translate month name to number
147 # month numbers start with 1 (January)
148 try:
149 mon = MONTHS_LOWER.index(mon.lower())+1
150 except ValueError:
151 # maybe it's already a number
152 try:
153 imon = int(mon)
154 except ValueError:
155 return None
156 if 1 <= imon <= 12:
157 mon = imon
158 else:
159 return None
160
161 # make sure clock elements are defined
162 if hr is None: hr = 0
163 if min is None: min = 0
164 if sec is None: sec = 0
165
166 yr = int(yr)
167 day = int(day)
168 hr = int(hr)
169 min = int(min)
170 sec = int(sec)
171
172 if yr < 1000:
173 # find "obvious" year
174 cur_yr = time.localtime(time.time())[0]
175 m = cur_yr % 100
176 tmp = yr
177 yr = yr + cur_yr - m
178 m = m - tmp
179 if abs(m) > 50:
180 if m > 0: yr = yr + 100
181 else: yr = yr - 100
182
183 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
184 t = _timegm((yr, mon, day, hr, min, sec, tz))
185
186 if t is not None:
187 # adjust time using timezone string, to get absolute time since epoch
188 if tz is None:
189 tz = "UTC"
190 tz = tz.upper()
191 offset = offset_from_tz_string(tz)
192 if offset is None:
193 return None
194 t = t - offset
195
196 return t
197
198STRICT_DATE_RE = re.compile(
199 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
Antoine Pitroufd036452008-08-19 17:56:33 +0000200 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000201WEEKDAY_RE = re.compile(
Antoine Pitroufd036452008-08-19 17:56:33 +0000202 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000203LOOSE_HTTP_DATE_RE = re.compile(
204 r"""^
205 (\d\d?) # day
206 (?:\s+|[-\/])
207 (\w+) # month
208 (?:\s+|[-\/])
209 (\d+) # year
210 (?:
211 (?:\s+|:) # separator before clock
212 (\d\d?):(\d\d) # hour:min
213 (?::(\d\d))? # optional seconds
214 )? # optional clock
215 \s*
216 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
217 \s*
218 (?:\(\w+\))? # ASCII representation of timezone in parens.
Antoine Pitroufd036452008-08-19 17:56:33 +0000219 \s*$""", re.X | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000220def http2time(text):
221 """Returns time in seconds since epoch of time represented by a string.
222
223 Return value is an integer.
224
225 None is returned if the format of str is unrecognized, the time is outside
226 the representable range, or the timezone string is not recognized. If the
227 string contains no timezone, UTC is assumed.
228
229 The timezone in the string may be numerical (like "-0800" or "+0100") or a
230 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
231 timezone strings equivalent to UTC (zero offset) are known to the function.
232
233 The function loosely parses the following formats:
234
235 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
236 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
237 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
238 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
239 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
240 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
241
242 The parser ignores leading and trailing whitespace. The time may be
243 absent.
244
245 If the year is given with only 2 digits, the function will select the
246 century that makes the year closest to the current date.
247
248 """
249 # fast exit for strictly conforming string
250 m = STRICT_DATE_RE.search(text)
251 if m:
252 g = m.groups()
253 mon = MONTHS_LOWER.index(g[1].lower()) + 1
254 tt = (int(g[2]), mon, int(g[0]),
255 int(g[3]), int(g[4]), float(g[5]))
256 return _timegm(tt)
257
258 # No, we need some messy parsing...
259
260 # clean up
261 text = text.lstrip()
262 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
263
264 # tz is time zone specifier string
265 day, mon, yr, hr, min, sec, tz = [None]*7
266
267 # loose regexp parse
268 m = LOOSE_HTTP_DATE_RE.search(text)
269 if m is not None:
270 day, mon, yr, hr, min, sec, tz = m.groups()
271 else:
272 return None # bad format
273
274 return _str2time(day, mon, yr, hr, min, sec, tz)
275
276ISO_DATE_RE = re.compile(
277 """^
278 (\d{4}) # year
279 [-\/]?
280 (\d\d?) # numerical month
281 [-\/]?
282 (\d\d?) # day
283 (?:
284 (?:\s+|[-:Tt]) # separator before clock
285 (\d\d?):?(\d\d) # hour:min
286 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
287 )? # optional clock
288 \s*
289 ([-+]?\d\d?:?(:?\d\d)?
290 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
Antoine Pitroufd036452008-08-19 17:56:33 +0000291 \s*$""", re.X | re. ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000292def iso2time(text):
293 """
294 As for http2time, but parses the ISO 8601 formats:
295
296 1994-02-03 14:15:29 -0100 -- ISO 8601 format
297 1994-02-03 14:15:29 -- zone is optional
298 1994-02-03 -- only date
299 1994-02-03T14:15:29 -- Use T as separator
300 19940203T141529Z -- ISO 8601 compact format
301 19940203 -- only date
302
303 """
304 # clean up
305 text = text.lstrip()
306
307 # tz is time zone specifier string
308 day, mon, yr, hr, min, sec, tz = [None]*7
309
310 # loose regexp parse
311 m = ISO_DATE_RE.search(text)
312 if m is not None:
313 # XXX there's an extra bit of the timezone I'm ignoring here: is
314 # this the right thing to do?
315 yr, mon, day, hr, min, sec, tz, _ = m.groups()
316 else:
317 return None # bad format
318
319 return _str2time(day, mon, yr, hr, min, sec, tz)
320
321
322# Header parsing
323# -----------------------------------------------------------------------------
324
325def unmatched(match):
326 """Return unmatched part of re.Match object."""
327 start, end = match.span(0)
328 return match.string[:start]+match.string[end:]
329
330HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
331HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
332HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
333HEADER_ESCAPE_RE = re.compile(r"\\(.)")
334def split_header_words(header_values):
335 r"""Parse header values into a list of lists containing key,value pairs.
336
337 The function knows how to deal with ",", ";" and "=" as well as quoted
338 values after "=". A list of space separated tokens are parsed as if they
339 were separated by ";".
340
341 If the header_values passed as argument contains multiple values, then they
342 are treated as if they were a single value separated by comma ",".
343
344 This means that this function is useful for parsing header fields that
345 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
346 the requirement for tokens).
347
348 headers = #header
349 header = (token | parameter) *( [";"] (token | parameter))
350
351 token = 1*<any CHAR except CTLs or separators>
352 separators = "(" | ")" | "<" | ">" | "@"
353 | "," | ";" | ":" | "\" | <">
354 | "/" | "[" | "]" | "?" | "="
355 | "{" | "}" | SP | HT
356
357 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
358 qdtext = <any TEXT except <">>
359 quoted-pair = "\" CHAR
360
361 parameter = attribute "=" value
362 attribute = token
363 value = token | quoted-string
364
365 Each header is represented by a list of key/value pairs. The value for a
366 simple token (not part of a parameter) is None. Syntactically incorrect
367 headers will not necessarily be parsed as you would want.
368
369 This is easier to describe with some examples:
370
371 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
372 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
373 >>> split_header_words(['text/html; charset="iso-8859-1"'])
374 [[('text/html', None), ('charset', 'iso-8859-1')]]
375 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
376 [[('Basic', None), ('realm', '"foobar"')]]
377
378 """
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000379 assert not isinstance(header_values, str)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000380 result = []
381 for text in header_values:
382 orig_text = text
383 pairs = []
384 while text:
385 m = HEADER_TOKEN_RE.search(text)
386 if m:
387 text = unmatched(m)
388 name = m.group(1)
389 m = HEADER_QUOTED_VALUE_RE.search(text)
390 if m: # quoted value
391 text = unmatched(m)
392 value = m.group(1)
393 value = HEADER_ESCAPE_RE.sub(r"\1", value)
394 else:
395 m = HEADER_VALUE_RE.search(text)
396 if m: # unquoted value
397 text = unmatched(m)
398 value = m.group(1)
399 value = value.rstrip()
400 else:
401 # no value, a lone token
402 value = None
403 pairs.append((name, value))
404 elif text.lstrip().startswith(","):
405 # concatenated headers, as per RFC 2616 section 4.2
406 text = text.lstrip()[1:]
407 if pairs: result.append(pairs)
408 pairs = []
409 else:
410 # skip junk
411 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
412 assert nr_junk_chars > 0, (
413 "split_header_words bug: '%s', '%s', %s" %
414 (orig_text, text, pairs))
415 text = non_junk
416 if pairs: result.append(pairs)
417 return result
418
419HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
420def join_header_words(lists):
421 """Do the inverse (almost) of the conversion done by split_header_words.
422
423 Takes a list of lists of (key, value) pairs and produces a single header
424 value. Attribute values are quoted if needed.
425
426 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
427 'text/plain; charset="iso-8859/1"'
428 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
429 'text/plain, charset="iso-8859/1"'
430
431 """
432 headers = []
433 for pairs in lists:
434 attr = []
435 for k, v in pairs:
436 if v is not None:
437 if not re.search(r"^\w+$", v):
438 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
439 v = '"%s"' % v
440 k = "%s=%s" % (k, v)
441 attr.append(k)
442 if attr: headers.append("; ".join(attr))
443 return ", ".join(headers)
444
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000445def strip_quotes(text):
446 if text.startswith('"'):
447 text = text[1:]
448 if text.endswith('"'):
449 text = text[:-1]
450 return text
451
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000452def parse_ns_headers(ns_headers):
453 """Ad-hoc parser for Netscape protocol cookie-attributes.
454
455 The old Netscape cookie format for Set-Cookie can for instance contain
456 an unquoted "," in the expires field, so we have to use this ad-hoc
457 parser instead of split_header_words.
458
459 XXX This may not make the best possible effort to parse all the crap
460 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
461 parser is probably better, so could do worse than following that if
462 this ever gives any trouble.
463
464 Currently, this is also used for parsing RFC 2109 cookies.
465
466 """
467 known_attrs = ("expires", "domain", "path", "secure",
468 # RFC 2109 attrs (may turn up in Netscape cookies, too)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000469 "version", "port", "max-age")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000470
471 result = []
472 for ns_header in ns_headers:
473 pairs = []
474 version_set = False
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000475 for ii, param in enumerate(re.split(r";\s*", ns_header)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000476 param = param.rstrip()
477 if param == "": continue
478 if "=" not in param:
Martin v. Löwisc5574e82005-03-03 10:57:37 +0000479 k, v = param, None
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000480 else:
481 k, v = re.split(r"\s*=\s*", param, 1)
482 k = k.lstrip()
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000483 if ii != 0:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000484 lc = k.lower()
485 if lc in known_attrs:
486 k = lc
487 if k == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000488 # This is an RFC 2109 cookie.
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000489 v = strip_quotes(v)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000490 version_set = True
491 if k == "expires":
492 # convert expires date to seconds since epoch
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000493 v = http2time(strip_quotes(v)) # None if invalid
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000494 pairs.append((k, v))
495
496 if pairs:
497 if not version_set:
498 pairs.append(("version", "0"))
499 result.append(pairs)
500
501 return result
502
503
Antoine Pitroufd036452008-08-19 17:56:33 +0000504IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000505def is_HDN(text):
506 """Return True if text is a host domain name."""
507 # XXX
508 # This may well be wrong. Which RFC is HDN defined in, if any (for
509 # the purposes of RFC 2965)?
510 # For the current implementation, what about IPv6? Remember to look
511 # at other uses of IPV4_RE also, if change this.
512 if IPV4_RE.search(text):
513 return False
514 if text == "":
515 return False
516 if text[0] == "." or text[-1] == ".":
517 return False
518 return True
519
520def domain_match(A, B):
521 """Return True if domain A domain-matches domain B, according to RFC 2965.
522
523 A and B may be host domain names or IP addresses.
524
525 RFC 2965, section 1:
526
527 Host names can be specified either as an IP address or a HDN string.
528 Sometimes we compare one host name with another. (Such comparisons SHALL
529 be case-insensitive.) Host A's name domain-matches host B's if
530
531 * their host name strings string-compare equal; or
532
533 * A is a HDN string and has the form NB, where N is a non-empty
534 name string, B has the form .B', and B' is a HDN string. (So,
535 x.y.com domain-matches .Y.com but not Y.com.)
536
537 Note that domain-match is not a commutative operation: a.b.c.com
538 domain-matches .c.com, but not the reverse.
539
540 """
541 # Note that, if A or B are IP addresses, the only relevant part of the
542 # definition of the domain-match algorithm is the direct string-compare.
543 A = A.lower()
544 B = B.lower()
545 if A == B:
546 return True
547 if not is_HDN(A):
548 return False
549 i = A.rfind(B)
550 if i == -1 or i == 0:
551 # A does not have form NB, or N is the empty string
552 return False
553 if not B.startswith("."):
554 return False
555 if not is_HDN(B[1:]):
556 return False
557 return True
558
559def liberal_is_HDN(text):
560 """Return True if text is a sort-of-like a host domain name.
561
562 For accepting/blocking domains.
563
564 """
565 if IPV4_RE.search(text):
566 return False
567 return True
568
569def user_domain_match(A, B):
570 """For blocking/accepting domains.
571
572 A and B may be host domain names or IP addresses.
573
574 """
575 A = A.lower()
576 B = B.lower()
577 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
578 if A == B:
579 # equal IP addresses
580 return True
581 return False
582 initial_dot = B.startswith(".")
583 if initial_dot and A.endswith(B):
584 return True
585 if not initial_dot and A == B:
586 return True
587 return False
588
Antoine Pitroufd036452008-08-19 17:56:33 +0000589cut_port_re = re.compile(r":\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000590def request_host(request):
591 """Return request-host, as defined by RFC 2965.
592
593 Variation from RFC: returned value is lowercased, for convenient
594 comparison.
595
596 """
597 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000598 host = urllib.parse.urlparse(url)[1]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000599 if host == "":
600 host = request.get_header("Host", "")
601
602 # remove port, if present
603 host = cut_port_re.sub("", host, 1)
604 return host.lower()
605
606def eff_request_host(request):
607 """Return a tuple (request-host, effective request-host name).
608
609 As defined by RFC 2965, except both are lowercased.
610
611 """
612 erhn = req_host = request_host(request)
613 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
614 erhn = req_host + ".local"
615 return req_host, erhn
616
617def request_path(request):
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000618 """Path component of request-URI, as defined by RFC 2965."""
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000619 url = request.get_full_url()
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000620 parts = urllib.parse.urlsplit(url)
621 path = escape_path(parts.path)
622 if not path.startswith("/"):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000623 # fix bad RFC 2396 absoluteURI
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000624 path = "/" + path
625 return path
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000626
627def request_port(request):
628 host = request.get_host()
629 i = host.find(':')
630 if i >= 0:
631 port = host[i+1:]
632 try:
633 int(port)
634 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000635 _debug("nonnumeric port: '%s'", port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000636 return None
637 else:
638 port = DEFAULT_HTTP_PORT
639 return port
640
641# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
642# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
643HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
644ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
645def uppercase_escaped_char(match):
646 return "%%%s" % match.group(1).upper()
647def escape_path(path):
648 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
649 # There's no knowing what character encoding was used to create URLs
650 # containing %-escapes, but since we have to pick one to escape invalid
651 # path characters, we pick UTF-8, as recommended in the HTML 4.0
652 # specification:
653 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
654 # And here, kind of: draft-fielding-uri-rfc2396bis-03
655 # (And in draft IRI specification: draft-duerst-iri-05)
656 # (And here, for new URI schemes: RFC 2718)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000657 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000658 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
659 return path
660
661def reach(h):
662 """Return reach of host h, as defined by RFC 2965, section 1.
663
664 The reach R of a host name H is defined as follows:
665
666 * If
667
668 - H is the host domain name of a host; and,
669
670 - H has the form A.B; and
671
672 - A has no embedded (that is, interior) dots; and
673
674 - B has at least one embedded dot, or B is the string "local".
675 then the reach of H is .B.
676
677 * Otherwise, the reach of H is H.
678
679 >>> reach("www.acme.com")
680 '.acme.com'
681 >>> reach("acme.com")
682 'acme.com'
683 >>> reach("acme.local")
684 '.local'
685
686 """
687 i = h.find(".")
688 if i >= 0:
689 #a = h[:i] # this line is only here to show what a is
690 b = h[i+1:]
691 i = b.find(".")
692 if is_HDN(h) and (i >= 0 or b == "local"):
693 return "."+b
694 return h
695
696def is_third_party(request):
697 """
698
699 RFC 2965, section 3.3.6:
700
701 An unverifiable transaction is to a third-party host if its request-
702 host U does not domain-match the reach R of the request-host O in the
703 origin transaction.
704
705 """
706 req_host = request_host(request)
707 if not domain_match(req_host, reach(request.get_origin_req_host())):
708 return True
709 else:
710 return False
711
712
713class Cookie:
714 """HTTP Cookie.
715
716 This class represents both Netscape and RFC 2965 cookies.
717
718 This is deliberately a very simple class. It just holds attributes. It's
719 possible to construct Cookie instances that don't comply with the cookie
720 standards. CookieJar.make_cookies is the factory function for Cookie
721 objects -- it deals with cookie parsing, supplying defaults, and
722 normalising to the representation used in this class. CookiePolicy is
723 responsible for checking them to see whether they should be accepted from
724 and returned to the server.
725
726 Note that the port may be present in the headers, but unspecified ("Port"
727 rather than"Port=80", for example); if this is the case, port is None.
728
729 """
730
731 def __init__(self, version, name, value,
732 port, port_specified,
733 domain, domain_specified, domain_initial_dot,
734 path, path_specified,
735 secure,
736 expires,
737 discard,
738 comment,
739 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000740 rest,
741 rfc2109=False,
742 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000743
744 if version is not None: version = int(version)
745 if expires is not None: expires = int(expires)
746 if port is None and port_specified is True:
747 raise ValueError("if port is None, port_specified must be false")
748
749 self.version = version
750 self.name = name
751 self.value = value
752 self.port = port
753 self.port_specified = port_specified
754 # normalise case, as per RFC 2965 section 3.3.3
755 self.domain = domain.lower()
756 self.domain_specified = domain_specified
757 # Sigh. We need to know whether the domain given in the
758 # cookie-attribute had an initial dot, in order to follow RFC 2965
759 # (as clarified in draft errata). Needed for the returned $Domain
760 # value.
761 self.domain_initial_dot = domain_initial_dot
762 self.path = path
763 self.path_specified = path_specified
764 self.secure = secure
765 self.expires = expires
766 self.discard = discard
767 self.comment = comment
768 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000769 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000770
771 self._rest = copy.copy(rest)
772
773 def has_nonstandard_attr(self, name):
774 return name in self._rest
775 def get_nonstandard_attr(self, name, default=None):
776 return self._rest.get(name, default)
777 def set_nonstandard_attr(self, name, value):
778 self._rest[name] = value
779
780 def is_expired(self, now=None):
781 if now is None: now = time.time()
782 if (self.expires is not None) and (self.expires <= now):
783 return True
784 return False
785
786 def __str__(self):
787 if self.port is None: p = ""
788 else: p = ":"+self.port
789 limit = self.domain + p + self.path
790 if self.value is not None:
791 namevalue = "%s=%s" % (self.name, self.value)
792 else:
793 namevalue = self.name
794 return "<Cookie %s for %s>" % (namevalue, limit)
795
796 def __repr__(self):
797 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000798 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000799 "port", "port_specified",
800 "domain", "domain_specified", "domain_initial_dot",
801 "path", "path_specified",
802 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000803 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000804 attr = getattr(self, name)
805 args.append("%s=%s" % (name, repr(attr)))
806 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000807 args.append("rfc2109=%s" % repr(self.rfc2109))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000808 return "Cookie(%s)" % ", ".join(args)
809
810
811class CookiePolicy:
812 """Defines which cookies get accepted from and returned to server.
813
814 May also modify cookies, though this is probably a bad idea.
815
816 The subclass DefaultCookiePolicy defines the standard rules for Netscape
817 and RFC 2965 cookies -- override that if you want a customised policy.
818
819 """
820 def set_ok(self, cookie, request):
821 """Return true if (and only if) cookie should be accepted from server.
822
823 Currently, pre-expired cookies never get this far -- the CookieJar
824 class deletes such cookies itself.
825
826 """
827 raise NotImplementedError()
828
829 def return_ok(self, cookie, request):
830 """Return true if (and only if) cookie should be returned to server."""
831 raise NotImplementedError()
832
833 def domain_return_ok(self, domain, request):
834 """Return false if cookies should not be returned, given cookie domain.
835 """
836 return True
837
838 def path_return_ok(self, path, request):
839 """Return false if cookies should not be returned, given cookie path.
840 """
841 return True
842
843
844class DefaultCookiePolicy(CookiePolicy):
845 """Implements the standard rules for accepting and returning cookies."""
846
847 DomainStrictNoDots = 1
848 DomainStrictNonDomain = 2
849 DomainRFC2965Match = 4
850
851 DomainLiberal = 0
852 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
853
854 def __init__(self,
855 blocked_domains=None, allowed_domains=None,
856 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000857 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000858 hide_cookie2=False,
859 strict_domain=False,
860 strict_rfc2965_unverifiable=True,
861 strict_ns_unverifiable=False,
862 strict_ns_domain=DomainLiberal,
863 strict_ns_set_initial_dollar=False,
864 strict_ns_set_path=False,
865 ):
866 """Constructor arguments should be passed as keyword arguments only."""
867 self.netscape = netscape
868 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000869 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000870 self.hide_cookie2 = hide_cookie2
871 self.strict_domain = strict_domain
872 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
873 self.strict_ns_unverifiable = strict_ns_unverifiable
874 self.strict_ns_domain = strict_ns_domain
875 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
876 self.strict_ns_set_path = strict_ns_set_path
877
878 if blocked_domains is not None:
879 self._blocked_domains = tuple(blocked_domains)
880 else:
881 self._blocked_domains = ()
882
883 if allowed_domains is not None:
884 allowed_domains = tuple(allowed_domains)
885 self._allowed_domains = allowed_domains
886
887 def blocked_domains(self):
888 """Return the sequence of blocked domains (as a tuple)."""
889 return self._blocked_domains
890 def set_blocked_domains(self, blocked_domains):
891 """Set the sequence of blocked domains."""
892 self._blocked_domains = tuple(blocked_domains)
893
894 def is_blocked(self, domain):
895 for blocked_domain in self._blocked_domains:
896 if user_domain_match(domain, blocked_domain):
897 return True
898 return False
899
900 def allowed_domains(self):
901 """Return None, or the sequence of allowed domains (as a tuple)."""
902 return self._allowed_domains
903 def set_allowed_domains(self, allowed_domains):
904 """Set the sequence of allowed domains, or None."""
905 if allowed_domains is not None:
906 allowed_domains = tuple(allowed_domains)
907 self._allowed_domains = allowed_domains
908
909 def is_not_allowed(self, domain):
910 if self._allowed_domains is None:
911 return False
912 for allowed_domain in self._allowed_domains:
913 if user_domain_match(domain, allowed_domain):
914 return False
915 return True
916
917 def set_ok(self, cookie, request):
918 """
919 If you override .set_ok(), be sure to call this method. If it returns
920 false, so should your subclass (assuming your subclass wants to be more
921 strict about which cookies to accept).
922
923 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000924 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000925
926 assert cookie.name is not None
927
928 for n in "version", "verifiability", "name", "path", "domain", "port":
929 fn_name = "set_ok_"+n
930 fn = getattr(self, fn_name)
931 if not fn(cookie, request):
932 return False
933
934 return True
935
936 def set_ok_version(self, cookie, request):
937 if cookie.version is None:
938 # Version is always set to 0 by parse_ns_headers if it's a Netscape
939 # cookie, so this must be an invalid RFC 2965 cookie.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000940 _debug(" Set-Cookie2 without version attribute (%s=%s)",
941 cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000942 return False
943 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000944 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000945 return False
946 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000947 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000948 return False
949 return True
950
951 def set_ok_verifiability(self, cookie, request):
952 if request.is_unverifiable() and is_third_party(request):
953 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000954 _debug(" third-party RFC 2965 cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000955 "unverifiable transaction")
956 return False
957 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000958 _debug(" third-party Netscape cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000959 "unverifiable transaction")
960 return False
961 return True
962
963 def set_ok_name(self, cookie, request):
964 # Try and stop servers setting V0 cookies designed to hack other
965 # servers that know both V0 and V1 protocols.
966 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
967 cookie.name.startswith("$")):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000968 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000969 return False
970 return True
971
972 def set_ok_path(self, cookie, request):
973 if cookie.path_specified:
974 req_path = request_path(request)
975 if ((cookie.version > 0 or
976 (cookie.version == 0 and self.strict_ns_set_path)) and
977 not req_path.startswith(cookie.path)):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000978 _debug(" path attribute %s is not a prefix of request "
979 "path %s", cookie.path, req_path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000980 return False
981 return True
982
983 def set_ok_domain(self, cookie, request):
984 if self.is_blocked(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000985 _debug(" domain %s is in user block-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000986 return False
987 if self.is_not_allowed(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000988 _debug(" domain %s is not in user allow-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000989 return False
990 if cookie.domain_specified:
991 req_host, erhn = eff_request_host(request)
992 domain = cookie.domain
993 if self.strict_domain and (domain.count(".") >= 2):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000994 # XXX This should probably be compared with the Konqueror
995 # (kcookiejar.cpp) and Mozilla implementations, but it's a
996 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000997 i = domain.rfind(".")
998 j = domain.rfind(".", 0, i)
999 if j == 0: # domain like .foo.bar
1000 tld = domain[i+1:]
1001 sld = domain[j+1:i]
Thomas Wouters477c8d52006-05-27 19:21:47 +00001002 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1003 "gov", "mil", "int", "aero", "biz", "cat", "coop",
1004 "info", "jobs", "mobi", "museum", "name", "pro",
1005 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001006 # domain like .co.uk
Thomas Wouters477c8d52006-05-27 19:21:47 +00001007 _debug(" country-code second level domain %s", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001008 return False
1009 if domain.startswith("."):
1010 undotted_domain = domain[1:]
1011 else:
1012 undotted_domain = domain
1013 embedded_dots = (undotted_domain.find(".") >= 0)
1014 if not embedded_dots and domain != ".local":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001015 _debug(" non-local domain %s contains no embedded dot",
1016 domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001017 return False
1018 if cookie.version == 0:
1019 if (not erhn.endswith(domain) and
1020 (not erhn.startswith(".") and
1021 not ("."+erhn).endswith(domain))):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001022 _debug(" effective request-host %s (even with added "
Ezio Melottie130a522011-10-19 10:58:56 +03001023 "initial dot) does not end with %s",
Thomas Wouters477c8d52006-05-27 19:21:47 +00001024 erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001025 return False
1026 if (cookie.version > 0 or
1027 (self.strict_ns_domain & self.DomainRFC2965Match)):
1028 if not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001029 _debug(" effective request-host %s does not domain-match "
1030 "%s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001031 return False
1032 if (cookie.version > 0 or
1033 (self.strict_ns_domain & self.DomainStrictNoDots)):
1034 host_prefix = req_host[:-len(domain)]
1035 if (host_prefix.find(".") >= 0 and
1036 not IPV4_RE.search(req_host)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001037 _debug(" host prefix %s for domain %s contains a dot",
1038 host_prefix, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001039 return False
1040 return True
1041
1042 def set_ok_port(self, cookie, request):
1043 if cookie.port_specified:
1044 req_port = request_port(request)
1045 if req_port is None:
1046 req_port = "80"
1047 else:
1048 req_port = str(req_port)
1049 for p in cookie.port.split(","):
1050 try:
1051 int(p)
1052 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001053 _debug(" bad port %s (not numeric)", p)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001054 return False
1055 if p == req_port:
1056 break
1057 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001058 _debug(" request port (%s) not found in %s",
1059 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001060 return False
1061 return True
1062
1063 def return_ok(self, cookie, request):
1064 """
1065 If you override .return_ok(), be sure to call this method. If it
1066 returns false, so should your subclass (assuming your subclass wants to
1067 be more strict about which cookies to return).
1068
1069 """
1070 # Path has already been checked by .path_return_ok(), and domain
1071 # blocking done by .domain_return_ok().
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001073
1074 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1075 fn_name = "return_ok_"+n
1076 fn = getattr(self, fn_name)
1077 if not fn(cookie, request):
1078 return False
1079 return True
1080
1081 def return_ok_version(self, cookie, request):
1082 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001083 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001084 return False
1085 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001086 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001087 return False
1088 return True
1089
1090 def return_ok_verifiability(self, cookie, request):
1091 if request.is_unverifiable() and is_third_party(request):
1092 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001093 _debug(" third-party RFC 2965 cookie during unverifiable "
1094 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001095 return False
1096 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001097 _debug(" third-party Netscape cookie during unverifiable "
1098 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001099 return False
1100 return True
1101
1102 def return_ok_secure(self, cookie, request):
1103 if cookie.secure and request.get_type() != "https":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001104 _debug(" secure cookie with non-secure request")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001105 return False
1106 return True
1107
1108 def return_ok_expires(self, cookie, request):
1109 if cookie.is_expired(self._now):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001110 _debug(" cookie expired")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001111 return False
1112 return True
1113
1114 def return_ok_port(self, cookie, request):
1115 if cookie.port:
1116 req_port = request_port(request)
1117 if req_port is None:
1118 req_port = "80"
1119 for p in cookie.port.split(","):
1120 if p == req_port:
1121 break
1122 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001123 _debug(" request port %s does not match cookie port %s",
1124 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001125 return False
1126 return True
1127
1128 def return_ok_domain(self, cookie, request):
1129 req_host, erhn = eff_request_host(request)
1130 domain = cookie.domain
1131
1132 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1133 if (cookie.version == 0 and
1134 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1135 not cookie.domain_specified and domain != erhn):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001136 _debug(" cookie with unspecified domain does not string-compare "
1137 "equal to request domain")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001138 return False
1139
1140 if cookie.version > 0 and not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001141 _debug(" effective request-host name %s does not domain-match "
1142 "RFC 2965 cookie domain %s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001143 return False
1144 if cookie.version == 0 and not ("."+erhn).endswith(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001145 _debug(" request-host %s does not match Netscape cookie domain "
1146 "%s", req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001147 return False
1148 return True
1149
1150 def domain_return_ok(self, domain, request):
1151 # Liberal check of. This is here as an optimization to avoid
1152 # having to load lots of MSIE cookie files unless necessary.
1153 req_host, erhn = eff_request_host(request)
1154 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001155 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001156 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001157 erhn = "."+erhn
1158 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001159 #_debug(" request domain %s does not match cookie domain %s",
1160 # req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001161 return False
1162
1163 if self.is_blocked(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001164 _debug(" domain %s is in user block-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001165 return False
1166 if self.is_not_allowed(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001167 _debug(" domain %s is not in user allow-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001168 return False
1169
1170 return True
1171
1172 def path_return_ok(self, path, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001173 _debug("- checking cookie path=%s", path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001174 req_path = request_path(request)
1175 if not req_path.startswith(path):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001176 _debug(" %s does not path-match %s", req_path, path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001177 return False
1178 return True
1179
1180
1181def vals_sorted_by_key(adict):
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001182 keys = sorted(adict.keys())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001183 return map(adict.get, keys)
1184
1185def deepvalues(mapping):
1186 """Iterates over nested mapping, depth-first, in sorted order by key."""
1187 values = vals_sorted_by_key(mapping)
1188 for obj in values:
1189 mapping = False
1190 try:
1191 obj.items
1192 except AttributeError:
1193 pass
1194 else:
1195 mapping = True
1196 for subobj in deepvalues(obj):
1197 yield subobj
1198 if not mapping:
1199 yield obj
1200
1201
1202# Used as second parameter to dict.get() method, to distinguish absent
1203# dict key from one with a None value.
1204class Absent: pass
1205
1206class CookieJar:
1207 """Collection of HTTP cookies.
1208
1209 You may not need to know about this class: try
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001210 urllib.request.build_opener(HTTPCookieProcessor).open(url).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001211 """
1212
1213 non_word_re = re.compile(r"\W")
1214 quote_re = re.compile(r"([\"\\])")
1215 strict_domain_re = re.compile(r"\.?[^.]*")
1216 domain_re = re.compile(r"[^.]*")
1217 dots_re = re.compile(r"^\.+")
1218
Antoine Pitroufd036452008-08-19 17:56:33 +00001219 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001220
1221 def __init__(self, policy=None):
1222 if policy is None:
1223 policy = DefaultCookiePolicy()
1224 self._policy = policy
1225
1226 self._cookies_lock = _threading.RLock()
1227 self._cookies = {}
1228
1229 def set_policy(self, policy):
1230 self._policy = policy
1231
1232 def _cookies_for_domain(self, domain, request):
1233 cookies = []
1234 if not self._policy.domain_return_ok(domain, request):
1235 return []
Thomas Wouters477c8d52006-05-27 19:21:47 +00001236 _debug("Checking %s for cookies to return", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001237 cookies_by_path = self._cookies[domain]
1238 for path in cookies_by_path.keys():
1239 if not self._policy.path_return_ok(path, request):
1240 continue
1241 cookies_by_name = cookies_by_path[path]
1242 for cookie in cookies_by_name.values():
1243 if not self._policy.return_ok(cookie, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001244 _debug(" not returning cookie")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001245 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001246 _debug(" it's a match")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001247 cookies.append(cookie)
1248 return cookies
1249
1250 def _cookies_for_request(self, request):
1251 """Return a list of cookies to be returned to server."""
1252 cookies = []
1253 for domain in self._cookies.keys():
1254 cookies.extend(self._cookies_for_domain(domain, request))
1255 return cookies
1256
1257 def _cookie_attrs(self, cookies):
1258 """Return a list of cookie-attributes to be returned to server.
1259
1260 like ['foo="bar"; $Path="/"', ...]
1261
1262 The $Version attribute is also added when appropriate (currently only
1263 once per request).
1264
1265 """
1266 # add cookies in order of most specific (ie. longest) path first
Raymond Hettinger70b64fc2008-01-30 20:15:17 +00001267 cookies.sort(key=lambda a: len(a.path), reverse=True)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001268
1269 version_set = False
1270
1271 attrs = []
1272 for cookie in cookies:
1273 # set version of Cookie header
1274 # XXX
1275 # What should it be if multiple matching Set-Cookie headers have
1276 # different versions themselves?
1277 # Answer: there is no answer; was supposed to be settled by
1278 # RFC 2965 errata, but that may never appear...
1279 version = cookie.version
1280 if not version_set:
1281 version_set = True
1282 if version > 0:
1283 attrs.append("$Version=%s" % version)
1284
1285 # quote cookie value if necessary
1286 # (not for Netscape protocol, which already has any quotes
1287 # intact, due to the poorly-specified Netscape Cookie: syntax)
1288 if ((cookie.value is not None) and
1289 self.non_word_re.search(cookie.value) and version > 0):
1290 value = self.quote_re.sub(r"\\\1", cookie.value)
1291 else:
1292 value = cookie.value
1293
1294 # add cookie-attributes to be returned in Cookie header
1295 if cookie.value is None:
1296 attrs.append(cookie.name)
1297 else:
1298 attrs.append("%s=%s" % (cookie.name, value))
1299 if version > 0:
1300 if cookie.path_specified:
1301 attrs.append('$Path="%s"' % cookie.path)
1302 if cookie.domain.startswith("."):
1303 domain = cookie.domain
1304 if (not cookie.domain_initial_dot and
1305 domain.startswith(".")):
1306 domain = domain[1:]
1307 attrs.append('$Domain="%s"' % domain)
1308 if cookie.port is not None:
1309 p = "$Port"
1310 if cookie.port_specified:
1311 p = p + ('="%s"' % cookie.port)
1312 attrs.append(p)
1313
1314 return attrs
1315
1316 def add_cookie_header(self, request):
Georg Brandl029986a2008-06-23 11:44:14 +00001317 """Add correct Cookie: header to request (urllib.request.Request object).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001318
1319 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1320
1321 """
Thomas Wouters477c8d52006-05-27 19:21:47 +00001322 _debug("add_cookie_header")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001323 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001324 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001325
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001326 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001327
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001328 cookies = self._cookies_for_request(request)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001329
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001330 attrs = self._cookie_attrs(cookies)
1331 if attrs:
1332 if not request.has_header("Cookie"):
1333 request.add_unredirected_header(
1334 "Cookie", "; ".join(attrs))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001335
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001336 # if necessary, advertise that we know RFC 2965
1337 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1338 not request.has_header("Cookie2")):
1339 for cookie in cookies:
1340 if cookie.version != 1:
1341 request.add_unredirected_header("Cookie2", '$Version="1"')
1342 break
1343
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001344 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001345 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001346
1347 self.clear_expired_cookies()
1348
1349 def _normalized_cookie_tuples(self, attrs_set):
1350 """Return list of tuples containing normalised cookie information.
1351
1352 attrs_set is the list of lists of key,value pairs extracted from
1353 the Set-Cookie or Set-Cookie2 headers.
1354
1355 Tuples are name, value, standard, rest, where name and value are the
1356 cookie name and value, standard is a dictionary containing the standard
1357 cookie-attributes (discard, secure, version, expires or max-age,
1358 domain, path and port) and rest is a dictionary containing the rest of
1359 the cookie-attributes.
1360
1361 """
1362 cookie_tuples = []
1363
1364 boolean_attrs = "discard", "secure"
1365 value_attrs = ("version",
1366 "expires", "max-age",
1367 "domain", "path", "port",
1368 "comment", "commenturl")
1369
1370 for cookie_attrs in attrs_set:
1371 name, value = cookie_attrs[0]
1372
1373 # Build dictionary of standard cookie-attributes (standard) and
1374 # dictionary of other cookie-attributes (rest).
1375
1376 # Note: expiry time is normalised to seconds since epoch. V0
1377 # cookies should have the Expires cookie-attribute, and V1 cookies
1378 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1379 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1380 # accept either (but prefer Max-Age).
1381 max_age_set = False
1382
1383 bad_cookie = False
1384
1385 standard = {}
1386 rest = {}
1387 for k, v in cookie_attrs[1:]:
1388 lc = k.lower()
1389 # don't lose case distinction for unknown fields
1390 if lc in value_attrs or lc in boolean_attrs:
1391 k = lc
1392 if k in boolean_attrs and v is None:
1393 # boolean cookie-attribute is present, but has no value
1394 # (like "discard", rather than "port=80")
1395 v = True
1396 if k in standard:
1397 # only first value is significant
1398 continue
1399 if k == "domain":
1400 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001401 _debug(" missing value for domain attribute")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001402 bad_cookie = True
1403 break
1404 # RFC 2965 section 3.3.3
1405 v = v.lower()
1406 if k == "expires":
1407 if max_age_set:
1408 # Prefer max-age to expires (like Mozilla)
1409 continue
1410 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001411 _debug(" missing or invalid value for expires "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001412 "attribute: treating as session cookie")
1413 continue
1414 if k == "max-age":
1415 max_age_set = True
1416 try:
1417 v = int(v)
1418 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001419 _debug(" missing or invalid (non-numeric) value for "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001420 "max-age attribute")
1421 bad_cookie = True
1422 break
1423 # convert RFC 2965 Max-Age to seconds since epoch
1424 # XXX Strictly you're supposed to follow RFC 2616
1425 # age-calculation rules. Remember that zero Max-Age is a
1426 # is a request to discard (old and new) cookie, though.
1427 k = "expires"
1428 v = self._now + v
1429 if (k in value_attrs) or (k in boolean_attrs):
1430 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001431 k not in ("port", "comment", "commenturl")):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001432 _debug(" missing value for %s attribute" % k)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001433 bad_cookie = True
1434 break
1435 standard[k] = v
1436 else:
1437 rest[k] = v
1438
1439 if bad_cookie:
1440 continue
1441
1442 cookie_tuples.append((name, value, standard, rest))
1443
1444 return cookie_tuples
1445
1446 def _cookie_from_cookie_tuple(self, tup, request):
1447 # standard is dict of standard cookie-attributes, rest is dict of the
1448 # rest of them
1449 name, value, standard, rest = tup
1450
1451 domain = standard.get("domain", Absent)
1452 path = standard.get("path", Absent)
1453 port = standard.get("port", Absent)
1454 expires = standard.get("expires", Absent)
1455
1456 # set the easy defaults
1457 version = standard.get("version", None)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +00001458 if version is not None:
1459 try:
1460 version = int(version)
1461 except ValueError:
1462 return None # invalid version, ignore cookie
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001463 secure = standard.get("secure", False)
1464 # (discard is also set if expires is Absent)
1465 discard = standard.get("discard", False)
1466 comment = standard.get("comment", None)
1467 comment_url = standard.get("commenturl", None)
1468
1469 # set default path
1470 if path is not Absent and path != "":
1471 path_specified = True
1472 path = escape_path(path)
1473 else:
1474 path_specified = False
1475 path = request_path(request)
1476 i = path.rfind("/")
1477 if i != -1:
1478 if version == 0:
1479 # Netscape spec parts company from reality here
1480 path = path[:i]
1481 else:
1482 path = path[:i+1]
1483 if len(path) == 0: path = "/"
1484
1485 # set default domain
1486 domain_specified = domain is not Absent
1487 # but first we have to remember whether it starts with a dot
1488 domain_initial_dot = False
1489 if domain_specified:
1490 domain_initial_dot = bool(domain.startswith("."))
1491 if domain is Absent:
1492 req_host, erhn = eff_request_host(request)
1493 domain = erhn
1494 elif not domain.startswith("."):
1495 domain = "."+domain
1496
1497 # set default port
1498 port_specified = False
1499 if port is not Absent:
1500 if port is None:
1501 # Port attr present, but has no value: default to request port.
1502 # Cookie should then only be sent back on that port.
1503 port = request_port(request)
1504 else:
1505 port_specified = True
1506 port = re.sub(r"\s+", "", port)
1507 else:
1508 # No port attr present. Cookie can be sent back on any port.
1509 port = None
1510
1511 # set default expires and discard
1512 if expires is Absent:
1513 expires = None
1514 discard = True
1515 elif expires <= self._now:
1516 # Expiry date in past is request to delete cookie. This can't be
1517 # in DefaultCookiePolicy, because can't delete cookies there.
1518 try:
1519 self.clear(domain, path, name)
1520 except KeyError:
1521 pass
Thomas Wouters477c8d52006-05-27 19:21:47 +00001522 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1523 domain, path, name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001524 return None
1525
1526 return Cookie(version,
1527 name, value,
1528 port, port_specified,
1529 domain, domain_specified, domain_initial_dot,
1530 path, path_specified,
1531 secure,
1532 expires,
1533 discard,
1534 comment,
1535 comment_url,
1536 rest)
1537
1538 def _cookies_from_attrs_set(self, attrs_set, request):
1539 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1540
1541 cookies = []
1542 for tup in cookie_tuples:
1543 cookie = self._cookie_from_cookie_tuple(tup, request)
1544 if cookie: cookies.append(cookie)
1545 return cookies
1546
Neal Norwitz71dad722005-12-23 21:43:48 +00001547 def _process_rfc2109_cookies(self, cookies):
1548 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1549 if rfc2109_as_ns is None:
1550 rfc2109_as_ns = not self._policy.rfc2965
1551 for cookie in cookies:
1552 if cookie.version == 1:
1553 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001554 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001555 # treat 2109 cookies as Netscape cookies rather than
1556 # as RFC2965 cookies
1557 cookie.version = 0
1558
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001559 def make_cookies(self, response, request):
1560 """Return sequence of Cookie objects extracted from response object."""
1561 # get cookie-attributes for RFC 2965 and Netscape protocols
1562 headers = response.info()
Barry Warsaw820c1202008-06-12 04:06:45 +00001563 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1564 ns_hdrs = headers.get_all("Set-Cookie", [])
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001565
1566 rfc2965 = self._policy.rfc2965
1567 netscape = self._policy.netscape
1568
1569 if ((not rfc2965_hdrs and not ns_hdrs) or
1570 (not ns_hdrs and not rfc2965) or
1571 (not rfc2965_hdrs and not netscape) or
1572 (not netscape and not rfc2965)):
1573 return [] # no relevant cookie headers: quick exit
1574
1575 try:
1576 cookies = self._cookies_from_attrs_set(
1577 split_header_words(rfc2965_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001578 except Exception:
1579 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001580 cookies = []
1581
1582 if ns_hdrs and netscape:
1583 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001584 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001585 ns_cookies = self._cookies_from_attrs_set(
1586 parse_ns_headers(ns_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001587 except Exception:
1588 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001589 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001590 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001591
1592 # Look for Netscape cookies (from Set-Cookie headers) that match
1593 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1594 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1595 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1596 # bundled in with the Netscape cookies for this purpose, which is
1597 # reasonable behaviour.
1598 if rfc2965:
1599 lookup = {}
1600 for cookie in cookies:
1601 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1602
1603 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1604 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1605 return key not in lookup
1606 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1607
1608 if ns_cookies:
1609 cookies.extend(ns_cookies)
1610
1611 return cookies
1612
1613 def set_cookie_if_ok(self, cookie, request):
1614 """Set a cookie if policy says it's OK to do so."""
1615 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001616 try:
1617 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001618
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001619 if self._policy.set_ok(cookie, request):
1620 self.set_cookie(cookie)
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001621
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001622
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001623 finally:
1624 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001625
1626 def set_cookie(self, cookie):
1627 """Set a cookie, without checking whether or not it should be set."""
1628 c = self._cookies
1629 self._cookies_lock.acquire()
1630 try:
1631 if cookie.domain not in c: c[cookie.domain] = {}
1632 c2 = c[cookie.domain]
1633 if cookie.path not in c2: c2[cookie.path] = {}
1634 c3 = c2[cookie.path]
1635 c3[cookie.name] = cookie
1636 finally:
1637 self._cookies_lock.release()
1638
1639 def extract_cookies(self, response, request):
1640 """Extract cookies from response, where allowable given the request."""
Thomas Wouters477c8d52006-05-27 19:21:47 +00001641 _debug("extract_cookies: %s", response.info())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001642 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001643 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001644 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001645
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001646 for cookie in self.make_cookies(response, request):
1647 if self._policy.set_ok(cookie, request):
1648 _debug(" setting cookie: %s", cookie)
1649 self.set_cookie(cookie)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001650 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001651 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001652
1653 def clear(self, domain=None, path=None, name=None):
1654 """Clear some cookies.
1655
1656 Invoking this method without arguments will clear all cookies. If
1657 given a single argument, only cookies belonging to that domain will be
1658 removed. If given two arguments, cookies belonging to the specified
1659 path within that domain are removed. If given three arguments, then
1660 the cookie with the specified name, path and domain is removed.
1661
1662 Raises KeyError if no matching cookie exists.
1663
1664 """
1665 if name is not None:
1666 if (domain is None) or (path is None):
1667 raise ValueError(
1668 "domain and path must be given to remove a cookie by name")
1669 del self._cookies[domain][path][name]
1670 elif path is not None:
1671 if domain is None:
1672 raise ValueError(
1673 "domain must be given to remove cookies by path")
1674 del self._cookies[domain][path]
1675 elif domain is not None:
1676 del self._cookies[domain]
1677 else:
1678 self._cookies = {}
1679
1680 def clear_session_cookies(self):
1681 """Discard all session cookies.
1682
1683 Note that the .save() method won't save session cookies anyway, unless
1684 you ask otherwise by passing a true ignore_discard argument.
1685
1686 """
1687 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001688 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001689 for cookie in self:
1690 if cookie.discard:
1691 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001692 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001693 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001694
1695 def clear_expired_cookies(self):
1696 """Discard all expired cookies.
1697
1698 You probably don't need to call this method: expired cookies are never
1699 sent back to the server (provided you're using DefaultCookiePolicy),
1700 this method is called by CookieJar itself every so often, and the
1701 .save() method won't save expired cookies anyway (unless you ask
1702 otherwise by passing a true ignore_expires argument).
1703
1704 """
1705 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001706 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001707 now = time.time()
1708 for cookie in self:
1709 if cookie.is_expired(now):
1710 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001711 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001712 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001713
1714 def __iter__(self):
1715 return deepvalues(self._cookies)
1716
1717 def __len__(self):
1718 """Return number of contained cookies."""
1719 i = 0
1720 for cookie in self: i = i + 1
1721 return i
1722
1723 def __repr__(self):
1724 r = []
1725 for cookie in self: r.append(repr(cookie))
1726 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1727
1728 def __str__(self):
1729 r = []
1730 for cookie in self: r.append(str(cookie))
1731 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1732
1733
Neal Norwitz3e7de592005-12-23 21:24:35 +00001734# derives from IOError for backwards-compatibility with Python 2.4.0
1735class LoadError(IOError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001736
1737class FileCookieJar(CookieJar):
1738 """CookieJar that can be loaded from and saved to a file."""
1739
1740 def __init__(self, filename=None, delayload=False, policy=None):
1741 """
1742 Cookies are NOT loaded from the named file until either the .load() or
1743 .revert() method is called.
1744
1745 """
1746 CookieJar.__init__(self, policy)
1747 if filename is not None:
1748 try:
1749 filename+""
1750 except:
1751 raise ValueError("filename must be string-like")
1752 self.filename = filename
1753 self.delayload = bool(delayload)
1754
1755 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1756 """Save cookies to a file."""
1757 raise NotImplementedError()
1758
1759 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1760 """Load cookies from a file."""
1761 if filename is None:
1762 if self.filename is not None: filename = self.filename
1763 else: raise ValueError(MISSING_FILENAME_TEXT)
1764
1765 f = open(filename)
1766 try:
1767 self._really_load(f, filename, ignore_discard, ignore_expires)
1768 finally:
1769 f.close()
1770
1771 def revert(self, filename=None,
1772 ignore_discard=False, ignore_expires=False):
1773 """Clear all cookies and reload cookies from a saved file.
1774
1775 Raises LoadError (or IOError) if reversion is not successful; the
1776 object's state will not be altered if this happens.
1777
1778 """
1779 if filename is None:
1780 if self.filename is not None: filename = self.filename
1781 else: raise ValueError(MISSING_FILENAME_TEXT)
1782
1783 self._cookies_lock.acquire()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001784 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001785
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001786 old_state = copy.deepcopy(self._cookies)
1787 self._cookies = {}
1788 try:
1789 self.load(filename, ignore_discard, ignore_expires)
1790 except (LoadError, IOError):
1791 self._cookies = old_state
1792 raise
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001793
1794 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001795 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001796
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001797
1798def lwp_cookie_str(cookie):
1799 """Return string representation of Cookie in an the LWP cookie file format.
1800
1801 Actually, the format is extended a bit -- see module docstring.
1802
1803 """
1804 h = [(cookie.name, cookie.value),
1805 ("path", cookie.path),
1806 ("domain", cookie.domain)]
1807 if cookie.port is not None: h.append(("port", cookie.port))
1808 if cookie.path_specified: h.append(("path_spec", None))
1809 if cookie.port_specified: h.append(("port_spec", None))
1810 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1811 if cookie.secure: h.append(("secure", None))
1812 if cookie.expires: h.append(("expires",
1813 time2isoz(float(cookie.expires))))
1814 if cookie.discard: h.append(("discard", None))
1815 if cookie.comment: h.append(("comment", cookie.comment))
1816 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1817
1818 keys = sorted(cookie._rest.keys())
1819 for k in keys:
1820 h.append((k, str(cookie._rest[k])))
1821
1822 h.append(("version", str(cookie.version)))
1823
1824 return join_header_words([h])
1825
1826class LWPCookieJar(FileCookieJar):
1827 """
1828 The LWPCookieJar saves a sequence of"Set-Cookie3" lines.
1829 "Set-Cookie3" is the format used by the libwww-perl libary, not known
1830 to be compatible with any browser, but which is easy to read and
1831 doesn't lose information about RFC 2965 cookies.
1832
1833 Additional methods
1834
1835 as_lwp_str(ignore_discard=True, ignore_expired=True)
1836
1837 """
1838
1839 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1840 """Return cookies as a string of "\n"-separated "Set-Cookie3" headers.
1841
1842 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1843
1844 """
1845 now = time.time()
1846 r = []
1847 for cookie in self:
1848 if not ignore_discard and cookie.discard:
1849 continue
1850 if not ignore_expires and cookie.is_expired(now):
1851 continue
1852 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1853 return "\n".join(r+[""])
1854
1855 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1856 if filename is None:
1857 if self.filename is not None: filename = self.filename
1858 else: raise ValueError(MISSING_FILENAME_TEXT)
1859
1860 f = open(filename, "w")
1861 try:
1862 # There really isn't an LWP Cookies 2.0 format, but this indicates
1863 # that there is extra information in here (domain_dot and
1864 # port_spec) while still being compatible with libwww-perl, I hope.
1865 f.write("#LWP-Cookies-2.0\n")
1866 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1867 finally:
1868 f.close()
1869
1870 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1871 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001872 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001873 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1874 "file" % filename)
1875 raise LoadError(msg)
1876
1877 now = time.time()
1878
1879 header = "Set-Cookie3:"
1880 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1881 "secure", "discard")
1882 value_attrs = ("version",
1883 "port", "path", "domain",
1884 "expires",
1885 "comment", "commenturl")
1886
1887 try:
1888 while 1:
1889 line = f.readline()
1890 if line == "": break
1891 if not line.startswith(header):
1892 continue
1893 line = line[len(header):].strip()
1894
1895 for data in split_header_words([line]):
1896 name, value = data[0]
1897 standard = {}
1898 rest = {}
1899 for k in boolean_attrs:
1900 standard[k] = False
1901 for k, v in data[1:]:
1902 if k is not None:
1903 lc = k.lower()
1904 else:
1905 lc = None
1906 # don't lose case distinction for unknown fields
1907 if (lc in value_attrs) or (lc in boolean_attrs):
1908 k = lc
1909 if k in boolean_attrs:
1910 if v is None: v = True
1911 standard[k] = v
1912 elif k in value_attrs:
1913 standard[k] = v
1914 else:
1915 rest[k] = v
1916
1917 h = standard.get
1918 expires = h("expires")
1919 discard = h("discard")
1920 if expires is not None:
1921 expires = iso2time(expires)
1922 if expires is None:
1923 discard = True
1924 domain = h("domain")
1925 domain_specified = domain.startswith(".")
1926 c = Cookie(h("version"), name, value,
1927 h("port"), h("port_spec"),
1928 domain, domain_specified, h("domain_dot"),
1929 h("path"), h("path_spec"),
1930 h("secure"),
1931 expires,
1932 discard,
1933 h("comment"),
1934 h("commenturl"),
1935 rest)
1936 if not ignore_discard and c.discard:
1937 continue
1938 if not ignore_expires and c.is_expired(now):
1939 continue
1940 self.set_cookie(c)
1941
1942 except IOError:
1943 raise
1944 except Exception:
1945 _warn_unhandled_exception()
1946 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1947 (filename, line))
1948
1949
1950class MozillaCookieJar(FileCookieJar):
1951 """
1952
1953 WARNING: you may want to backup your browser's cookies file if you use
1954 this class to save cookies. I *think* it works, but there have been
1955 bugs in the past!
1956
1957 This class differs from CookieJar only in the format it uses to save and
1958 load cookies to and from a file. This class uses the Mozilla/Netscape
1959 `cookies.txt' format. lynx uses this file format, too.
1960
1961 Don't expect cookies saved while the browser is running to be noticed by
1962 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1963 you change them on disk while it's running; on Windows, you probably can't
1964 save at all while the browser is running).
1965
1966 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1967 Netscape cookies on saving.
1968
1969 In particular, the cookie version and port number information is lost,
1970 together with information about whether or not Path, Port and Discard were
1971 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1972 domain as set in the HTTP header started with a dot (yes, I'm aware some
1973 domains in Netscape files start with a dot and some don't -- trust me, you
1974 really don't want to know any more about this).
1975
1976 Note that though Mozilla and Netscape use the same format, they use
1977 slightly different headers. The class saves cookies using the Netscape
1978 header by default (Mozilla can cope with that).
1979
1980 """
Antoine Pitroufd036452008-08-19 17:56:33 +00001981 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001982 header = """\
Georg Brandl87a15642010-07-31 22:11:11 +00001983# Netscape HTTP Cookie File
1984# http://www.netscape.com/newsref/std/cookie_spec.html
1985# This is a generated file! Do not edit.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001986
1987"""
1988
1989 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1990 now = time.time()
1991
1992 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001993 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001994 f.close()
1995 raise LoadError(
1996 "%r does not look like a Netscape format cookies file" %
1997 filename)
1998
1999 try:
2000 while 1:
2001 line = f.readline()
2002 if line == "": break
2003
2004 # last field may be absent, so keep any trailing tab
2005 if line.endswith("\n"): line = line[:-1]
2006
2007 # skip comments and blank lines XXX what is $ for?
2008 if (line.strip().startswith(("#", "$")) or
2009 line.strip() == ""):
2010 continue
2011
2012 domain, domain_specified, path, secure, expires, name, value = \
2013 line.split("\t")
2014 secure = (secure == "TRUE")
2015 domain_specified = (domain_specified == "TRUE")
2016 if name == "":
2017 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2018 # with no name, whereas http.cookiejar regards it as a
2019 # cookie with no value.
2020 name = value
2021 value = None
2022
2023 initial_dot = domain.startswith(".")
2024 assert domain_specified == initial_dot
2025
2026 discard = False
2027 if expires == "":
2028 expires = None
2029 discard = True
2030
2031 # assume path_specified is false
2032 c = Cookie(0, name, value,
2033 None, False,
2034 domain, domain_specified, initial_dot,
2035 path, False,
2036 secure,
2037 expires,
2038 discard,
2039 None,
2040 None,
2041 {})
2042 if not ignore_discard and c.discard:
2043 continue
2044 if not ignore_expires and c.is_expired(now):
2045 continue
2046 self.set_cookie(c)
2047
2048 except IOError:
2049 raise
2050 except Exception:
2051 _warn_unhandled_exception()
2052 raise LoadError("invalid Netscape format cookies file %r: %r" %
2053 (filename, line))
2054
2055 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2056 if filename is None:
2057 if self.filename is not None: filename = self.filename
2058 else: raise ValueError(MISSING_FILENAME_TEXT)
2059
2060 f = open(filename, "w")
2061 try:
2062 f.write(self.header)
2063 now = time.time()
2064 for cookie in self:
2065 if not ignore_discard and cookie.discard:
2066 continue
2067 if not ignore_expires and cookie.is_expired(now):
2068 continue
2069 if cookie.secure: secure = "TRUE"
2070 else: secure = "FALSE"
2071 if cookie.domain.startswith("."): initial_dot = "TRUE"
2072 else: initial_dot = "FALSE"
2073 if cookie.expires is not None:
2074 expires = str(cookie.expires)
2075 else:
2076 expires = ""
2077 if cookie.value is None:
2078 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2079 # with no name, whereas http.cookiejar regards it as a
2080 # cookie with no value.
2081 name = ""
2082 value = cookie.name
2083 else:
2084 name = cookie.name
2085 value = cookie.value
2086 f.write(
2087 "\t".join([cookie.domain, initial_dot, cookie.path,
2088 secure, expires, name, value])+
2089 "\n")
2090 finally:
2091 f.close()