blob: 4dc468bacc9ac678da872b4e8935a9dab11e38d6 [file] [log] [blame]
Éric Araujo23760e92011-11-07 17:52:48 +01001r"""HTTP cookie handling for web clients.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00002
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
Thomas Wouters477c8d52006-05-27 19:21:47 +000010Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000013
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Thomas Wouters477c8d52006-05-27 19:21:47 +000028__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
Jeremy Hylton1afc1692008-06-18 20:49:58 +000031import copy
Victor Stinner628225c2011-03-21 02:38:51 +010032import datetime
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033import re
34import time
35import urllib.parse, urllib.request
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000036try:
37 import threading as _threading
Brett Cannoncd171c82013-07-04 17:43:24 -040038except ImportError:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000039 import dummy_threading as _threading
Georg Brandl24420152008-05-26 16:32:26 +000040import http.client # only for the default HTTP port
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000041from calendar import timegm
42
Thomas Wouters477c8d52006-05-27 19:21:47 +000043debug = False # set to True to enable debugging via the logging module
44logger = None
45
46def _debug(*args):
47 if not debug:
48 return
49 global logger
50 if not logger:
51 import logging
Georg Brandl24420152008-05-26 16:32:26 +000052 logger = logging.getLogger("http.cookiejar")
Thomas Wouters477c8d52006-05-27 19:21:47 +000053 return logger.debug(*args)
54
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000055
Georg Brandl24420152008-05-26 16:32:26 +000056DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000057MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
58 "instance initialised with one)")
59
Thomas Wouters477c8d52006-05-27 19:21:47 +000060def _warn_unhandled_exception():
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000061 # There are a few catch-all except: statements in this module, for
Thomas Wouters477c8d52006-05-27 19:21:47 +000062 # catching input that's bad in unexpected ways. Warn if any
63 # exceptions are caught there.
Jeremy Hylton7ecf3dc2008-05-10 20:38:40 +000064 import io, warnings, traceback
Guido van Rossum34d19282007-08-09 01:03:29 +000065 f = io.StringIO()
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000066 traceback.print_exc(None, f)
67 msg = f.getvalue()
Georg Brandl24420152008-05-26 16:32:26 +000068 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000069
70
71# Date/time conversion
72# -----------------------------------------------------------------------------
73
74EPOCH_YEAR = 1970
75def _timegm(tt):
76 year, month, mday, hour, min, sec = tt[:6]
77 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
78 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
79 return timegm(tt)
80 else:
81 return None
82
83DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
84MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
85 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
86MONTHS_LOWER = []
87for month in MONTHS: MONTHS_LOWER.append(month.lower())
88
89def time2isoz(t=None):
90 """Return a string representing time in seconds since epoch, t.
91
92 If the function is called without an argument, it will use the current
93 time.
94
95 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
96 representing Universal Time (UTC, aka GMT). An example of this format is:
97
98 1994-11-24 08:49:37Z
99
100 """
Victor Stinner628225c2011-03-21 02:38:51 +0100101 if t is None:
102 dt = datetime.datetime.utcnow()
103 else:
104 dt = datetime.datetime.utcfromtimestamp(t)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000105 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100106 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000107
108def time2netscape(t=None):
109 """Return a string representing time in seconds since epoch, t.
110
111 If the function is called without an argument, it will use the current
112 time.
113
114 The format of the returned string is like this:
115
116 Wed, DD-Mon-YYYY HH:MM:SS GMT
117
118 """
Victor Stinner628225c2011-03-21 02:38:51 +0100119 if t is None:
120 dt = datetime.datetime.utcnow()
121 else:
122 dt = datetime.datetime.utcfromtimestamp(t)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000123 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100124 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
125 dt.year, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000126
127
128UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
129
Antoine Pitroufd036452008-08-19 17:56:33 +0000130TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000131def offset_from_tz_string(tz):
132 offset = None
133 if tz in UTC_ZONES:
134 offset = 0
135 else:
136 m = TIMEZONE_RE.search(tz)
137 if m:
138 offset = 3600 * int(m.group(2))
139 if m.group(3):
140 offset = offset + 60 * int(m.group(3))
141 if m.group(1) == '-':
142 offset = -offset
143 return offset
144
145def _str2time(day, mon, yr, hr, min, sec, tz):
146 # translate month name to number
147 # month numbers start with 1 (January)
148 try:
149 mon = MONTHS_LOWER.index(mon.lower())+1
150 except ValueError:
151 # maybe it's already a number
152 try:
153 imon = int(mon)
154 except ValueError:
155 return None
156 if 1 <= imon <= 12:
157 mon = imon
158 else:
159 return None
160
161 # make sure clock elements are defined
162 if hr is None: hr = 0
163 if min is None: min = 0
164 if sec is None: sec = 0
165
166 yr = int(yr)
167 day = int(day)
168 hr = int(hr)
169 min = int(min)
170 sec = int(sec)
171
172 if yr < 1000:
173 # find "obvious" year
174 cur_yr = time.localtime(time.time())[0]
175 m = cur_yr % 100
176 tmp = yr
177 yr = yr + cur_yr - m
178 m = m - tmp
179 if abs(m) > 50:
180 if m > 0: yr = yr + 100
181 else: yr = yr - 100
182
183 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
184 t = _timegm((yr, mon, day, hr, min, sec, tz))
185
186 if t is not None:
187 # adjust time using timezone string, to get absolute time since epoch
188 if tz is None:
189 tz = "UTC"
190 tz = tz.upper()
191 offset = offset_from_tz_string(tz)
192 if offset is None:
193 return None
194 t = t - offset
195
196 return t
197
198STRICT_DATE_RE = re.compile(
199 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
Antoine Pitroufd036452008-08-19 17:56:33 +0000200 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000201WEEKDAY_RE = re.compile(
Antoine Pitroufd036452008-08-19 17:56:33 +0000202 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000203LOOSE_HTTP_DATE_RE = re.compile(
204 r"""^
205 (\d\d?) # day
206 (?:\s+|[-\/])
207 (\w+) # month
208 (?:\s+|[-\/])
209 (\d+) # year
210 (?:
211 (?:\s+|:) # separator before clock
212 (\d\d?):(\d\d) # hour:min
213 (?::(\d\d))? # optional seconds
214 )? # optional clock
215 \s*
216 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
217 \s*
218 (?:\(\w+\))? # ASCII representation of timezone in parens.
Antoine Pitroufd036452008-08-19 17:56:33 +0000219 \s*$""", re.X | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000220def http2time(text):
221 """Returns time in seconds since epoch of time represented by a string.
222
223 Return value is an integer.
224
225 None is returned if the format of str is unrecognized, the time is outside
226 the representable range, or the timezone string is not recognized. If the
227 string contains no timezone, UTC is assumed.
228
229 The timezone in the string may be numerical (like "-0800" or "+0100") or a
230 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
231 timezone strings equivalent to UTC (zero offset) are known to the function.
232
233 The function loosely parses the following formats:
234
235 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
236 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
237 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
238 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
239 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
240 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
241
242 The parser ignores leading and trailing whitespace. The time may be
243 absent.
244
245 If the year is given with only 2 digits, the function will select the
246 century that makes the year closest to the current date.
247
248 """
249 # fast exit for strictly conforming string
250 m = STRICT_DATE_RE.search(text)
251 if m:
252 g = m.groups()
253 mon = MONTHS_LOWER.index(g[1].lower()) + 1
254 tt = (int(g[2]), mon, int(g[0]),
255 int(g[3]), int(g[4]), float(g[5]))
256 return _timegm(tt)
257
258 # No, we need some messy parsing...
259
260 # clean up
261 text = text.lstrip()
262 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
263
264 # tz is time zone specifier string
265 day, mon, yr, hr, min, sec, tz = [None]*7
266
267 # loose regexp parse
268 m = LOOSE_HTTP_DATE_RE.search(text)
269 if m is not None:
270 day, mon, yr, hr, min, sec, tz = m.groups()
271 else:
272 return None # bad format
273
274 return _str2time(day, mon, yr, hr, min, sec, tz)
275
276ISO_DATE_RE = re.compile(
277 """^
278 (\d{4}) # year
279 [-\/]?
280 (\d\d?) # numerical month
281 [-\/]?
282 (\d\d?) # day
283 (?:
284 (?:\s+|[-:Tt]) # separator before clock
285 (\d\d?):?(\d\d) # hour:min
286 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
287 )? # optional clock
288 \s*
289 ([-+]?\d\d?:?(:?\d\d)?
290 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
Antoine Pitroufd036452008-08-19 17:56:33 +0000291 \s*$""", re.X | re. ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000292def iso2time(text):
293 """
294 As for http2time, but parses the ISO 8601 formats:
295
296 1994-02-03 14:15:29 -0100 -- ISO 8601 format
297 1994-02-03 14:15:29 -- zone is optional
298 1994-02-03 -- only date
299 1994-02-03T14:15:29 -- Use T as separator
300 19940203T141529Z -- ISO 8601 compact format
301 19940203 -- only date
302
303 """
304 # clean up
305 text = text.lstrip()
306
307 # tz is time zone specifier string
308 day, mon, yr, hr, min, sec, tz = [None]*7
309
310 # loose regexp parse
311 m = ISO_DATE_RE.search(text)
312 if m is not None:
313 # XXX there's an extra bit of the timezone I'm ignoring here: is
314 # this the right thing to do?
315 yr, mon, day, hr, min, sec, tz, _ = m.groups()
316 else:
317 return None # bad format
318
319 return _str2time(day, mon, yr, hr, min, sec, tz)
320
321
322# Header parsing
323# -----------------------------------------------------------------------------
324
325def unmatched(match):
326 """Return unmatched part of re.Match object."""
327 start, end = match.span(0)
328 return match.string[:start]+match.string[end:]
329
330HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
331HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
332HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
333HEADER_ESCAPE_RE = re.compile(r"\\(.)")
334def split_header_words(header_values):
335 r"""Parse header values into a list of lists containing key,value pairs.
336
337 The function knows how to deal with ",", ";" and "=" as well as quoted
338 values after "=". A list of space separated tokens are parsed as if they
339 were separated by ";".
340
341 If the header_values passed as argument contains multiple values, then they
342 are treated as if they were a single value separated by comma ",".
343
344 This means that this function is useful for parsing header fields that
345 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
346 the requirement for tokens).
347
348 headers = #header
349 header = (token | parameter) *( [";"] (token | parameter))
350
351 token = 1*<any CHAR except CTLs or separators>
352 separators = "(" | ")" | "<" | ">" | "@"
353 | "," | ";" | ":" | "\" | <">
354 | "/" | "[" | "]" | "?" | "="
355 | "{" | "}" | SP | HT
356
357 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
358 qdtext = <any TEXT except <">>
359 quoted-pair = "\" CHAR
360
361 parameter = attribute "=" value
362 attribute = token
363 value = token | quoted-string
364
365 Each header is represented by a list of key/value pairs. The value for a
366 simple token (not part of a parameter) is None. Syntactically incorrect
367 headers will not necessarily be parsed as you would want.
368
369 This is easier to describe with some examples:
370
371 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
372 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
373 >>> split_header_words(['text/html; charset="iso-8859-1"'])
374 [[('text/html', None), ('charset', 'iso-8859-1')]]
375 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
376 [[('Basic', None), ('realm', '"foobar"')]]
377
378 """
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000379 assert not isinstance(header_values, str)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000380 result = []
381 for text in header_values:
382 orig_text = text
383 pairs = []
384 while text:
385 m = HEADER_TOKEN_RE.search(text)
386 if m:
387 text = unmatched(m)
388 name = m.group(1)
389 m = HEADER_QUOTED_VALUE_RE.search(text)
390 if m: # quoted value
391 text = unmatched(m)
392 value = m.group(1)
393 value = HEADER_ESCAPE_RE.sub(r"\1", value)
394 else:
395 m = HEADER_VALUE_RE.search(text)
396 if m: # unquoted value
397 text = unmatched(m)
398 value = m.group(1)
399 value = value.rstrip()
400 else:
401 # no value, a lone token
402 value = None
403 pairs.append((name, value))
404 elif text.lstrip().startswith(","):
405 # concatenated headers, as per RFC 2616 section 4.2
406 text = text.lstrip()[1:]
407 if pairs: result.append(pairs)
408 pairs = []
409 else:
410 # skip junk
411 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
412 assert nr_junk_chars > 0, (
413 "split_header_words bug: '%s', '%s', %s" %
414 (orig_text, text, pairs))
415 text = non_junk
416 if pairs: result.append(pairs)
417 return result
418
419HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
420def join_header_words(lists):
421 """Do the inverse (almost) of the conversion done by split_header_words.
422
423 Takes a list of lists of (key, value) pairs and produces a single header
424 value. Attribute values are quoted if needed.
425
426 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
427 'text/plain; charset="iso-8859/1"'
428 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
429 'text/plain, charset="iso-8859/1"'
430
431 """
432 headers = []
433 for pairs in lists:
434 attr = []
435 for k, v in pairs:
436 if v is not None:
437 if not re.search(r"^\w+$", v):
438 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
439 v = '"%s"' % v
440 k = "%s=%s" % (k, v)
441 attr.append(k)
442 if attr: headers.append("; ".join(attr))
443 return ", ".join(headers)
444
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000445def strip_quotes(text):
446 if text.startswith('"'):
447 text = text[1:]
448 if text.endswith('"'):
449 text = text[:-1]
450 return text
451
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000452def parse_ns_headers(ns_headers):
453 """Ad-hoc parser for Netscape protocol cookie-attributes.
454
455 The old Netscape cookie format for Set-Cookie can for instance contain
456 an unquoted "," in the expires field, so we have to use this ad-hoc
457 parser instead of split_header_words.
458
459 XXX This may not make the best possible effort to parse all the crap
460 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
461 parser is probably better, so could do worse than following that if
462 this ever gives any trouble.
463
464 Currently, this is also used for parsing RFC 2109 cookies.
465
466 """
467 known_attrs = ("expires", "domain", "path", "secure",
468 # RFC 2109 attrs (may turn up in Netscape cookies, too)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000469 "version", "port", "max-age")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000470
471 result = []
472 for ns_header in ns_headers:
473 pairs = []
474 version_set = False
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000475 for ii, param in enumerate(re.split(r";\s*", ns_header)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000476 param = param.rstrip()
477 if param == "": continue
478 if "=" not in param:
Martin v. Löwisc5574e82005-03-03 10:57:37 +0000479 k, v = param, None
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000480 else:
481 k, v = re.split(r"\s*=\s*", param, 1)
482 k = k.lstrip()
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000483 if ii != 0:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000484 lc = k.lower()
485 if lc in known_attrs:
486 k = lc
487 if k == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000488 # This is an RFC 2109 cookie.
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000489 v = strip_quotes(v)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000490 version_set = True
491 if k == "expires":
492 # convert expires date to seconds since epoch
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000493 v = http2time(strip_quotes(v)) # None if invalid
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000494 pairs.append((k, v))
495
496 if pairs:
497 if not version_set:
498 pairs.append(("version", "0"))
499 result.append(pairs)
500
501 return result
502
503
Antoine Pitroufd036452008-08-19 17:56:33 +0000504IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000505def is_HDN(text):
506 """Return True if text is a host domain name."""
507 # XXX
508 # This may well be wrong. Which RFC is HDN defined in, if any (for
509 # the purposes of RFC 2965)?
510 # For the current implementation, what about IPv6? Remember to look
511 # at other uses of IPV4_RE also, if change this.
512 if IPV4_RE.search(text):
513 return False
514 if text == "":
515 return False
516 if text[0] == "." or text[-1] == ".":
517 return False
518 return True
519
520def domain_match(A, B):
521 """Return True if domain A domain-matches domain B, according to RFC 2965.
522
523 A and B may be host domain names or IP addresses.
524
525 RFC 2965, section 1:
526
527 Host names can be specified either as an IP address or a HDN string.
528 Sometimes we compare one host name with another. (Such comparisons SHALL
529 be case-insensitive.) Host A's name domain-matches host B's if
530
531 * their host name strings string-compare equal; or
532
533 * A is a HDN string and has the form NB, where N is a non-empty
534 name string, B has the form .B', and B' is a HDN string. (So,
535 x.y.com domain-matches .Y.com but not Y.com.)
536
537 Note that domain-match is not a commutative operation: a.b.c.com
538 domain-matches .c.com, but not the reverse.
539
540 """
541 # Note that, if A or B are IP addresses, the only relevant part of the
542 # definition of the domain-match algorithm is the direct string-compare.
543 A = A.lower()
544 B = B.lower()
545 if A == B:
546 return True
547 if not is_HDN(A):
548 return False
549 i = A.rfind(B)
550 if i == -1 or i == 0:
551 # A does not have form NB, or N is the empty string
552 return False
553 if not B.startswith("."):
554 return False
555 if not is_HDN(B[1:]):
556 return False
557 return True
558
559def liberal_is_HDN(text):
560 """Return True if text is a sort-of-like a host domain name.
561
562 For accepting/blocking domains.
563
564 """
565 if IPV4_RE.search(text):
566 return False
567 return True
568
569def user_domain_match(A, B):
570 """For blocking/accepting domains.
571
572 A and B may be host domain names or IP addresses.
573
574 """
575 A = A.lower()
576 B = B.lower()
577 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
578 if A == B:
579 # equal IP addresses
580 return True
581 return False
582 initial_dot = B.startswith(".")
583 if initial_dot and A.endswith(B):
584 return True
585 if not initial_dot and A == B:
586 return True
587 return False
588
Antoine Pitroufd036452008-08-19 17:56:33 +0000589cut_port_re = re.compile(r":\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000590def request_host(request):
591 """Return request-host, as defined by RFC 2965.
592
593 Variation from RFC: returned value is lowercased, for convenient
594 comparison.
595
596 """
597 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000598 host = urllib.parse.urlparse(url)[1]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000599 if host == "":
600 host = request.get_header("Host", "")
601
602 # remove port, if present
603 host = cut_port_re.sub("", host, 1)
604 return host.lower()
605
606def eff_request_host(request):
607 """Return a tuple (request-host, effective request-host name).
608
609 As defined by RFC 2965, except both are lowercased.
610
611 """
612 erhn = req_host = request_host(request)
613 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
614 erhn = req_host + ".local"
615 return req_host, erhn
616
617def request_path(request):
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000618 """Path component of request-URI, as defined by RFC 2965."""
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000619 url = request.get_full_url()
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000620 parts = urllib.parse.urlsplit(url)
621 path = escape_path(parts.path)
622 if not path.startswith("/"):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000623 # fix bad RFC 2396 absoluteURI
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000624 path = "/" + path
625 return path
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000626
627def request_port(request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -0500628 host = request.host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000629 i = host.find(':')
630 if i >= 0:
631 port = host[i+1:]
632 try:
633 int(port)
634 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000635 _debug("nonnumeric port: '%s'", port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000636 return None
637 else:
638 port = DEFAULT_HTTP_PORT
639 return port
640
641# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
642# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
643HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
644ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
645def uppercase_escaped_char(match):
646 return "%%%s" % match.group(1).upper()
647def escape_path(path):
648 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
649 # There's no knowing what character encoding was used to create URLs
650 # containing %-escapes, but since we have to pick one to escape invalid
651 # path characters, we pick UTF-8, as recommended in the HTML 4.0
652 # specification:
653 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
654 # And here, kind of: draft-fielding-uri-rfc2396bis-03
655 # (And in draft IRI specification: draft-duerst-iri-05)
656 # (And here, for new URI schemes: RFC 2718)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000657 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000658 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
659 return path
660
661def reach(h):
662 """Return reach of host h, as defined by RFC 2965, section 1.
663
664 The reach R of a host name H is defined as follows:
665
666 * If
667
668 - H is the host domain name of a host; and,
669
670 - H has the form A.B; and
671
672 - A has no embedded (that is, interior) dots; and
673
674 - B has at least one embedded dot, or B is the string "local".
675 then the reach of H is .B.
676
677 * Otherwise, the reach of H is H.
678
679 >>> reach("www.acme.com")
680 '.acme.com'
681 >>> reach("acme.com")
682 'acme.com'
683 >>> reach("acme.local")
684 '.local'
685
686 """
687 i = h.find(".")
688 if i >= 0:
689 #a = h[:i] # this line is only here to show what a is
690 b = h[i+1:]
691 i = b.find(".")
692 if is_HDN(h) and (i >= 0 or b == "local"):
693 return "."+b
694 return h
695
696def is_third_party(request):
697 """
698
699 RFC 2965, section 3.3.6:
700
701 An unverifiable transaction is to a third-party host if its request-
702 host U does not domain-match the reach R of the request-host O in the
703 origin transaction.
704
705 """
706 req_host = request_host(request)
Senthil Kumarand9fbf362013-04-09 07:07:59 -0700707 if not domain_match(req_host, reach(request.origin_req_host)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000708 return True
709 else:
710 return False
711
712
713class Cookie:
714 """HTTP Cookie.
715
716 This class represents both Netscape and RFC 2965 cookies.
717
718 This is deliberately a very simple class. It just holds attributes. It's
719 possible to construct Cookie instances that don't comply with the cookie
720 standards. CookieJar.make_cookies is the factory function for Cookie
721 objects -- it deals with cookie parsing, supplying defaults, and
722 normalising to the representation used in this class. CookiePolicy is
723 responsible for checking them to see whether they should be accepted from
724 and returned to the server.
725
726 Note that the port may be present in the headers, but unspecified ("Port"
727 rather than"Port=80", for example); if this is the case, port is None.
728
729 """
730
731 def __init__(self, version, name, value,
732 port, port_specified,
733 domain, domain_specified, domain_initial_dot,
734 path, path_specified,
735 secure,
736 expires,
737 discard,
738 comment,
739 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000740 rest,
741 rfc2109=False,
742 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000743
744 if version is not None: version = int(version)
745 if expires is not None: expires = int(expires)
746 if port is None and port_specified is True:
747 raise ValueError("if port is None, port_specified must be false")
748
749 self.version = version
750 self.name = name
751 self.value = value
752 self.port = port
753 self.port_specified = port_specified
754 # normalise case, as per RFC 2965 section 3.3.3
755 self.domain = domain.lower()
756 self.domain_specified = domain_specified
757 # Sigh. We need to know whether the domain given in the
758 # cookie-attribute had an initial dot, in order to follow RFC 2965
759 # (as clarified in draft errata). Needed for the returned $Domain
760 # value.
761 self.domain_initial_dot = domain_initial_dot
762 self.path = path
763 self.path_specified = path_specified
764 self.secure = secure
765 self.expires = expires
766 self.discard = discard
767 self.comment = comment
768 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000769 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000770
771 self._rest = copy.copy(rest)
772
773 def has_nonstandard_attr(self, name):
774 return name in self._rest
775 def get_nonstandard_attr(self, name, default=None):
776 return self._rest.get(name, default)
777 def set_nonstandard_attr(self, name, value):
778 self._rest[name] = value
779
780 def is_expired(self, now=None):
781 if now is None: now = time.time()
782 if (self.expires is not None) and (self.expires <= now):
783 return True
784 return False
785
786 def __str__(self):
787 if self.port is None: p = ""
788 else: p = ":"+self.port
789 limit = self.domain + p + self.path
790 if self.value is not None:
791 namevalue = "%s=%s" % (self.name, self.value)
792 else:
793 namevalue = self.name
794 return "<Cookie %s for %s>" % (namevalue, limit)
795
796 def __repr__(self):
797 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000798 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000799 "port", "port_specified",
800 "domain", "domain_specified", "domain_initial_dot",
801 "path", "path_specified",
802 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000803 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000804 attr = getattr(self, name)
805 args.append("%s=%s" % (name, repr(attr)))
806 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000807 args.append("rfc2109=%s" % repr(self.rfc2109))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000808 return "Cookie(%s)" % ", ".join(args)
809
810
811class CookiePolicy:
812 """Defines which cookies get accepted from and returned to server.
813
814 May also modify cookies, though this is probably a bad idea.
815
816 The subclass DefaultCookiePolicy defines the standard rules for Netscape
817 and RFC 2965 cookies -- override that if you want a customised policy.
818
819 """
820 def set_ok(self, cookie, request):
821 """Return true if (and only if) cookie should be accepted from server.
822
823 Currently, pre-expired cookies never get this far -- the CookieJar
824 class deletes such cookies itself.
825
826 """
827 raise NotImplementedError()
828
829 def return_ok(self, cookie, request):
830 """Return true if (and only if) cookie should be returned to server."""
831 raise NotImplementedError()
832
833 def domain_return_ok(self, domain, request):
834 """Return false if cookies should not be returned, given cookie domain.
835 """
836 return True
837
838 def path_return_ok(self, path, request):
839 """Return false if cookies should not be returned, given cookie path.
840 """
841 return True
842
843
844class DefaultCookiePolicy(CookiePolicy):
845 """Implements the standard rules for accepting and returning cookies."""
846
847 DomainStrictNoDots = 1
848 DomainStrictNonDomain = 2
849 DomainRFC2965Match = 4
850
851 DomainLiberal = 0
852 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
853
854 def __init__(self,
855 blocked_domains=None, allowed_domains=None,
856 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000857 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000858 hide_cookie2=False,
859 strict_domain=False,
860 strict_rfc2965_unverifiable=True,
861 strict_ns_unverifiable=False,
862 strict_ns_domain=DomainLiberal,
863 strict_ns_set_initial_dollar=False,
864 strict_ns_set_path=False,
865 ):
866 """Constructor arguments should be passed as keyword arguments only."""
867 self.netscape = netscape
868 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000869 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000870 self.hide_cookie2 = hide_cookie2
871 self.strict_domain = strict_domain
872 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
873 self.strict_ns_unverifiable = strict_ns_unverifiable
874 self.strict_ns_domain = strict_ns_domain
875 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
876 self.strict_ns_set_path = strict_ns_set_path
877
878 if blocked_domains is not None:
879 self._blocked_domains = tuple(blocked_domains)
880 else:
881 self._blocked_domains = ()
882
883 if allowed_domains is not None:
884 allowed_domains = tuple(allowed_domains)
885 self._allowed_domains = allowed_domains
886
887 def blocked_domains(self):
888 """Return the sequence of blocked domains (as a tuple)."""
889 return self._blocked_domains
890 def set_blocked_domains(self, blocked_domains):
891 """Set the sequence of blocked domains."""
892 self._blocked_domains = tuple(blocked_domains)
893
894 def is_blocked(self, domain):
895 for blocked_domain in self._blocked_domains:
896 if user_domain_match(domain, blocked_domain):
897 return True
898 return False
899
900 def allowed_domains(self):
901 """Return None, or the sequence of allowed domains (as a tuple)."""
902 return self._allowed_domains
903 def set_allowed_domains(self, allowed_domains):
904 """Set the sequence of allowed domains, or None."""
905 if allowed_domains is not None:
906 allowed_domains = tuple(allowed_domains)
907 self._allowed_domains = allowed_domains
908
909 def is_not_allowed(self, domain):
910 if self._allowed_domains is None:
911 return False
912 for allowed_domain in self._allowed_domains:
913 if user_domain_match(domain, allowed_domain):
914 return False
915 return True
916
917 def set_ok(self, cookie, request):
918 """
919 If you override .set_ok(), be sure to call this method. If it returns
920 false, so should your subclass (assuming your subclass wants to be more
921 strict about which cookies to accept).
922
923 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000924 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000925
926 assert cookie.name is not None
927
928 for n in "version", "verifiability", "name", "path", "domain", "port":
929 fn_name = "set_ok_"+n
930 fn = getattr(self, fn_name)
931 if not fn(cookie, request):
932 return False
933
934 return True
935
936 def set_ok_version(self, cookie, request):
937 if cookie.version is None:
938 # Version is always set to 0 by parse_ns_headers if it's a Netscape
939 # cookie, so this must be an invalid RFC 2965 cookie.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000940 _debug(" Set-Cookie2 without version attribute (%s=%s)",
941 cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000942 return False
943 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000944 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000945 return False
946 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000947 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000948 return False
949 return True
950
951 def set_ok_verifiability(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -0500952 if request.unverifiable and is_third_party(request):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000953 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000954 _debug(" third-party RFC 2965 cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000955 "unverifiable transaction")
956 return False
957 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000958 _debug(" third-party Netscape cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000959 "unverifiable transaction")
960 return False
961 return True
962
963 def set_ok_name(self, cookie, request):
964 # Try and stop servers setting V0 cookies designed to hack other
965 # servers that know both V0 and V1 protocols.
966 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
967 cookie.name.startswith("$")):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000968 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000969 return False
970 return True
971
972 def set_ok_path(self, cookie, request):
973 if cookie.path_specified:
974 req_path = request_path(request)
975 if ((cookie.version > 0 or
976 (cookie.version == 0 and self.strict_ns_set_path)) and
977 not req_path.startswith(cookie.path)):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000978 _debug(" path attribute %s is not a prefix of request "
979 "path %s", cookie.path, req_path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000980 return False
981 return True
982
983 def set_ok_domain(self, cookie, request):
984 if self.is_blocked(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000985 _debug(" domain %s is in user block-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000986 return False
987 if self.is_not_allowed(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000988 _debug(" domain %s is not in user allow-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000989 return False
990 if cookie.domain_specified:
991 req_host, erhn = eff_request_host(request)
992 domain = cookie.domain
993 if self.strict_domain and (domain.count(".") >= 2):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000994 # XXX This should probably be compared with the Konqueror
995 # (kcookiejar.cpp) and Mozilla implementations, but it's a
996 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000997 i = domain.rfind(".")
998 j = domain.rfind(".", 0, i)
999 if j == 0: # domain like .foo.bar
1000 tld = domain[i+1:]
1001 sld = domain[j+1:i]
Thomas Wouters477c8d52006-05-27 19:21:47 +00001002 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1003 "gov", "mil", "int", "aero", "biz", "cat", "coop",
1004 "info", "jobs", "mobi", "museum", "name", "pro",
1005 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001006 # domain like .co.uk
Thomas Wouters477c8d52006-05-27 19:21:47 +00001007 _debug(" country-code second level domain %s", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001008 return False
1009 if domain.startswith("."):
1010 undotted_domain = domain[1:]
1011 else:
1012 undotted_domain = domain
1013 embedded_dots = (undotted_domain.find(".") >= 0)
1014 if not embedded_dots and domain != ".local":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001015 _debug(" non-local domain %s contains no embedded dot",
1016 domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001017 return False
1018 if cookie.version == 0:
1019 if (not erhn.endswith(domain) and
1020 (not erhn.startswith(".") and
1021 not ("."+erhn).endswith(domain))):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001022 _debug(" effective request-host %s (even with added "
Ezio Melottie130a522011-10-19 10:58:56 +03001023 "initial dot) does not end with %s",
Thomas Wouters477c8d52006-05-27 19:21:47 +00001024 erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001025 return False
1026 if (cookie.version > 0 or
1027 (self.strict_ns_domain & self.DomainRFC2965Match)):
1028 if not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001029 _debug(" effective request-host %s does not domain-match "
1030 "%s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001031 return False
1032 if (cookie.version > 0 or
1033 (self.strict_ns_domain & self.DomainStrictNoDots)):
1034 host_prefix = req_host[:-len(domain)]
1035 if (host_prefix.find(".") >= 0 and
1036 not IPV4_RE.search(req_host)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001037 _debug(" host prefix %s for domain %s contains a dot",
1038 host_prefix, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001039 return False
1040 return True
1041
1042 def set_ok_port(self, cookie, request):
1043 if cookie.port_specified:
1044 req_port = request_port(request)
1045 if req_port is None:
1046 req_port = "80"
1047 else:
1048 req_port = str(req_port)
1049 for p in cookie.port.split(","):
1050 try:
1051 int(p)
1052 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001053 _debug(" bad port %s (not numeric)", p)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001054 return False
1055 if p == req_port:
1056 break
1057 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001058 _debug(" request port (%s) not found in %s",
1059 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001060 return False
1061 return True
1062
1063 def return_ok(self, cookie, request):
1064 """
1065 If you override .return_ok(), be sure to call this method. If it
1066 returns false, so should your subclass (assuming your subclass wants to
1067 be more strict about which cookies to return).
1068
1069 """
1070 # Path has already been checked by .path_return_ok(), and domain
1071 # blocking done by .domain_return_ok().
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001073
1074 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1075 fn_name = "return_ok_"+n
1076 fn = getattr(self, fn_name)
1077 if not fn(cookie, request):
1078 return False
1079 return True
1080
1081 def return_ok_version(self, cookie, request):
1082 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001083 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001084 return False
1085 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001086 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001087 return False
1088 return True
1089
1090 def return_ok_verifiability(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -05001091 if request.unverifiable and is_third_party(request):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001092 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001093 _debug(" third-party RFC 2965 cookie during unverifiable "
1094 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001095 return False
1096 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001097 _debug(" third-party Netscape cookie during unverifiable "
1098 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001099 return False
1100 return True
1101
1102 def return_ok_secure(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -05001103 if cookie.secure and request.type != "https":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001104 _debug(" secure cookie with non-secure request")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001105 return False
1106 return True
1107
1108 def return_ok_expires(self, cookie, request):
1109 if cookie.is_expired(self._now):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001110 _debug(" cookie expired")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001111 return False
1112 return True
1113
1114 def return_ok_port(self, cookie, request):
1115 if cookie.port:
1116 req_port = request_port(request)
1117 if req_port is None:
1118 req_port = "80"
1119 for p in cookie.port.split(","):
1120 if p == req_port:
1121 break
1122 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001123 _debug(" request port %s does not match cookie port %s",
1124 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001125 return False
1126 return True
1127
1128 def return_ok_domain(self, cookie, request):
1129 req_host, erhn = eff_request_host(request)
1130 domain = cookie.domain
1131
1132 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1133 if (cookie.version == 0 and
1134 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1135 not cookie.domain_specified and domain != erhn):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001136 _debug(" cookie with unspecified domain does not string-compare "
1137 "equal to request domain")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001138 return False
1139
1140 if cookie.version > 0 and not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001141 _debug(" effective request-host name %s does not domain-match "
1142 "RFC 2965 cookie domain %s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001143 return False
1144 if cookie.version == 0 and not ("."+erhn).endswith(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001145 _debug(" request-host %s does not match Netscape cookie domain "
1146 "%s", req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001147 return False
1148 return True
1149
1150 def domain_return_ok(self, domain, request):
1151 # Liberal check of. This is here as an optimization to avoid
1152 # having to load lots of MSIE cookie files unless necessary.
1153 req_host, erhn = eff_request_host(request)
1154 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001155 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001156 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001157 erhn = "."+erhn
1158 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001159 #_debug(" request domain %s does not match cookie domain %s",
1160 # req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001161 return False
1162
1163 if self.is_blocked(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001164 _debug(" domain %s is in user block-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001165 return False
1166 if self.is_not_allowed(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001167 _debug(" domain %s is not in user allow-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001168 return False
1169
1170 return True
1171
1172 def path_return_ok(self, path, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001173 _debug("- checking cookie path=%s", path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001174 req_path = request_path(request)
1175 if not req_path.startswith(path):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001176 _debug(" %s does not path-match %s", req_path, path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001177 return False
1178 return True
1179
1180
1181def vals_sorted_by_key(adict):
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001182 keys = sorted(adict.keys())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001183 return map(adict.get, keys)
1184
1185def deepvalues(mapping):
1186 """Iterates over nested mapping, depth-first, in sorted order by key."""
1187 values = vals_sorted_by_key(mapping)
1188 for obj in values:
1189 mapping = False
1190 try:
1191 obj.items
1192 except AttributeError:
1193 pass
1194 else:
1195 mapping = True
Philip Jenveyfd0d3e52012-10-01 15:34:31 -07001196 yield from deepvalues(obj)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001197 if not mapping:
1198 yield obj
1199
1200
1201# Used as second parameter to dict.get() method, to distinguish absent
1202# dict key from one with a None value.
1203class Absent: pass
1204
1205class CookieJar:
1206 """Collection of HTTP cookies.
1207
1208 You may not need to know about this class: try
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001209 urllib.request.build_opener(HTTPCookieProcessor).open(url).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001210 """
1211
1212 non_word_re = re.compile(r"\W")
1213 quote_re = re.compile(r"([\"\\])")
1214 strict_domain_re = re.compile(r"\.?[^.]*")
1215 domain_re = re.compile(r"[^.]*")
1216 dots_re = re.compile(r"^\.+")
1217
Antoine Pitroufd036452008-08-19 17:56:33 +00001218 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001219
1220 def __init__(self, policy=None):
1221 if policy is None:
1222 policy = DefaultCookiePolicy()
1223 self._policy = policy
1224
1225 self._cookies_lock = _threading.RLock()
1226 self._cookies = {}
1227
1228 def set_policy(self, policy):
1229 self._policy = policy
1230
1231 def _cookies_for_domain(self, domain, request):
1232 cookies = []
1233 if not self._policy.domain_return_ok(domain, request):
1234 return []
Thomas Wouters477c8d52006-05-27 19:21:47 +00001235 _debug("Checking %s for cookies to return", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001236 cookies_by_path = self._cookies[domain]
1237 for path in cookies_by_path.keys():
1238 if not self._policy.path_return_ok(path, request):
1239 continue
1240 cookies_by_name = cookies_by_path[path]
1241 for cookie in cookies_by_name.values():
1242 if not self._policy.return_ok(cookie, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001243 _debug(" not returning cookie")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001244 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001245 _debug(" it's a match")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001246 cookies.append(cookie)
1247 return cookies
1248
1249 def _cookies_for_request(self, request):
1250 """Return a list of cookies to be returned to server."""
1251 cookies = []
1252 for domain in self._cookies.keys():
1253 cookies.extend(self._cookies_for_domain(domain, request))
1254 return cookies
1255
1256 def _cookie_attrs(self, cookies):
1257 """Return a list of cookie-attributes to be returned to server.
1258
1259 like ['foo="bar"; $Path="/"', ...]
1260
1261 The $Version attribute is also added when appropriate (currently only
1262 once per request).
1263
1264 """
1265 # add cookies in order of most specific (ie. longest) path first
Raymond Hettinger70b64fc2008-01-30 20:15:17 +00001266 cookies.sort(key=lambda a: len(a.path), reverse=True)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001267
1268 version_set = False
1269
1270 attrs = []
1271 for cookie in cookies:
1272 # set version of Cookie header
1273 # XXX
1274 # What should it be if multiple matching Set-Cookie headers have
1275 # different versions themselves?
1276 # Answer: there is no answer; was supposed to be settled by
1277 # RFC 2965 errata, but that may never appear...
1278 version = cookie.version
1279 if not version_set:
1280 version_set = True
1281 if version > 0:
1282 attrs.append("$Version=%s" % version)
1283
1284 # quote cookie value if necessary
1285 # (not for Netscape protocol, which already has any quotes
1286 # intact, due to the poorly-specified Netscape Cookie: syntax)
1287 if ((cookie.value is not None) and
1288 self.non_word_re.search(cookie.value) and version > 0):
1289 value = self.quote_re.sub(r"\\\1", cookie.value)
1290 else:
1291 value = cookie.value
1292
1293 # add cookie-attributes to be returned in Cookie header
1294 if cookie.value is None:
1295 attrs.append(cookie.name)
1296 else:
1297 attrs.append("%s=%s" % (cookie.name, value))
1298 if version > 0:
1299 if cookie.path_specified:
1300 attrs.append('$Path="%s"' % cookie.path)
1301 if cookie.domain.startswith("."):
1302 domain = cookie.domain
1303 if (not cookie.domain_initial_dot and
1304 domain.startswith(".")):
1305 domain = domain[1:]
1306 attrs.append('$Domain="%s"' % domain)
1307 if cookie.port is not None:
1308 p = "$Port"
1309 if cookie.port_specified:
1310 p = p + ('="%s"' % cookie.port)
1311 attrs.append(p)
1312
1313 return attrs
1314
1315 def add_cookie_header(self, request):
Georg Brandl029986a2008-06-23 11:44:14 +00001316 """Add correct Cookie: header to request (urllib.request.Request object).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001317
1318 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1319
1320 """
Thomas Wouters477c8d52006-05-27 19:21:47 +00001321 _debug("add_cookie_header")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001322 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001323 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001324
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001325 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001326
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001327 cookies = self._cookies_for_request(request)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001328
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001329 attrs = self._cookie_attrs(cookies)
1330 if attrs:
1331 if not request.has_header("Cookie"):
1332 request.add_unredirected_header(
1333 "Cookie", "; ".join(attrs))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001334
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001335 # if necessary, advertise that we know RFC 2965
1336 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1337 not request.has_header("Cookie2")):
1338 for cookie in cookies:
1339 if cookie.version != 1:
1340 request.add_unredirected_header("Cookie2", '$Version="1"')
1341 break
1342
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001343 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001344 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001345
1346 self.clear_expired_cookies()
1347
1348 def _normalized_cookie_tuples(self, attrs_set):
1349 """Return list of tuples containing normalised cookie information.
1350
1351 attrs_set is the list of lists of key,value pairs extracted from
1352 the Set-Cookie or Set-Cookie2 headers.
1353
1354 Tuples are name, value, standard, rest, where name and value are the
1355 cookie name and value, standard is a dictionary containing the standard
1356 cookie-attributes (discard, secure, version, expires or max-age,
1357 domain, path and port) and rest is a dictionary containing the rest of
1358 the cookie-attributes.
1359
1360 """
1361 cookie_tuples = []
1362
1363 boolean_attrs = "discard", "secure"
1364 value_attrs = ("version",
1365 "expires", "max-age",
1366 "domain", "path", "port",
1367 "comment", "commenturl")
1368
1369 for cookie_attrs in attrs_set:
1370 name, value = cookie_attrs[0]
1371
1372 # Build dictionary of standard cookie-attributes (standard) and
1373 # dictionary of other cookie-attributes (rest).
1374
1375 # Note: expiry time is normalised to seconds since epoch. V0
1376 # cookies should have the Expires cookie-attribute, and V1 cookies
1377 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1378 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1379 # accept either (but prefer Max-Age).
1380 max_age_set = False
1381
1382 bad_cookie = False
1383
1384 standard = {}
1385 rest = {}
1386 for k, v in cookie_attrs[1:]:
1387 lc = k.lower()
1388 # don't lose case distinction for unknown fields
1389 if lc in value_attrs or lc in boolean_attrs:
1390 k = lc
1391 if k in boolean_attrs and v is None:
1392 # boolean cookie-attribute is present, but has no value
1393 # (like "discard", rather than "port=80")
1394 v = True
1395 if k in standard:
1396 # only first value is significant
1397 continue
1398 if k == "domain":
1399 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001400 _debug(" missing value for domain attribute")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001401 bad_cookie = True
1402 break
1403 # RFC 2965 section 3.3.3
1404 v = v.lower()
1405 if k == "expires":
1406 if max_age_set:
1407 # Prefer max-age to expires (like Mozilla)
1408 continue
1409 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001410 _debug(" missing or invalid value for expires "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001411 "attribute: treating as session cookie")
1412 continue
1413 if k == "max-age":
1414 max_age_set = True
1415 try:
1416 v = int(v)
1417 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001418 _debug(" missing or invalid (non-numeric) value for "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001419 "max-age attribute")
1420 bad_cookie = True
1421 break
1422 # convert RFC 2965 Max-Age to seconds since epoch
1423 # XXX Strictly you're supposed to follow RFC 2616
1424 # age-calculation rules. Remember that zero Max-Age is a
1425 # is a request to discard (old and new) cookie, though.
1426 k = "expires"
1427 v = self._now + v
1428 if (k in value_attrs) or (k in boolean_attrs):
1429 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001430 k not in ("port", "comment", "commenturl")):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001431 _debug(" missing value for %s attribute" % k)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001432 bad_cookie = True
1433 break
1434 standard[k] = v
1435 else:
1436 rest[k] = v
1437
1438 if bad_cookie:
1439 continue
1440
1441 cookie_tuples.append((name, value, standard, rest))
1442
1443 return cookie_tuples
1444
1445 def _cookie_from_cookie_tuple(self, tup, request):
1446 # standard is dict of standard cookie-attributes, rest is dict of the
1447 # rest of them
1448 name, value, standard, rest = tup
1449
1450 domain = standard.get("domain", Absent)
1451 path = standard.get("path", Absent)
1452 port = standard.get("port", Absent)
1453 expires = standard.get("expires", Absent)
1454
1455 # set the easy defaults
1456 version = standard.get("version", None)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +00001457 if version is not None:
1458 try:
1459 version = int(version)
1460 except ValueError:
1461 return None # invalid version, ignore cookie
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001462 secure = standard.get("secure", False)
1463 # (discard is also set if expires is Absent)
1464 discard = standard.get("discard", False)
1465 comment = standard.get("comment", None)
1466 comment_url = standard.get("commenturl", None)
1467
1468 # set default path
1469 if path is not Absent and path != "":
1470 path_specified = True
1471 path = escape_path(path)
1472 else:
1473 path_specified = False
1474 path = request_path(request)
1475 i = path.rfind("/")
1476 if i != -1:
1477 if version == 0:
1478 # Netscape spec parts company from reality here
1479 path = path[:i]
1480 else:
1481 path = path[:i+1]
1482 if len(path) == 0: path = "/"
1483
1484 # set default domain
1485 domain_specified = domain is not Absent
1486 # but first we have to remember whether it starts with a dot
1487 domain_initial_dot = False
1488 if domain_specified:
1489 domain_initial_dot = bool(domain.startswith("."))
1490 if domain is Absent:
1491 req_host, erhn = eff_request_host(request)
1492 domain = erhn
1493 elif not domain.startswith("."):
1494 domain = "."+domain
1495
1496 # set default port
1497 port_specified = False
1498 if port is not Absent:
1499 if port is None:
1500 # Port attr present, but has no value: default to request port.
1501 # Cookie should then only be sent back on that port.
1502 port = request_port(request)
1503 else:
1504 port_specified = True
1505 port = re.sub(r"\s+", "", port)
1506 else:
1507 # No port attr present. Cookie can be sent back on any port.
1508 port = None
1509
1510 # set default expires and discard
1511 if expires is Absent:
1512 expires = None
1513 discard = True
1514 elif expires <= self._now:
1515 # Expiry date in past is request to delete cookie. This can't be
1516 # in DefaultCookiePolicy, because can't delete cookies there.
1517 try:
1518 self.clear(domain, path, name)
1519 except KeyError:
1520 pass
Thomas Wouters477c8d52006-05-27 19:21:47 +00001521 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1522 domain, path, name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001523 return None
1524
1525 return Cookie(version,
1526 name, value,
1527 port, port_specified,
1528 domain, domain_specified, domain_initial_dot,
1529 path, path_specified,
1530 secure,
1531 expires,
1532 discard,
1533 comment,
1534 comment_url,
1535 rest)
1536
1537 def _cookies_from_attrs_set(self, attrs_set, request):
1538 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1539
1540 cookies = []
1541 for tup in cookie_tuples:
1542 cookie = self._cookie_from_cookie_tuple(tup, request)
1543 if cookie: cookies.append(cookie)
1544 return cookies
1545
Neal Norwitz71dad722005-12-23 21:43:48 +00001546 def _process_rfc2109_cookies(self, cookies):
1547 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1548 if rfc2109_as_ns is None:
1549 rfc2109_as_ns = not self._policy.rfc2965
1550 for cookie in cookies:
1551 if cookie.version == 1:
1552 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001553 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001554 # treat 2109 cookies as Netscape cookies rather than
1555 # as RFC2965 cookies
1556 cookie.version = 0
1557
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001558 def make_cookies(self, response, request):
1559 """Return sequence of Cookie objects extracted from response object."""
1560 # get cookie-attributes for RFC 2965 and Netscape protocols
1561 headers = response.info()
Barry Warsaw820c1202008-06-12 04:06:45 +00001562 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1563 ns_hdrs = headers.get_all("Set-Cookie", [])
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001564
1565 rfc2965 = self._policy.rfc2965
1566 netscape = self._policy.netscape
1567
1568 if ((not rfc2965_hdrs and not ns_hdrs) or
1569 (not ns_hdrs and not rfc2965) or
1570 (not rfc2965_hdrs and not netscape) or
1571 (not netscape and not rfc2965)):
1572 return [] # no relevant cookie headers: quick exit
1573
1574 try:
1575 cookies = self._cookies_from_attrs_set(
1576 split_header_words(rfc2965_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001577 except Exception:
1578 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001579 cookies = []
1580
1581 if ns_hdrs and netscape:
1582 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001583 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001584 ns_cookies = self._cookies_from_attrs_set(
1585 parse_ns_headers(ns_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001586 except Exception:
1587 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001588 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001589 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001590
1591 # Look for Netscape cookies (from Set-Cookie headers) that match
1592 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1593 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1594 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1595 # bundled in with the Netscape cookies for this purpose, which is
1596 # reasonable behaviour.
1597 if rfc2965:
1598 lookup = {}
1599 for cookie in cookies:
1600 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1601
1602 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1603 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1604 return key not in lookup
1605 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1606
1607 if ns_cookies:
1608 cookies.extend(ns_cookies)
1609
1610 return cookies
1611
1612 def set_cookie_if_ok(self, cookie, request):
1613 """Set a cookie if policy says it's OK to do so."""
1614 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001615 try:
1616 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001617
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001618 if self._policy.set_ok(cookie, request):
1619 self.set_cookie(cookie)
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001620
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001621
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001622 finally:
1623 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001624
1625 def set_cookie(self, cookie):
1626 """Set a cookie, without checking whether or not it should be set."""
1627 c = self._cookies
1628 self._cookies_lock.acquire()
1629 try:
1630 if cookie.domain not in c: c[cookie.domain] = {}
1631 c2 = c[cookie.domain]
1632 if cookie.path not in c2: c2[cookie.path] = {}
1633 c3 = c2[cookie.path]
1634 c3[cookie.name] = cookie
1635 finally:
1636 self._cookies_lock.release()
1637
1638 def extract_cookies(self, response, request):
1639 """Extract cookies from response, where allowable given the request."""
Thomas Wouters477c8d52006-05-27 19:21:47 +00001640 _debug("extract_cookies: %s", response.info())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001641 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001642 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001643 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001644
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001645 for cookie in self.make_cookies(response, request):
1646 if self._policy.set_ok(cookie, request):
1647 _debug(" setting cookie: %s", cookie)
1648 self.set_cookie(cookie)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001649 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001650 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001651
1652 def clear(self, domain=None, path=None, name=None):
1653 """Clear some cookies.
1654
1655 Invoking this method without arguments will clear all cookies. If
1656 given a single argument, only cookies belonging to that domain will be
1657 removed. If given two arguments, cookies belonging to the specified
1658 path within that domain are removed. If given three arguments, then
1659 the cookie with the specified name, path and domain is removed.
1660
1661 Raises KeyError if no matching cookie exists.
1662
1663 """
1664 if name is not None:
1665 if (domain is None) or (path is None):
1666 raise ValueError(
1667 "domain and path must be given to remove a cookie by name")
1668 del self._cookies[domain][path][name]
1669 elif path is not None:
1670 if domain is None:
1671 raise ValueError(
1672 "domain must be given to remove cookies by path")
1673 del self._cookies[domain][path]
1674 elif domain is not None:
1675 del self._cookies[domain]
1676 else:
1677 self._cookies = {}
1678
1679 def clear_session_cookies(self):
1680 """Discard all session cookies.
1681
1682 Note that the .save() method won't save session cookies anyway, unless
1683 you ask otherwise by passing a true ignore_discard argument.
1684
1685 """
1686 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001687 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001688 for cookie in self:
1689 if cookie.discard:
1690 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001691 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001692 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001693
1694 def clear_expired_cookies(self):
1695 """Discard all expired cookies.
1696
1697 You probably don't need to call this method: expired cookies are never
1698 sent back to the server (provided you're using DefaultCookiePolicy),
1699 this method is called by CookieJar itself every so often, and the
1700 .save() method won't save expired cookies anyway (unless you ask
1701 otherwise by passing a true ignore_expires argument).
1702
1703 """
1704 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001705 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001706 now = time.time()
1707 for cookie in self:
1708 if cookie.is_expired(now):
1709 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001710 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001711 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001712
1713 def __iter__(self):
1714 return deepvalues(self._cookies)
1715
1716 def __len__(self):
1717 """Return number of contained cookies."""
1718 i = 0
1719 for cookie in self: i = i + 1
1720 return i
1721
1722 def __repr__(self):
1723 r = []
1724 for cookie in self: r.append(repr(cookie))
1725 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1726
1727 def __str__(self):
1728 r = []
1729 for cookie in self: r.append(str(cookie))
1730 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1731
1732
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001733# derives from OSError for backwards-compatibility with Python 2.4.0
1734class LoadError(OSError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001735
1736class FileCookieJar(CookieJar):
1737 """CookieJar that can be loaded from and saved to a file."""
1738
1739 def __init__(self, filename=None, delayload=False, policy=None):
1740 """
1741 Cookies are NOT loaded from the named file until either the .load() or
1742 .revert() method is called.
1743
1744 """
1745 CookieJar.__init__(self, policy)
1746 if filename is not None:
1747 try:
1748 filename+""
1749 except:
1750 raise ValueError("filename must be string-like")
1751 self.filename = filename
1752 self.delayload = bool(delayload)
1753
1754 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1755 """Save cookies to a file."""
1756 raise NotImplementedError()
1757
1758 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1759 """Load cookies from a file."""
1760 if filename is None:
1761 if self.filename is not None: filename = self.filename
1762 else: raise ValueError(MISSING_FILENAME_TEXT)
1763
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01001764 with open(filename) as f:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001765 self._really_load(f, filename, ignore_discard, ignore_expires)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001766
1767 def revert(self, filename=None,
1768 ignore_discard=False, ignore_expires=False):
1769 """Clear all cookies and reload cookies from a saved file.
1770
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001771 Raises LoadError (or OSError) if reversion is not successful; the
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001772 object's state will not be altered if this happens.
1773
1774 """
1775 if filename is None:
1776 if self.filename is not None: filename = self.filename
1777 else: raise ValueError(MISSING_FILENAME_TEXT)
1778
1779 self._cookies_lock.acquire()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001780 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001781
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001782 old_state = copy.deepcopy(self._cookies)
1783 self._cookies = {}
1784 try:
1785 self.load(filename, ignore_discard, ignore_expires)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001786 except OSError:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001787 self._cookies = old_state
1788 raise
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001789
1790 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001791 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001792
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001793
1794def lwp_cookie_str(cookie):
1795 """Return string representation of Cookie in an the LWP cookie file format.
1796
1797 Actually, the format is extended a bit -- see module docstring.
1798
1799 """
1800 h = [(cookie.name, cookie.value),
1801 ("path", cookie.path),
1802 ("domain", cookie.domain)]
1803 if cookie.port is not None: h.append(("port", cookie.port))
1804 if cookie.path_specified: h.append(("path_spec", None))
1805 if cookie.port_specified: h.append(("port_spec", None))
1806 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1807 if cookie.secure: h.append(("secure", None))
1808 if cookie.expires: h.append(("expires",
1809 time2isoz(float(cookie.expires))))
1810 if cookie.discard: h.append(("discard", None))
1811 if cookie.comment: h.append(("comment", cookie.comment))
1812 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1813
1814 keys = sorted(cookie._rest.keys())
1815 for k in keys:
1816 h.append((k, str(cookie._rest[k])))
1817
1818 h.append(("version", str(cookie.version)))
1819
1820 return join_header_words([h])
1821
1822class LWPCookieJar(FileCookieJar):
1823 """
Ezio Melotti0847db72012-09-21 16:30:22 +03001824 The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001825 "Set-Cookie3" is the format used by the libwww-perl libary, not known
1826 to be compatible with any browser, but which is easy to read and
1827 doesn't lose information about RFC 2965 cookies.
1828
1829 Additional methods
1830
1831 as_lwp_str(ignore_discard=True, ignore_expired=True)
1832
1833 """
1834
1835 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
Ezio Melotti0847db72012-09-21 16:30:22 +03001836 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001837
1838 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1839
1840 """
1841 now = time.time()
1842 r = []
1843 for cookie in self:
1844 if not ignore_discard and cookie.discard:
1845 continue
1846 if not ignore_expires and cookie.is_expired(now):
1847 continue
1848 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1849 return "\n".join(r+[""])
1850
1851 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1852 if filename is None:
1853 if self.filename is not None: filename = self.filename
1854 else: raise ValueError(MISSING_FILENAME_TEXT)
1855
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01001856 with open(filename, "w") as f:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001857 # There really isn't an LWP Cookies 2.0 format, but this indicates
1858 # that there is extra information in here (domain_dot and
1859 # port_spec) while still being compatible with libwww-perl, I hope.
1860 f.write("#LWP-Cookies-2.0\n")
1861 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001862
1863 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1864 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001865 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001866 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1867 "file" % filename)
1868 raise LoadError(msg)
1869
1870 now = time.time()
1871
1872 header = "Set-Cookie3:"
1873 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1874 "secure", "discard")
1875 value_attrs = ("version",
1876 "port", "path", "domain",
1877 "expires",
1878 "comment", "commenturl")
1879
1880 try:
1881 while 1:
1882 line = f.readline()
1883 if line == "": break
1884 if not line.startswith(header):
1885 continue
1886 line = line[len(header):].strip()
1887
1888 for data in split_header_words([line]):
1889 name, value = data[0]
1890 standard = {}
1891 rest = {}
1892 for k in boolean_attrs:
1893 standard[k] = False
1894 for k, v in data[1:]:
1895 if k is not None:
1896 lc = k.lower()
1897 else:
1898 lc = None
1899 # don't lose case distinction for unknown fields
1900 if (lc in value_attrs) or (lc in boolean_attrs):
1901 k = lc
1902 if k in boolean_attrs:
1903 if v is None: v = True
1904 standard[k] = v
1905 elif k in value_attrs:
1906 standard[k] = v
1907 else:
1908 rest[k] = v
1909
1910 h = standard.get
1911 expires = h("expires")
1912 discard = h("discard")
1913 if expires is not None:
1914 expires = iso2time(expires)
1915 if expires is None:
1916 discard = True
1917 domain = h("domain")
1918 domain_specified = domain.startswith(".")
1919 c = Cookie(h("version"), name, value,
1920 h("port"), h("port_spec"),
1921 domain, domain_specified, h("domain_dot"),
1922 h("path"), h("path_spec"),
1923 h("secure"),
1924 expires,
1925 discard,
1926 h("comment"),
1927 h("commenturl"),
1928 rest)
1929 if not ignore_discard and c.discard:
1930 continue
1931 if not ignore_expires and c.is_expired(now):
1932 continue
1933 self.set_cookie(c)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001934 except OSError:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001935 raise
1936 except Exception:
1937 _warn_unhandled_exception()
1938 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1939 (filename, line))
1940
1941
1942class MozillaCookieJar(FileCookieJar):
1943 """
1944
1945 WARNING: you may want to backup your browser's cookies file if you use
1946 this class to save cookies. I *think* it works, but there have been
1947 bugs in the past!
1948
1949 This class differs from CookieJar only in the format it uses to save and
1950 load cookies to and from a file. This class uses the Mozilla/Netscape
1951 `cookies.txt' format. lynx uses this file format, too.
1952
1953 Don't expect cookies saved while the browser is running to be noticed by
1954 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1955 you change them on disk while it's running; on Windows, you probably can't
1956 save at all while the browser is running).
1957
1958 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1959 Netscape cookies on saving.
1960
1961 In particular, the cookie version and port number information is lost,
1962 together with information about whether or not Path, Port and Discard were
1963 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1964 domain as set in the HTTP header started with a dot (yes, I'm aware some
1965 domains in Netscape files start with a dot and some don't -- trust me, you
1966 really don't want to know any more about this).
1967
1968 Note that though Mozilla and Netscape use the same format, they use
1969 slightly different headers. The class saves cookies using the Netscape
1970 header by default (Mozilla can cope with that).
1971
1972 """
Antoine Pitroufd036452008-08-19 17:56:33 +00001973 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001974 header = """\
Georg Brandl87a15642010-07-31 22:11:11 +00001975# Netscape HTTP Cookie File
Benjamin Petersonccedc222013-12-18 15:35:18 -06001976# http://curl.haxx.se/rfc/cookie_spec.html
Georg Brandl87a15642010-07-31 22:11:11 +00001977# This is a generated file! Do not edit.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001978
1979"""
1980
1981 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1982 now = time.time()
1983
1984 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001985 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001986 f.close()
1987 raise LoadError(
1988 "%r does not look like a Netscape format cookies file" %
1989 filename)
1990
1991 try:
1992 while 1:
1993 line = f.readline()
1994 if line == "": break
1995
1996 # last field may be absent, so keep any trailing tab
1997 if line.endswith("\n"): line = line[:-1]
1998
1999 # skip comments and blank lines XXX what is $ for?
2000 if (line.strip().startswith(("#", "$")) or
2001 line.strip() == ""):
2002 continue
2003
2004 domain, domain_specified, path, secure, expires, name, value = \
2005 line.split("\t")
2006 secure = (secure == "TRUE")
2007 domain_specified = (domain_specified == "TRUE")
2008 if name == "":
2009 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2010 # with no name, whereas http.cookiejar regards it as a
2011 # cookie with no value.
2012 name = value
2013 value = None
2014
2015 initial_dot = domain.startswith(".")
2016 assert domain_specified == initial_dot
2017
2018 discard = False
2019 if expires == "":
2020 expires = None
2021 discard = True
2022
2023 # assume path_specified is false
2024 c = Cookie(0, name, value,
2025 None, False,
2026 domain, domain_specified, initial_dot,
2027 path, False,
2028 secure,
2029 expires,
2030 discard,
2031 None,
2032 None,
2033 {})
2034 if not ignore_discard and c.discard:
2035 continue
2036 if not ignore_expires and c.is_expired(now):
2037 continue
2038 self.set_cookie(c)
2039
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002040 except OSError:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002041 raise
2042 except Exception:
2043 _warn_unhandled_exception()
2044 raise LoadError("invalid Netscape format cookies file %r: %r" %
2045 (filename, line))
2046
2047 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2048 if filename is None:
2049 if self.filename is not None: filename = self.filename
2050 else: raise ValueError(MISSING_FILENAME_TEXT)
2051
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01002052 with open(filename, "w") as f:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002053 f.write(self.header)
2054 now = time.time()
2055 for cookie in self:
2056 if not ignore_discard and cookie.discard:
2057 continue
2058 if not ignore_expires and cookie.is_expired(now):
2059 continue
2060 if cookie.secure: secure = "TRUE"
2061 else: secure = "FALSE"
2062 if cookie.domain.startswith("."): initial_dot = "TRUE"
2063 else: initial_dot = "FALSE"
2064 if cookie.expires is not None:
2065 expires = str(cookie.expires)
2066 else:
2067 expires = ""
2068 if cookie.value is None:
2069 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2070 # with no name, whereas http.cookiejar regards it as a
2071 # cookie with no value.
2072 name = ""
2073 value = cookie.name
2074 else:
2075 name = cookie.name
2076 value = cookie.value
2077 f.write(
2078 "\t".join([cookie.domain, initial_dot, cookie.path,
2079 secure, expires, name, value])+
2080 "\n")