blob: e0f1032b2816d7f6206fa5edc00a95918a110a50 [file] [log] [blame]
Éric Araujo23760e92011-11-07 17:52:48 +01001r"""HTTP cookie handling for web clients.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00002
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
Thomas Wouters477c8d52006-05-27 19:21:47 +000010Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000013
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Thomas Wouters477c8d52006-05-27 19:21:47 +000028__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
Jeremy Hylton1afc1692008-06-18 20:49:58 +000031import copy
Victor Stinner628225c2011-03-21 02:38:51 +010032import datetime
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033import re
34import time
35import urllib.parse, urllib.request
Antoine Pitroua6a4dc82017-09-07 18:56:24 +020036import threading as _threading
Georg Brandl24420152008-05-26 16:32:26 +000037import http.client # only for the default HTTP port
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000038from calendar import timegm
39
Thomas Wouters477c8d52006-05-27 19:21:47 +000040debug = False # set to True to enable debugging via the logging module
41logger = None
42
43def _debug(*args):
44 if not debug:
45 return
46 global logger
47 if not logger:
48 import logging
Georg Brandl24420152008-05-26 16:32:26 +000049 logger = logging.getLogger("http.cookiejar")
Thomas Wouters477c8d52006-05-27 19:21:47 +000050 return logger.debug(*args)
51
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000052
Georg Brandl24420152008-05-26 16:32:26 +000053DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000054MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
55 "instance initialised with one)")
56
Thomas Wouters477c8d52006-05-27 19:21:47 +000057def _warn_unhandled_exception():
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000058 # There are a few catch-all except: statements in this module, for
Thomas Wouters477c8d52006-05-27 19:21:47 +000059 # catching input that's bad in unexpected ways. Warn if any
60 # exceptions are caught there.
Jeremy Hylton7ecf3dc2008-05-10 20:38:40 +000061 import io, warnings, traceback
Guido van Rossum34d19282007-08-09 01:03:29 +000062 f = io.StringIO()
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000063 traceback.print_exc(None, f)
64 msg = f.getvalue()
Georg Brandl24420152008-05-26 16:32:26 +000065 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000066
67
68# Date/time conversion
69# -----------------------------------------------------------------------------
70
71EPOCH_YEAR = 1970
72def _timegm(tt):
73 year, month, mday, hour, min, sec = tt[:6]
74 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
75 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
76 return timegm(tt)
77 else:
78 return None
79
80DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
81MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
82 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
83MONTHS_LOWER = []
84for month in MONTHS: MONTHS_LOWER.append(month.lower())
85
86def time2isoz(t=None):
87 """Return a string representing time in seconds since epoch, t.
88
89 If the function is called without an argument, it will use the current
90 time.
91
92 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
93 representing Universal Time (UTC, aka GMT). An example of this format is:
94
95 1994-11-24 08:49:37Z
96
97 """
Victor Stinner628225c2011-03-21 02:38:51 +010098 if t is None:
99 dt = datetime.datetime.utcnow()
100 else:
101 dt = datetime.datetime.utcfromtimestamp(t)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000102 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100103 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000104
105def time2netscape(t=None):
106 """Return a string representing time in seconds since epoch, t.
107
108 If the function is called without an argument, it will use the current
109 time.
110
111 The format of the returned string is like this:
112
113 Wed, DD-Mon-YYYY HH:MM:SS GMT
114
115 """
Victor Stinner628225c2011-03-21 02:38:51 +0100116 if t is None:
117 dt = datetime.datetime.utcnow()
118 else:
119 dt = datetime.datetime.utcfromtimestamp(t)
Senthil Kumarand5b47fb2016-07-10 06:45:38 -0700120 return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100121 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
122 dt.year, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000123
124
125UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
126
Antoine Pitroufd036452008-08-19 17:56:33 +0000127TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000128def offset_from_tz_string(tz):
129 offset = None
130 if tz in UTC_ZONES:
131 offset = 0
132 else:
133 m = TIMEZONE_RE.search(tz)
134 if m:
135 offset = 3600 * int(m.group(2))
136 if m.group(3):
137 offset = offset + 60 * int(m.group(3))
138 if m.group(1) == '-':
139 offset = -offset
140 return offset
141
142def _str2time(day, mon, yr, hr, min, sec, tz):
Berker Peksag20be53e2016-03-14 05:48:02 +0200143 yr = int(yr)
144 if yr > datetime.MAXYEAR:
145 return None
146
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000147 # translate month name to number
148 # month numbers start with 1 (January)
149 try:
150 mon = MONTHS_LOWER.index(mon.lower())+1
151 except ValueError:
152 # maybe it's already a number
153 try:
154 imon = int(mon)
155 except ValueError:
156 return None
157 if 1 <= imon <= 12:
158 mon = imon
159 else:
160 return None
161
162 # make sure clock elements are defined
163 if hr is None: hr = 0
164 if min is None: min = 0
165 if sec is None: sec = 0
166
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000167 day = int(day)
168 hr = int(hr)
169 min = int(min)
170 sec = int(sec)
171
172 if yr < 1000:
173 # find "obvious" year
174 cur_yr = time.localtime(time.time())[0]
175 m = cur_yr % 100
176 tmp = yr
177 yr = yr + cur_yr - m
178 m = m - tmp
179 if abs(m) > 50:
180 if m > 0: yr = yr + 100
181 else: yr = yr - 100
182
183 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
184 t = _timegm((yr, mon, day, hr, min, sec, tz))
185
186 if t is not None:
187 # adjust time using timezone string, to get absolute time since epoch
188 if tz is None:
189 tz = "UTC"
190 tz = tz.upper()
191 offset = offset_from_tz_string(tz)
192 if offset is None:
193 return None
194 t = t - offset
195
196 return t
197
198STRICT_DATE_RE = re.compile(
199 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
R David Murray44b548d2016-09-08 13:59:53 -0400200 r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000201WEEKDAY_RE = re.compile(
Antoine Pitroufd036452008-08-19 17:56:33 +0000202 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000203LOOSE_HTTP_DATE_RE = re.compile(
204 r"""^
205 (\d\d?) # day
206 (?:\s+|[-\/])
207 (\w+) # month
208 (?:\s+|[-\/])
209 (\d+) # year
210 (?:
211 (?:\s+|:) # separator before clock
212 (\d\d?):(\d\d) # hour:min
213 (?::(\d\d))? # optional seconds
214 )? # optional clock
215 \s*
216 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
217 \s*
218 (?:\(\w+\))? # ASCII representation of timezone in parens.
Antoine Pitroufd036452008-08-19 17:56:33 +0000219 \s*$""", re.X | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000220def http2time(text):
221 """Returns time in seconds since epoch of time represented by a string.
222
223 Return value is an integer.
224
225 None is returned if the format of str is unrecognized, the time is outside
226 the representable range, or the timezone string is not recognized. If the
227 string contains no timezone, UTC is assumed.
228
229 The timezone in the string may be numerical (like "-0800" or "+0100") or a
230 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
231 timezone strings equivalent to UTC (zero offset) are known to the function.
232
233 The function loosely parses the following formats:
234
235 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
236 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
237 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
238 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
239 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
240 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
241
242 The parser ignores leading and trailing whitespace. The time may be
243 absent.
244
245 If the year is given with only 2 digits, the function will select the
246 century that makes the year closest to the current date.
247
248 """
249 # fast exit for strictly conforming string
250 m = STRICT_DATE_RE.search(text)
251 if m:
252 g = m.groups()
253 mon = MONTHS_LOWER.index(g[1].lower()) + 1
254 tt = (int(g[2]), mon, int(g[0]),
255 int(g[3]), int(g[4]), float(g[5]))
256 return _timegm(tt)
257
258 # No, we need some messy parsing...
259
260 # clean up
261 text = text.lstrip()
262 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
263
264 # tz is time zone specifier string
265 day, mon, yr, hr, min, sec, tz = [None]*7
266
267 # loose regexp parse
268 m = LOOSE_HTTP_DATE_RE.search(text)
269 if m is not None:
270 day, mon, yr, hr, min, sec, tz = m.groups()
271 else:
272 return None # bad format
273
274 return _str2time(day, mon, yr, hr, min, sec, tz)
275
276ISO_DATE_RE = re.compile(
R David Murray44b548d2016-09-08 13:59:53 -0400277 r"""^
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000278 (\d{4}) # year
279 [-\/]?
280 (\d\d?) # numerical month
281 [-\/]?
282 (\d\d?) # day
283 (?:
284 (?:\s+|[-:Tt]) # separator before clock
285 (\d\d?):?(\d\d) # hour:min
286 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
287 )? # optional clock
288 \s*
289 ([-+]?\d\d?:?(:?\d\d)?
290 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
Antoine Pitroufd036452008-08-19 17:56:33 +0000291 \s*$""", re.X | re. ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000292def iso2time(text):
293 """
294 As for http2time, but parses the ISO 8601 formats:
295
296 1994-02-03 14:15:29 -0100 -- ISO 8601 format
297 1994-02-03 14:15:29 -- zone is optional
298 1994-02-03 -- only date
299 1994-02-03T14:15:29 -- Use T as separator
300 19940203T141529Z -- ISO 8601 compact format
301 19940203 -- only date
302
303 """
304 # clean up
305 text = text.lstrip()
306
307 # tz is time zone specifier string
308 day, mon, yr, hr, min, sec, tz = [None]*7
309
310 # loose regexp parse
311 m = ISO_DATE_RE.search(text)
312 if m is not None:
313 # XXX there's an extra bit of the timezone I'm ignoring here: is
314 # this the right thing to do?
315 yr, mon, day, hr, min, sec, tz, _ = m.groups()
316 else:
317 return None # bad format
318
319 return _str2time(day, mon, yr, hr, min, sec, tz)
320
321
322# Header parsing
323# -----------------------------------------------------------------------------
324
325def unmatched(match):
326 """Return unmatched part of re.Match object."""
327 start, end = match.span(0)
328 return match.string[:start]+match.string[end:]
329
330HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
331HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
332HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
333HEADER_ESCAPE_RE = re.compile(r"\\(.)")
334def split_header_words(header_values):
335 r"""Parse header values into a list of lists containing key,value pairs.
336
337 The function knows how to deal with ",", ";" and "=" as well as quoted
338 values after "=". A list of space separated tokens are parsed as if they
339 were separated by ";".
340
341 If the header_values passed as argument contains multiple values, then they
342 are treated as if they were a single value separated by comma ",".
343
344 This means that this function is useful for parsing header fields that
345 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
346 the requirement for tokens).
347
348 headers = #header
349 header = (token | parameter) *( [";"] (token | parameter))
350
351 token = 1*<any CHAR except CTLs or separators>
352 separators = "(" | ")" | "<" | ">" | "@"
353 | "," | ";" | ":" | "\" | <">
354 | "/" | "[" | "]" | "?" | "="
355 | "{" | "}" | SP | HT
356
357 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
358 qdtext = <any TEXT except <">>
359 quoted-pair = "\" CHAR
360
361 parameter = attribute "=" value
362 attribute = token
363 value = token | quoted-string
364
365 Each header is represented by a list of key/value pairs. The value for a
366 simple token (not part of a parameter) is None. Syntactically incorrect
367 headers will not necessarily be parsed as you would want.
368
369 This is easier to describe with some examples:
370
371 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
372 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
373 >>> split_header_words(['text/html; charset="iso-8859-1"'])
374 [[('text/html', None), ('charset', 'iso-8859-1')]]
375 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
376 [[('Basic', None), ('realm', '"foobar"')]]
377
378 """
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000379 assert not isinstance(header_values, str)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000380 result = []
381 for text in header_values:
382 orig_text = text
383 pairs = []
384 while text:
385 m = HEADER_TOKEN_RE.search(text)
386 if m:
387 text = unmatched(m)
388 name = m.group(1)
389 m = HEADER_QUOTED_VALUE_RE.search(text)
390 if m: # quoted value
391 text = unmatched(m)
392 value = m.group(1)
393 value = HEADER_ESCAPE_RE.sub(r"\1", value)
394 else:
395 m = HEADER_VALUE_RE.search(text)
396 if m: # unquoted value
397 text = unmatched(m)
398 value = m.group(1)
399 value = value.rstrip()
400 else:
401 # no value, a lone token
402 value = None
403 pairs.append((name, value))
404 elif text.lstrip().startswith(","):
405 # concatenated headers, as per RFC 2616 section 4.2
406 text = text.lstrip()[1:]
407 if pairs: result.append(pairs)
408 pairs = []
409 else:
410 # skip junk
R David Murray44b548d2016-09-08 13:59:53 -0400411 non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000412 assert nr_junk_chars > 0, (
413 "split_header_words bug: '%s', '%s', %s" %
414 (orig_text, text, pairs))
415 text = non_junk
416 if pairs: result.append(pairs)
417 return result
418
419HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
420def join_header_words(lists):
421 """Do the inverse (almost) of the conversion done by split_header_words.
422
423 Takes a list of lists of (key, value) pairs and produces a single header
424 value. Attribute values are quoted if needed.
425
Martin Panterac34e092015-11-14 00:58:32 +0000426 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]])
427 'text/plain; charset="iso-8859-1"'
428 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]])
429 'text/plain, charset="iso-8859-1"'
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000430
431 """
432 headers = []
433 for pairs in lists:
434 attr = []
435 for k, v in pairs:
436 if v is not None:
437 if not re.search(r"^\w+$", v):
438 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
439 v = '"%s"' % v
440 k = "%s=%s" % (k, v)
441 attr.append(k)
442 if attr: headers.append("; ".join(attr))
443 return ", ".join(headers)
444
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000445def strip_quotes(text):
446 if text.startswith('"'):
447 text = text[1:]
448 if text.endswith('"'):
449 text = text[:-1]
450 return text
451
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000452def parse_ns_headers(ns_headers):
453 """Ad-hoc parser for Netscape protocol cookie-attributes.
454
455 The old Netscape cookie format for Set-Cookie can for instance contain
456 an unquoted "," in the expires field, so we have to use this ad-hoc
457 parser instead of split_header_words.
458
459 XXX This may not make the best possible effort to parse all the crap
460 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
461 parser is probably better, so could do worse than following that if
462 this ever gives any trouble.
463
464 Currently, this is also used for parsing RFC 2109 cookies.
465
466 """
467 known_attrs = ("expires", "domain", "path", "secure",
468 # RFC 2109 attrs (may turn up in Netscape cookies, too)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000469 "version", "port", "max-age")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000470
471 result = []
472 for ns_header in ns_headers:
473 pairs = []
474 version_set = False
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200475
476 # XXX: The following does not strictly adhere to RFCs in that empty
477 # names and values are legal (the former will only appear once and will
478 # be overwritten if multiple occurrences are present). This is
479 # mostly to deal with backwards compatibility.
480 for ii, param in enumerate(ns_header.split(';')):
481 param = param.strip()
482
483 key, sep, val = param.partition('=')
484 key = key.strip()
485
486 if not key:
487 if ii == 0:
488 break
489 else:
490 continue
491
492 # allow for a distinction between present and empty and missing
493 # altogether
494 val = val.strip() if sep else None
495
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000496 if ii != 0:
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200497 lc = key.lower()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000498 if lc in known_attrs:
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200499 key = lc
500
501 if key == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000502 # This is an RFC 2109 cookie.
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200503 if val is not None:
504 val = strip_quotes(val)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000505 version_set = True
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200506 elif key == "expires":
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000507 # convert expires date to seconds since epoch
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200508 if val is not None:
509 val = http2time(strip_quotes(val)) # None if invalid
510 pairs.append((key, val))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000511
512 if pairs:
513 if not version_set:
514 pairs.append(("version", "0"))
515 result.append(pairs)
516
517 return result
518
519
Antoine Pitroufd036452008-08-19 17:56:33 +0000520IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000521def is_HDN(text):
522 """Return True if text is a host domain name."""
523 # XXX
524 # This may well be wrong. Which RFC is HDN defined in, if any (for
525 # the purposes of RFC 2965)?
526 # For the current implementation, what about IPv6? Remember to look
527 # at other uses of IPV4_RE also, if change this.
528 if IPV4_RE.search(text):
529 return False
530 if text == "":
531 return False
532 if text[0] == "." or text[-1] == ".":
533 return False
534 return True
535
536def domain_match(A, B):
537 """Return True if domain A domain-matches domain B, according to RFC 2965.
538
539 A and B may be host domain names or IP addresses.
540
541 RFC 2965, section 1:
542
543 Host names can be specified either as an IP address or a HDN string.
544 Sometimes we compare one host name with another. (Such comparisons SHALL
545 be case-insensitive.) Host A's name domain-matches host B's if
546
547 * their host name strings string-compare equal; or
548
549 * A is a HDN string and has the form NB, where N is a non-empty
550 name string, B has the form .B', and B' is a HDN string. (So,
551 x.y.com domain-matches .Y.com but not Y.com.)
552
553 Note that domain-match is not a commutative operation: a.b.c.com
554 domain-matches .c.com, but not the reverse.
555
556 """
557 # Note that, if A or B are IP addresses, the only relevant part of the
558 # definition of the domain-match algorithm is the direct string-compare.
559 A = A.lower()
560 B = B.lower()
561 if A == B:
562 return True
563 if not is_HDN(A):
564 return False
565 i = A.rfind(B)
566 if i == -1 or i == 0:
567 # A does not have form NB, or N is the empty string
568 return False
569 if not B.startswith("."):
570 return False
571 if not is_HDN(B[1:]):
572 return False
573 return True
574
575def liberal_is_HDN(text):
576 """Return True if text is a sort-of-like a host domain name.
577
578 For accepting/blocking domains.
579
580 """
581 if IPV4_RE.search(text):
582 return False
583 return True
584
585def user_domain_match(A, B):
586 """For blocking/accepting domains.
587
588 A and B may be host domain names or IP addresses.
589
590 """
591 A = A.lower()
592 B = B.lower()
593 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
594 if A == B:
595 # equal IP addresses
596 return True
597 return False
598 initial_dot = B.startswith(".")
599 if initial_dot and A.endswith(B):
600 return True
601 if not initial_dot and A == B:
602 return True
603 return False
604
Antoine Pitroufd036452008-08-19 17:56:33 +0000605cut_port_re = re.compile(r":\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000606def request_host(request):
607 """Return request-host, as defined by RFC 2965.
608
609 Variation from RFC: returned value is lowercased, for convenient
610 comparison.
611
612 """
613 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000614 host = urllib.parse.urlparse(url)[1]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000615 if host == "":
616 host = request.get_header("Host", "")
617
618 # remove port, if present
619 host = cut_port_re.sub("", host, 1)
620 return host.lower()
621
622def eff_request_host(request):
623 """Return a tuple (request-host, effective request-host name).
624
625 As defined by RFC 2965, except both are lowercased.
626
627 """
628 erhn = req_host = request_host(request)
629 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
630 erhn = req_host + ".local"
631 return req_host, erhn
632
633def request_path(request):
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000634 """Path component of request-URI, as defined by RFC 2965."""
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000635 url = request.get_full_url()
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000636 parts = urllib.parse.urlsplit(url)
637 path = escape_path(parts.path)
638 if not path.startswith("/"):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000639 # fix bad RFC 2396 absoluteURI
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000640 path = "/" + path
641 return path
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000642
643def request_port(request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -0500644 host = request.host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000645 i = host.find(':')
646 if i >= 0:
647 port = host[i+1:]
648 try:
649 int(port)
650 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000651 _debug("nonnumeric port: '%s'", port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000652 return None
653 else:
654 port = DEFAULT_HTTP_PORT
655 return port
656
657# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
658# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
659HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
660ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
661def uppercase_escaped_char(match):
662 return "%%%s" % match.group(1).upper()
663def escape_path(path):
664 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
665 # There's no knowing what character encoding was used to create URLs
666 # containing %-escapes, but since we have to pick one to escape invalid
667 # path characters, we pick UTF-8, as recommended in the HTML 4.0
668 # specification:
669 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
670 # And here, kind of: draft-fielding-uri-rfc2396bis-03
671 # (And in draft IRI specification: draft-duerst-iri-05)
672 # (And here, for new URI schemes: RFC 2718)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000673 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000674 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
675 return path
676
677def reach(h):
678 """Return reach of host h, as defined by RFC 2965, section 1.
679
680 The reach R of a host name H is defined as follows:
681
682 * If
683
684 - H is the host domain name of a host; and,
685
686 - H has the form A.B; and
687
688 - A has no embedded (that is, interior) dots; and
689
690 - B has at least one embedded dot, or B is the string "local".
691 then the reach of H is .B.
692
693 * Otherwise, the reach of H is H.
694
695 >>> reach("www.acme.com")
696 '.acme.com'
697 >>> reach("acme.com")
698 'acme.com'
699 >>> reach("acme.local")
700 '.local'
701
702 """
703 i = h.find(".")
704 if i >= 0:
705 #a = h[:i] # this line is only here to show what a is
706 b = h[i+1:]
707 i = b.find(".")
708 if is_HDN(h) and (i >= 0 or b == "local"):
709 return "."+b
710 return h
711
712def is_third_party(request):
713 """
714
715 RFC 2965, section 3.3.6:
716
717 An unverifiable transaction is to a third-party host if its request-
718 host U does not domain-match the reach R of the request-host O in the
719 origin transaction.
720
721 """
722 req_host = request_host(request)
Senthil Kumarand9fbf362013-04-09 07:07:59 -0700723 if not domain_match(req_host, reach(request.origin_req_host)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000724 return True
725 else:
726 return False
727
728
729class Cookie:
730 """HTTP Cookie.
731
732 This class represents both Netscape and RFC 2965 cookies.
733
734 This is deliberately a very simple class. It just holds attributes. It's
735 possible to construct Cookie instances that don't comply with the cookie
736 standards. CookieJar.make_cookies is the factory function for Cookie
737 objects -- it deals with cookie parsing, supplying defaults, and
738 normalising to the representation used in this class. CookiePolicy is
739 responsible for checking them to see whether they should be accepted from
740 and returned to the server.
741
742 Note that the port may be present in the headers, but unspecified ("Port"
743 rather than"Port=80", for example); if this is the case, port is None.
744
745 """
746
747 def __init__(self, version, name, value,
748 port, port_specified,
749 domain, domain_specified, domain_initial_dot,
750 path, path_specified,
751 secure,
752 expires,
753 discard,
754 comment,
755 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000756 rest,
757 rfc2109=False,
758 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000759
760 if version is not None: version = int(version)
Robert Collinsa0e5d982015-08-04 10:06:29 +1200761 if expires is not None: expires = int(float(expires))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000762 if port is None and port_specified is True:
763 raise ValueError("if port is None, port_specified must be false")
764
765 self.version = version
766 self.name = name
767 self.value = value
768 self.port = port
769 self.port_specified = port_specified
770 # normalise case, as per RFC 2965 section 3.3.3
771 self.domain = domain.lower()
772 self.domain_specified = domain_specified
773 # Sigh. We need to know whether the domain given in the
774 # cookie-attribute had an initial dot, in order to follow RFC 2965
775 # (as clarified in draft errata). Needed for the returned $Domain
776 # value.
777 self.domain_initial_dot = domain_initial_dot
778 self.path = path
779 self.path_specified = path_specified
780 self.secure = secure
781 self.expires = expires
782 self.discard = discard
783 self.comment = comment
784 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000785 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000786
787 self._rest = copy.copy(rest)
788
789 def has_nonstandard_attr(self, name):
790 return name in self._rest
791 def get_nonstandard_attr(self, name, default=None):
792 return self._rest.get(name, default)
793 def set_nonstandard_attr(self, name, value):
794 self._rest[name] = value
795
796 def is_expired(self, now=None):
797 if now is None: now = time.time()
798 if (self.expires is not None) and (self.expires <= now):
799 return True
800 return False
801
802 def __str__(self):
803 if self.port is None: p = ""
804 else: p = ":"+self.port
805 limit = self.domain + p + self.path
806 if self.value is not None:
807 namevalue = "%s=%s" % (self.name, self.value)
808 else:
809 namevalue = self.name
810 return "<Cookie %s for %s>" % (namevalue, limit)
811
812 def __repr__(self):
813 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000814 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000815 "port", "port_specified",
816 "domain", "domain_specified", "domain_initial_dot",
817 "path", "path_specified",
818 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000819 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000820 attr = getattr(self, name)
821 args.append("%s=%s" % (name, repr(attr)))
822 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000823 args.append("rfc2109=%s" % repr(self.rfc2109))
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300824 return "%s(%s)" % (self.__class__.__name__, ", ".join(args))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000825
826
827class CookiePolicy:
828 """Defines which cookies get accepted from and returned to server.
829
830 May also modify cookies, though this is probably a bad idea.
831
832 The subclass DefaultCookiePolicy defines the standard rules for Netscape
Raymond Hettinger7ea386e2016-08-25 21:11:50 -0700833 and RFC 2965 cookies -- override that if you want a customized policy.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000834
835 """
836 def set_ok(self, cookie, request):
837 """Return true if (and only if) cookie should be accepted from server.
838
839 Currently, pre-expired cookies never get this far -- the CookieJar
840 class deletes such cookies itself.
841
842 """
843 raise NotImplementedError()
844
845 def return_ok(self, cookie, request):
846 """Return true if (and only if) cookie should be returned to server."""
847 raise NotImplementedError()
848
849 def domain_return_ok(self, domain, request):
850 """Return false if cookies should not be returned, given cookie domain.
851 """
852 return True
853
854 def path_return_ok(self, path, request):
855 """Return false if cookies should not be returned, given cookie path.
856 """
857 return True
858
859
860class DefaultCookiePolicy(CookiePolicy):
861 """Implements the standard rules for accepting and returning cookies."""
862
863 DomainStrictNoDots = 1
864 DomainStrictNonDomain = 2
865 DomainRFC2965Match = 4
866
867 DomainLiberal = 0
868 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
869
870 def __init__(self,
871 blocked_domains=None, allowed_domains=None,
872 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000873 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000874 hide_cookie2=False,
875 strict_domain=False,
876 strict_rfc2965_unverifiable=True,
877 strict_ns_unverifiable=False,
878 strict_ns_domain=DomainLiberal,
879 strict_ns_set_initial_dollar=False,
880 strict_ns_set_path=False,
881 ):
882 """Constructor arguments should be passed as keyword arguments only."""
883 self.netscape = netscape
884 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000885 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000886 self.hide_cookie2 = hide_cookie2
887 self.strict_domain = strict_domain
888 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
889 self.strict_ns_unverifiable = strict_ns_unverifiable
890 self.strict_ns_domain = strict_ns_domain
891 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
892 self.strict_ns_set_path = strict_ns_set_path
893
894 if blocked_domains is not None:
895 self._blocked_domains = tuple(blocked_domains)
896 else:
897 self._blocked_domains = ()
898
899 if allowed_domains is not None:
900 allowed_domains = tuple(allowed_domains)
901 self._allowed_domains = allowed_domains
902
903 def blocked_domains(self):
904 """Return the sequence of blocked domains (as a tuple)."""
905 return self._blocked_domains
906 def set_blocked_domains(self, blocked_domains):
907 """Set the sequence of blocked domains."""
908 self._blocked_domains = tuple(blocked_domains)
909
910 def is_blocked(self, domain):
911 for blocked_domain in self._blocked_domains:
912 if user_domain_match(domain, blocked_domain):
913 return True
914 return False
915
916 def allowed_domains(self):
917 """Return None, or the sequence of allowed domains (as a tuple)."""
918 return self._allowed_domains
919 def set_allowed_domains(self, allowed_domains):
920 """Set the sequence of allowed domains, or None."""
921 if allowed_domains is not None:
922 allowed_domains = tuple(allowed_domains)
923 self._allowed_domains = allowed_domains
924
925 def is_not_allowed(self, domain):
926 if self._allowed_domains is None:
927 return False
928 for allowed_domain in self._allowed_domains:
929 if user_domain_match(domain, allowed_domain):
930 return False
931 return True
932
933 def set_ok(self, cookie, request):
934 """
935 If you override .set_ok(), be sure to call this method. If it returns
936 false, so should your subclass (assuming your subclass wants to be more
937 strict about which cookies to accept).
938
939 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000940 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000941
942 assert cookie.name is not None
943
944 for n in "version", "verifiability", "name", "path", "domain", "port":
945 fn_name = "set_ok_"+n
946 fn = getattr(self, fn_name)
947 if not fn(cookie, request):
948 return False
949
950 return True
951
952 def set_ok_version(self, cookie, request):
953 if cookie.version is None:
954 # Version is always set to 0 by parse_ns_headers if it's a Netscape
955 # cookie, so this must be an invalid RFC 2965 cookie.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000956 _debug(" Set-Cookie2 without version attribute (%s=%s)",
957 cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000958 return False
959 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000960 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000961 return False
962 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000963 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000964 return False
965 return True
966
967 def set_ok_verifiability(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -0500968 if request.unverifiable and is_third_party(request):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000969 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000970 _debug(" third-party RFC 2965 cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000971 "unverifiable transaction")
972 return False
973 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000974 _debug(" third-party Netscape cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000975 "unverifiable transaction")
976 return False
977 return True
978
979 def set_ok_name(self, cookie, request):
980 # Try and stop servers setting V0 cookies designed to hack other
981 # servers that know both V0 and V1 protocols.
982 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
983 cookie.name.startswith("$")):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000984 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000985 return False
986 return True
987
988 def set_ok_path(self, cookie, request):
989 if cookie.path_specified:
990 req_path = request_path(request)
991 if ((cookie.version > 0 or
992 (cookie.version == 0 and self.strict_ns_set_path)) and
993 not req_path.startswith(cookie.path)):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000994 _debug(" path attribute %s is not a prefix of request "
995 "path %s", cookie.path, req_path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000996 return False
997 return True
998
999 def set_ok_domain(self, cookie, request):
1000 if self.is_blocked(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001001 _debug(" domain %s is in user block-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001002 return False
1003 if self.is_not_allowed(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001004 _debug(" domain %s is not in user allow-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001005 return False
1006 if cookie.domain_specified:
1007 req_host, erhn = eff_request_host(request)
1008 domain = cookie.domain
1009 if self.strict_domain and (domain.count(".") >= 2):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001010 # XXX This should probably be compared with the Konqueror
1011 # (kcookiejar.cpp) and Mozilla implementations, but it's a
1012 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001013 i = domain.rfind(".")
1014 j = domain.rfind(".", 0, i)
1015 if j == 0: # domain like .foo.bar
1016 tld = domain[i+1:]
1017 sld = domain[j+1:i]
Thomas Wouters477c8d52006-05-27 19:21:47 +00001018 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1019 "gov", "mil", "int", "aero", "biz", "cat", "coop",
1020 "info", "jobs", "mobi", "museum", "name", "pro",
1021 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001022 # domain like .co.uk
Thomas Wouters477c8d52006-05-27 19:21:47 +00001023 _debug(" country-code second level domain %s", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001024 return False
1025 if domain.startswith("."):
1026 undotted_domain = domain[1:]
1027 else:
1028 undotted_domain = domain
1029 embedded_dots = (undotted_domain.find(".") >= 0)
1030 if not embedded_dots and domain != ".local":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001031 _debug(" non-local domain %s contains no embedded dot",
1032 domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001033 return False
1034 if cookie.version == 0:
1035 if (not erhn.endswith(domain) and
1036 (not erhn.startswith(".") and
1037 not ("."+erhn).endswith(domain))):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001038 _debug(" effective request-host %s (even with added "
Ezio Melottie130a522011-10-19 10:58:56 +03001039 "initial dot) does not end with %s",
Thomas Wouters477c8d52006-05-27 19:21:47 +00001040 erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001041 return False
1042 if (cookie.version > 0 or
1043 (self.strict_ns_domain & self.DomainRFC2965Match)):
1044 if not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001045 _debug(" effective request-host %s does not domain-match "
1046 "%s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001047 return False
1048 if (cookie.version > 0 or
1049 (self.strict_ns_domain & self.DomainStrictNoDots)):
1050 host_prefix = req_host[:-len(domain)]
1051 if (host_prefix.find(".") >= 0 and
1052 not IPV4_RE.search(req_host)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001053 _debug(" host prefix %s for domain %s contains a dot",
1054 host_prefix, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001055 return False
1056 return True
1057
1058 def set_ok_port(self, cookie, request):
1059 if cookie.port_specified:
1060 req_port = request_port(request)
1061 if req_port is None:
1062 req_port = "80"
1063 else:
1064 req_port = str(req_port)
1065 for p in cookie.port.split(","):
1066 try:
1067 int(p)
1068 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001069 _debug(" bad port %s (not numeric)", p)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001070 return False
1071 if p == req_port:
1072 break
1073 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001074 _debug(" request port (%s) not found in %s",
1075 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001076 return False
1077 return True
1078
1079 def return_ok(self, cookie, request):
1080 """
1081 If you override .return_ok(), be sure to call this method. If it
1082 returns false, so should your subclass (assuming your subclass wants to
1083 be more strict about which cookies to return).
1084
1085 """
1086 # Path has already been checked by .path_return_ok(), and domain
1087 # blocking done by .domain_return_ok().
Thomas Wouters477c8d52006-05-27 19:21:47 +00001088 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001089
1090 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1091 fn_name = "return_ok_"+n
1092 fn = getattr(self, fn_name)
1093 if not fn(cookie, request):
1094 return False
1095 return True
1096
1097 def return_ok_version(self, cookie, request):
1098 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001099 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001100 return False
1101 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001102 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001103 return False
1104 return True
1105
1106 def return_ok_verifiability(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -05001107 if request.unverifiable and is_third_party(request):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001108 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001109 _debug(" third-party RFC 2965 cookie during unverifiable "
1110 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001111 return False
1112 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001113 _debug(" third-party Netscape cookie during unverifiable "
1114 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001115 return False
1116 return True
1117
1118 def return_ok_secure(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -05001119 if cookie.secure and request.type != "https":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001120 _debug(" secure cookie with non-secure request")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001121 return False
1122 return True
1123
1124 def return_ok_expires(self, cookie, request):
1125 if cookie.is_expired(self._now):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001126 _debug(" cookie expired")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001127 return False
1128 return True
1129
1130 def return_ok_port(self, cookie, request):
1131 if cookie.port:
1132 req_port = request_port(request)
1133 if req_port is None:
1134 req_port = "80"
1135 for p in cookie.port.split(","):
1136 if p == req_port:
1137 break
1138 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001139 _debug(" request port %s does not match cookie port %s",
1140 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001141 return False
1142 return True
1143
1144 def return_ok_domain(self, cookie, request):
1145 req_host, erhn = eff_request_host(request)
1146 domain = cookie.domain
1147
1148 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1149 if (cookie.version == 0 and
1150 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1151 not cookie.domain_specified and domain != erhn):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001152 _debug(" cookie with unspecified domain does not string-compare "
1153 "equal to request domain")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001154 return False
1155
1156 if cookie.version > 0 and not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001157 _debug(" effective request-host name %s does not domain-match "
1158 "RFC 2965 cookie domain %s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001159 return False
1160 if cookie.version == 0 and not ("."+erhn).endswith(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001161 _debug(" request-host %s does not match Netscape cookie domain "
1162 "%s", req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001163 return False
1164 return True
1165
1166 def domain_return_ok(self, domain, request):
1167 # Liberal check of. This is here as an optimization to avoid
1168 # having to load lots of MSIE cookie files unless necessary.
1169 req_host, erhn = eff_request_host(request)
1170 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001171 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001172 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001173 erhn = "."+erhn
1174 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001175 #_debug(" request domain %s does not match cookie domain %s",
1176 # req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001177 return False
1178
1179 if self.is_blocked(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001180 _debug(" domain %s is in user block-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001181 return False
1182 if self.is_not_allowed(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001183 _debug(" domain %s is not in user allow-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001184 return False
1185
1186 return True
1187
1188 def path_return_ok(self, path, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001189 _debug("- checking cookie path=%s", path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001190 req_path = request_path(request)
1191 if not req_path.startswith(path):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001192 _debug(" %s does not path-match %s", req_path, path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001193 return False
1194 return True
1195
1196
1197def vals_sorted_by_key(adict):
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001198 keys = sorted(adict.keys())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001199 return map(adict.get, keys)
1200
1201def deepvalues(mapping):
1202 """Iterates over nested mapping, depth-first, in sorted order by key."""
1203 values = vals_sorted_by_key(mapping)
1204 for obj in values:
1205 mapping = False
1206 try:
1207 obj.items
1208 except AttributeError:
1209 pass
1210 else:
1211 mapping = True
Philip Jenveyfd0d3e52012-10-01 15:34:31 -07001212 yield from deepvalues(obj)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001213 if not mapping:
1214 yield obj
1215
1216
1217# Used as second parameter to dict.get() method, to distinguish absent
1218# dict key from one with a None value.
1219class Absent: pass
1220
1221class CookieJar:
1222 """Collection of HTTP cookies.
1223
1224 You may not need to know about this class: try
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001225 urllib.request.build_opener(HTTPCookieProcessor).open(url).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001226 """
1227
1228 non_word_re = re.compile(r"\W")
1229 quote_re = re.compile(r"([\"\\])")
1230 strict_domain_re = re.compile(r"\.?[^.]*")
1231 domain_re = re.compile(r"[^.]*")
1232 dots_re = re.compile(r"^\.+")
1233
Antoine Pitroufd036452008-08-19 17:56:33 +00001234 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001235
1236 def __init__(self, policy=None):
1237 if policy is None:
1238 policy = DefaultCookiePolicy()
1239 self._policy = policy
1240
1241 self._cookies_lock = _threading.RLock()
1242 self._cookies = {}
1243
1244 def set_policy(self, policy):
1245 self._policy = policy
1246
1247 def _cookies_for_domain(self, domain, request):
1248 cookies = []
1249 if not self._policy.domain_return_ok(domain, request):
1250 return []
Thomas Wouters477c8d52006-05-27 19:21:47 +00001251 _debug("Checking %s for cookies to return", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001252 cookies_by_path = self._cookies[domain]
1253 for path in cookies_by_path.keys():
1254 if not self._policy.path_return_ok(path, request):
1255 continue
1256 cookies_by_name = cookies_by_path[path]
1257 for cookie in cookies_by_name.values():
1258 if not self._policy.return_ok(cookie, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001259 _debug(" not returning cookie")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001260 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001261 _debug(" it's a match")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001262 cookies.append(cookie)
1263 return cookies
1264
1265 def _cookies_for_request(self, request):
1266 """Return a list of cookies to be returned to server."""
1267 cookies = []
1268 for domain in self._cookies.keys():
1269 cookies.extend(self._cookies_for_domain(domain, request))
1270 return cookies
1271
1272 def _cookie_attrs(self, cookies):
1273 """Return a list of cookie-attributes to be returned to server.
1274
1275 like ['foo="bar"; $Path="/"', ...]
1276
1277 The $Version attribute is also added when appropriate (currently only
1278 once per request).
1279
1280 """
1281 # add cookies in order of most specific (ie. longest) path first
Raymond Hettinger70b64fc2008-01-30 20:15:17 +00001282 cookies.sort(key=lambda a: len(a.path), reverse=True)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001283
1284 version_set = False
1285
1286 attrs = []
1287 for cookie in cookies:
1288 # set version of Cookie header
1289 # XXX
1290 # What should it be if multiple matching Set-Cookie headers have
1291 # different versions themselves?
1292 # Answer: there is no answer; was supposed to be settled by
1293 # RFC 2965 errata, but that may never appear...
1294 version = cookie.version
1295 if not version_set:
1296 version_set = True
1297 if version > 0:
1298 attrs.append("$Version=%s" % version)
1299
1300 # quote cookie value if necessary
1301 # (not for Netscape protocol, which already has any quotes
1302 # intact, due to the poorly-specified Netscape Cookie: syntax)
1303 if ((cookie.value is not None) and
1304 self.non_word_re.search(cookie.value) and version > 0):
1305 value = self.quote_re.sub(r"\\\1", cookie.value)
1306 else:
1307 value = cookie.value
1308
1309 # add cookie-attributes to be returned in Cookie header
1310 if cookie.value is None:
1311 attrs.append(cookie.name)
1312 else:
1313 attrs.append("%s=%s" % (cookie.name, value))
1314 if version > 0:
1315 if cookie.path_specified:
1316 attrs.append('$Path="%s"' % cookie.path)
1317 if cookie.domain.startswith("."):
1318 domain = cookie.domain
1319 if (not cookie.domain_initial_dot and
1320 domain.startswith(".")):
1321 domain = domain[1:]
1322 attrs.append('$Domain="%s"' % domain)
1323 if cookie.port is not None:
1324 p = "$Port"
1325 if cookie.port_specified:
1326 p = p + ('="%s"' % cookie.port)
1327 attrs.append(p)
1328
1329 return attrs
1330
1331 def add_cookie_header(self, request):
Georg Brandl029986a2008-06-23 11:44:14 +00001332 """Add correct Cookie: header to request (urllib.request.Request object).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001333
1334 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1335
1336 """
Thomas Wouters477c8d52006-05-27 19:21:47 +00001337 _debug("add_cookie_header")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001338 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001339 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001340
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001341 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001342
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001343 cookies = self._cookies_for_request(request)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001344
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001345 attrs = self._cookie_attrs(cookies)
1346 if attrs:
1347 if not request.has_header("Cookie"):
1348 request.add_unredirected_header(
1349 "Cookie", "; ".join(attrs))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001350
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001351 # if necessary, advertise that we know RFC 2965
1352 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1353 not request.has_header("Cookie2")):
1354 for cookie in cookies:
1355 if cookie.version != 1:
1356 request.add_unredirected_header("Cookie2", '$Version="1"')
1357 break
1358
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001359 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001360 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001361
1362 self.clear_expired_cookies()
1363
1364 def _normalized_cookie_tuples(self, attrs_set):
1365 """Return list of tuples containing normalised cookie information.
1366
1367 attrs_set is the list of lists of key,value pairs extracted from
1368 the Set-Cookie or Set-Cookie2 headers.
1369
1370 Tuples are name, value, standard, rest, where name and value are the
1371 cookie name and value, standard is a dictionary containing the standard
1372 cookie-attributes (discard, secure, version, expires or max-age,
1373 domain, path and port) and rest is a dictionary containing the rest of
1374 the cookie-attributes.
1375
1376 """
1377 cookie_tuples = []
1378
1379 boolean_attrs = "discard", "secure"
1380 value_attrs = ("version",
1381 "expires", "max-age",
1382 "domain", "path", "port",
1383 "comment", "commenturl")
1384
1385 for cookie_attrs in attrs_set:
1386 name, value = cookie_attrs[0]
1387
1388 # Build dictionary of standard cookie-attributes (standard) and
1389 # dictionary of other cookie-attributes (rest).
1390
1391 # Note: expiry time is normalised to seconds since epoch. V0
1392 # cookies should have the Expires cookie-attribute, and V1 cookies
1393 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1394 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1395 # accept either (but prefer Max-Age).
1396 max_age_set = False
1397
1398 bad_cookie = False
1399
1400 standard = {}
1401 rest = {}
1402 for k, v in cookie_attrs[1:]:
1403 lc = k.lower()
1404 # don't lose case distinction for unknown fields
1405 if lc in value_attrs or lc in boolean_attrs:
1406 k = lc
1407 if k in boolean_attrs and v is None:
1408 # boolean cookie-attribute is present, but has no value
1409 # (like "discard", rather than "port=80")
1410 v = True
1411 if k in standard:
1412 # only first value is significant
1413 continue
1414 if k == "domain":
1415 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001416 _debug(" missing value for domain attribute")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001417 bad_cookie = True
1418 break
1419 # RFC 2965 section 3.3.3
1420 v = v.lower()
1421 if k == "expires":
1422 if max_age_set:
1423 # Prefer max-age to expires (like Mozilla)
1424 continue
1425 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001426 _debug(" missing or invalid value for expires "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001427 "attribute: treating as session cookie")
1428 continue
1429 if k == "max-age":
1430 max_age_set = True
1431 try:
1432 v = int(v)
1433 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001434 _debug(" missing or invalid (non-numeric) value for "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001435 "max-age attribute")
1436 bad_cookie = True
1437 break
1438 # convert RFC 2965 Max-Age to seconds since epoch
1439 # XXX Strictly you're supposed to follow RFC 2616
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001440 # age-calculation rules. Remember that zero Max-Age
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001441 # is a request to discard (old and new) cookie, though.
1442 k = "expires"
1443 v = self._now + v
1444 if (k in value_attrs) or (k in boolean_attrs):
1445 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001446 k not in ("port", "comment", "commenturl")):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001447 _debug(" missing value for %s attribute" % k)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001448 bad_cookie = True
1449 break
1450 standard[k] = v
1451 else:
1452 rest[k] = v
1453
1454 if bad_cookie:
1455 continue
1456
1457 cookie_tuples.append((name, value, standard, rest))
1458
1459 return cookie_tuples
1460
1461 def _cookie_from_cookie_tuple(self, tup, request):
1462 # standard is dict of standard cookie-attributes, rest is dict of the
1463 # rest of them
1464 name, value, standard, rest = tup
1465
1466 domain = standard.get("domain", Absent)
1467 path = standard.get("path", Absent)
1468 port = standard.get("port", Absent)
1469 expires = standard.get("expires", Absent)
1470
1471 # set the easy defaults
1472 version = standard.get("version", None)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +00001473 if version is not None:
1474 try:
1475 version = int(version)
1476 except ValueError:
1477 return None # invalid version, ignore cookie
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001478 secure = standard.get("secure", False)
1479 # (discard is also set if expires is Absent)
1480 discard = standard.get("discard", False)
1481 comment = standard.get("comment", None)
1482 comment_url = standard.get("commenturl", None)
1483
1484 # set default path
1485 if path is not Absent and path != "":
1486 path_specified = True
1487 path = escape_path(path)
1488 else:
1489 path_specified = False
1490 path = request_path(request)
1491 i = path.rfind("/")
1492 if i != -1:
1493 if version == 0:
1494 # Netscape spec parts company from reality here
1495 path = path[:i]
1496 else:
1497 path = path[:i+1]
1498 if len(path) == 0: path = "/"
1499
1500 # set default domain
1501 domain_specified = domain is not Absent
1502 # but first we have to remember whether it starts with a dot
1503 domain_initial_dot = False
1504 if domain_specified:
1505 domain_initial_dot = bool(domain.startswith("."))
1506 if domain is Absent:
1507 req_host, erhn = eff_request_host(request)
1508 domain = erhn
1509 elif not domain.startswith("."):
1510 domain = "."+domain
1511
1512 # set default port
1513 port_specified = False
1514 if port is not Absent:
1515 if port is None:
1516 # Port attr present, but has no value: default to request port.
1517 # Cookie should then only be sent back on that port.
1518 port = request_port(request)
1519 else:
1520 port_specified = True
1521 port = re.sub(r"\s+", "", port)
1522 else:
1523 # No port attr present. Cookie can be sent back on any port.
1524 port = None
1525
1526 # set default expires and discard
1527 if expires is Absent:
1528 expires = None
1529 discard = True
1530 elif expires <= self._now:
1531 # Expiry date in past is request to delete cookie. This can't be
1532 # in DefaultCookiePolicy, because can't delete cookies there.
1533 try:
1534 self.clear(domain, path, name)
1535 except KeyError:
1536 pass
Thomas Wouters477c8d52006-05-27 19:21:47 +00001537 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1538 domain, path, name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001539 return None
1540
1541 return Cookie(version,
1542 name, value,
1543 port, port_specified,
1544 domain, domain_specified, domain_initial_dot,
1545 path, path_specified,
1546 secure,
1547 expires,
1548 discard,
1549 comment,
1550 comment_url,
1551 rest)
1552
1553 def _cookies_from_attrs_set(self, attrs_set, request):
1554 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1555
1556 cookies = []
1557 for tup in cookie_tuples:
1558 cookie = self._cookie_from_cookie_tuple(tup, request)
1559 if cookie: cookies.append(cookie)
1560 return cookies
1561
Neal Norwitz71dad722005-12-23 21:43:48 +00001562 def _process_rfc2109_cookies(self, cookies):
1563 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1564 if rfc2109_as_ns is None:
1565 rfc2109_as_ns = not self._policy.rfc2965
1566 for cookie in cookies:
1567 if cookie.version == 1:
1568 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001569 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001570 # treat 2109 cookies as Netscape cookies rather than
1571 # as RFC2965 cookies
1572 cookie.version = 0
1573
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001574 def make_cookies(self, response, request):
1575 """Return sequence of Cookie objects extracted from response object."""
1576 # get cookie-attributes for RFC 2965 and Netscape protocols
1577 headers = response.info()
Barry Warsaw820c1202008-06-12 04:06:45 +00001578 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1579 ns_hdrs = headers.get_all("Set-Cookie", [])
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001580
1581 rfc2965 = self._policy.rfc2965
1582 netscape = self._policy.netscape
1583
1584 if ((not rfc2965_hdrs and not ns_hdrs) or
1585 (not ns_hdrs and not rfc2965) or
1586 (not rfc2965_hdrs and not netscape) or
1587 (not netscape and not rfc2965)):
1588 return [] # no relevant cookie headers: quick exit
1589
1590 try:
1591 cookies = self._cookies_from_attrs_set(
1592 split_header_words(rfc2965_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001593 except Exception:
1594 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001595 cookies = []
1596
1597 if ns_hdrs and netscape:
1598 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001599 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001600 ns_cookies = self._cookies_from_attrs_set(
1601 parse_ns_headers(ns_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001602 except Exception:
1603 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001604 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001605 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001606
1607 # Look for Netscape cookies (from Set-Cookie headers) that match
1608 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1609 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1610 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1611 # bundled in with the Netscape cookies for this purpose, which is
1612 # reasonable behaviour.
1613 if rfc2965:
1614 lookup = {}
1615 for cookie in cookies:
1616 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1617
1618 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1619 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1620 return key not in lookup
1621 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1622
1623 if ns_cookies:
1624 cookies.extend(ns_cookies)
1625
1626 return cookies
1627
1628 def set_cookie_if_ok(self, cookie, request):
1629 """Set a cookie if policy says it's OK to do so."""
1630 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001631 try:
1632 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001633
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001634 if self._policy.set_ok(cookie, request):
1635 self.set_cookie(cookie)
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001636
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001637
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001638 finally:
1639 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001640
1641 def set_cookie(self, cookie):
1642 """Set a cookie, without checking whether or not it should be set."""
1643 c = self._cookies
1644 self._cookies_lock.acquire()
1645 try:
1646 if cookie.domain not in c: c[cookie.domain] = {}
1647 c2 = c[cookie.domain]
1648 if cookie.path not in c2: c2[cookie.path] = {}
1649 c3 = c2[cookie.path]
1650 c3[cookie.name] = cookie
1651 finally:
1652 self._cookies_lock.release()
1653
1654 def extract_cookies(self, response, request):
1655 """Extract cookies from response, where allowable given the request."""
Thomas Wouters477c8d52006-05-27 19:21:47 +00001656 _debug("extract_cookies: %s", response.info())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001657 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001658 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001659 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001660
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001661 for cookie in self.make_cookies(response, request):
1662 if self._policy.set_ok(cookie, request):
1663 _debug(" setting cookie: %s", cookie)
1664 self.set_cookie(cookie)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001665 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001666 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001667
1668 def clear(self, domain=None, path=None, name=None):
1669 """Clear some cookies.
1670
1671 Invoking this method without arguments will clear all cookies. If
1672 given a single argument, only cookies belonging to that domain will be
1673 removed. If given two arguments, cookies belonging to the specified
1674 path within that domain are removed. If given three arguments, then
1675 the cookie with the specified name, path and domain is removed.
1676
1677 Raises KeyError if no matching cookie exists.
1678
1679 """
1680 if name is not None:
1681 if (domain is None) or (path is None):
1682 raise ValueError(
1683 "domain and path must be given to remove a cookie by name")
1684 del self._cookies[domain][path][name]
1685 elif path is not None:
1686 if domain is None:
1687 raise ValueError(
1688 "domain must be given to remove cookies by path")
1689 del self._cookies[domain][path]
1690 elif domain is not None:
1691 del self._cookies[domain]
1692 else:
1693 self._cookies = {}
1694
1695 def clear_session_cookies(self):
1696 """Discard all session cookies.
1697
1698 Note that the .save() method won't save session cookies anyway, unless
1699 you ask otherwise by passing a true ignore_discard argument.
1700
1701 """
1702 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001703 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001704 for cookie in self:
1705 if cookie.discard:
1706 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001707 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001708 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001709
1710 def clear_expired_cookies(self):
1711 """Discard all expired cookies.
1712
1713 You probably don't need to call this method: expired cookies are never
1714 sent back to the server (provided you're using DefaultCookiePolicy),
1715 this method is called by CookieJar itself every so often, and the
1716 .save() method won't save expired cookies anyway (unless you ask
1717 otherwise by passing a true ignore_expires argument).
1718
1719 """
1720 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001721 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001722 now = time.time()
1723 for cookie in self:
1724 if cookie.is_expired(now):
1725 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001726 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001727 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001728
1729 def __iter__(self):
1730 return deepvalues(self._cookies)
1731
1732 def __len__(self):
1733 """Return number of contained cookies."""
1734 i = 0
1735 for cookie in self: i = i + 1
1736 return i
1737
1738 def __repr__(self):
1739 r = []
1740 for cookie in self: r.append(repr(cookie))
Serhiy Storchaka1392df92014-07-22 11:09:36 +03001741 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001742
1743 def __str__(self):
1744 r = []
1745 for cookie in self: r.append(str(cookie))
Serhiy Storchaka1392df92014-07-22 11:09:36 +03001746 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001747
1748
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001749# derives from OSError for backwards-compatibility with Python 2.4.0
1750class LoadError(OSError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001751
1752class FileCookieJar(CookieJar):
1753 """CookieJar that can be loaded from and saved to a file."""
1754
1755 def __init__(self, filename=None, delayload=False, policy=None):
1756 """
1757 Cookies are NOT loaded from the named file until either the .load() or
1758 .revert() method is called.
1759
1760 """
1761 CookieJar.__init__(self, policy)
1762 if filename is not None:
1763 try:
1764 filename+""
1765 except:
1766 raise ValueError("filename must be string-like")
1767 self.filename = filename
1768 self.delayload = bool(delayload)
1769
1770 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1771 """Save cookies to a file."""
1772 raise NotImplementedError()
1773
1774 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1775 """Load cookies from a file."""
1776 if filename is None:
1777 if self.filename is not None: filename = self.filename
1778 else: raise ValueError(MISSING_FILENAME_TEXT)
1779
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01001780 with open(filename) as f:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001781 self._really_load(f, filename, ignore_discard, ignore_expires)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001782
1783 def revert(self, filename=None,
1784 ignore_discard=False, ignore_expires=False):
1785 """Clear all cookies and reload cookies from a saved file.
1786
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001787 Raises LoadError (or OSError) if reversion is not successful; the
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001788 object's state will not be altered if this happens.
1789
1790 """
1791 if filename is None:
1792 if self.filename is not None: filename = self.filename
1793 else: raise ValueError(MISSING_FILENAME_TEXT)
1794
1795 self._cookies_lock.acquire()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001796 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001797
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001798 old_state = copy.deepcopy(self._cookies)
1799 self._cookies = {}
1800 try:
1801 self.load(filename, ignore_discard, ignore_expires)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001802 except OSError:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001803 self._cookies = old_state
1804 raise
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001805
1806 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001807 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001808
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001809
1810def lwp_cookie_str(cookie):
Benjamin Peterson82f34ad2015-01-13 09:17:24 -05001811 """Return string representation of Cookie in the LWP cookie file format.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001812
1813 Actually, the format is extended a bit -- see module docstring.
1814
1815 """
1816 h = [(cookie.name, cookie.value),
1817 ("path", cookie.path),
1818 ("domain", cookie.domain)]
1819 if cookie.port is not None: h.append(("port", cookie.port))
1820 if cookie.path_specified: h.append(("path_spec", None))
1821 if cookie.port_specified: h.append(("port_spec", None))
1822 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1823 if cookie.secure: h.append(("secure", None))
1824 if cookie.expires: h.append(("expires",
1825 time2isoz(float(cookie.expires))))
1826 if cookie.discard: h.append(("discard", None))
1827 if cookie.comment: h.append(("comment", cookie.comment))
1828 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1829
1830 keys = sorted(cookie._rest.keys())
1831 for k in keys:
1832 h.append((k, str(cookie._rest[k])))
1833
1834 h.append(("version", str(cookie.version)))
1835
1836 return join_header_words([h])
1837
1838class LWPCookieJar(FileCookieJar):
1839 """
Ezio Melotti0847db72012-09-21 16:30:22 +03001840 The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
Martin Panterf05641642016-05-08 13:48:10 +00001841 "Set-Cookie3" is the format used by the libwww-perl library, not known
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001842 to be compatible with any browser, but which is easy to read and
1843 doesn't lose information about RFC 2965 cookies.
1844
1845 Additional methods
1846
1847 as_lwp_str(ignore_discard=True, ignore_expired=True)
1848
1849 """
1850
1851 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
Ezio Melotti0847db72012-09-21 16:30:22 +03001852 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001853
1854 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1855
1856 """
1857 now = time.time()
1858 r = []
1859 for cookie in self:
1860 if not ignore_discard and cookie.discard:
1861 continue
1862 if not ignore_expires and cookie.is_expired(now):
1863 continue
1864 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1865 return "\n".join(r+[""])
1866
1867 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1868 if filename is None:
1869 if self.filename is not None: filename = self.filename
1870 else: raise ValueError(MISSING_FILENAME_TEXT)
1871
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01001872 with open(filename, "w") as f:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001873 # There really isn't an LWP Cookies 2.0 format, but this indicates
1874 # that there is extra information in here (domain_dot and
1875 # port_spec) while still being compatible with libwww-perl, I hope.
1876 f.write("#LWP-Cookies-2.0\n")
1877 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001878
1879 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1880 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001881 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001882 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1883 "file" % filename)
1884 raise LoadError(msg)
1885
1886 now = time.time()
1887
1888 header = "Set-Cookie3:"
1889 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1890 "secure", "discard")
1891 value_attrs = ("version",
1892 "port", "path", "domain",
1893 "expires",
1894 "comment", "commenturl")
1895
1896 try:
1897 while 1:
1898 line = f.readline()
1899 if line == "": break
1900 if not line.startswith(header):
1901 continue
1902 line = line[len(header):].strip()
1903
1904 for data in split_header_words([line]):
1905 name, value = data[0]
1906 standard = {}
1907 rest = {}
1908 for k in boolean_attrs:
1909 standard[k] = False
1910 for k, v in data[1:]:
1911 if k is not None:
1912 lc = k.lower()
1913 else:
1914 lc = None
1915 # don't lose case distinction for unknown fields
1916 if (lc in value_attrs) or (lc in boolean_attrs):
1917 k = lc
1918 if k in boolean_attrs:
1919 if v is None: v = True
1920 standard[k] = v
1921 elif k in value_attrs:
1922 standard[k] = v
1923 else:
1924 rest[k] = v
1925
1926 h = standard.get
1927 expires = h("expires")
1928 discard = h("discard")
1929 if expires is not None:
1930 expires = iso2time(expires)
1931 if expires is None:
1932 discard = True
1933 domain = h("domain")
1934 domain_specified = domain.startswith(".")
1935 c = Cookie(h("version"), name, value,
1936 h("port"), h("port_spec"),
1937 domain, domain_specified, h("domain_dot"),
1938 h("path"), h("path_spec"),
1939 h("secure"),
1940 expires,
1941 discard,
1942 h("comment"),
1943 h("commenturl"),
1944 rest)
1945 if not ignore_discard and c.discard:
1946 continue
1947 if not ignore_expires and c.is_expired(now):
1948 continue
1949 self.set_cookie(c)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001950 except OSError:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001951 raise
1952 except Exception:
1953 _warn_unhandled_exception()
1954 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1955 (filename, line))
1956
1957
1958class MozillaCookieJar(FileCookieJar):
1959 """
1960
1961 WARNING: you may want to backup your browser's cookies file if you use
1962 this class to save cookies. I *think* it works, but there have been
1963 bugs in the past!
1964
1965 This class differs from CookieJar only in the format it uses to save and
1966 load cookies to and from a file. This class uses the Mozilla/Netscape
1967 `cookies.txt' format. lynx uses this file format, too.
1968
1969 Don't expect cookies saved while the browser is running to be noticed by
1970 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1971 you change them on disk while it's running; on Windows, you probably can't
1972 save at all while the browser is running).
1973
1974 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1975 Netscape cookies on saving.
1976
1977 In particular, the cookie version and port number information is lost,
1978 together with information about whether or not Path, Port and Discard were
1979 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1980 domain as set in the HTTP header started with a dot (yes, I'm aware some
1981 domains in Netscape files start with a dot and some don't -- trust me, you
1982 really don't want to know any more about this).
1983
1984 Note that though Mozilla and Netscape use the same format, they use
1985 slightly different headers. The class saves cookies using the Netscape
1986 header by default (Mozilla can cope with that).
1987
1988 """
Antoine Pitroufd036452008-08-19 17:56:33 +00001989 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001990 header = """\
Georg Brandl87a15642010-07-31 22:11:11 +00001991# Netscape HTTP Cookie File
Benjamin Petersonccedc222013-12-18 15:35:18 -06001992# http://curl.haxx.se/rfc/cookie_spec.html
Georg Brandl87a15642010-07-31 22:11:11 +00001993# This is a generated file! Do not edit.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001994
1995"""
1996
1997 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1998 now = time.time()
1999
2000 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00002001 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002002 raise LoadError(
2003 "%r does not look like a Netscape format cookies file" %
2004 filename)
2005
2006 try:
2007 while 1:
2008 line = f.readline()
2009 if line == "": break
2010
2011 # last field may be absent, so keep any trailing tab
2012 if line.endswith("\n"): line = line[:-1]
2013
2014 # skip comments and blank lines XXX what is $ for?
2015 if (line.strip().startswith(("#", "$")) or
2016 line.strip() == ""):
2017 continue
2018
2019 domain, domain_specified, path, secure, expires, name, value = \
2020 line.split("\t")
2021 secure = (secure == "TRUE")
2022 domain_specified = (domain_specified == "TRUE")
2023 if name == "":
2024 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2025 # with no name, whereas http.cookiejar regards it as a
2026 # cookie with no value.
2027 name = value
2028 value = None
2029
2030 initial_dot = domain.startswith(".")
2031 assert domain_specified == initial_dot
2032
2033 discard = False
2034 if expires == "":
2035 expires = None
2036 discard = True
2037
2038 # assume path_specified is false
2039 c = Cookie(0, name, value,
2040 None, False,
2041 domain, domain_specified, initial_dot,
2042 path, False,
2043 secure,
2044 expires,
2045 discard,
2046 None,
2047 None,
2048 {})
2049 if not ignore_discard and c.discard:
2050 continue
2051 if not ignore_expires and c.is_expired(now):
2052 continue
2053 self.set_cookie(c)
2054
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002055 except OSError:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002056 raise
2057 except Exception:
2058 _warn_unhandled_exception()
2059 raise LoadError("invalid Netscape format cookies file %r: %r" %
2060 (filename, line))
2061
2062 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2063 if filename is None:
2064 if self.filename is not None: filename = self.filename
2065 else: raise ValueError(MISSING_FILENAME_TEXT)
2066
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01002067 with open(filename, "w") as f:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002068 f.write(self.header)
2069 now = time.time()
2070 for cookie in self:
2071 if not ignore_discard and cookie.discard:
2072 continue
2073 if not ignore_expires and cookie.is_expired(now):
2074 continue
2075 if cookie.secure: secure = "TRUE"
2076 else: secure = "FALSE"
2077 if cookie.domain.startswith("."): initial_dot = "TRUE"
2078 else: initial_dot = "FALSE"
2079 if cookie.expires is not None:
2080 expires = str(cookie.expires)
2081 else:
2082 expires = ""
2083 if cookie.value is None:
2084 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2085 # with no name, whereas http.cookiejar regards it as a
2086 # cookie with no value.
2087 name = ""
2088 value = cookie.name
2089 else:
2090 name = cookie.name
2091 value = cookie.value
2092 f.write(
2093 "\t".join([cookie.domain, initial_dot, cookie.path,
2094 secure, expires, name, value])+
2095 "\n")