blob: d63544a5f52e2325a77d1f5434e4ff9bea10f6f0 [file] [log] [blame]
Éric Araujo23760e92011-11-07 17:52:48 +01001r"""HTTP cookie handling for web clients.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00002
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
Thomas Wouters477c8d52006-05-27 19:21:47 +000010Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000013
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Thomas Wouters477c8d52006-05-27 19:21:47 +000028__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
Jeremy Hylton1afc1692008-06-18 20:49:58 +000031import copy
Victor Stinner628225c2011-03-21 02:38:51 +010032import datetime
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033import re
34import time
35import urllib.parse, urllib.request
Antoine Pitroua6a4dc82017-09-07 18:56:24 +020036import threading as _threading
Georg Brandl24420152008-05-26 16:32:26 +000037import http.client # only for the default HTTP port
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000038from calendar import timegm
39
Thomas Wouters477c8d52006-05-27 19:21:47 +000040debug = False # set to True to enable debugging via the logging module
41logger = None
42
43def _debug(*args):
44 if not debug:
45 return
46 global logger
47 if not logger:
48 import logging
Georg Brandl24420152008-05-26 16:32:26 +000049 logger = logging.getLogger("http.cookiejar")
Thomas Wouters477c8d52006-05-27 19:21:47 +000050 return logger.debug(*args)
51
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000052
Georg Brandl24420152008-05-26 16:32:26 +000053DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000054MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
55 "instance initialised with one)")
56
Thomas Wouters477c8d52006-05-27 19:21:47 +000057def _warn_unhandled_exception():
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000058 # There are a few catch-all except: statements in this module, for
Thomas Wouters477c8d52006-05-27 19:21:47 +000059 # catching input that's bad in unexpected ways. Warn if any
60 # exceptions are caught there.
Jeremy Hylton7ecf3dc2008-05-10 20:38:40 +000061 import io, warnings, traceback
Guido van Rossum34d19282007-08-09 01:03:29 +000062 f = io.StringIO()
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000063 traceback.print_exc(None, f)
64 msg = f.getvalue()
Georg Brandl24420152008-05-26 16:32:26 +000065 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000066
67
68# Date/time conversion
69# -----------------------------------------------------------------------------
70
71EPOCH_YEAR = 1970
72def _timegm(tt):
73 year, month, mday, hour, min, sec = tt[:6]
74 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
75 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
76 return timegm(tt)
77 else:
78 return None
79
80DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
81MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
82 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
83MONTHS_LOWER = []
84for month in MONTHS: MONTHS_LOWER.append(month.lower())
85
86def time2isoz(t=None):
87 """Return a string representing time in seconds since epoch, t.
88
89 If the function is called without an argument, it will use the current
90 time.
91
92 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
93 representing Universal Time (UTC, aka GMT). An example of this format is:
94
95 1994-11-24 08:49:37Z
96
97 """
Victor Stinner628225c2011-03-21 02:38:51 +010098 if t is None:
99 dt = datetime.datetime.utcnow()
100 else:
101 dt = datetime.datetime.utcfromtimestamp(t)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000102 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100103 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000104
105def time2netscape(t=None):
106 """Return a string representing time in seconds since epoch, t.
107
108 If the function is called without an argument, it will use the current
109 time.
110
111 The format of the returned string is like this:
112
113 Wed, DD-Mon-YYYY HH:MM:SS GMT
114
115 """
Victor Stinner628225c2011-03-21 02:38:51 +0100116 if t is None:
117 dt = datetime.datetime.utcnow()
118 else:
119 dt = datetime.datetime.utcfromtimestamp(t)
Senthil Kumarand5b47fb2016-07-10 06:45:38 -0700120 return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100121 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
122 dt.year, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000123
124
125UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
126
Antoine Pitroufd036452008-08-19 17:56:33 +0000127TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000128def offset_from_tz_string(tz):
129 offset = None
130 if tz in UTC_ZONES:
131 offset = 0
132 else:
133 m = TIMEZONE_RE.search(tz)
134 if m:
135 offset = 3600 * int(m.group(2))
136 if m.group(3):
137 offset = offset + 60 * int(m.group(3))
138 if m.group(1) == '-':
139 offset = -offset
140 return offset
141
142def _str2time(day, mon, yr, hr, min, sec, tz):
Berker Peksag20be53e2016-03-14 05:48:02 +0200143 yr = int(yr)
144 if yr > datetime.MAXYEAR:
145 return None
146
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000147 # translate month name to number
148 # month numbers start with 1 (January)
149 try:
150 mon = MONTHS_LOWER.index(mon.lower())+1
151 except ValueError:
152 # maybe it's already a number
153 try:
154 imon = int(mon)
155 except ValueError:
156 return None
157 if 1 <= imon <= 12:
158 mon = imon
159 else:
160 return None
161
162 # make sure clock elements are defined
163 if hr is None: hr = 0
164 if min is None: min = 0
165 if sec is None: sec = 0
166
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000167 day = int(day)
168 hr = int(hr)
169 min = int(min)
170 sec = int(sec)
171
172 if yr < 1000:
173 # find "obvious" year
174 cur_yr = time.localtime(time.time())[0]
175 m = cur_yr % 100
176 tmp = yr
177 yr = yr + cur_yr - m
178 m = m - tmp
179 if abs(m) > 50:
180 if m > 0: yr = yr + 100
181 else: yr = yr - 100
182
183 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
184 t = _timegm((yr, mon, day, hr, min, sec, tz))
185
186 if t is not None:
187 # adjust time using timezone string, to get absolute time since epoch
188 if tz is None:
189 tz = "UTC"
190 tz = tz.upper()
191 offset = offset_from_tz_string(tz)
192 if offset is None:
193 return None
194 t = t - offset
195
196 return t
197
198STRICT_DATE_RE = re.compile(
199 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
R David Murray44b548d2016-09-08 13:59:53 -0400200 r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000201WEEKDAY_RE = re.compile(
Antoine Pitroufd036452008-08-19 17:56:33 +0000202 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000203LOOSE_HTTP_DATE_RE = re.compile(
204 r"""^
205 (\d\d?) # day
206 (?:\s+|[-\/])
207 (\w+) # month
208 (?:\s+|[-\/])
209 (\d+) # year
210 (?:
211 (?:\s+|:) # separator before clock
212 (\d\d?):(\d\d) # hour:min
213 (?::(\d\d))? # optional seconds
214 )? # optional clock
215 \s*
216 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
217 \s*
218 (?:\(\w+\))? # ASCII representation of timezone in parens.
Antoine Pitroufd036452008-08-19 17:56:33 +0000219 \s*$""", re.X | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000220def http2time(text):
221 """Returns time in seconds since epoch of time represented by a string.
222
223 Return value is an integer.
224
225 None is returned if the format of str is unrecognized, the time is outside
226 the representable range, or the timezone string is not recognized. If the
227 string contains no timezone, UTC is assumed.
228
229 The timezone in the string may be numerical (like "-0800" or "+0100") or a
230 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
231 timezone strings equivalent to UTC (zero offset) are known to the function.
232
233 The function loosely parses the following formats:
234
235 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
236 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
237 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
238 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
239 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
240 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
241
242 The parser ignores leading and trailing whitespace. The time may be
243 absent.
244
245 If the year is given with only 2 digits, the function will select the
246 century that makes the year closest to the current date.
247
248 """
249 # fast exit for strictly conforming string
250 m = STRICT_DATE_RE.search(text)
251 if m:
252 g = m.groups()
253 mon = MONTHS_LOWER.index(g[1].lower()) + 1
254 tt = (int(g[2]), mon, int(g[0]),
255 int(g[3]), int(g[4]), float(g[5]))
256 return _timegm(tt)
257
258 # No, we need some messy parsing...
259
260 # clean up
261 text = text.lstrip()
262 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
263
264 # tz is time zone specifier string
265 day, mon, yr, hr, min, sec, tz = [None]*7
266
267 # loose regexp parse
268 m = LOOSE_HTTP_DATE_RE.search(text)
269 if m is not None:
270 day, mon, yr, hr, min, sec, tz = m.groups()
271 else:
272 return None # bad format
273
274 return _str2time(day, mon, yr, hr, min, sec, tz)
275
276ISO_DATE_RE = re.compile(
R David Murray44b548d2016-09-08 13:59:53 -0400277 r"""^
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000278 (\d{4}) # year
279 [-\/]?
280 (\d\d?) # numerical month
281 [-\/]?
282 (\d\d?) # day
283 (?:
284 (?:\s+|[-:Tt]) # separator before clock
285 (\d\d?):?(\d\d) # hour:min
286 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
287 )? # optional clock
288 \s*
289 ([-+]?\d\d?:?(:?\d\d)?
290 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
Antoine Pitroufd036452008-08-19 17:56:33 +0000291 \s*$""", re.X | re. ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000292def iso2time(text):
293 """
294 As for http2time, but parses the ISO 8601 formats:
295
296 1994-02-03 14:15:29 -0100 -- ISO 8601 format
297 1994-02-03 14:15:29 -- zone is optional
298 1994-02-03 -- only date
299 1994-02-03T14:15:29 -- Use T as separator
300 19940203T141529Z -- ISO 8601 compact format
301 19940203 -- only date
302
303 """
304 # clean up
305 text = text.lstrip()
306
307 # tz is time zone specifier string
308 day, mon, yr, hr, min, sec, tz = [None]*7
309
310 # loose regexp parse
311 m = ISO_DATE_RE.search(text)
312 if m is not None:
313 # XXX there's an extra bit of the timezone I'm ignoring here: is
314 # this the right thing to do?
315 yr, mon, day, hr, min, sec, tz, _ = m.groups()
316 else:
317 return None # bad format
318
319 return _str2time(day, mon, yr, hr, min, sec, tz)
320
321
322# Header parsing
323# -----------------------------------------------------------------------------
324
325def unmatched(match):
326 """Return unmatched part of re.Match object."""
327 start, end = match.span(0)
328 return match.string[:start]+match.string[end:]
329
330HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
331HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
332HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
333HEADER_ESCAPE_RE = re.compile(r"\\(.)")
334def split_header_words(header_values):
335 r"""Parse header values into a list of lists containing key,value pairs.
336
337 The function knows how to deal with ",", ";" and "=" as well as quoted
338 values after "=". A list of space separated tokens are parsed as if they
339 were separated by ";".
340
341 If the header_values passed as argument contains multiple values, then they
342 are treated as if they were a single value separated by comma ",".
343
344 This means that this function is useful for parsing header fields that
345 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
346 the requirement for tokens).
347
348 headers = #header
349 header = (token | parameter) *( [";"] (token | parameter))
350
351 token = 1*<any CHAR except CTLs or separators>
352 separators = "(" | ")" | "<" | ">" | "@"
353 | "," | ";" | ":" | "\" | <">
354 | "/" | "[" | "]" | "?" | "="
355 | "{" | "}" | SP | HT
356
357 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
358 qdtext = <any TEXT except <">>
359 quoted-pair = "\" CHAR
360
361 parameter = attribute "=" value
362 attribute = token
363 value = token | quoted-string
364
365 Each header is represented by a list of key/value pairs. The value for a
366 simple token (not part of a parameter) is None. Syntactically incorrect
367 headers will not necessarily be parsed as you would want.
368
369 This is easier to describe with some examples:
370
371 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
372 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
373 >>> split_header_words(['text/html; charset="iso-8859-1"'])
374 [[('text/html', None), ('charset', 'iso-8859-1')]]
375 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
376 [[('Basic', None), ('realm', '"foobar"')]]
377
378 """
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000379 assert not isinstance(header_values, str)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000380 result = []
381 for text in header_values:
382 orig_text = text
383 pairs = []
384 while text:
385 m = HEADER_TOKEN_RE.search(text)
386 if m:
387 text = unmatched(m)
388 name = m.group(1)
389 m = HEADER_QUOTED_VALUE_RE.search(text)
390 if m: # quoted value
391 text = unmatched(m)
392 value = m.group(1)
393 value = HEADER_ESCAPE_RE.sub(r"\1", value)
394 else:
395 m = HEADER_VALUE_RE.search(text)
396 if m: # unquoted value
397 text = unmatched(m)
398 value = m.group(1)
399 value = value.rstrip()
400 else:
401 # no value, a lone token
402 value = None
403 pairs.append((name, value))
404 elif text.lstrip().startswith(","):
405 # concatenated headers, as per RFC 2616 section 4.2
406 text = text.lstrip()[1:]
407 if pairs: result.append(pairs)
408 pairs = []
409 else:
410 # skip junk
R David Murray44b548d2016-09-08 13:59:53 -0400411 non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000412 assert nr_junk_chars > 0, (
413 "split_header_words bug: '%s', '%s', %s" %
414 (orig_text, text, pairs))
415 text = non_junk
416 if pairs: result.append(pairs)
417 return result
418
419HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
420def join_header_words(lists):
421 """Do the inverse (almost) of the conversion done by split_header_words.
422
423 Takes a list of lists of (key, value) pairs and produces a single header
424 value. Attribute values are quoted if needed.
425
Martin Panterac34e092015-11-14 00:58:32 +0000426 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]])
427 'text/plain; charset="iso-8859-1"'
428 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]])
429 'text/plain, charset="iso-8859-1"'
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000430
431 """
432 headers = []
433 for pairs in lists:
434 attr = []
435 for k, v in pairs:
436 if v is not None:
437 if not re.search(r"^\w+$", v):
438 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
439 v = '"%s"' % v
440 k = "%s=%s" % (k, v)
441 attr.append(k)
442 if attr: headers.append("; ".join(attr))
443 return ", ".join(headers)
444
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000445def strip_quotes(text):
446 if text.startswith('"'):
447 text = text[1:]
448 if text.endswith('"'):
449 text = text[:-1]
450 return text
451
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000452def parse_ns_headers(ns_headers):
453 """Ad-hoc parser for Netscape protocol cookie-attributes.
454
455 The old Netscape cookie format for Set-Cookie can for instance contain
456 an unquoted "," in the expires field, so we have to use this ad-hoc
457 parser instead of split_header_words.
458
459 XXX This may not make the best possible effort to parse all the crap
460 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
461 parser is probably better, so could do worse than following that if
462 this ever gives any trouble.
463
464 Currently, this is also used for parsing RFC 2109 cookies.
465
466 """
467 known_attrs = ("expires", "domain", "path", "secure",
468 # RFC 2109 attrs (may turn up in Netscape cookies, too)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000469 "version", "port", "max-age")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000470
471 result = []
472 for ns_header in ns_headers:
473 pairs = []
474 version_set = False
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200475
476 # XXX: The following does not strictly adhere to RFCs in that empty
477 # names and values are legal (the former will only appear once and will
478 # be overwritten if multiple occurrences are present). This is
479 # mostly to deal with backwards compatibility.
480 for ii, param in enumerate(ns_header.split(';')):
481 param = param.strip()
482
483 key, sep, val = param.partition('=')
484 key = key.strip()
485
486 if not key:
487 if ii == 0:
488 break
489 else:
490 continue
491
492 # allow for a distinction between present and empty and missing
493 # altogether
494 val = val.strip() if sep else None
495
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000496 if ii != 0:
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200497 lc = key.lower()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000498 if lc in known_attrs:
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200499 key = lc
500
501 if key == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000502 # This is an RFC 2109 cookie.
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200503 if val is not None:
504 val = strip_quotes(val)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000505 version_set = True
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200506 elif key == "expires":
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000507 # convert expires date to seconds since epoch
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200508 if val is not None:
509 val = http2time(strip_quotes(val)) # None if invalid
510 pairs.append((key, val))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000511
512 if pairs:
513 if not version_set:
514 pairs.append(("version", "0"))
515 result.append(pairs)
516
517 return result
518
519
Antoine Pitroufd036452008-08-19 17:56:33 +0000520IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000521def is_HDN(text):
522 """Return True if text is a host domain name."""
523 # XXX
524 # This may well be wrong. Which RFC is HDN defined in, if any (for
525 # the purposes of RFC 2965)?
526 # For the current implementation, what about IPv6? Remember to look
527 # at other uses of IPV4_RE also, if change this.
528 if IPV4_RE.search(text):
529 return False
530 if text == "":
531 return False
532 if text[0] == "." or text[-1] == ".":
533 return False
534 return True
535
536def domain_match(A, B):
537 """Return True if domain A domain-matches domain B, according to RFC 2965.
538
539 A and B may be host domain names or IP addresses.
540
541 RFC 2965, section 1:
542
543 Host names can be specified either as an IP address or a HDN string.
544 Sometimes we compare one host name with another. (Such comparisons SHALL
545 be case-insensitive.) Host A's name domain-matches host B's if
546
547 * their host name strings string-compare equal; or
548
549 * A is a HDN string and has the form NB, where N is a non-empty
550 name string, B has the form .B', and B' is a HDN string. (So,
551 x.y.com domain-matches .Y.com but not Y.com.)
552
553 Note that domain-match is not a commutative operation: a.b.c.com
554 domain-matches .c.com, but not the reverse.
555
556 """
557 # Note that, if A or B are IP addresses, the only relevant part of the
558 # definition of the domain-match algorithm is the direct string-compare.
559 A = A.lower()
560 B = B.lower()
561 if A == B:
562 return True
563 if not is_HDN(A):
564 return False
565 i = A.rfind(B)
566 if i == -1 or i == 0:
567 # A does not have form NB, or N is the empty string
568 return False
569 if not B.startswith("."):
570 return False
571 if not is_HDN(B[1:]):
572 return False
573 return True
574
575def liberal_is_HDN(text):
576 """Return True if text is a sort-of-like a host domain name.
577
578 For accepting/blocking domains.
579
580 """
581 if IPV4_RE.search(text):
582 return False
583 return True
584
585def user_domain_match(A, B):
586 """For blocking/accepting domains.
587
588 A and B may be host domain names or IP addresses.
589
590 """
591 A = A.lower()
592 B = B.lower()
593 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
594 if A == B:
595 # equal IP addresses
596 return True
597 return False
598 initial_dot = B.startswith(".")
599 if initial_dot and A.endswith(B):
600 return True
601 if not initial_dot and A == B:
602 return True
603 return False
604
Antoine Pitroufd036452008-08-19 17:56:33 +0000605cut_port_re = re.compile(r":\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000606def request_host(request):
607 """Return request-host, as defined by RFC 2965.
608
609 Variation from RFC: returned value is lowercased, for convenient
610 comparison.
611
612 """
613 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000614 host = urllib.parse.urlparse(url)[1]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000615 if host == "":
616 host = request.get_header("Host", "")
617
618 # remove port, if present
619 host = cut_port_re.sub("", host, 1)
620 return host.lower()
621
622def eff_request_host(request):
623 """Return a tuple (request-host, effective request-host name).
624
625 As defined by RFC 2965, except both are lowercased.
626
627 """
628 erhn = req_host = request_host(request)
629 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
630 erhn = req_host + ".local"
631 return req_host, erhn
632
633def request_path(request):
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000634 """Path component of request-URI, as defined by RFC 2965."""
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000635 url = request.get_full_url()
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000636 parts = urllib.parse.urlsplit(url)
637 path = escape_path(parts.path)
638 if not path.startswith("/"):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000639 # fix bad RFC 2396 absoluteURI
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000640 path = "/" + path
641 return path
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000642
643def request_port(request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -0500644 host = request.host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000645 i = host.find(':')
646 if i >= 0:
647 port = host[i+1:]
648 try:
649 int(port)
650 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000651 _debug("nonnumeric port: '%s'", port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000652 return None
653 else:
654 port = DEFAULT_HTTP_PORT
655 return port
656
657# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
658# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
659HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
660ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
661def uppercase_escaped_char(match):
662 return "%%%s" % match.group(1).upper()
663def escape_path(path):
664 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
665 # There's no knowing what character encoding was used to create URLs
666 # containing %-escapes, but since we have to pick one to escape invalid
667 # path characters, we pick UTF-8, as recommended in the HTML 4.0
668 # specification:
669 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
670 # And here, kind of: draft-fielding-uri-rfc2396bis-03
671 # (And in draft IRI specification: draft-duerst-iri-05)
672 # (And here, for new URI schemes: RFC 2718)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000673 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000674 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
675 return path
676
677def reach(h):
678 """Return reach of host h, as defined by RFC 2965, section 1.
679
680 The reach R of a host name H is defined as follows:
681
682 * If
683
684 - H is the host domain name of a host; and,
685
686 - H has the form A.B; and
687
688 - A has no embedded (that is, interior) dots; and
689
690 - B has at least one embedded dot, or B is the string "local".
691 then the reach of H is .B.
692
693 * Otherwise, the reach of H is H.
694
695 >>> reach("www.acme.com")
696 '.acme.com'
697 >>> reach("acme.com")
698 'acme.com'
699 >>> reach("acme.local")
700 '.local'
701
702 """
703 i = h.find(".")
704 if i >= 0:
705 #a = h[:i] # this line is only here to show what a is
706 b = h[i+1:]
707 i = b.find(".")
708 if is_HDN(h) and (i >= 0 or b == "local"):
709 return "."+b
710 return h
711
712def is_third_party(request):
713 """
714
715 RFC 2965, section 3.3.6:
716
717 An unverifiable transaction is to a third-party host if its request-
718 host U does not domain-match the reach R of the request-host O in the
719 origin transaction.
720
721 """
722 req_host = request_host(request)
Senthil Kumarand9fbf362013-04-09 07:07:59 -0700723 if not domain_match(req_host, reach(request.origin_req_host)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000724 return True
725 else:
726 return False
727
728
729class Cookie:
730 """HTTP Cookie.
731
732 This class represents both Netscape and RFC 2965 cookies.
733
734 This is deliberately a very simple class. It just holds attributes. It's
735 possible to construct Cookie instances that don't comply with the cookie
736 standards. CookieJar.make_cookies is the factory function for Cookie
737 objects -- it deals with cookie parsing, supplying defaults, and
738 normalising to the representation used in this class. CookiePolicy is
739 responsible for checking them to see whether they should be accepted from
740 and returned to the server.
741
742 Note that the port may be present in the headers, but unspecified ("Port"
743 rather than"Port=80", for example); if this is the case, port is None.
744
745 """
746
747 def __init__(self, version, name, value,
748 port, port_specified,
749 domain, domain_specified, domain_initial_dot,
750 path, path_specified,
751 secure,
752 expires,
753 discard,
754 comment,
755 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000756 rest,
757 rfc2109=False,
758 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000759
760 if version is not None: version = int(version)
Robert Collinsa0e5d982015-08-04 10:06:29 +1200761 if expires is not None: expires = int(float(expires))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000762 if port is None and port_specified is True:
763 raise ValueError("if port is None, port_specified must be false")
764
765 self.version = version
766 self.name = name
767 self.value = value
768 self.port = port
769 self.port_specified = port_specified
770 # normalise case, as per RFC 2965 section 3.3.3
771 self.domain = domain.lower()
772 self.domain_specified = domain_specified
773 # Sigh. We need to know whether the domain given in the
774 # cookie-attribute had an initial dot, in order to follow RFC 2965
775 # (as clarified in draft errata). Needed for the returned $Domain
776 # value.
777 self.domain_initial_dot = domain_initial_dot
778 self.path = path
779 self.path_specified = path_specified
780 self.secure = secure
781 self.expires = expires
782 self.discard = discard
783 self.comment = comment
784 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000785 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000786
787 self._rest = copy.copy(rest)
788
789 def has_nonstandard_attr(self, name):
790 return name in self._rest
791 def get_nonstandard_attr(self, name, default=None):
792 return self._rest.get(name, default)
793 def set_nonstandard_attr(self, name, value):
794 self._rest[name] = value
795
796 def is_expired(self, now=None):
797 if now is None: now = time.time()
798 if (self.expires is not None) and (self.expires <= now):
799 return True
800 return False
801
802 def __str__(self):
803 if self.port is None: p = ""
804 else: p = ":"+self.port
805 limit = self.domain + p + self.path
806 if self.value is not None:
807 namevalue = "%s=%s" % (self.name, self.value)
808 else:
809 namevalue = self.name
810 return "<Cookie %s for %s>" % (namevalue, limit)
811
812 def __repr__(self):
813 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000814 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000815 "port", "port_specified",
816 "domain", "domain_specified", "domain_initial_dot",
817 "path", "path_specified",
818 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000819 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000820 attr = getattr(self, name)
821 args.append("%s=%s" % (name, repr(attr)))
822 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000823 args.append("rfc2109=%s" % repr(self.rfc2109))
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300824 return "%s(%s)" % (self.__class__.__name__, ", ".join(args))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000825
826
827class CookiePolicy:
828 """Defines which cookies get accepted from and returned to server.
829
830 May also modify cookies, though this is probably a bad idea.
831
832 The subclass DefaultCookiePolicy defines the standard rules for Netscape
Raymond Hettinger7ea386e2016-08-25 21:11:50 -0700833 and RFC 2965 cookies -- override that if you want a customized policy.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000834
835 """
836 def set_ok(self, cookie, request):
837 """Return true if (and only if) cookie should be accepted from server.
838
839 Currently, pre-expired cookies never get this far -- the CookieJar
840 class deletes such cookies itself.
841
842 """
843 raise NotImplementedError()
844
845 def return_ok(self, cookie, request):
846 """Return true if (and only if) cookie should be returned to server."""
847 raise NotImplementedError()
848
849 def domain_return_ok(self, domain, request):
850 """Return false if cookies should not be returned, given cookie domain.
851 """
852 return True
853
854 def path_return_ok(self, path, request):
855 """Return false if cookies should not be returned, given cookie path.
856 """
857 return True
858
859
860class DefaultCookiePolicy(CookiePolicy):
861 """Implements the standard rules for accepting and returning cookies."""
862
863 DomainStrictNoDots = 1
864 DomainStrictNonDomain = 2
865 DomainRFC2965Match = 4
866
867 DomainLiberal = 0
868 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
869
870 def __init__(self,
871 blocked_domains=None, allowed_domains=None,
872 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000873 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000874 hide_cookie2=False,
875 strict_domain=False,
876 strict_rfc2965_unverifiable=True,
877 strict_ns_unverifiable=False,
878 strict_ns_domain=DomainLiberal,
879 strict_ns_set_initial_dollar=False,
880 strict_ns_set_path=False,
881 ):
882 """Constructor arguments should be passed as keyword arguments only."""
883 self.netscape = netscape
884 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000885 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000886 self.hide_cookie2 = hide_cookie2
887 self.strict_domain = strict_domain
888 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
889 self.strict_ns_unverifiable = strict_ns_unverifiable
890 self.strict_ns_domain = strict_ns_domain
891 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
892 self.strict_ns_set_path = strict_ns_set_path
893
894 if blocked_domains is not None:
895 self._blocked_domains = tuple(blocked_domains)
896 else:
897 self._blocked_domains = ()
898
899 if allowed_domains is not None:
900 allowed_domains = tuple(allowed_domains)
901 self._allowed_domains = allowed_domains
902
903 def blocked_domains(self):
904 """Return the sequence of blocked domains (as a tuple)."""
905 return self._blocked_domains
906 def set_blocked_domains(self, blocked_domains):
907 """Set the sequence of blocked domains."""
908 self._blocked_domains = tuple(blocked_domains)
909
910 def is_blocked(self, domain):
911 for blocked_domain in self._blocked_domains:
912 if user_domain_match(domain, blocked_domain):
913 return True
914 return False
915
916 def allowed_domains(self):
917 """Return None, or the sequence of allowed domains (as a tuple)."""
918 return self._allowed_domains
919 def set_allowed_domains(self, allowed_domains):
920 """Set the sequence of allowed domains, or None."""
921 if allowed_domains is not None:
922 allowed_domains = tuple(allowed_domains)
923 self._allowed_domains = allowed_domains
924
925 def is_not_allowed(self, domain):
926 if self._allowed_domains is None:
927 return False
928 for allowed_domain in self._allowed_domains:
929 if user_domain_match(domain, allowed_domain):
930 return False
931 return True
932
933 def set_ok(self, cookie, request):
934 """
935 If you override .set_ok(), be sure to call this method. If it returns
936 false, so should your subclass (assuming your subclass wants to be more
937 strict about which cookies to accept).
938
939 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000940 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000941
942 assert cookie.name is not None
943
944 for n in "version", "verifiability", "name", "path", "domain", "port":
945 fn_name = "set_ok_"+n
946 fn = getattr(self, fn_name)
947 if not fn(cookie, request):
948 return False
949
950 return True
951
952 def set_ok_version(self, cookie, request):
953 if cookie.version is None:
954 # Version is always set to 0 by parse_ns_headers if it's a Netscape
955 # cookie, so this must be an invalid RFC 2965 cookie.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000956 _debug(" Set-Cookie2 without version attribute (%s=%s)",
957 cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000958 return False
959 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000960 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000961 return False
962 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000963 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000964 return False
965 return True
966
967 def set_ok_verifiability(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -0500968 if request.unverifiable and is_third_party(request):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000969 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000970 _debug(" third-party RFC 2965 cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000971 "unverifiable transaction")
972 return False
973 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000974 _debug(" third-party Netscape cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000975 "unverifiable transaction")
976 return False
977 return True
978
979 def set_ok_name(self, cookie, request):
980 # Try and stop servers setting V0 cookies designed to hack other
981 # servers that know both V0 and V1 protocols.
982 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
983 cookie.name.startswith("$")):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000984 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000985 return False
986 return True
987
988 def set_ok_path(self, cookie, request):
989 if cookie.path_specified:
990 req_path = request_path(request)
991 if ((cookie.version > 0 or
992 (cookie.version == 0 and self.strict_ns_set_path)) and
Miss Islington (bot)97c7d782019-03-10 10:30:35 -0700993 not self.path_return_ok(cookie.path, request)):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000994 _debug(" path attribute %s is not a prefix of request "
995 "path %s", cookie.path, req_path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000996 return False
997 return True
998
999 def set_ok_domain(self, cookie, request):
1000 if self.is_blocked(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001001 _debug(" domain %s is in user block-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001002 return False
1003 if self.is_not_allowed(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001004 _debug(" domain %s is not in user allow-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001005 return False
1006 if cookie.domain_specified:
1007 req_host, erhn = eff_request_host(request)
1008 domain = cookie.domain
1009 if self.strict_domain and (domain.count(".") >= 2):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001010 # XXX This should probably be compared with the Konqueror
1011 # (kcookiejar.cpp) and Mozilla implementations, but it's a
1012 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001013 i = domain.rfind(".")
1014 j = domain.rfind(".", 0, i)
1015 if j == 0: # domain like .foo.bar
1016 tld = domain[i+1:]
1017 sld = domain[j+1:i]
Thomas Wouters477c8d52006-05-27 19:21:47 +00001018 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1019 "gov", "mil", "int", "aero", "biz", "cat", "coop",
1020 "info", "jobs", "mobi", "museum", "name", "pro",
1021 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001022 # domain like .co.uk
Thomas Wouters477c8d52006-05-27 19:21:47 +00001023 _debug(" country-code second level domain %s", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001024 return False
1025 if domain.startswith("."):
1026 undotted_domain = domain[1:]
1027 else:
1028 undotted_domain = domain
1029 embedded_dots = (undotted_domain.find(".") >= 0)
1030 if not embedded_dots and domain != ".local":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001031 _debug(" non-local domain %s contains no embedded dot",
1032 domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001033 return False
1034 if cookie.version == 0:
1035 if (not erhn.endswith(domain) and
1036 (not erhn.startswith(".") and
1037 not ("."+erhn).endswith(domain))):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001038 _debug(" effective request-host %s (even with added "
Ezio Melottie130a522011-10-19 10:58:56 +03001039 "initial dot) does not end with %s",
Thomas Wouters477c8d52006-05-27 19:21:47 +00001040 erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001041 return False
1042 if (cookie.version > 0 or
1043 (self.strict_ns_domain & self.DomainRFC2965Match)):
1044 if not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001045 _debug(" effective request-host %s does not domain-match "
1046 "%s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001047 return False
1048 if (cookie.version > 0 or
1049 (self.strict_ns_domain & self.DomainStrictNoDots)):
1050 host_prefix = req_host[:-len(domain)]
1051 if (host_prefix.find(".") >= 0 and
1052 not IPV4_RE.search(req_host)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001053 _debug(" host prefix %s for domain %s contains a dot",
1054 host_prefix, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001055 return False
1056 return True
1057
1058 def set_ok_port(self, cookie, request):
1059 if cookie.port_specified:
1060 req_port = request_port(request)
1061 if req_port is None:
1062 req_port = "80"
1063 else:
1064 req_port = str(req_port)
1065 for p in cookie.port.split(","):
1066 try:
1067 int(p)
1068 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001069 _debug(" bad port %s (not numeric)", p)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001070 return False
1071 if p == req_port:
1072 break
1073 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001074 _debug(" request port (%s) not found in %s",
1075 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001076 return False
1077 return True
1078
1079 def return_ok(self, cookie, request):
1080 """
1081 If you override .return_ok(), be sure to call this method. If it
1082 returns false, so should your subclass (assuming your subclass wants to
1083 be more strict about which cookies to return).
1084
1085 """
1086 # Path has already been checked by .path_return_ok(), and domain
1087 # blocking done by .domain_return_ok().
Thomas Wouters477c8d52006-05-27 19:21:47 +00001088 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001089
1090 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1091 fn_name = "return_ok_"+n
1092 fn = getattr(self, fn_name)
1093 if not fn(cookie, request):
1094 return False
1095 return True
1096
1097 def return_ok_version(self, cookie, request):
1098 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001099 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001100 return False
1101 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001102 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001103 return False
1104 return True
1105
1106 def return_ok_verifiability(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -05001107 if request.unverifiable and is_third_party(request):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001108 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001109 _debug(" third-party RFC 2965 cookie during unverifiable "
1110 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001111 return False
1112 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001113 _debug(" third-party Netscape cookie during unverifiable "
1114 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001115 return False
1116 return True
1117
1118 def return_ok_secure(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -05001119 if cookie.secure and request.type != "https":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001120 _debug(" secure cookie with non-secure request")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001121 return False
1122 return True
1123
1124 def return_ok_expires(self, cookie, request):
1125 if cookie.is_expired(self._now):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001126 _debug(" cookie expired")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001127 return False
1128 return True
1129
1130 def return_ok_port(self, cookie, request):
1131 if cookie.port:
1132 req_port = request_port(request)
1133 if req_port is None:
1134 req_port = "80"
1135 for p in cookie.port.split(","):
1136 if p == req_port:
1137 break
1138 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001139 _debug(" request port %s does not match cookie port %s",
1140 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001141 return False
1142 return True
1143
1144 def return_ok_domain(self, cookie, request):
1145 req_host, erhn = eff_request_host(request)
1146 domain = cookie.domain
1147
Miss Islington (bot)e5123d82019-03-09 18:58:25 -08001148 if domain and not domain.startswith("."):
1149 dotdomain = "." + domain
1150 else:
1151 dotdomain = domain
1152
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001153 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1154 if (cookie.version == 0 and
1155 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1156 not cookie.domain_specified and domain != erhn):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001157 _debug(" cookie with unspecified domain does not string-compare "
1158 "equal to request domain")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001159 return False
1160
1161 if cookie.version > 0 and not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001162 _debug(" effective request-host name %s does not domain-match "
1163 "RFC 2965 cookie domain %s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001164 return False
Miss Islington (bot)e5123d82019-03-09 18:58:25 -08001165 if cookie.version == 0 and not ("."+erhn).endswith(dotdomain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001166 _debug(" request-host %s does not match Netscape cookie domain "
1167 "%s", req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001168 return False
1169 return True
1170
1171 def domain_return_ok(self, domain, request):
1172 # Liberal check of. This is here as an optimization to avoid
1173 # having to load lots of MSIE cookie files unless necessary.
1174 req_host, erhn = eff_request_host(request)
1175 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001176 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001177 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001178 erhn = "."+erhn
Miss Islington (bot)e5123d82019-03-09 18:58:25 -08001179 if domain and not domain.startswith("."):
1180 dotdomain = "." + domain
1181 else:
1182 dotdomain = domain
1183 if not (req_host.endswith(dotdomain) or erhn.endswith(dotdomain)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001184 #_debug(" request domain %s does not match cookie domain %s",
1185 # req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001186 return False
1187
1188 if self.is_blocked(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001189 _debug(" domain %s is in user block-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001190 return False
1191 if self.is_not_allowed(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001192 _debug(" domain %s is not in user allow-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001193 return False
1194
1195 return True
1196
1197 def path_return_ok(self, path, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001198 _debug("- checking cookie path=%s", path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001199 req_path = request_path(request)
Miss Islington (bot)97c7d782019-03-10 10:30:35 -07001200 pathlen = len(path)
1201 if req_path == path:
1202 return True
1203 elif (req_path.startswith(path) and
1204 (path.endswith("/") or req_path[pathlen:pathlen+1] == "/")):
1205 return True
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001206
Miss Islington (bot)97c7d782019-03-10 10:30:35 -07001207 _debug(" %s does not path-match %s", req_path, path)
1208 return False
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001209
1210def vals_sorted_by_key(adict):
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001211 keys = sorted(adict.keys())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001212 return map(adict.get, keys)
1213
1214def deepvalues(mapping):
1215 """Iterates over nested mapping, depth-first, in sorted order by key."""
1216 values = vals_sorted_by_key(mapping)
1217 for obj in values:
1218 mapping = False
1219 try:
1220 obj.items
1221 except AttributeError:
1222 pass
1223 else:
1224 mapping = True
Philip Jenveyfd0d3e52012-10-01 15:34:31 -07001225 yield from deepvalues(obj)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001226 if not mapping:
1227 yield obj
1228
1229
1230# Used as second parameter to dict.get() method, to distinguish absent
1231# dict key from one with a None value.
1232class Absent: pass
1233
1234class CookieJar:
1235 """Collection of HTTP cookies.
1236
1237 You may not need to know about this class: try
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001238 urllib.request.build_opener(HTTPCookieProcessor).open(url).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001239 """
1240
1241 non_word_re = re.compile(r"\W")
1242 quote_re = re.compile(r"([\"\\])")
1243 strict_domain_re = re.compile(r"\.?[^.]*")
1244 domain_re = re.compile(r"[^.]*")
1245 dots_re = re.compile(r"^\.+")
1246
Antoine Pitroufd036452008-08-19 17:56:33 +00001247 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001248
1249 def __init__(self, policy=None):
1250 if policy is None:
1251 policy = DefaultCookiePolicy()
1252 self._policy = policy
1253
1254 self._cookies_lock = _threading.RLock()
1255 self._cookies = {}
1256
1257 def set_policy(self, policy):
1258 self._policy = policy
1259
1260 def _cookies_for_domain(self, domain, request):
1261 cookies = []
1262 if not self._policy.domain_return_ok(domain, request):
1263 return []
Thomas Wouters477c8d52006-05-27 19:21:47 +00001264 _debug("Checking %s for cookies to return", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001265 cookies_by_path = self._cookies[domain]
1266 for path in cookies_by_path.keys():
1267 if not self._policy.path_return_ok(path, request):
1268 continue
1269 cookies_by_name = cookies_by_path[path]
1270 for cookie in cookies_by_name.values():
1271 if not self._policy.return_ok(cookie, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001272 _debug(" not returning cookie")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001273 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001274 _debug(" it's a match")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001275 cookies.append(cookie)
1276 return cookies
1277
1278 def _cookies_for_request(self, request):
1279 """Return a list of cookies to be returned to server."""
1280 cookies = []
1281 for domain in self._cookies.keys():
1282 cookies.extend(self._cookies_for_domain(domain, request))
1283 return cookies
1284
1285 def _cookie_attrs(self, cookies):
1286 """Return a list of cookie-attributes to be returned to server.
1287
1288 like ['foo="bar"; $Path="/"', ...]
1289
1290 The $Version attribute is also added when appropriate (currently only
1291 once per request).
1292
1293 """
1294 # add cookies in order of most specific (ie. longest) path first
Raymond Hettinger70b64fc2008-01-30 20:15:17 +00001295 cookies.sort(key=lambda a: len(a.path), reverse=True)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001296
1297 version_set = False
1298
1299 attrs = []
1300 for cookie in cookies:
1301 # set version of Cookie header
1302 # XXX
1303 # What should it be if multiple matching Set-Cookie headers have
1304 # different versions themselves?
1305 # Answer: there is no answer; was supposed to be settled by
1306 # RFC 2965 errata, but that may never appear...
1307 version = cookie.version
1308 if not version_set:
1309 version_set = True
1310 if version > 0:
1311 attrs.append("$Version=%s" % version)
1312
1313 # quote cookie value if necessary
1314 # (not for Netscape protocol, which already has any quotes
1315 # intact, due to the poorly-specified Netscape Cookie: syntax)
1316 if ((cookie.value is not None) and
1317 self.non_word_re.search(cookie.value) and version > 0):
1318 value = self.quote_re.sub(r"\\\1", cookie.value)
1319 else:
1320 value = cookie.value
1321
1322 # add cookie-attributes to be returned in Cookie header
1323 if cookie.value is None:
1324 attrs.append(cookie.name)
1325 else:
1326 attrs.append("%s=%s" % (cookie.name, value))
1327 if version > 0:
1328 if cookie.path_specified:
1329 attrs.append('$Path="%s"' % cookie.path)
1330 if cookie.domain.startswith("."):
1331 domain = cookie.domain
1332 if (not cookie.domain_initial_dot and
1333 domain.startswith(".")):
1334 domain = domain[1:]
1335 attrs.append('$Domain="%s"' % domain)
1336 if cookie.port is not None:
1337 p = "$Port"
1338 if cookie.port_specified:
1339 p = p + ('="%s"' % cookie.port)
1340 attrs.append(p)
1341
1342 return attrs
1343
1344 def add_cookie_header(self, request):
Georg Brandl029986a2008-06-23 11:44:14 +00001345 """Add correct Cookie: header to request (urllib.request.Request object).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001346
1347 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1348
1349 """
Thomas Wouters477c8d52006-05-27 19:21:47 +00001350 _debug("add_cookie_header")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001351 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001352 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001353
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001354 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001355
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001356 cookies = self._cookies_for_request(request)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001357
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001358 attrs = self._cookie_attrs(cookies)
1359 if attrs:
1360 if not request.has_header("Cookie"):
1361 request.add_unredirected_header(
1362 "Cookie", "; ".join(attrs))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001363
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001364 # if necessary, advertise that we know RFC 2965
1365 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1366 not request.has_header("Cookie2")):
1367 for cookie in cookies:
1368 if cookie.version != 1:
1369 request.add_unredirected_header("Cookie2", '$Version="1"')
1370 break
1371
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001372 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001373 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001374
1375 self.clear_expired_cookies()
1376
1377 def _normalized_cookie_tuples(self, attrs_set):
1378 """Return list of tuples containing normalised cookie information.
1379
1380 attrs_set is the list of lists of key,value pairs extracted from
1381 the Set-Cookie or Set-Cookie2 headers.
1382
1383 Tuples are name, value, standard, rest, where name and value are the
1384 cookie name and value, standard is a dictionary containing the standard
1385 cookie-attributes (discard, secure, version, expires or max-age,
1386 domain, path and port) and rest is a dictionary containing the rest of
1387 the cookie-attributes.
1388
1389 """
1390 cookie_tuples = []
1391
1392 boolean_attrs = "discard", "secure"
1393 value_attrs = ("version",
1394 "expires", "max-age",
1395 "domain", "path", "port",
1396 "comment", "commenturl")
1397
1398 for cookie_attrs in attrs_set:
1399 name, value = cookie_attrs[0]
1400
1401 # Build dictionary of standard cookie-attributes (standard) and
1402 # dictionary of other cookie-attributes (rest).
1403
1404 # Note: expiry time is normalised to seconds since epoch. V0
1405 # cookies should have the Expires cookie-attribute, and V1 cookies
1406 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1407 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1408 # accept either (but prefer Max-Age).
1409 max_age_set = False
1410
1411 bad_cookie = False
1412
1413 standard = {}
1414 rest = {}
1415 for k, v in cookie_attrs[1:]:
1416 lc = k.lower()
1417 # don't lose case distinction for unknown fields
1418 if lc in value_attrs or lc in boolean_attrs:
1419 k = lc
1420 if k in boolean_attrs and v is None:
1421 # boolean cookie-attribute is present, but has no value
1422 # (like "discard", rather than "port=80")
1423 v = True
1424 if k in standard:
1425 # only first value is significant
1426 continue
1427 if k == "domain":
1428 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001429 _debug(" missing value for domain attribute")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001430 bad_cookie = True
1431 break
1432 # RFC 2965 section 3.3.3
1433 v = v.lower()
1434 if k == "expires":
1435 if max_age_set:
1436 # Prefer max-age to expires (like Mozilla)
1437 continue
1438 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001439 _debug(" missing or invalid value for expires "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001440 "attribute: treating as session cookie")
1441 continue
1442 if k == "max-age":
1443 max_age_set = True
1444 try:
1445 v = int(v)
1446 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001447 _debug(" missing or invalid (non-numeric) value for "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001448 "max-age attribute")
1449 bad_cookie = True
1450 break
1451 # convert RFC 2965 Max-Age to seconds since epoch
1452 # XXX Strictly you're supposed to follow RFC 2616
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001453 # age-calculation rules. Remember that zero Max-Age
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001454 # is a request to discard (old and new) cookie, though.
1455 k = "expires"
1456 v = self._now + v
1457 if (k in value_attrs) or (k in boolean_attrs):
1458 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001459 k not in ("port", "comment", "commenturl")):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001460 _debug(" missing value for %s attribute" % k)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001461 bad_cookie = True
1462 break
1463 standard[k] = v
1464 else:
1465 rest[k] = v
1466
1467 if bad_cookie:
1468 continue
1469
1470 cookie_tuples.append((name, value, standard, rest))
1471
1472 return cookie_tuples
1473
1474 def _cookie_from_cookie_tuple(self, tup, request):
1475 # standard is dict of standard cookie-attributes, rest is dict of the
1476 # rest of them
1477 name, value, standard, rest = tup
1478
1479 domain = standard.get("domain", Absent)
1480 path = standard.get("path", Absent)
1481 port = standard.get("port", Absent)
1482 expires = standard.get("expires", Absent)
1483
1484 # set the easy defaults
1485 version = standard.get("version", None)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +00001486 if version is not None:
1487 try:
1488 version = int(version)
1489 except ValueError:
1490 return None # invalid version, ignore cookie
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001491 secure = standard.get("secure", False)
1492 # (discard is also set if expires is Absent)
1493 discard = standard.get("discard", False)
1494 comment = standard.get("comment", None)
1495 comment_url = standard.get("commenturl", None)
1496
1497 # set default path
1498 if path is not Absent and path != "":
1499 path_specified = True
1500 path = escape_path(path)
1501 else:
1502 path_specified = False
1503 path = request_path(request)
1504 i = path.rfind("/")
1505 if i != -1:
1506 if version == 0:
1507 # Netscape spec parts company from reality here
1508 path = path[:i]
1509 else:
1510 path = path[:i+1]
1511 if len(path) == 0: path = "/"
1512
1513 # set default domain
1514 domain_specified = domain is not Absent
1515 # but first we have to remember whether it starts with a dot
1516 domain_initial_dot = False
1517 if domain_specified:
1518 domain_initial_dot = bool(domain.startswith("."))
1519 if domain is Absent:
1520 req_host, erhn = eff_request_host(request)
1521 domain = erhn
1522 elif not domain.startswith("."):
1523 domain = "."+domain
1524
1525 # set default port
1526 port_specified = False
1527 if port is not Absent:
1528 if port is None:
1529 # Port attr present, but has no value: default to request port.
1530 # Cookie should then only be sent back on that port.
1531 port = request_port(request)
1532 else:
1533 port_specified = True
1534 port = re.sub(r"\s+", "", port)
1535 else:
1536 # No port attr present. Cookie can be sent back on any port.
1537 port = None
1538
1539 # set default expires and discard
1540 if expires is Absent:
1541 expires = None
1542 discard = True
1543 elif expires <= self._now:
1544 # Expiry date in past is request to delete cookie. This can't be
1545 # in DefaultCookiePolicy, because can't delete cookies there.
1546 try:
1547 self.clear(domain, path, name)
1548 except KeyError:
1549 pass
Thomas Wouters477c8d52006-05-27 19:21:47 +00001550 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1551 domain, path, name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001552 return None
1553
1554 return Cookie(version,
1555 name, value,
1556 port, port_specified,
1557 domain, domain_specified, domain_initial_dot,
1558 path, path_specified,
1559 secure,
1560 expires,
1561 discard,
1562 comment,
1563 comment_url,
1564 rest)
1565
1566 def _cookies_from_attrs_set(self, attrs_set, request):
1567 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1568
1569 cookies = []
1570 for tup in cookie_tuples:
1571 cookie = self._cookie_from_cookie_tuple(tup, request)
1572 if cookie: cookies.append(cookie)
1573 return cookies
1574
Neal Norwitz71dad722005-12-23 21:43:48 +00001575 def _process_rfc2109_cookies(self, cookies):
1576 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1577 if rfc2109_as_ns is None:
1578 rfc2109_as_ns = not self._policy.rfc2965
1579 for cookie in cookies:
1580 if cookie.version == 1:
1581 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001582 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001583 # treat 2109 cookies as Netscape cookies rather than
1584 # as RFC2965 cookies
1585 cookie.version = 0
1586
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001587 def make_cookies(self, response, request):
1588 """Return sequence of Cookie objects extracted from response object."""
1589 # get cookie-attributes for RFC 2965 and Netscape protocols
1590 headers = response.info()
Barry Warsaw820c1202008-06-12 04:06:45 +00001591 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1592 ns_hdrs = headers.get_all("Set-Cookie", [])
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001593
1594 rfc2965 = self._policy.rfc2965
1595 netscape = self._policy.netscape
1596
1597 if ((not rfc2965_hdrs and not ns_hdrs) or
1598 (not ns_hdrs and not rfc2965) or
1599 (not rfc2965_hdrs and not netscape) or
1600 (not netscape and not rfc2965)):
1601 return [] # no relevant cookie headers: quick exit
1602
1603 try:
1604 cookies = self._cookies_from_attrs_set(
1605 split_header_words(rfc2965_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001606 except Exception:
1607 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001608 cookies = []
1609
1610 if ns_hdrs and netscape:
1611 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001612 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001613 ns_cookies = self._cookies_from_attrs_set(
1614 parse_ns_headers(ns_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001615 except Exception:
1616 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001617 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001618 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001619
1620 # Look for Netscape cookies (from Set-Cookie headers) that match
1621 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1622 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1623 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1624 # bundled in with the Netscape cookies for this purpose, which is
1625 # reasonable behaviour.
1626 if rfc2965:
1627 lookup = {}
1628 for cookie in cookies:
1629 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1630
1631 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1632 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1633 return key not in lookup
1634 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1635
1636 if ns_cookies:
1637 cookies.extend(ns_cookies)
1638
1639 return cookies
1640
1641 def set_cookie_if_ok(self, cookie, request):
1642 """Set a cookie if policy says it's OK to do so."""
1643 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001644 try:
1645 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001646
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001647 if self._policy.set_ok(cookie, request):
1648 self.set_cookie(cookie)
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001649
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001650
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001651 finally:
1652 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001653
1654 def set_cookie(self, cookie):
1655 """Set a cookie, without checking whether or not it should be set."""
1656 c = self._cookies
1657 self._cookies_lock.acquire()
1658 try:
1659 if cookie.domain not in c: c[cookie.domain] = {}
1660 c2 = c[cookie.domain]
1661 if cookie.path not in c2: c2[cookie.path] = {}
1662 c3 = c2[cookie.path]
1663 c3[cookie.name] = cookie
1664 finally:
1665 self._cookies_lock.release()
1666
1667 def extract_cookies(self, response, request):
1668 """Extract cookies from response, where allowable given the request."""
Thomas Wouters477c8d52006-05-27 19:21:47 +00001669 _debug("extract_cookies: %s", response.info())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001670 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001671 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001672 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001673
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001674 for cookie in self.make_cookies(response, request):
1675 if self._policy.set_ok(cookie, request):
1676 _debug(" setting cookie: %s", cookie)
1677 self.set_cookie(cookie)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001678 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001679 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001680
1681 def clear(self, domain=None, path=None, name=None):
1682 """Clear some cookies.
1683
1684 Invoking this method without arguments will clear all cookies. If
1685 given a single argument, only cookies belonging to that domain will be
1686 removed. If given two arguments, cookies belonging to the specified
1687 path within that domain are removed. If given three arguments, then
1688 the cookie with the specified name, path and domain is removed.
1689
1690 Raises KeyError if no matching cookie exists.
1691
1692 """
1693 if name is not None:
1694 if (domain is None) or (path is None):
1695 raise ValueError(
1696 "domain and path must be given to remove a cookie by name")
1697 del self._cookies[domain][path][name]
1698 elif path is not None:
1699 if domain is None:
1700 raise ValueError(
1701 "domain must be given to remove cookies by path")
1702 del self._cookies[domain][path]
1703 elif domain is not None:
1704 del self._cookies[domain]
1705 else:
1706 self._cookies = {}
1707
1708 def clear_session_cookies(self):
1709 """Discard all session cookies.
1710
1711 Note that the .save() method won't save session cookies anyway, unless
1712 you ask otherwise by passing a true ignore_discard argument.
1713
1714 """
1715 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001716 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001717 for cookie in self:
1718 if cookie.discard:
1719 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001720 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001721 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001722
1723 def clear_expired_cookies(self):
1724 """Discard all expired cookies.
1725
1726 You probably don't need to call this method: expired cookies are never
1727 sent back to the server (provided you're using DefaultCookiePolicy),
1728 this method is called by CookieJar itself every so often, and the
1729 .save() method won't save expired cookies anyway (unless you ask
1730 otherwise by passing a true ignore_expires argument).
1731
1732 """
1733 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001734 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001735 now = time.time()
1736 for cookie in self:
1737 if cookie.is_expired(now):
1738 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001739 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001740 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001741
1742 def __iter__(self):
1743 return deepvalues(self._cookies)
1744
1745 def __len__(self):
1746 """Return number of contained cookies."""
1747 i = 0
1748 for cookie in self: i = i + 1
1749 return i
1750
1751 def __repr__(self):
1752 r = []
1753 for cookie in self: r.append(repr(cookie))
Serhiy Storchaka1392df92014-07-22 11:09:36 +03001754 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001755
1756 def __str__(self):
1757 r = []
1758 for cookie in self: r.append(str(cookie))
Serhiy Storchaka1392df92014-07-22 11:09:36 +03001759 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001760
1761
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001762# derives from OSError for backwards-compatibility with Python 2.4.0
1763class LoadError(OSError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001764
1765class FileCookieJar(CookieJar):
1766 """CookieJar that can be loaded from and saved to a file."""
1767
1768 def __init__(self, filename=None, delayload=False, policy=None):
1769 """
1770 Cookies are NOT loaded from the named file until either the .load() or
1771 .revert() method is called.
1772
1773 """
1774 CookieJar.__init__(self, policy)
1775 if filename is not None:
1776 try:
1777 filename+""
1778 except:
1779 raise ValueError("filename must be string-like")
1780 self.filename = filename
1781 self.delayload = bool(delayload)
1782
1783 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1784 """Save cookies to a file."""
1785 raise NotImplementedError()
1786
1787 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1788 """Load cookies from a file."""
1789 if filename is None:
1790 if self.filename is not None: filename = self.filename
1791 else: raise ValueError(MISSING_FILENAME_TEXT)
1792
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01001793 with open(filename) as f:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001794 self._really_load(f, filename, ignore_discard, ignore_expires)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001795
1796 def revert(self, filename=None,
1797 ignore_discard=False, ignore_expires=False):
1798 """Clear all cookies and reload cookies from a saved file.
1799
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001800 Raises LoadError (or OSError) if reversion is not successful; the
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001801 object's state will not be altered if this happens.
1802
1803 """
1804 if filename is None:
1805 if self.filename is not None: filename = self.filename
1806 else: raise ValueError(MISSING_FILENAME_TEXT)
1807
1808 self._cookies_lock.acquire()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001809 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001810
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001811 old_state = copy.deepcopy(self._cookies)
1812 self._cookies = {}
1813 try:
1814 self.load(filename, ignore_discard, ignore_expires)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001815 except OSError:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001816 self._cookies = old_state
1817 raise
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001818
1819 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001820 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001821
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001822
1823def lwp_cookie_str(cookie):
Benjamin Peterson82f34ad2015-01-13 09:17:24 -05001824 """Return string representation of Cookie in the LWP cookie file format.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001825
1826 Actually, the format is extended a bit -- see module docstring.
1827
1828 """
1829 h = [(cookie.name, cookie.value),
1830 ("path", cookie.path),
1831 ("domain", cookie.domain)]
1832 if cookie.port is not None: h.append(("port", cookie.port))
1833 if cookie.path_specified: h.append(("path_spec", None))
1834 if cookie.port_specified: h.append(("port_spec", None))
1835 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1836 if cookie.secure: h.append(("secure", None))
1837 if cookie.expires: h.append(("expires",
1838 time2isoz(float(cookie.expires))))
1839 if cookie.discard: h.append(("discard", None))
1840 if cookie.comment: h.append(("comment", cookie.comment))
1841 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1842
1843 keys = sorted(cookie._rest.keys())
1844 for k in keys:
1845 h.append((k, str(cookie._rest[k])))
1846
1847 h.append(("version", str(cookie.version)))
1848
1849 return join_header_words([h])
1850
1851class LWPCookieJar(FileCookieJar):
1852 """
Ezio Melotti0847db72012-09-21 16:30:22 +03001853 The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
Martin Panterf05641642016-05-08 13:48:10 +00001854 "Set-Cookie3" is the format used by the libwww-perl library, not known
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001855 to be compatible with any browser, but which is easy to read and
1856 doesn't lose information about RFC 2965 cookies.
1857
1858 Additional methods
1859
1860 as_lwp_str(ignore_discard=True, ignore_expired=True)
1861
1862 """
1863
1864 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
Ezio Melotti0847db72012-09-21 16:30:22 +03001865 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001866
1867 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1868
1869 """
1870 now = time.time()
1871 r = []
1872 for cookie in self:
1873 if not ignore_discard and cookie.discard:
1874 continue
1875 if not ignore_expires and cookie.is_expired(now):
1876 continue
1877 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1878 return "\n".join(r+[""])
1879
1880 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1881 if filename is None:
1882 if self.filename is not None: filename = self.filename
1883 else: raise ValueError(MISSING_FILENAME_TEXT)
1884
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01001885 with open(filename, "w") as f:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001886 # There really isn't an LWP Cookies 2.0 format, but this indicates
1887 # that there is extra information in here (domain_dot and
1888 # port_spec) while still being compatible with libwww-perl, I hope.
1889 f.write("#LWP-Cookies-2.0\n")
1890 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001891
1892 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1893 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001894 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001895 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1896 "file" % filename)
1897 raise LoadError(msg)
1898
1899 now = time.time()
1900
1901 header = "Set-Cookie3:"
1902 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1903 "secure", "discard")
1904 value_attrs = ("version",
1905 "port", "path", "domain",
1906 "expires",
1907 "comment", "commenturl")
1908
1909 try:
1910 while 1:
1911 line = f.readline()
1912 if line == "": break
1913 if not line.startswith(header):
1914 continue
1915 line = line[len(header):].strip()
1916
1917 for data in split_header_words([line]):
1918 name, value = data[0]
1919 standard = {}
1920 rest = {}
1921 for k in boolean_attrs:
1922 standard[k] = False
1923 for k, v in data[1:]:
1924 if k is not None:
1925 lc = k.lower()
1926 else:
1927 lc = None
1928 # don't lose case distinction for unknown fields
1929 if (lc in value_attrs) or (lc in boolean_attrs):
1930 k = lc
1931 if k in boolean_attrs:
1932 if v is None: v = True
1933 standard[k] = v
1934 elif k in value_attrs:
1935 standard[k] = v
1936 else:
1937 rest[k] = v
1938
1939 h = standard.get
1940 expires = h("expires")
1941 discard = h("discard")
1942 if expires is not None:
1943 expires = iso2time(expires)
1944 if expires is None:
1945 discard = True
1946 domain = h("domain")
1947 domain_specified = domain.startswith(".")
1948 c = Cookie(h("version"), name, value,
1949 h("port"), h("port_spec"),
1950 domain, domain_specified, h("domain_dot"),
1951 h("path"), h("path_spec"),
1952 h("secure"),
1953 expires,
1954 discard,
1955 h("comment"),
1956 h("commenturl"),
1957 rest)
1958 if not ignore_discard and c.discard:
1959 continue
1960 if not ignore_expires and c.is_expired(now):
1961 continue
1962 self.set_cookie(c)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001963 except OSError:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001964 raise
1965 except Exception:
1966 _warn_unhandled_exception()
1967 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1968 (filename, line))
1969
1970
1971class MozillaCookieJar(FileCookieJar):
1972 """
1973
1974 WARNING: you may want to backup your browser's cookies file if you use
1975 this class to save cookies. I *think* it works, but there have been
1976 bugs in the past!
1977
1978 This class differs from CookieJar only in the format it uses to save and
1979 load cookies to and from a file. This class uses the Mozilla/Netscape
1980 `cookies.txt' format. lynx uses this file format, too.
1981
1982 Don't expect cookies saved while the browser is running to be noticed by
1983 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1984 you change them on disk while it's running; on Windows, you probably can't
1985 save at all while the browser is running).
1986
1987 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1988 Netscape cookies on saving.
1989
1990 In particular, the cookie version and port number information is lost,
1991 together with information about whether or not Path, Port and Discard were
1992 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1993 domain as set in the HTTP header started with a dot (yes, I'm aware some
1994 domains in Netscape files start with a dot and some don't -- trust me, you
1995 really don't want to know any more about this).
1996
1997 Note that though Mozilla and Netscape use the same format, they use
1998 slightly different headers. The class saves cookies using the Netscape
1999 header by default (Mozilla can cope with that).
2000
2001 """
Antoine Pitroufd036452008-08-19 17:56:33 +00002002 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002003 header = """\
Georg Brandl87a15642010-07-31 22:11:11 +00002004# Netscape HTTP Cookie File
Benjamin Petersonccedc222013-12-18 15:35:18 -06002005# http://curl.haxx.se/rfc/cookie_spec.html
Georg Brandl87a15642010-07-31 22:11:11 +00002006# This is a generated file! Do not edit.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002007
2008"""
2009
2010 def _really_load(self, f, filename, ignore_discard, ignore_expires):
2011 now = time.time()
2012
2013 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00002014 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002015 raise LoadError(
2016 "%r does not look like a Netscape format cookies file" %
2017 filename)
2018
2019 try:
2020 while 1:
2021 line = f.readline()
2022 if line == "": break
2023
2024 # last field may be absent, so keep any trailing tab
2025 if line.endswith("\n"): line = line[:-1]
2026
2027 # skip comments and blank lines XXX what is $ for?
2028 if (line.strip().startswith(("#", "$")) or
2029 line.strip() == ""):
2030 continue
2031
2032 domain, domain_specified, path, secure, expires, name, value = \
2033 line.split("\t")
2034 secure = (secure == "TRUE")
2035 domain_specified = (domain_specified == "TRUE")
2036 if name == "":
2037 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2038 # with no name, whereas http.cookiejar regards it as a
2039 # cookie with no value.
2040 name = value
2041 value = None
2042
2043 initial_dot = domain.startswith(".")
2044 assert domain_specified == initial_dot
2045
2046 discard = False
2047 if expires == "":
2048 expires = None
2049 discard = True
2050
2051 # assume path_specified is false
2052 c = Cookie(0, name, value,
2053 None, False,
2054 domain, domain_specified, initial_dot,
2055 path, False,
2056 secure,
2057 expires,
2058 discard,
2059 None,
2060 None,
2061 {})
2062 if not ignore_discard and c.discard:
2063 continue
2064 if not ignore_expires and c.is_expired(now):
2065 continue
2066 self.set_cookie(c)
2067
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002068 except OSError:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002069 raise
2070 except Exception:
2071 _warn_unhandled_exception()
2072 raise LoadError("invalid Netscape format cookies file %r: %r" %
2073 (filename, line))
2074
2075 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2076 if filename is None:
2077 if self.filename is not None: filename = self.filename
2078 else: raise ValueError(MISSING_FILENAME_TEXT)
2079
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01002080 with open(filename, "w") as f:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002081 f.write(self.header)
2082 now = time.time()
2083 for cookie in self:
2084 if not ignore_discard and cookie.discard:
2085 continue
2086 if not ignore_expires and cookie.is_expired(now):
2087 continue
2088 if cookie.secure: secure = "TRUE"
2089 else: secure = "FALSE"
2090 if cookie.domain.startswith("."): initial_dot = "TRUE"
2091 else: initial_dot = "FALSE"
2092 if cookie.expires is not None:
2093 expires = str(cookie.expires)
2094 else:
2095 expires = ""
2096 if cookie.value is None:
2097 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2098 # with no name, whereas http.cookiejar regards it as a
2099 # cookie with no value.
2100 name = ""
2101 value = cookie.name
2102 else:
2103 name = cookie.name
2104 value = cookie.value
2105 f.write(
2106 "\t".join([cookie.domain, initial_dot, cookie.path,
2107 secure, expires, name, value])+
2108 "\n")