blob: 00cb1250a07e86a71f27c465c6fc6d7d89ef1cff [file] [log] [blame]
Éric Araujo23760e92011-11-07 17:52:48 +01001r"""HTTP cookie handling for web clients.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00002
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
Thomas Wouters477c8d52006-05-27 19:21:47 +000010Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000013
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Thomas Wouters477c8d52006-05-27 19:21:47 +000028__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
Jeremy Hylton1afc1692008-06-18 20:49:58 +000031import copy
Victor Stinner628225c2011-03-21 02:38:51 +010032import datetime
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033import re
34import time
35import urllib.parse, urllib.request
Antoine Pitroua6a4dc82017-09-07 18:56:24 +020036import threading as _threading
Georg Brandl24420152008-05-26 16:32:26 +000037import http.client # only for the default HTTP port
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000038from calendar import timegm
39
Thomas Wouters477c8d52006-05-27 19:21:47 +000040debug = False # set to True to enable debugging via the logging module
41logger = None
42
43def _debug(*args):
44 if not debug:
45 return
46 global logger
47 if not logger:
48 import logging
Georg Brandl24420152008-05-26 16:32:26 +000049 logger = logging.getLogger("http.cookiejar")
Thomas Wouters477c8d52006-05-27 19:21:47 +000050 return logger.debug(*args)
51
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000052
Georg Brandl24420152008-05-26 16:32:26 +000053DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000054MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
55 "instance initialised with one)")
56
Thomas Wouters477c8d52006-05-27 19:21:47 +000057def _warn_unhandled_exception():
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000058 # There are a few catch-all except: statements in this module, for
Thomas Wouters477c8d52006-05-27 19:21:47 +000059 # catching input that's bad in unexpected ways. Warn if any
60 # exceptions are caught there.
Jeremy Hylton7ecf3dc2008-05-10 20:38:40 +000061 import io, warnings, traceback
Guido van Rossum34d19282007-08-09 01:03:29 +000062 f = io.StringIO()
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000063 traceback.print_exc(None, f)
64 msg = f.getvalue()
Georg Brandl24420152008-05-26 16:32:26 +000065 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000066
67
68# Date/time conversion
69# -----------------------------------------------------------------------------
70
71EPOCH_YEAR = 1970
72def _timegm(tt):
73 year, month, mday, hour, min, sec = tt[:6]
74 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
75 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
76 return timegm(tt)
77 else:
78 return None
79
80DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
81MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
82 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
83MONTHS_LOWER = []
84for month in MONTHS: MONTHS_LOWER.append(month.lower())
85
86def time2isoz(t=None):
87 """Return a string representing time in seconds since epoch, t.
88
89 If the function is called without an argument, it will use the current
90 time.
91
92 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
93 representing Universal Time (UTC, aka GMT). An example of this format is:
94
95 1994-11-24 08:49:37Z
96
97 """
Victor Stinner628225c2011-03-21 02:38:51 +010098 if t is None:
99 dt = datetime.datetime.utcnow()
100 else:
101 dt = datetime.datetime.utcfromtimestamp(t)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000102 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100103 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000104
105def time2netscape(t=None):
106 """Return a string representing time in seconds since epoch, t.
107
108 If the function is called without an argument, it will use the current
109 time.
110
111 The format of the returned string is like this:
112
113 Wed, DD-Mon-YYYY HH:MM:SS GMT
114
115 """
Victor Stinner628225c2011-03-21 02:38:51 +0100116 if t is None:
117 dt = datetime.datetime.utcnow()
118 else:
119 dt = datetime.datetime.utcfromtimestamp(t)
Senthil Kumarand5b47fb2016-07-10 06:45:38 -0700120 return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100121 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
122 dt.year, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000123
124
125UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
126
Antoine Pitroufd036452008-08-19 17:56:33 +0000127TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000128def offset_from_tz_string(tz):
129 offset = None
130 if tz in UTC_ZONES:
131 offset = 0
132 else:
133 m = TIMEZONE_RE.search(tz)
134 if m:
135 offset = 3600 * int(m.group(2))
136 if m.group(3):
137 offset = offset + 60 * int(m.group(3))
138 if m.group(1) == '-':
139 offset = -offset
140 return offset
141
142def _str2time(day, mon, yr, hr, min, sec, tz):
Berker Peksag20be53e2016-03-14 05:48:02 +0200143 yr = int(yr)
144 if yr > datetime.MAXYEAR:
145 return None
146
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000147 # translate month name to number
148 # month numbers start with 1 (January)
149 try:
150 mon = MONTHS_LOWER.index(mon.lower())+1
151 except ValueError:
152 # maybe it's already a number
153 try:
154 imon = int(mon)
155 except ValueError:
156 return None
157 if 1 <= imon <= 12:
158 mon = imon
159 else:
160 return None
161
162 # make sure clock elements are defined
163 if hr is None: hr = 0
164 if min is None: min = 0
165 if sec is None: sec = 0
166
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000167 day = int(day)
168 hr = int(hr)
169 min = int(min)
170 sec = int(sec)
171
172 if yr < 1000:
173 # find "obvious" year
174 cur_yr = time.localtime(time.time())[0]
175 m = cur_yr % 100
176 tmp = yr
177 yr = yr + cur_yr - m
178 m = m - tmp
179 if abs(m) > 50:
180 if m > 0: yr = yr + 100
181 else: yr = yr - 100
182
183 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
184 t = _timegm((yr, mon, day, hr, min, sec, tz))
185
186 if t is not None:
187 # adjust time using timezone string, to get absolute time since epoch
188 if tz is None:
189 tz = "UTC"
190 tz = tz.upper()
191 offset = offset_from_tz_string(tz)
192 if offset is None:
193 return None
194 t = t - offset
195
196 return t
197
198STRICT_DATE_RE = re.compile(
199 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
R David Murray44b548d2016-09-08 13:59:53 -0400200 r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000201WEEKDAY_RE = re.compile(
Antoine Pitroufd036452008-08-19 17:56:33 +0000202 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000203LOOSE_HTTP_DATE_RE = re.compile(
204 r"""^
205 (\d\d?) # day
206 (?:\s+|[-\/])
207 (\w+) # month
208 (?:\s+|[-\/])
209 (\d+) # year
210 (?:
211 (?:\s+|:) # separator before clock
212 (\d\d?):(\d\d) # hour:min
213 (?::(\d\d))? # optional seconds
214 )? # optional clock
215 \s*
216 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
217 \s*
218 (?:\(\w+\))? # ASCII representation of timezone in parens.
Antoine Pitroufd036452008-08-19 17:56:33 +0000219 \s*$""", re.X | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000220def http2time(text):
221 """Returns time in seconds since epoch of time represented by a string.
222
223 Return value is an integer.
224
225 None is returned if the format of str is unrecognized, the time is outside
226 the representable range, or the timezone string is not recognized. If the
227 string contains no timezone, UTC is assumed.
228
229 The timezone in the string may be numerical (like "-0800" or "+0100") or a
230 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
231 timezone strings equivalent to UTC (zero offset) are known to the function.
232
233 The function loosely parses the following formats:
234
235 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
236 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
237 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
238 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
239 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
240 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
241
242 The parser ignores leading and trailing whitespace. The time may be
243 absent.
244
245 If the year is given with only 2 digits, the function will select the
246 century that makes the year closest to the current date.
247
248 """
249 # fast exit for strictly conforming string
250 m = STRICT_DATE_RE.search(text)
251 if m:
252 g = m.groups()
253 mon = MONTHS_LOWER.index(g[1].lower()) + 1
254 tt = (int(g[2]), mon, int(g[0]),
255 int(g[3]), int(g[4]), float(g[5]))
256 return _timegm(tt)
257
258 # No, we need some messy parsing...
259
260 # clean up
261 text = text.lstrip()
262 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
263
264 # tz is time zone specifier string
265 day, mon, yr, hr, min, sec, tz = [None]*7
266
267 # loose regexp parse
268 m = LOOSE_HTTP_DATE_RE.search(text)
269 if m is not None:
270 day, mon, yr, hr, min, sec, tz = m.groups()
271 else:
272 return None # bad format
273
274 return _str2time(day, mon, yr, hr, min, sec, tz)
275
276ISO_DATE_RE = re.compile(
R David Murray44b548d2016-09-08 13:59:53 -0400277 r"""^
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000278 (\d{4}) # year
279 [-\/]?
280 (\d\d?) # numerical month
281 [-\/]?
282 (\d\d?) # day
283 (?:
284 (?:\s+|[-:Tt]) # separator before clock
285 (\d\d?):?(\d\d) # hour:min
286 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
287 )? # optional clock
288 \s*
289 ([-+]?\d\d?:?(:?\d\d)?
290 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
Antoine Pitroufd036452008-08-19 17:56:33 +0000291 \s*$""", re.X | re. ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000292def iso2time(text):
293 """
294 As for http2time, but parses the ISO 8601 formats:
295
296 1994-02-03 14:15:29 -0100 -- ISO 8601 format
297 1994-02-03 14:15:29 -- zone is optional
298 1994-02-03 -- only date
299 1994-02-03T14:15:29 -- Use T as separator
300 19940203T141529Z -- ISO 8601 compact format
301 19940203 -- only date
302
303 """
304 # clean up
305 text = text.lstrip()
306
307 # tz is time zone specifier string
308 day, mon, yr, hr, min, sec, tz = [None]*7
309
310 # loose regexp parse
311 m = ISO_DATE_RE.search(text)
312 if m is not None:
313 # XXX there's an extra bit of the timezone I'm ignoring here: is
314 # this the right thing to do?
315 yr, mon, day, hr, min, sec, tz, _ = m.groups()
316 else:
317 return None # bad format
318
319 return _str2time(day, mon, yr, hr, min, sec, tz)
320
321
322# Header parsing
323# -----------------------------------------------------------------------------
324
325def unmatched(match):
326 """Return unmatched part of re.Match object."""
327 start, end = match.span(0)
328 return match.string[:start]+match.string[end:]
329
330HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
331HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
332HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
333HEADER_ESCAPE_RE = re.compile(r"\\(.)")
334def split_header_words(header_values):
335 r"""Parse header values into a list of lists containing key,value pairs.
336
337 The function knows how to deal with ",", ";" and "=" as well as quoted
338 values after "=". A list of space separated tokens are parsed as if they
339 were separated by ";".
340
341 If the header_values passed as argument contains multiple values, then they
342 are treated as if they were a single value separated by comma ",".
343
344 This means that this function is useful for parsing header fields that
345 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
346 the requirement for tokens).
347
348 headers = #header
349 header = (token | parameter) *( [";"] (token | parameter))
350
351 token = 1*<any CHAR except CTLs or separators>
352 separators = "(" | ")" | "<" | ">" | "@"
353 | "," | ";" | ":" | "\" | <">
354 | "/" | "[" | "]" | "?" | "="
355 | "{" | "}" | SP | HT
356
357 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
358 qdtext = <any TEXT except <">>
359 quoted-pair = "\" CHAR
360
361 parameter = attribute "=" value
362 attribute = token
363 value = token | quoted-string
364
365 Each header is represented by a list of key/value pairs. The value for a
366 simple token (not part of a parameter) is None. Syntactically incorrect
367 headers will not necessarily be parsed as you would want.
368
369 This is easier to describe with some examples:
370
371 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
372 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
373 >>> split_header_words(['text/html; charset="iso-8859-1"'])
374 [[('text/html', None), ('charset', 'iso-8859-1')]]
375 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
376 [[('Basic', None), ('realm', '"foobar"')]]
377
378 """
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000379 assert not isinstance(header_values, str)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000380 result = []
381 for text in header_values:
382 orig_text = text
383 pairs = []
384 while text:
385 m = HEADER_TOKEN_RE.search(text)
386 if m:
387 text = unmatched(m)
388 name = m.group(1)
389 m = HEADER_QUOTED_VALUE_RE.search(text)
390 if m: # quoted value
391 text = unmatched(m)
392 value = m.group(1)
393 value = HEADER_ESCAPE_RE.sub(r"\1", value)
394 else:
395 m = HEADER_VALUE_RE.search(text)
396 if m: # unquoted value
397 text = unmatched(m)
398 value = m.group(1)
399 value = value.rstrip()
400 else:
401 # no value, a lone token
402 value = None
403 pairs.append((name, value))
404 elif text.lstrip().startswith(","):
405 # concatenated headers, as per RFC 2616 section 4.2
406 text = text.lstrip()[1:]
407 if pairs: result.append(pairs)
408 pairs = []
409 else:
410 # skip junk
R David Murray44b548d2016-09-08 13:59:53 -0400411 non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000412 assert nr_junk_chars > 0, (
413 "split_header_words bug: '%s', '%s', %s" %
414 (orig_text, text, pairs))
415 text = non_junk
416 if pairs: result.append(pairs)
417 return result
418
419HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
420def join_header_words(lists):
421 """Do the inverse (almost) of the conversion done by split_header_words.
422
423 Takes a list of lists of (key, value) pairs and produces a single header
424 value. Attribute values are quoted if needed.
425
Martin Panterac34e092015-11-14 00:58:32 +0000426 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]])
427 'text/plain; charset="iso-8859-1"'
428 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]])
429 'text/plain, charset="iso-8859-1"'
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000430
431 """
432 headers = []
433 for pairs in lists:
434 attr = []
435 for k, v in pairs:
436 if v is not None:
437 if not re.search(r"^\w+$", v):
438 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
439 v = '"%s"' % v
440 k = "%s=%s" % (k, v)
441 attr.append(k)
442 if attr: headers.append("; ".join(attr))
443 return ", ".join(headers)
444
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000445def strip_quotes(text):
446 if text.startswith('"'):
447 text = text[1:]
448 if text.endswith('"'):
449 text = text[:-1]
450 return text
451
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000452def parse_ns_headers(ns_headers):
453 """Ad-hoc parser for Netscape protocol cookie-attributes.
454
455 The old Netscape cookie format for Set-Cookie can for instance contain
456 an unquoted "," in the expires field, so we have to use this ad-hoc
457 parser instead of split_header_words.
458
459 XXX This may not make the best possible effort to parse all the crap
460 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
461 parser is probably better, so could do worse than following that if
462 this ever gives any trouble.
463
464 Currently, this is also used for parsing RFC 2109 cookies.
465
466 """
467 known_attrs = ("expires", "domain", "path", "secure",
468 # RFC 2109 attrs (may turn up in Netscape cookies, too)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000469 "version", "port", "max-age")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000470
471 result = []
472 for ns_header in ns_headers:
473 pairs = []
474 version_set = False
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200475
476 # XXX: The following does not strictly adhere to RFCs in that empty
477 # names and values are legal (the former will only appear once and will
478 # be overwritten if multiple occurrences are present). This is
479 # mostly to deal with backwards compatibility.
480 for ii, param in enumerate(ns_header.split(';')):
481 param = param.strip()
482
483 key, sep, val = param.partition('=')
484 key = key.strip()
485
486 if not key:
487 if ii == 0:
488 break
489 else:
490 continue
491
492 # allow for a distinction between present and empty and missing
493 # altogether
494 val = val.strip() if sep else None
495
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000496 if ii != 0:
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200497 lc = key.lower()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000498 if lc in known_attrs:
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200499 key = lc
500
501 if key == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000502 # This is an RFC 2109 cookie.
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200503 if val is not None:
504 val = strip_quotes(val)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000505 version_set = True
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200506 elif key == "expires":
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000507 # convert expires date to seconds since epoch
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200508 if val is not None:
509 val = http2time(strip_quotes(val)) # None if invalid
510 pairs.append((key, val))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000511
512 if pairs:
513 if not version_set:
514 pairs.append(("version", "0"))
515 result.append(pairs)
516
517 return result
518
519
Antoine Pitroufd036452008-08-19 17:56:33 +0000520IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000521def is_HDN(text):
522 """Return True if text is a host domain name."""
523 # XXX
524 # This may well be wrong. Which RFC is HDN defined in, if any (for
525 # the purposes of RFC 2965)?
526 # For the current implementation, what about IPv6? Remember to look
527 # at other uses of IPV4_RE also, if change this.
528 if IPV4_RE.search(text):
529 return False
530 if text == "":
531 return False
532 if text[0] == "." or text[-1] == ".":
533 return False
534 return True
535
536def domain_match(A, B):
537 """Return True if domain A domain-matches domain B, according to RFC 2965.
538
539 A and B may be host domain names or IP addresses.
540
541 RFC 2965, section 1:
542
543 Host names can be specified either as an IP address or a HDN string.
544 Sometimes we compare one host name with another. (Such comparisons SHALL
545 be case-insensitive.) Host A's name domain-matches host B's if
546
547 * their host name strings string-compare equal; or
548
549 * A is a HDN string and has the form NB, where N is a non-empty
550 name string, B has the form .B', and B' is a HDN string. (So,
551 x.y.com domain-matches .Y.com but not Y.com.)
552
553 Note that domain-match is not a commutative operation: a.b.c.com
554 domain-matches .c.com, but not the reverse.
555
556 """
557 # Note that, if A or B are IP addresses, the only relevant part of the
558 # definition of the domain-match algorithm is the direct string-compare.
559 A = A.lower()
560 B = B.lower()
561 if A == B:
562 return True
563 if not is_HDN(A):
564 return False
565 i = A.rfind(B)
566 if i == -1 or i == 0:
567 # A does not have form NB, or N is the empty string
568 return False
569 if not B.startswith("."):
570 return False
571 if not is_HDN(B[1:]):
572 return False
573 return True
574
575def liberal_is_HDN(text):
576 """Return True if text is a sort-of-like a host domain name.
577
578 For accepting/blocking domains.
579
580 """
581 if IPV4_RE.search(text):
582 return False
583 return True
584
585def user_domain_match(A, B):
586 """For blocking/accepting domains.
587
588 A and B may be host domain names or IP addresses.
589
590 """
591 A = A.lower()
592 B = B.lower()
593 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
594 if A == B:
595 # equal IP addresses
596 return True
597 return False
598 initial_dot = B.startswith(".")
599 if initial_dot and A.endswith(B):
600 return True
601 if not initial_dot and A == B:
602 return True
603 return False
604
Antoine Pitroufd036452008-08-19 17:56:33 +0000605cut_port_re = re.compile(r":\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000606def request_host(request):
607 """Return request-host, as defined by RFC 2965.
608
609 Variation from RFC: returned value is lowercased, for convenient
610 comparison.
611
612 """
613 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000614 host = urllib.parse.urlparse(url)[1]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000615 if host == "":
616 host = request.get_header("Host", "")
617
618 # remove port, if present
619 host = cut_port_re.sub("", host, 1)
620 return host.lower()
621
622def eff_request_host(request):
623 """Return a tuple (request-host, effective request-host name).
624
625 As defined by RFC 2965, except both are lowercased.
626
627 """
628 erhn = req_host = request_host(request)
629 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
630 erhn = req_host + ".local"
631 return req_host, erhn
632
633def request_path(request):
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000634 """Path component of request-URI, as defined by RFC 2965."""
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000635 url = request.get_full_url()
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000636 parts = urllib.parse.urlsplit(url)
637 path = escape_path(parts.path)
638 if not path.startswith("/"):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000639 # fix bad RFC 2396 absoluteURI
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000640 path = "/" + path
641 return path
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000642
643def request_port(request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -0500644 host = request.host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000645 i = host.find(':')
646 if i >= 0:
647 port = host[i+1:]
648 try:
649 int(port)
650 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000651 _debug("nonnumeric port: '%s'", port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000652 return None
653 else:
654 port = DEFAULT_HTTP_PORT
655 return port
656
657# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
658# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
659HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
660ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
661def uppercase_escaped_char(match):
662 return "%%%s" % match.group(1).upper()
663def escape_path(path):
664 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
665 # There's no knowing what character encoding was used to create URLs
666 # containing %-escapes, but since we have to pick one to escape invalid
667 # path characters, we pick UTF-8, as recommended in the HTML 4.0
668 # specification:
669 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
670 # And here, kind of: draft-fielding-uri-rfc2396bis-03
671 # (And in draft IRI specification: draft-duerst-iri-05)
672 # (And here, for new URI schemes: RFC 2718)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000673 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000674 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
675 return path
676
677def reach(h):
678 """Return reach of host h, as defined by RFC 2965, section 1.
679
680 The reach R of a host name H is defined as follows:
681
682 * If
683
684 - H is the host domain name of a host; and,
685
686 - H has the form A.B; and
687
688 - A has no embedded (that is, interior) dots; and
689
690 - B has at least one embedded dot, or B is the string "local".
691 then the reach of H is .B.
692
693 * Otherwise, the reach of H is H.
694
695 >>> reach("www.acme.com")
696 '.acme.com'
697 >>> reach("acme.com")
698 'acme.com'
699 >>> reach("acme.local")
700 '.local'
701
702 """
703 i = h.find(".")
704 if i >= 0:
705 #a = h[:i] # this line is only here to show what a is
706 b = h[i+1:]
707 i = b.find(".")
708 if is_HDN(h) and (i >= 0 or b == "local"):
709 return "."+b
710 return h
711
712def is_third_party(request):
713 """
714
715 RFC 2965, section 3.3.6:
716
717 An unverifiable transaction is to a third-party host if its request-
718 host U does not domain-match the reach R of the request-host O in the
719 origin transaction.
720
721 """
722 req_host = request_host(request)
Senthil Kumarand9fbf362013-04-09 07:07:59 -0700723 if not domain_match(req_host, reach(request.origin_req_host)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000724 return True
725 else:
726 return False
727
728
729class Cookie:
730 """HTTP Cookie.
731
732 This class represents both Netscape and RFC 2965 cookies.
733
734 This is deliberately a very simple class. It just holds attributes. It's
735 possible to construct Cookie instances that don't comply with the cookie
736 standards. CookieJar.make_cookies is the factory function for Cookie
737 objects -- it deals with cookie parsing, supplying defaults, and
738 normalising to the representation used in this class. CookiePolicy is
739 responsible for checking them to see whether they should be accepted from
740 and returned to the server.
741
742 Note that the port may be present in the headers, but unspecified ("Port"
743 rather than"Port=80", for example); if this is the case, port is None.
744
745 """
746
747 def __init__(self, version, name, value,
748 port, port_specified,
749 domain, domain_specified, domain_initial_dot,
750 path, path_specified,
751 secure,
752 expires,
753 discard,
754 comment,
755 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000756 rest,
757 rfc2109=False,
758 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000759
760 if version is not None: version = int(version)
Robert Collinsa0e5d982015-08-04 10:06:29 +1200761 if expires is not None: expires = int(float(expires))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000762 if port is None and port_specified is True:
763 raise ValueError("if port is None, port_specified must be false")
764
765 self.version = version
766 self.name = name
767 self.value = value
768 self.port = port
769 self.port_specified = port_specified
770 # normalise case, as per RFC 2965 section 3.3.3
771 self.domain = domain.lower()
772 self.domain_specified = domain_specified
773 # Sigh. We need to know whether the domain given in the
774 # cookie-attribute had an initial dot, in order to follow RFC 2965
775 # (as clarified in draft errata). Needed for the returned $Domain
776 # value.
777 self.domain_initial_dot = domain_initial_dot
778 self.path = path
779 self.path_specified = path_specified
780 self.secure = secure
781 self.expires = expires
782 self.discard = discard
783 self.comment = comment
784 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000785 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000786
787 self._rest = copy.copy(rest)
788
789 def has_nonstandard_attr(self, name):
790 return name in self._rest
791 def get_nonstandard_attr(self, name, default=None):
792 return self._rest.get(name, default)
793 def set_nonstandard_attr(self, name, value):
794 self._rest[name] = value
795
796 def is_expired(self, now=None):
797 if now is None: now = time.time()
798 if (self.expires is not None) and (self.expires <= now):
799 return True
800 return False
801
802 def __str__(self):
803 if self.port is None: p = ""
804 else: p = ":"+self.port
805 limit = self.domain + p + self.path
806 if self.value is not None:
807 namevalue = "%s=%s" % (self.name, self.value)
808 else:
809 namevalue = self.name
810 return "<Cookie %s for %s>" % (namevalue, limit)
811
812 def __repr__(self):
813 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000814 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000815 "port", "port_specified",
816 "domain", "domain_specified", "domain_initial_dot",
817 "path", "path_specified",
818 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000819 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000820 attr = getattr(self, name)
821 args.append("%s=%s" % (name, repr(attr)))
822 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000823 args.append("rfc2109=%s" % repr(self.rfc2109))
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300824 return "%s(%s)" % (self.__class__.__name__, ", ".join(args))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000825
826
827class CookiePolicy:
828 """Defines which cookies get accepted from and returned to server.
829
830 May also modify cookies, though this is probably a bad idea.
831
832 The subclass DefaultCookiePolicy defines the standard rules for Netscape
Raymond Hettinger7ea386e2016-08-25 21:11:50 -0700833 and RFC 2965 cookies -- override that if you want a customized policy.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000834
835 """
836 def set_ok(self, cookie, request):
837 """Return true if (and only if) cookie should be accepted from server.
838
839 Currently, pre-expired cookies never get this far -- the CookieJar
840 class deletes such cookies itself.
841
842 """
843 raise NotImplementedError()
844
845 def return_ok(self, cookie, request):
846 """Return true if (and only if) cookie should be returned to server."""
847 raise NotImplementedError()
848
849 def domain_return_ok(self, domain, request):
850 """Return false if cookies should not be returned, given cookie domain.
851 """
852 return True
853
854 def path_return_ok(self, path, request):
855 """Return false if cookies should not be returned, given cookie path.
856 """
857 return True
858
859
860class DefaultCookiePolicy(CookiePolicy):
861 """Implements the standard rules for accepting and returning cookies."""
862
863 DomainStrictNoDots = 1
864 DomainStrictNonDomain = 2
865 DomainRFC2965Match = 4
866
867 DomainLiberal = 0
868 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
869
870 def __init__(self,
871 blocked_domains=None, allowed_domains=None,
872 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000873 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000874 hide_cookie2=False,
875 strict_domain=False,
876 strict_rfc2965_unverifiable=True,
877 strict_ns_unverifiable=False,
878 strict_ns_domain=DomainLiberal,
879 strict_ns_set_initial_dollar=False,
880 strict_ns_set_path=False,
881 ):
882 """Constructor arguments should be passed as keyword arguments only."""
883 self.netscape = netscape
884 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000885 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000886 self.hide_cookie2 = hide_cookie2
887 self.strict_domain = strict_domain
888 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
889 self.strict_ns_unverifiable = strict_ns_unverifiable
890 self.strict_ns_domain = strict_ns_domain
891 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
892 self.strict_ns_set_path = strict_ns_set_path
893
894 if blocked_domains is not None:
895 self._blocked_domains = tuple(blocked_domains)
896 else:
897 self._blocked_domains = ()
898
899 if allowed_domains is not None:
900 allowed_domains = tuple(allowed_domains)
901 self._allowed_domains = allowed_domains
902
903 def blocked_domains(self):
904 """Return the sequence of blocked domains (as a tuple)."""
905 return self._blocked_domains
906 def set_blocked_domains(self, blocked_domains):
907 """Set the sequence of blocked domains."""
908 self._blocked_domains = tuple(blocked_domains)
909
910 def is_blocked(self, domain):
911 for blocked_domain in self._blocked_domains:
912 if user_domain_match(domain, blocked_domain):
913 return True
914 return False
915
916 def allowed_domains(self):
917 """Return None, or the sequence of allowed domains (as a tuple)."""
918 return self._allowed_domains
919 def set_allowed_domains(self, allowed_domains):
920 """Set the sequence of allowed domains, or None."""
921 if allowed_domains is not None:
922 allowed_domains = tuple(allowed_domains)
923 self._allowed_domains = allowed_domains
924
925 def is_not_allowed(self, domain):
926 if self._allowed_domains is None:
927 return False
928 for allowed_domain in self._allowed_domains:
929 if user_domain_match(domain, allowed_domain):
930 return False
931 return True
932
933 def set_ok(self, cookie, request):
934 """
935 If you override .set_ok(), be sure to call this method. If it returns
936 false, so should your subclass (assuming your subclass wants to be more
937 strict about which cookies to accept).
938
939 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000940 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000941
942 assert cookie.name is not None
943
944 for n in "version", "verifiability", "name", "path", "domain", "port":
945 fn_name = "set_ok_"+n
946 fn = getattr(self, fn_name)
947 if not fn(cookie, request):
948 return False
949
950 return True
951
952 def set_ok_version(self, cookie, request):
953 if cookie.version is None:
954 # Version is always set to 0 by parse_ns_headers if it's a Netscape
955 # cookie, so this must be an invalid RFC 2965 cookie.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000956 _debug(" Set-Cookie2 without version attribute (%s=%s)",
957 cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000958 return False
959 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000960 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000961 return False
962 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000963 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000964 return False
965 return True
966
967 def set_ok_verifiability(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -0500968 if request.unverifiable and is_third_party(request):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000969 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000970 _debug(" third-party RFC 2965 cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000971 "unverifiable transaction")
972 return False
973 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000974 _debug(" third-party Netscape cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000975 "unverifiable transaction")
976 return False
977 return True
978
979 def set_ok_name(self, cookie, request):
980 # Try and stop servers setting V0 cookies designed to hack other
981 # servers that know both V0 and V1 protocols.
982 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
983 cookie.name.startswith("$")):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000984 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000985 return False
986 return True
987
988 def set_ok_path(self, cookie, request):
989 if cookie.path_specified:
990 req_path = request_path(request)
991 if ((cookie.version > 0 or
992 (cookie.version == 0 and self.strict_ns_set_path)) and
993 not req_path.startswith(cookie.path)):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000994 _debug(" path attribute %s is not a prefix of request "
995 "path %s", cookie.path, req_path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000996 return False
997 return True
998
999 def set_ok_domain(self, cookie, request):
1000 if self.is_blocked(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001001 _debug(" domain %s is in user block-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001002 return False
1003 if self.is_not_allowed(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001004 _debug(" domain %s is not in user allow-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001005 return False
1006 if cookie.domain_specified:
1007 req_host, erhn = eff_request_host(request)
1008 domain = cookie.domain
1009 if self.strict_domain and (domain.count(".") >= 2):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001010 # XXX This should probably be compared with the Konqueror
1011 # (kcookiejar.cpp) and Mozilla implementations, but it's a
1012 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001013 i = domain.rfind(".")
1014 j = domain.rfind(".", 0, i)
1015 if j == 0: # domain like .foo.bar
1016 tld = domain[i+1:]
1017 sld = domain[j+1:i]
Thomas Wouters477c8d52006-05-27 19:21:47 +00001018 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1019 "gov", "mil", "int", "aero", "biz", "cat", "coop",
1020 "info", "jobs", "mobi", "museum", "name", "pro",
1021 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001022 # domain like .co.uk
Thomas Wouters477c8d52006-05-27 19:21:47 +00001023 _debug(" country-code second level domain %s", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001024 return False
1025 if domain.startswith("."):
1026 undotted_domain = domain[1:]
1027 else:
1028 undotted_domain = domain
1029 embedded_dots = (undotted_domain.find(".") >= 0)
1030 if not embedded_dots and domain != ".local":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001031 _debug(" non-local domain %s contains no embedded dot",
1032 domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001033 return False
1034 if cookie.version == 0:
1035 if (not erhn.endswith(domain) and
1036 (not erhn.startswith(".") and
1037 not ("."+erhn).endswith(domain))):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001038 _debug(" effective request-host %s (even with added "
Ezio Melottie130a522011-10-19 10:58:56 +03001039 "initial dot) does not end with %s",
Thomas Wouters477c8d52006-05-27 19:21:47 +00001040 erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001041 return False
1042 if (cookie.version > 0 or
1043 (self.strict_ns_domain & self.DomainRFC2965Match)):
1044 if not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001045 _debug(" effective request-host %s does not domain-match "
1046 "%s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001047 return False
1048 if (cookie.version > 0 or
1049 (self.strict_ns_domain & self.DomainStrictNoDots)):
1050 host_prefix = req_host[:-len(domain)]
1051 if (host_prefix.find(".") >= 0 and
1052 not IPV4_RE.search(req_host)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001053 _debug(" host prefix %s for domain %s contains a dot",
1054 host_prefix, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001055 return False
1056 return True
1057
1058 def set_ok_port(self, cookie, request):
1059 if cookie.port_specified:
1060 req_port = request_port(request)
1061 if req_port is None:
1062 req_port = "80"
1063 else:
1064 req_port = str(req_port)
1065 for p in cookie.port.split(","):
1066 try:
1067 int(p)
1068 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001069 _debug(" bad port %s (not numeric)", p)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001070 return False
1071 if p == req_port:
1072 break
1073 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001074 _debug(" request port (%s) not found in %s",
1075 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001076 return False
1077 return True
1078
1079 def return_ok(self, cookie, request):
1080 """
1081 If you override .return_ok(), be sure to call this method. If it
1082 returns false, so should your subclass (assuming your subclass wants to
1083 be more strict about which cookies to return).
1084
1085 """
1086 # Path has already been checked by .path_return_ok(), and domain
1087 # blocking done by .domain_return_ok().
Thomas Wouters477c8d52006-05-27 19:21:47 +00001088 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001089
1090 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1091 fn_name = "return_ok_"+n
1092 fn = getattr(self, fn_name)
1093 if not fn(cookie, request):
1094 return False
1095 return True
1096
1097 def return_ok_version(self, cookie, request):
1098 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001099 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001100 return False
1101 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001102 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001103 return False
1104 return True
1105
1106 def return_ok_verifiability(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -05001107 if request.unverifiable and is_third_party(request):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001108 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001109 _debug(" third-party RFC 2965 cookie during unverifiable "
1110 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001111 return False
1112 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001113 _debug(" third-party Netscape cookie during unverifiable "
1114 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001115 return False
1116 return True
1117
1118 def return_ok_secure(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -05001119 if cookie.secure and request.type != "https":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001120 _debug(" secure cookie with non-secure request")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001121 return False
1122 return True
1123
1124 def return_ok_expires(self, cookie, request):
1125 if cookie.is_expired(self._now):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001126 _debug(" cookie expired")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001127 return False
1128 return True
1129
1130 def return_ok_port(self, cookie, request):
1131 if cookie.port:
1132 req_port = request_port(request)
1133 if req_port is None:
1134 req_port = "80"
1135 for p in cookie.port.split(","):
1136 if p == req_port:
1137 break
1138 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001139 _debug(" request port %s does not match cookie port %s",
1140 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001141 return False
1142 return True
1143
1144 def return_ok_domain(self, cookie, request):
1145 req_host, erhn = eff_request_host(request)
1146 domain = cookie.domain
1147
Miss Islington (bot)e5123d82019-03-09 18:58:25 -08001148 if domain and not domain.startswith("."):
1149 dotdomain = "." + domain
1150 else:
1151 dotdomain = domain
1152
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001153 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1154 if (cookie.version == 0 and
1155 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1156 not cookie.domain_specified and domain != erhn):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001157 _debug(" cookie with unspecified domain does not string-compare "
1158 "equal to request domain")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001159 return False
1160
1161 if cookie.version > 0 and not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001162 _debug(" effective request-host name %s does not domain-match "
1163 "RFC 2965 cookie domain %s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001164 return False
Miss Islington (bot)e5123d82019-03-09 18:58:25 -08001165 if cookie.version == 0 and not ("."+erhn).endswith(dotdomain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001166 _debug(" request-host %s does not match Netscape cookie domain "
1167 "%s", req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001168 return False
1169 return True
1170
1171 def domain_return_ok(self, domain, request):
1172 # Liberal check of. This is here as an optimization to avoid
1173 # having to load lots of MSIE cookie files unless necessary.
1174 req_host, erhn = eff_request_host(request)
1175 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001176 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001177 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001178 erhn = "."+erhn
Miss Islington (bot)e5123d82019-03-09 18:58:25 -08001179 if domain and not domain.startswith("."):
1180 dotdomain = "." + domain
1181 else:
1182 dotdomain = domain
1183 if not (req_host.endswith(dotdomain) or erhn.endswith(dotdomain)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001184 #_debug(" request domain %s does not match cookie domain %s",
1185 # req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001186 return False
1187
1188 if self.is_blocked(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001189 _debug(" domain %s is in user block-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001190 return False
1191 if self.is_not_allowed(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001192 _debug(" domain %s is not in user allow-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001193 return False
1194
1195 return True
1196
1197 def path_return_ok(self, path, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001198 _debug("- checking cookie path=%s", path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001199 req_path = request_path(request)
1200 if not req_path.startswith(path):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001201 _debug(" %s does not path-match %s", req_path, path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001202 return False
1203 return True
1204
1205
1206def vals_sorted_by_key(adict):
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001207 keys = sorted(adict.keys())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001208 return map(adict.get, keys)
1209
1210def deepvalues(mapping):
1211 """Iterates over nested mapping, depth-first, in sorted order by key."""
1212 values = vals_sorted_by_key(mapping)
1213 for obj in values:
1214 mapping = False
1215 try:
1216 obj.items
1217 except AttributeError:
1218 pass
1219 else:
1220 mapping = True
Philip Jenveyfd0d3e52012-10-01 15:34:31 -07001221 yield from deepvalues(obj)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001222 if not mapping:
1223 yield obj
1224
1225
1226# Used as second parameter to dict.get() method, to distinguish absent
1227# dict key from one with a None value.
1228class Absent: pass
1229
1230class CookieJar:
1231 """Collection of HTTP cookies.
1232
1233 You may not need to know about this class: try
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001234 urllib.request.build_opener(HTTPCookieProcessor).open(url).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001235 """
1236
1237 non_word_re = re.compile(r"\W")
1238 quote_re = re.compile(r"([\"\\])")
1239 strict_domain_re = re.compile(r"\.?[^.]*")
1240 domain_re = re.compile(r"[^.]*")
1241 dots_re = re.compile(r"^\.+")
1242
Antoine Pitroufd036452008-08-19 17:56:33 +00001243 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001244
1245 def __init__(self, policy=None):
1246 if policy is None:
1247 policy = DefaultCookiePolicy()
1248 self._policy = policy
1249
1250 self._cookies_lock = _threading.RLock()
1251 self._cookies = {}
1252
1253 def set_policy(self, policy):
1254 self._policy = policy
1255
1256 def _cookies_for_domain(self, domain, request):
1257 cookies = []
1258 if not self._policy.domain_return_ok(domain, request):
1259 return []
Thomas Wouters477c8d52006-05-27 19:21:47 +00001260 _debug("Checking %s for cookies to return", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001261 cookies_by_path = self._cookies[domain]
1262 for path in cookies_by_path.keys():
1263 if not self._policy.path_return_ok(path, request):
1264 continue
1265 cookies_by_name = cookies_by_path[path]
1266 for cookie in cookies_by_name.values():
1267 if not self._policy.return_ok(cookie, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001268 _debug(" not returning cookie")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001269 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001270 _debug(" it's a match")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001271 cookies.append(cookie)
1272 return cookies
1273
1274 def _cookies_for_request(self, request):
1275 """Return a list of cookies to be returned to server."""
1276 cookies = []
1277 for domain in self._cookies.keys():
1278 cookies.extend(self._cookies_for_domain(domain, request))
1279 return cookies
1280
1281 def _cookie_attrs(self, cookies):
1282 """Return a list of cookie-attributes to be returned to server.
1283
1284 like ['foo="bar"; $Path="/"', ...]
1285
1286 The $Version attribute is also added when appropriate (currently only
1287 once per request).
1288
1289 """
1290 # add cookies in order of most specific (ie. longest) path first
Raymond Hettinger70b64fc2008-01-30 20:15:17 +00001291 cookies.sort(key=lambda a: len(a.path), reverse=True)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001292
1293 version_set = False
1294
1295 attrs = []
1296 for cookie in cookies:
1297 # set version of Cookie header
1298 # XXX
1299 # What should it be if multiple matching Set-Cookie headers have
1300 # different versions themselves?
1301 # Answer: there is no answer; was supposed to be settled by
1302 # RFC 2965 errata, but that may never appear...
1303 version = cookie.version
1304 if not version_set:
1305 version_set = True
1306 if version > 0:
1307 attrs.append("$Version=%s" % version)
1308
1309 # quote cookie value if necessary
1310 # (not for Netscape protocol, which already has any quotes
1311 # intact, due to the poorly-specified Netscape Cookie: syntax)
1312 if ((cookie.value is not None) and
1313 self.non_word_re.search(cookie.value) and version > 0):
1314 value = self.quote_re.sub(r"\\\1", cookie.value)
1315 else:
1316 value = cookie.value
1317
1318 # add cookie-attributes to be returned in Cookie header
1319 if cookie.value is None:
1320 attrs.append(cookie.name)
1321 else:
1322 attrs.append("%s=%s" % (cookie.name, value))
1323 if version > 0:
1324 if cookie.path_specified:
1325 attrs.append('$Path="%s"' % cookie.path)
1326 if cookie.domain.startswith("."):
1327 domain = cookie.domain
1328 if (not cookie.domain_initial_dot and
1329 domain.startswith(".")):
1330 domain = domain[1:]
1331 attrs.append('$Domain="%s"' % domain)
1332 if cookie.port is not None:
1333 p = "$Port"
1334 if cookie.port_specified:
1335 p = p + ('="%s"' % cookie.port)
1336 attrs.append(p)
1337
1338 return attrs
1339
1340 def add_cookie_header(self, request):
Georg Brandl029986a2008-06-23 11:44:14 +00001341 """Add correct Cookie: header to request (urllib.request.Request object).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001342
1343 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1344
1345 """
Thomas Wouters477c8d52006-05-27 19:21:47 +00001346 _debug("add_cookie_header")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001347 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001348 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001349
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001350 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001351
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001352 cookies = self._cookies_for_request(request)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001353
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001354 attrs = self._cookie_attrs(cookies)
1355 if attrs:
1356 if not request.has_header("Cookie"):
1357 request.add_unredirected_header(
1358 "Cookie", "; ".join(attrs))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001359
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001360 # if necessary, advertise that we know RFC 2965
1361 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1362 not request.has_header("Cookie2")):
1363 for cookie in cookies:
1364 if cookie.version != 1:
1365 request.add_unredirected_header("Cookie2", '$Version="1"')
1366 break
1367
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001368 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001369 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001370
1371 self.clear_expired_cookies()
1372
1373 def _normalized_cookie_tuples(self, attrs_set):
1374 """Return list of tuples containing normalised cookie information.
1375
1376 attrs_set is the list of lists of key,value pairs extracted from
1377 the Set-Cookie or Set-Cookie2 headers.
1378
1379 Tuples are name, value, standard, rest, where name and value are the
1380 cookie name and value, standard is a dictionary containing the standard
1381 cookie-attributes (discard, secure, version, expires or max-age,
1382 domain, path and port) and rest is a dictionary containing the rest of
1383 the cookie-attributes.
1384
1385 """
1386 cookie_tuples = []
1387
1388 boolean_attrs = "discard", "secure"
1389 value_attrs = ("version",
1390 "expires", "max-age",
1391 "domain", "path", "port",
1392 "comment", "commenturl")
1393
1394 for cookie_attrs in attrs_set:
1395 name, value = cookie_attrs[0]
1396
1397 # Build dictionary of standard cookie-attributes (standard) and
1398 # dictionary of other cookie-attributes (rest).
1399
1400 # Note: expiry time is normalised to seconds since epoch. V0
1401 # cookies should have the Expires cookie-attribute, and V1 cookies
1402 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1403 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1404 # accept either (but prefer Max-Age).
1405 max_age_set = False
1406
1407 bad_cookie = False
1408
1409 standard = {}
1410 rest = {}
1411 for k, v in cookie_attrs[1:]:
1412 lc = k.lower()
1413 # don't lose case distinction for unknown fields
1414 if lc in value_attrs or lc in boolean_attrs:
1415 k = lc
1416 if k in boolean_attrs and v is None:
1417 # boolean cookie-attribute is present, but has no value
1418 # (like "discard", rather than "port=80")
1419 v = True
1420 if k in standard:
1421 # only first value is significant
1422 continue
1423 if k == "domain":
1424 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001425 _debug(" missing value for domain attribute")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001426 bad_cookie = True
1427 break
1428 # RFC 2965 section 3.3.3
1429 v = v.lower()
1430 if k == "expires":
1431 if max_age_set:
1432 # Prefer max-age to expires (like Mozilla)
1433 continue
1434 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001435 _debug(" missing or invalid value for expires "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001436 "attribute: treating as session cookie")
1437 continue
1438 if k == "max-age":
1439 max_age_set = True
1440 try:
1441 v = int(v)
1442 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001443 _debug(" missing or invalid (non-numeric) value for "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001444 "max-age attribute")
1445 bad_cookie = True
1446 break
1447 # convert RFC 2965 Max-Age to seconds since epoch
1448 # XXX Strictly you're supposed to follow RFC 2616
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001449 # age-calculation rules. Remember that zero Max-Age
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001450 # is a request to discard (old and new) cookie, though.
1451 k = "expires"
1452 v = self._now + v
1453 if (k in value_attrs) or (k in boolean_attrs):
1454 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001455 k not in ("port", "comment", "commenturl")):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001456 _debug(" missing value for %s attribute" % k)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001457 bad_cookie = True
1458 break
1459 standard[k] = v
1460 else:
1461 rest[k] = v
1462
1463 if bad_cookie:
1464 continue
1465
1466 cookie_tuples.append((name, value, standard, rest))
1467
1468 return cookie_tuples
1469
1470 def _cookie_from_cookie_tuple(self, tup, request):
1471 # standard is dict of standard cookie-attributes, rest is dict of the
1472 # rest of them
1473 name, value, standard, rest = tup
1474
1475 domain = standard.get("domain", Absent)
1476 path = standard.get("path", Absent)
1477 port = standard.get("port", Absent)
1478 expires = standard.get("expires", Absent)
1479
1480 # set the easy defaults
1481 version = standard.get("version", None)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +00001482 if version is not None:
1483 try:
1484 version = int(version)
1485 except ValueError:
1486 return None # invalid version, ignore cookie
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001487 secure = standard.get("secure", False)
1488 # (discard is also set if expires is Absent)
1489 discard = standard.get("discard", False)
1490 comment = standard.get("comment", None)
1491 comment_url = standard.get("commenturl", None)
1492
1493 # set default path
1494 if path is not Absent and path != "":
1495 path_specified = True
1496 path = escape_path(path)
1497 else:
1498 path_specified = False
1499 path = request_path(request)
1500 i = path.rfind("/")
1501 if i != -1:
1502 if version == 0:
1503 # Netscape spec parts company from reality here
1504 path = path[:i]
1505 else:
1506 path = path[:i+1]
1507 if len(path) == 0: path = "/"
1508
1509 # set default domain
1510 domain_specified = domain is not Absent
1511 # but first we have to remember whether it starts with a dot
1512 domain_initial_dot = False
1513 if domain_specified:
1514 domain_initial_dot = bool(domain.startswith("."))
1515 if domain is Absent:
1516 req_host, erhn = eff_request_host(request)
1517 domain = erhn
1518 elif not domain.startswith("."):
1519 domain = "."+domain
1520
1521 # set default port
1522 port_specified = False
1523 if port is not Absent:
1524 if port is None:
1525 # Port attr present, but has no value: default to request port.
1526 # Cookie should then only be sent back on that port.
1527 port = request_port(request)
1528 else:
1529 port_specified = True
1530 port = re.sub(r"\s+", "", port)
1531 else:
1532 # No port attr present. Cookie can be sent back on any port.
1533 port = None
1534
1535 # set default expires and discard
1536 if expires is Absent:
1537 expires = None
1538 discard = True
1539 elif expires <= self._now:
1540 # Expiry date in past is request to delete cookie. This can't be
1541 # in DefaultCookiePolicy, because can't delete cookies there.
1542 try:
1543 self.clear(domain, path, name)
1544 except KeyError:
1545 pass
Thomas Wouters477c8d52006-05-27 19:21:47 +00001546 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1547 domain, path, name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001548 return None
1549
1550 return Cookie(version,
1551 name, value,
1552 port, port_specified,
1553 domain, domain_specified, domain_initial_dot,
1554 path, path_specified,
1555 secure,
1556 expires,
1557 discard,
1558 comment,
1559 comment_url,
1560 rest)
1561
1562 def _cookies_from_attrs_set(self, attrs_set, request):
1563 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1564
1565 cookies = []
1566 for tup in cookie_tuples:
1567 cookie = self._cookie_from_cookie_tuple(tup, request)
1568 if cookie: cookies.append(cookie)
1569 return cookies
1570
Neal Norwitz71dad722005-12-23 21:43:48 +00001571 def _process_rfc2109_cookies(self, cookies):
1572 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1573 if rfc2109_as_ns is None:
1574 rfc2109_as_ns = not self._policy.rfc2965
1575 for cookie in cookies:
1576 if cookie.version == 1:
1577 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001578 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001579 # treat 2109 cookies as Netscape cookies rather than
1580 # as RFC2965 cookies
1581 cookie.version = 0
1582
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001583 def make_cookies(self, response, request):
1584 """Return sequence of Cookie objects extracted from response object."""
1585 # get cookie-attributes for RFC 2965 and Netscape protocols
1586 headers = response.info()
Barry Warsaw820c1202008-06-12 04:06:45 +00001587 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1588 ns_hdrs = headers.get_all("Set-Cookie", [])
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001589
1590 rfc2965 = self._policy.rfc2965
1591 netscape = self._policy.netscape
1592
1593 if ((not rfc2965_hdrs and not ns_hdrs) or
1594 (not ns_hdrs and not rfc2965) or
1595 (not rfc2965_hdrs and not netscape) or
1596 (not netscape and not rfc2965)):
1597 return [] # no relevant cookie headers: quick exit
1598
1599 try:
1600 cookies = self._cookies_from_attrs_set(
1601 split_header_words(rfc2965_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001602 except Exception:
1603 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001604 cookies = []
1605
1606 if ns_hdrs and netscape:
1607 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001608 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001609 ns_cookies = self._cookies_from_attrs_set(
1610 parse_ns_headers(ns_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001611 except Exception:
1612 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001613 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001614 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001615
1616 # Look for Netscape cookies (from Set-Cookie headers) that match
1617 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1618 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1619 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1620 # bundled in with the Netscape cookies for this purpose, which is
1621 # reasonable behaviour.
1622 if rfc2965:
1623 lookup = {}
1624 for cookie in cookies:
1625 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1626
1627 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1628 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1629 return key not in lookup
1630 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1631
1632 if ns_cookies:
1633 cookies.extend(ns_cookies)
1634
1635 return cookies
1636
1637 def set_cookie_if_ok(self, cookie, request):
1638 """Set a cookie if policy says it's OK to do so."""
1639 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001640 try:
1641 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001642
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001643 if self._policy.set_ok(cookie, request):
1644 self.set_cookie(cookie)
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001645
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001646
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001647 finally:
1648 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001649
1650 def set_cookie(self, cookie):
1651 """Set a cookie, without checking whether or not it should be set."""
1652 c = self._cookies
1653 self._cookies_lock.acquire()
1654 try:
1655 if cookie.domain not in c: c[cookie.domain] = {}
1656 c2 = c[cookie.domain]
1657 if cookie.path not in c2: c2[cookie.path] = {}
1658 c3 = c2[cookie.path]
1659 c3[cookie.name] = cookie
1660 finally:
1661 self._cookies_lock.release()
1662
1663 def extract_cookies(self, response, request):
1664 """Extract cookies from response, where allowable given the request."""
Thomas Wouters477c8d52006-05-27 19:21:47 +00001665 _debug("extract_cookies: %s", response.info())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001666 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001667 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001668 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001669
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001670 for cookie in self.make_cookies(response, request):
1671 if self._policy.set_ok(cookie, request):
1672 _debug(" setting cookie: %s", cookie)
1673 self.set_cookie(cookie)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001674 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001675 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001676
1677 def clear(self, domain=None, path=None, name=None):
1678 """Clear some cookies.
1679
1680 Invoking this method without arguments will clear all cookies. If
1681 given a single argument, only cookies belonging to that domain will be
1682 removed. If given two arguments, cookies belonging to the specified
1683 path within that domain are removed. If given three arguments, then
1684 the cookie with the specified name, path and domain is removed.
1685
1686 Raises KeyError if no matching cookie exists.
1687
1688 """
1689 if name is not None:
1690 if (domain is None) or (path is None):
1691 raise ValueError(
1692 "domain and path must be given to remove a cookie by name")
1693 del self._cookies[domain][path][name]
1694 elif path is not None:
1695 if domain is None:
1696 raise ValueError(
1697 "domain must be given to remove cookies by path")
1698 del self._cookies[domain][path]
1699 elif domain is not None:
1700 del self._cookies[domain]
1701 else:
1702 self._cookies = {}
1703
1704 def clear_session_cookies(self):
1705 """Discard all session cookies.
1706
1707 Note that the .save() method won't save session cookies anyway, unless
1708 you ask otherwise by passing a true ignore_discard argument.
1709
1710 """
1711 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001712 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001713 for cookie in self:
1714 if cookie.discard:
1715 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001716 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001717 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001718
1719 def clear_expired_cookies(self):
1720 """Discard all expired cookies.
1721
1722 You probably don't need to call this method: expired cookies are never
1723 sent back to the server (provided you're using DefaultCookiePolicy),
1724 this method is called by CookieJar itself every so often, and the
1725 .save() method won't save expired cookies anyway (unless you ask
1726 otherwise by passing a true ignore_expires argument).
1727
1728 """
1729 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001730 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001731 now = time.time()
1732 for cookie in self:
1733 if cookie.is_expired(now):
1734 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001735 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001736 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001737
1738 def __iter__(self):
1739 return deepvalues(self._cookies)
1740
1741 def __len__(self):
1742 """Return number of contained cookies."""
1743 i = 0
1744 for cookie in self: i = i + 1
1745 return i
1746
1747 def __repr__(self):
1748 r = []
1749 for cookie in self: r.append(repr(cookie))
Serhiy Storchaka1392df92014-07-22 11:09:36 +03001750 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001751
1752 def __str__(self):
1753 r = []
1754 for cookie in self: r.append(str(cookie))
Serhiy Storchaka1392df92014-07-22 11:09:36 +03001755 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001756
1757
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001758# derives from OSError for backwards-compatibility with Python 2.4.0
1759class LoadError(OSError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001760
1761class FileCookieJar(CookieJar):
1762 """CookieJar that can be loaded from and saved to a file."""
1763
1764 def __init__(self, filename=None, delayload=False, policy=None):
1765 """
1766 Cookies are NOT loaded from the named file until either the .load() or
1767 .revert() method is called.
1768
1769 """
1770 CookieJar.__init__(self, policy)
1771 if filename is not None:
1772 try:
1773 filename+""
1774 except:
1775 raise ValueError("filename must be string-like")
1776 self.filename = filename
1777 self.delayload = bool(delayload)
1778
1779 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1780 """Save cookies to a file."""
1781 raise NotImplementedError()
1782
1783 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1784 """Load cookies from a file."""
1785 if filename is None:
1786 if self.filename is not None: filename = self.filename
1787 else: raise ValueError(MISSING_FILENAME_TEXT)
1788
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01001789 with open(filename) as f:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001790 self._really_load(f, filename, ignore_discard, ignore_expires)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001791
1792 def revert(self, filename=None,
1793 ignore_discard=False, ignore_expires=False):
1794 """Clear all cookies and reload cookies from a saved file.
1795
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001796 Raises LoadError (or OSError) if reversion is not successful; the
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001797 object's state will not be altered if this happens.
1798
1799 """
1800 if filename is None:
1801 if self.filename is not None: filename = self.filename
1802 else: raise ValueError(MISSING_FILENAME_TEXT)
1803
1804 self._cookies_lock.acquire()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001805 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001806
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001807 old_state = copy.deepcopy(self._cookies)
1808 self._cookies = {}
1809 try:
1810 self.load(filename, ignore_discard, ignore_expires)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001811 except OSError:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001812 self._cookies = old_state
1813 raise
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001814
1815 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001816 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001817
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001818
1819def lwp_cookie_str(cookie):
Benjamin Peterson82f34ad2015-01-13 09:17:24 -05001820 """Return string representation of Cookie in the LWP cookie file format.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001821
1822 Actually, the format is extended a bit -- see module docstring.
1823
1824 """
1825 h = [(cookie.name, cookie.value),
1826 ("path", cookie.path),
1827 ("domain", cookie.domain)]
1828 if cookie.port is not None: h.append(("port", cookie.port))
1829 if cookie.path_specified: h.append(("path_spec", None))
1830 if cookie.port_specified: h.append(("port_spec", None))
1831 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1832 if cookie.secure: h.append(("secure", None))
1833 if cookie.expires: h.append(("expires",
1834 time2isoz(float(cookie.expires))))
1835 if cookie.discard: h.append(("discard", None))
1836 if cookie.comment: h.append(("comment", cookie.comment))
1837 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1838
1839 keys = sorted(cookie._rest.keys())
1840 for k in keys:
1841 h.append((k, str(cookie._rest[k])))
1842
1843 h.append(("version", str(cookie.version)))
1844
1845 return join_header_words([h])
1846
1847class LWPCookieJar(FileCookieJar):
1848 """
Ezio Melotti0847db72012-09-21 16:30:22 +03001849 The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
Martin Panterf05641642016-05-08 13:48:10 +00001850 "Set-Cookie3" is the format used by the libwww-perl library, not known
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001851 to be compatible with any browser, but which is easy to read and
1852 doesn't lose information about RFC 2965 cookies.
1853
1854 Additional methods
1855
1856 as_lwp_str(ignore_discard=True, ignore_expired=True)
1857
1858 """
1859
1860 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
Ezio Melotti0847db72012-09-21 16:30:22 +03001861 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001862
1863 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1864
1865 """
1866 now = time.time()
1867 r = []
1868 for cookie in self:
1869 if not ignore_discard and cookie.discard:
1870 continue
1871 if not ignore_expires and cookie.is_expired(now):
1872 continue
1873 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1874 return "\n".join(r+[""])
1875
1876 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1877 if filename is None:
1878 if self.filename is not None: filename = self.filename
1879 else: raise ValueError(MISSING_FILENAME_TEXT)
1880
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01001881 with open(filename, "w") as f:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001882 # There really isn't an LWP Cookies 2.0 format, but this indicates
1883 # that there is extra information in here (domain_dot and
1884 # port_spec) while still being compatible with libwww-perl, I hope.
1885 f.write("#LWP-Cookies-2.0\n")
1886 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001887
1888 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1889 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001890 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001891 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1892 "file" % filename)
1893 raise LoadError(msg)
1894
1895 now = time.time()
1896
1897 header = "Set-Cookie3:"
1898 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1899 "secure", "discard")
1900 value_attrs = ("version",
1901 "port", "path", "domain",
1902 "expires",
1903 "comment", "commenturl")
1904
1905 try:
1906 while 1:
1907 line = f.readline()
1908 if line == "": break
1909 if not line.startswith(header):
1910 continue
1911 line = line[len(header):].strip()
1912
1913 for data in split_header_words([line]):
1914 name, value = data[0]
1915 standard = {}
1916 rest = {}
1917 for k in boolean_attrs:
1918 standard[k] = False
1919 for k, v in data[1:]:
1920 if k is not None:
1921 lc = k.lower()
1922 else:
1923 lc = None
1924 # don't lose case distinction for unknown fields
1925 if (lc in value_attrs) or (lc in boolean_attrs):
1926 k = lc
1927 if k in boolean_attrs:
1928 if v is None: v = True
1929 standard[k] = v
1930 elif k in value_attrs:
1931 standard[k] = v
1932 else:
1933 rest[k] = v
1934
1935 h = standard.get
1936 expires = h("expires")
1937 discard = h("discard")
1938 if expires is not None:
1939 expires = iso2time(expires)
1940 if expires is None:
1941 discard = True
1942 domain = h("domain")
1943 domain_specified = domain.startswith(".")
1944 c = Cookie(h("version"), name, value,
1945 h("port"), h("port_spec"),
1946 domain, domain_specified, h("domain_dot"),
1947 h("path"), h("path_spec"),
1948 h("secure"),
1949 expires,
1950 discard,
1951 h("comment"),
1952 h("commenturl"),
1953 rest)
1954 if not ignore_discard and c.discard:
1955 continue
1956 if not ignore_expires and c.is_expired(now):
1957 continue
1958 self.set_cookie(c)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001959 except OSError:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001960 raise
1961 except Exception:
1962 _warn_unhandled_exception()
1963 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1964 (filename, line))
1965
1966
1967class MozillaCookieJar(FileCookieJar):
1968 """
1969
1970 WARNING: you may want to backup your browser's cookies file if you use
1971 this class to save cookies. I *think* it works, but there have been
1972 bugs in the past!
1973
1974 This class differs from CookieJar only in the format it uses to save and
1975 load cookies to and from a file. This class uses the Mozilla/Netscape
1976 `cookies.txt' format. lynx uses this file format, too.
1977
1978 Don't expect cookies saved while the browser is running to be noticed by
1979 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1980 you change them on disk while it's running; on Windows, you probably can't
1981 save at all while the browser is running).
1982
1983 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1984 Netscape cookies on saving.
1985
1986 In particular, the cookie version and port number information is lost,
1987 together with information about whether or not Path, Port and Discard were
1988 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1989 domain as set in the HTTP header started with a dot (yes, I'm aware some
1990 domains in Netscape files start with a dot and some don't -- trust me, you
1991 really don't want to know any more about this).
1992
1993 Note that though Mozilla and Netscape use the same format, they use
1994 slightly different headers. The class saves cookies using the Netscape
1995 header by default (Mozilla can cope with that).
1996
1997 """
Antoine Pitroufd036452008-08-19 17:56:33 +00001998 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001999 header = """\
Georg Brandl87a15642010-07-31 22:11:11 +00002000# Netscape HTTP Cookie File
Benjamin Petersonccedc222013-12-18 15:35:18 -06002001# http://curl.haxx.se/rfc/cookie_spec.html
Georg Brandl87a15642010-07-31 22:11:11 +00002002# This is a generated file! Do not edit.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002003
2004"""
2005
2006 def _really_load(self, f, filename, ignore_discard, ignore_expires):
2007 now = time.time()
2008
2009 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00002010 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002011 raise LoadError(
2012 "%r does not look like a Netscape format cookies file" %
2013 filename)
2014
2015 try:
2016 while 1:
2017 line = f.readline()
2018 if line == "": break
2019
2020 # last field may be absent, so keep any trailing tab
2021 if line.endswith("\n"): line = line[:-1]
2022
2023 # skip comments and blank lines XXX what is $ for?
2024 if (line.strip().startswith(("#", "$")) or
2025 line.strip() == ""):
2026 continue
2027
2028 domain, domain_specified, path, secure, expires, name, value = \
2029 line.split("\t")
2030 secure = (secure == "TRUE")
2031 domain_specified = (domain_specified == "TRUE")
2032 if name == "":
2033 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2034 # with no name, whereas http.cookiejar regards it as a
2035 # cookie with no value.
2036 name = value
2037 value = None
2038
2039 initial_dot = domain.startswith(".")
2040 assert domain_specified == initial_dot
2041
2042 discard = False
2043 if expires == "":
2044 expires = None
2045 discard = True
2046
2047 # assume path_specified is false
2048 c = Cookie(0, name, value,
2049 None, False,
2050 domain, domain_specified, initial_dot,
2051 path, False,
2052 secure,
2053 expires,
2054 discard,
2055 None,
2056 None,
2057 {})
2058 if not ignore_discard and c.discard:
2059 continue
2060 if not ignore_expires and c.is_expired(now):
2061 continue
2062 self.set_cookie(c)
2063
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002064 except OSError:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002065 raise
2066 except Exception:
2067 _warn_unhandled_exception()
2068 raise LoadError("invalid Netscape format cookies file %r: %r" %
2069 (filename, line))
2070
2071 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2072 if filename is None:
2073 if self.filename is not None: filename = self.filename
2074 else: raise ValueError(MISSING_FILENAME_TEXT)
2075
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01002076 with open(filename, "w") as f:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002077 f.write(self.header)
2078 now = time.time()
2079 for cookie in self:
2080 if not ignore_discard and cookie.discard:
2081 continue
2082 if not ignore_expires and cookie.is_expired(now):
2083 continue
2084 if cookie.secure: secure = "TRUE"
2085 else: secure = "FALSE"
2086 if cookie.domain.startswith("."): initial_dot = "TRUE"
2087 else: initial_dot = "FALSE"
2088 if cookie.expires is not None:
2089 expires = str(cookie.expires)
2090 else:
2091 expires = ""
2092 if cookie.value is None:
2093 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2094 # with no name, whereas http.cookiejar regards it as a
2095 # cookie with no value.
2096 name = ""
2097 value = cookie.name
2098 else:
2099 name = cookie.name
2100 value = cookie.value
2101 f.write(
2102 "\t".join([cookie.domain, initial_dot, cookie.path,
2103 secure, expires, name, value])+
2104 "\n")