blob: 0ba8200f325a629ed4eb1b88ae4e277afdc9deb6 [file] [log] [blame]
Éric Araujo23760e92011-11-07 17:52:48 +01001r"""HTTP cookie handling for web clients.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00002
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
Thomas Wouters477c8d52006-05-27 19:21:47 +000010Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000013
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Thomas Wouters477c8d52006-05-27 19:21:47 +000028__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
Jeremy Hylton1afc1692008-06-18 20:49:58 +000031import copy
Victor Stinner628225c2011-03-21 02:38:51 +010032import datetime
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033import re
34import time
35import urllib.parse, urllib.request
Antoine Pitroua6a4dc82017-09-07 18:56:24 +020036import threading as _threading
Georg Brandl24420152008-05-26 16:32:26 +000037import http.client # only for the default HTTP port
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000038from calendar import timegm
39
Thomas Wouters477c8d52006-05-27 19:21:47 +000040debug = False # set to True to enable debugging via the logging module
41logger = None
42
43def _debug(*args):
44 if not debug:
45 return
46 global logger
47 if not logger:
48 import logging
Georg Brandl24420152008-05-26 16:32:26 +000049 logger = logging.getLogger("http.cookiejar")
Thomas Wouters477c8d52006-05-27 19:21:47 +000050 return logger.debug(*args)
51
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000052
Georg Brandl24420152008-05-26 16:32:26 +000053DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000054MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
55 "instance initialised with one)")
56
Thomas Wouters477c8d52006-05-27 19:21:47 +000057def _warn_unhandled_exception():
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000058 # There are a few catch-all except: statements in this module, for
Thomas Wouters477c8d52006-05-27 19:21:47 +000059 # catching input that's bad in unexpected ways. Warn if any
60 # exceptions are caught there.
Jeremy Hylton7ecf3dc2008-05-10 20:38:40 +000061 import io, warnings, traceback
Guido van Rossum34d19282007-08-09 01:03:29 +000062 f = io.StringIO()
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000063 traceback.print_exc(None, f)
64 msg = f.getvalue()
Georg Brandl24420152008-05-26 16:32:26 +000065 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000066
67
68# Date/time conversion
69# -----------------------------------------------------------------------------
70
71EPOCH_YEAR = 1970
72def _timegm(tt):
73 year, month, mday, hour, min, sec = tt[:6]
74 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
75 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
76 return timegm(tt)
77 else:
78 return None
79
80DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
81MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
82 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
83MONTHS_LOWER = []
84for month in MONTHS: MONTHS_LOWER.append(month.lower())
85
86def time2isoz(t=None):
87 """Return a string representing time in seconds since epoch, t.
88
89 If the function is called without an argument, it will use the current
90 time.
91
92 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
93 representing Universal Time (UTC, aka GMT). An example of this format is:
94
95 1994-11-24 08:49:37Z
96
97 """
Victor Stinner628225c2011-03-21 02:38:51 +010098 if t is None:
99 dt = datetime.datetime.utcnow()
100 else:
101 dt = datetime.datetime.utcfromtimestamp(t)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000102 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100103 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000104
105def time2netscape(t=None):
106 """Return a string representing time in seconds since epoch, t.
107
108 If the function is called without an argument, it will use the current
109 time.
110
111 The format of the returned string is like this:
112
113 Wed, DD-Mon-YYYY HH:MM:SS GMT
114
115 """
Victor Stinner628225c2011-03-21 02:38:51 +0100116 if t is None:
117 dt = datetime.datetime.utcnow()
118 else:
119 dt = datetime.datetime.utcfromtimestamp(t)
Senthil Kumarand5b47fb2016-07-10 06:45:38 -0700120 return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100121 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
122 dt.year, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000123
124
125UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
126
Antoine Pitroufd036452008-08-19 17:56:33 +0000127TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000128def offset_from_tz_string(tz):
129 offset = None
130 if tz in UTC_ZONES:
131 offset = 0
132 else:
133 m = TIMEZONE_RE.search(tz)
134 if m:
135 offset = 3600 * int(m.group(2))
136 if m.group(3):
137 offset = offset + 60 * int(m.group(3))
138 if m.group(1) == '-':
139 offset = -offset
140 return offset
141
142def _str2time(day, mon, yr, hr, min, sec, tz):
Berker Peksag20be53e2016-03-14 05:48:02 +0200143 yr = int(yr)
144 if yr > datetime.MAXYEAR:
145 return None
146
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000147 # translate month name to number
148 # month numbers start with 1 (January)
149 try:
150 mon = MONTHS_LOWER.index(mon.lower())+1
151 except ValueError:
152 # maybe it's already a number
153 try:
154 imon = int(mon)
155 except ValueError:
156 return None
157 if 1 <= imon <= 12:
158 mon = imon
159 else:
160 return None
161
162 # make sure clock elements are defined
163 if hr is None: hr = 0
164 if min is None: min = 0
165 if sec is None: sec = 0
166
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000167 day = int(day)
168 hr = int(hr)
169 min = int(min)
170 sec = int(sec)
171
172 if yr < 1000:
173 # find "obvious" year
174 cur_yr = time.localtime(time.time())[0]
175 m = cur_yr % 100
176 tmp = yr
177 yr = yr + cur_yr - m
178 m = m - tmp
179 if abs(m) > 50:
180 if m > 0: yr = yr + 100
181 else: yr = yr - 100
182
183 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
184 t = _timegm((yr, mon, day, hr, min, sec, tz))
185
186 if t is not None:
187 # adjust time using timezone string, to get absolute time since epoch
188 if tz is None:
189 tz = "UTC"
190 tz = tz.upper()
191 offset = offset_from_tz_string(tz)
192 if offset is None:
193 return None
194 t = t - offset
195
196 return t
197
198STRICT_DATE_RE = re.compile(
199 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
R David Murray44b548d2016-09-08 13:59:53 -0400200 r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000201WEEKDAY_RE = re.compile(
Antoine Pitroufd036452008-08-19 17:56:33 +0000202 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000203LOOSE_HTTP_DATE_RE = re.compile(
204 r"""^
205 (\d\d?) # day
206 (?:\s+|[-\/])
207 (\w+) # month
208 (?:\s+|[-\/])
209 (\d+) # year
210 (?:
211 (?:\s+|:) # separator before clock
212 (\d\d?):(\d\d) # hour:min
213 (?::(\d\d))? # optional seconds
214 )? # optional clock
215 \s*
216 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
217 \s*
218 (?:\(\w+\))? # ASCII representation of timezone in parens.
Antoine Pitroufd036452008-08-19 17:56:33 +0000219 \s*$""", re.X | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000220def http2time(text):
221 """Returns time in seconds since epoch of time represented by a string.
222
223 Return value is an integer.
224
225 None is returned if the format of str is unrecognized, the time is outside
226 the representable range, or the timezone string is not recognized. If the
227 string contains no timezone, UTC is assumed.
228
229 The timezone in the string may be numerical (like "-0800" or "+0100") or a
230 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
231 timezone strings equivalent to UTC (zero offset) are known to the function.
232
233 The function loosely parses the following formats:
234
235 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
236 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
237 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
238 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
239 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
240 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
241
242 The parser ignores leading and trailing whitespace. The time may be
243 absent.
244
245 If the year is given with only 2 digits, the function will select the
246 century that makes the year closest to the current date.
247
248 """
249 # fast exit for strictly conforming string
250 m = STRICT_DATE_RE.search(text)
251 if m:
252 g = m.groups()
253 mon = MONTHS_LOWER.index(g[1].lower()) + 1
254 tt = (int(g[2]), mon, int(g[0]),
255 int(g[3]), int(g[4]), float(g[5]))
256 return _timegm(tt)
257
258 # No, we need some messy parsing...
259
260 # clean up
261 text = text.lstrip()
262 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
263
264 # tz is time zone specifier string
265 day, mon, yr, hr, min, sec, tz = [None]*7
266
267 # loose regexp parse
268 m = LOOSE_HTTP_DATE_RE.search(text)
269 if m is not None:
270 day, mon, yr, hr, min, sec, tz = m.groups()
271 else:
272 return None # bad format
273
274 return _str2time(day, mon, yr, hr, min, sec, tz)
275
276ISO_DATE_RE = re.compile(
R David Murray44b548d2016-09-08 13:59:53 -0400277 r"""^
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000278 (\d{4}) # year
279 [-\/]?
280 (\d\d?) # numerical month
281 [-\/]?
282 (\d\d?) # day
283 (?:
284 (?:\s+|[-:Tt]) # separator before clock
285 (\d\d?):?(\d\d) # hour:min
286 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
287 )? # optional clock
288 \s*
289 ([-+]?\d\d?:?(:?\d\d)?
290 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
Antoine Pitroufd036452008-08-19 17:56:33 +0000291 \s*$""", re.X | re. ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000292def iso2time(text):
293 """
294 As for http2time, but parses the ISO 8601 formats:
295
296 1994-02-03 14:15:29 -0100 -- ISO 8601 format
297 1994-02-03 14:15:29 -- zone is optional
298 1994-02-03 -- only date
299 1994-02-03T14:15:29 -- Use T as separator
300 19940203T141529Z -- ISO 8601 compact format
301 19940203 -- only date
302
303 """
304 # clean up
305 text = text.lstrip()
306
307 # tz is time zone specifier string
308 day, mon, yr, hr, min, sec, tz = [None]*7
309
310 # loose regexp parse
311 m = ISO_DATE_RE.search(text)
312 if m is not None:
313 # XXX there's an extra bit of the timezone I'm ignoring here: is
314 # this the right thing to do?
315 yr, mon, day, hr, min, sec, tz, _ = m.groups()
316 else:
317 return None # bad format
318
319 return _str2time(day, mon, yr, hr, min, sec, tz)
320
321
322# Header parsing
323# -----------------------------------------------------------------------------
324
325def unmatched(match):
326 """Return unmatched part of re.Match object."""
327 start, end = match.span(0)
328 return match.string[:start]+match.string[end:]
329
330HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
331HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
332HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
333HEADER_ESCAPE_RE = re.compile(r"\\(.)")
334def split_header_words(header_values):
335 r"""Parse header values into a list of lists containing key,value pairs.
336
337 The function knows how to deal with ",", ";" and "=" as well as quoted
338 values after "=". A list of space separated tokens are parsed as if they
339 were separated by ";".
340
341 If the header_values passed as argument contains multiple values, then they
342 are treated as if they were a single value separated by comma ",".
343
344 This means that this function is useful for parsing header fields that
345 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
346 the requirement for tokens).
347
348 headers = #header
349 header = (token | parameter) *( [";"] (token | parameter))
350
351 token = 1*<any CHAR except CTLs or separators>
352 separators = "(" | ")" | "<" | ">" | "@"
353 | "," | ";" | ":" | "\" | <">
354 | "/" | "[" | "]" | "?" | "="
355 | "{" | "}" | SP | HT
356
357 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
358 qdtext = <any TEXT except <">>
359 quoted-pair = "\" CHAR
360
361 parameter = attribute "=" value
362 attribute = token
363 value = token | quoted-string
364
365 Each header is represented by a list of key/value pairs. The value for a
366 simple token (not part of a parameter) is None. Syntactically incorrect
367 headers will not necessarily be parsed as you would want.
368
369 This is easier to describe with some examples:
370
371 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
372 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
373 >>> split_header_words(['text/html; charset="iso-8859-1"'])
374 [[('text/html', None), ('charset', 'iso-8859-1')]]
375 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
376 [[('Basic', None), ('realm', '"foobar"')]]
377
378 """
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000379 assert not isinstance(header_values, str)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000380 result = []
381 for text in header_values:
382 orig_text = text
383 pairs = []
384 while text:
385 m = HEADER_TOKEN_RE.search(text)
386 if m:
387 text = unmatched(m)
388 name = m.group(1)
389 m = HEADER_QUOTED_VALUE_RE.search(text)
390 if m: # quoted value
391 text = unmatched(m)
392 value = m.group(1)
393 value = HEADER_ESCAPE_RE.sub(r"\1", value)
394 else:
395 m = HEADER_VALUE_RE.search(text)
396 if m: # unquoted value
397 text = unmatched(m)
398 value = m.group(1)
399 value = value.rstrip()
400 else:
401 # no value, a lone token
402 value = None
403 pairs.append((name, value))
404 elif text.lstrip().startswith(","):
405 # concatenated headers, as per RFC 2616 section 4.2
406 text = text.lstrip()[1:]
407 if pairs: result.append(pairs)
408 pairs = []
409 else:
410 # skip junk
R David Murray44b548d2016-09-08 13:59:53 -0400411 non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000412 assert nr_junk_chars > 0, (
413 "split_header_words bug: '%s', '%s', %s" %
414 (orig_text, text, pairs))
415 text = non_junk
416 if pairs: result.append(pairs)
417 return result
418
419HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
420def join_header_words(lists):
421 """Do the inverse (almost) of the conversion done by split_header_words.
422
423 Takes a list of lists of (key, value) pairs and produces a single header
424 value. Attribute values are quoted if needed.
425
Martin Panterac34e092015-11-14 00:58:32 +0000426 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]])
427 'text/plain; charset="iso-8859-1"'
428 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]])
429 'text/plain, charset="iso-8859-1"'
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000430
431 """
432 headers = []
433 for pairs in lists:
434 attr = []
435 for k, v in pairs:
436 if v is not None:
437 if not re.search(r"^\w+$", v):
438 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
439 v = '"%s"' % v
440 k = "%s=%s" % (k, v)
441 attr.append(k)
442 if attr: headers.append("; ".join(attr))
443 return ", ".join(headers)
444
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000445def strip_quotes(text):
446 if text.startswith('"'):
447 text = text[1:]
448 if text.endswith('"'):
449 text = text[:-1]
450 return text
451
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000452def parse_ns_headers(ns_headers):
453 """Ad-hoc parser for Netscape protocol cookie-attributes.
454
455 The old Netscape cookie format for Set-Cookie can for instance contain
456 an unquoted "," in the expires field, so we have to use this ad-hoc
457 parser instead of split_header_words.
458
459 XXX This may not make the best possible effort to parse all the crap
460 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
461 parser is probably better, so could do worse than following that if
462 this ever gives any trouble.
463
464 Currently, this is also used for parsing RFC 2109 cookies.
465
466 """
467 known_attrs = ("expires", "domain", "path", "secure",
468 # RFC 2109 attrs (may turn up in Netscape cookies, too)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000469 "version", "port", "max-age")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000470
471 result = []
472 for ns_header in ns_headers:
473 pairs = []
474 version_set = False
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200475
476 # XXX: The following does not strictly adhere to RFCs in that empty
477 # names and values are legal (the former will only appear once and will
478 # be overwritten if multiple occurrences are present). This is
479 # mostly to deal with backwards compatibility.
480 for ii, param in enumerate(ns_header.split(';')):
481 param = param.strip()
482
483 key, sep, val = param.partition('=')
484 key = key.strip()
485
486 if not key:
487 if ii == 0:
488 break
489 else:
490 continue
491
492 # allow for a distinction between present and empty and missing
493 # altogether
494 val = val.strip() if sep else None
495
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000496 if ii != 0:
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200497 lc = key.lower()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000498 if lc in known_attrs:
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200499 key = lc
500
501 if key == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000502 # This is an RFC 2109 cookie.
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200503 if val is not None:
504 val = strip_quotes(val)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000505 version_set = True
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200506 elif key == "expires":
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000507 # convert expires date to seconds since epoch
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200508 if val is not None:
509 val = http2time(strip_quotes(val)) # None if invalid
510 pairs.append((key, val))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000511
512 if pairs:
513 if not version_set:
514 pairs.append(("version", "0"))
515 result.append(pairs)
516
517 return result
518
519
Antoine Pitroufd036452008-08-19 17:56:33 +0000520IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000521def is_HDN(text):
522 """Return True if text is a host domain name."""
523 # XXX
524 # This may well be wrong. Which RFC is HDN defined in, if any (for
525 # the purposes of RFC 2965)?
526 # For the current implementation, what about IPv6? Remember to look
527 # at other uses of IPV4_RE also, if change this.
528 if IPV4_RE.search(text):
529 return False
530 if text == "":
531 return False
532 if text[0] == "." or text[-1] == ".":
533 return False
534 return True
535
536def domain_match(A, B):
537 """Return True if domain A domain-matches domain B, according to RFC 2965.
538
539 A and B may be host domain names or IP addresses.
540
541 RFC 2965, section 1:
542
543 Host names can be specified either as an IP address or a HDN string.
544 Sometimes we compare one host name with another. (Such comparisons SHALL
545 be case-insensitive.) Host A's name domain-matches host B's if
546
547 * their host name strings string-compare equal; or
548
549 * A is a HDN string and has the form NB, where N is a non-empty
550 name string, B has the form .B', and B' is a HDN string. (So,
551 x.y.com domain-matches .Y.com but not Y.com.)
552
553 Note that domain-match is not a commutative operation: a.b.c.com
554 domain-matches .c.com, but not the reverse.
555
556 """
557 # Note that, if A or B are IP addresses, the only relevant part of the
558 # definition of the domain-match algorithm is the direct string-compare.
559 A = A.lower()
560 B = B.lower()
561 if A == B:
562 return True
563 if not is_HDN(A):
564 return False
565 i = A.rfind(B)
566 if i == -1 or i == 0:
567 # A does not have form NB, or N is the empty string
568 return False
569 if not B.startswith("."):
570 return False
571 if not is_HDN(B[1:]):
572 return False
573 return True
574
575def liberal_is_HDN(text):
576 """Return True if text is a sort-of-like a host domain name.
577
578 For accepting/blocking domains.
579
580 """
581 if IPV4_RE.search(text):
582 return False
583 return True
584
585def user_domain_match(A, B):
586 """For blocking/accepting domains.
587
588 A and B may be host domain names or IP addresses.
589
590 """
591 A = A.lower()
592 B = B.lower()
593 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
594 if A == B:
595 # equal IP addresses
596 return True
597 return False
598 initial_dot = B.startswith(".")
599 if initial_dot and A.endswith(B):
600 return True
601 if not initial_dot and A == B:
602 return True
603 return False
604
Antoine Pitroufd036452008-08-19 17:56:33 +0000605cut_port_re = re.compile(r":\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000606def request_host(request):
607 """Return request-host, as defined by RFC 2965.
608
609 Variation from RFC: returned value is lowercased, for convenient
610 comparison.
611
612 """
613 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000614 host = urllib.parse.urlparse(url)[1]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000615 if host == "":
616 host = request.get_header("Host", "")
617
618 # remove port, if present
619 host = cut_port_re.sub("", host, 1)
620 return host.lower()
621
622def eff_request_host(request):
623 """Return a tuple (request-host, effective request-host name).
624
625 As defined by RFC 2965, except both are lowercased.
626
627 """
628 erhn = req_host = request_host(request)
629 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
630 erhn = req_host + ".local"
631 return req_host, erhn
632
633def request_path(request):
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000634 """Path component of request-URI, as defined by RFC 2965."""
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000635 url = request.get_full_url()
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000636 parts = urllib.parse.urlsplit(url)
637 path = escape_path(parts.path)
638 if not path.startswith("/"):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000639 # fix bad RFC 2396 absoluteURI
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000640 path = "/" + path
641 return path
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000642
643def request_port(request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -0500644 host = request.host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000645 i = host.find(':')
646 if i >= 0:
647 port = host[i+1:]
648 try:
649 int(port)
650 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000651 _debug("nonnumeric port: '%s'", port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000652 return None
653 else:
654 port = DEFAULT_HTTP_PORT
655 return port
656
657# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
658# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
659HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
660ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
661def uppercase_escaped_char(match):
662 return "%%%s" % match.group(1).upper()
663def escape_path(path):
664 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
665 # There's no knowing what character encoding was used to create URLs
666 # containing %-escapes, but since we have to pick one to escape invalid
667 # path characters, we pick UTF-8, as recommended in the HTML 4.0
668 # specification:
669 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
670 # And here, kind of: draft-fielding-uri-rfc2396bis-03
671 # (And in draft IRI specification: draft-duerst-iri-05)
672 # (And here, for new URI schemes: RFC 2718)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000673 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000674 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
675 return path
676
677def reach(h):
678 """Return reach of host h, as defined by RFC 2965, section 1.
679
680 The reach R of a host name H is defined as follows:
681
682 * If
683
684 - H is the host domain name of a host; and,
685
686 - H has the form A.B; and
687
688 - A has no embedded (that is, interior) dots; and
689
690 - B has at least one embedded dot, or B is the string "local".
691 then the reach of H is .B.
692
693 * Otherwise, the reach of H is H.
694
695 >>> reach("www.acme.com")
696 '.acme.com'
697 >>> reach("acme.com")
698 'acme.com'
699 >>> reach("acme.local")
700 '.local'
701
702 """
703 i = h.find(".")
704 if i >= 0:
705 #a = h[:i] # this line is only here to show what a is
706 b = h[i+1:]
707 i = b.find(".")
708 if is_HDN(h) and (i >= 0 or b == "local"):
709 return "."+b
710 return h
711
712def is_third_party(request):
713 """
714
715 RFC 2965, section 3.3.6:
716
717 An unverifiable transaction is to a third-party host if its request-
718 host U does not domain-match the reach R of the request-host O in the
719 origin transaction.
720
721 """
722 req_host = request_host(request)
Senthil Kumarand9fbf362013-04-09 07:07:59 -0700723 if not domain_match(req_host, reach(request.origin_req_host)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000724 return True
725 else:
726 return False
727
728
729class Cookie:
730 """HTTP Cookie.
731
732 This class represents both Netscape and RFC 2965 cookies.
733
734 This is deliberately a very simple class. It just holds attributes. It's
735 possible to construct Cookie instances that don't comply with the cookie
736 standards. CookieJar.make_cookies is the factory function for Cookie
737 objects -- it deals with cookie parsing, supplying defaults, and
738 normalising to the representation used in this class. CookiePolicy is
739 responsible for checking them to see whether they should be accepted from
740 and returned to the server.
741
742 Note that the port may be present in the headers, but unspecified ("Port"
743 rather than"Port=80", for example); if this is the case, port is None.
744
745 """
746
747 def __init__(self, version, name, value,
748 port, port_specified,
749 domain, domain_specified, domain_initial_dot,
750 path, path_specified,
751 secure,
752 expires,
753 discard,
754 comment,
755 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000756 rest,
757 rfc2109=False,
758 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000759
760 if version is not None: version = int(version)
Robert Collinsa0e5d982015-08-04 10:06:29 +1200761 if expires is not None: expires = int(float(expires))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000762 if port is None and port_specified is True:
763 raise ValueError("if port is None, port_specified must be false")
764
765 self.version = version
766 self.name = name
767 self.value = value
768 self.port = port
769 self.port_specified = port_specified
770 # normalise case, as per RFC 2965 section 3.3.3
771 self.domain = domain.lower()
772 self.domain_specified = domain_specified
773 # Sigh. We need to know whether the domain given in the
774 # cookie-attribute had an initial dot, in order to follow RFC 2965
775 # (as clarified in draft errata). Needed for the returned $Domain
776 # value.
777 self.domain_initial_dot = domain_initial_dot
778 self.path = path
779 self.path_specified = path_specified
780 self.secure = secure
781 self.expires = expires
782 self.discard = discard
783 self.comment = comment
784 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000785 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000786
787 self._rest = copy.copy(rest)
788
789 def has_nonstandard_attr(self, name):
790 return name in self._rest
791 def get_nonstandard_attr(self, name, default=None):
792 return self._rest.get(name, default)
793 def set_nonstandard_attr(self, name, value):
794 self._rest[name] = value
795
796 def is_expired(self, now=None):
797 if now is None: now = time.time()
798 if (self.expires is not None) and (self.expires <= now):
799 return True
800 return False
801
802 def __str__(self):
803 if self.port is None: p = ""
804 else: p = ":"+self.port
805 limit = self.domain + p + self.path
806 if self.value is not None:
807 namevalue = "%s=%s" % (self.name, self.value)
808 else:
809 namevalue = self.name
810 return "<Cookie %s for %s>" % (namevalue, limit)
811
812 def __repr__(self):
813 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000814 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000815 "port", "port_specified",
816 "domain", "domain_specified", "domain_initial_dot",
817 "path", "path_specified",
818 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000819 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000820 attr = getattr(self, name)
821 args.append("%s=%s" % (name, repr(attr)))
822 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000823 args.append("rfc2109=%s" % repr(self.rfc2109))
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300824 return "%s(%s)" % (self.__class__.__name__, ", ".join(args))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000825
826
827class CookiePolicy:
828 """Defines which cookies get accepted from and returned to server.
829
830 May also modify cookies, though this is probably a bad idea.
831
832 The subclass DefaultCookiePolicy defines the standard rules for Netscape
Raymond Hettinger7ea386e2016-08-25 21:11:50 -0700833 and RFC 2965 cookies -- override that if you want a customized policy.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000834
835 """
836 def set_ok(self, cookie, request):
837 """Return true if (and only if) cookie should be accepted from server.
838
839 Currently, pre-expired cookies never get this far -- the CookieJar
840 class deletes such cookies itself.
841
842 """
843 raise NotImplementedError()
844
845 def return_ok(self, cookie, request):
846 """Return true if (and only if) cookie should be returned to server."""
847 raise NotImplementedError()
848
849 def domain_return_ok(self, domain, request):
850 """Return false if cookies should not be returned, given cookie domain.
851 """
852 return True
853
854 def path_return_ok(self, path, request):
855 """Return false if cookies should not be returned, given cookie path.
856 """
857 return True
858
859
860class DefaultCookiePolicy(CookiePolicy):
861 """Implements the standard rules for accepting and returning cookies."""
862
863 DomainStrictNoDots = 1
864 DomainStrictNonDomain = 2
865 DomainRFC2965Match = 4
866
867 DomainLiberal = 0
868 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
869
870 def __init__(self,
871 blocked_domains=None, allowed_domains=None,
872 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000873 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000874 hide_cookie2=False,
875 strict_domain=False,
876 strict_rfc2965_unverifiable=True,
877 strict_ns_unverifiable=False,
878 strict_ns_domain=DomainLiberal,
879 strict_ns_set_initial_dollar=False,
880 strict_ns_set_path=False,
Paul Bailey4c339972018-10-08 13:49:29 -0500881 secure_protocols=("https", "wss")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000882 ):
883 """Constructor arguments should be passed as keyword arguments only."""
884 self.netscape = netscape
885 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000886 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000887 self.hide_cookie2 = hide_cookie2
888 self.strict_domain = strict_domain
889 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
890 self.strict_ns_unverifiable = strict_ns_unverifiable
891 self.strict_ns_domain = strict_ns_domain
892 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
893 self.strict_ns_set_path = strict_ns_set_path
Paul Bailey4c339972018-10-08 13:49:29 -0500894 self.secure_protocols = secure_protocols
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000895
896 if blocked_domains is not None:
897 self._blocked_domains = tuple(blocked_domains)
898 else:
899 self._blocked_domains = ()
900
901 if allowed_domains is not None:
902 allowed_domains = tuple(allowed_domains)
903 self._allowed_domains = allowed_domains
904
905 def blocked_domains(self):
906 """Return the sequence of blocked domains (as a tuple)."""
907 return self._blocked_domains
908 def set_blocked_domains(self, blocked_domains):
909 """Set the sequence of blocked domains."""
910 self._blocked_domains = tuple(blocked_domains)
911
912 def is_blocked(self, domain):
913 for blocked_domain in self._blocked_domains:
914 if user_domain_match(domain, blocked_domain):
915 return True
916 return False
917
918 def allowed_domains(self):
919 """Return None, or the sequence of allowed domains (as a tuple)."""
920 return self._allowed_domains
921 def set_allowed_domains(self, allowed_domains):
922 """Set the sequence of allowed domains, or None."""
923 if allowed_domains is not None:
924 allowed_domains = tuple(allowed_domains)
925 self._allowed_domains = allowed_domains
926
927 def is_not_allowed(self, domain):
928 if self._allowed_domains is None:
929 return False
930 for allowed_domain in self._allowed_domains:
931 if user_domain_match(domain, allowed_domain):
932 return False
933 return True
934
935 def set_ok(self, cookie, request):
936 """
937 If you override .set_ok(), be sure to call this method. If it returns
938 false, so should your subclass (assuming your subclass wants to be more
939 strict about which cookies to accept).
940
941 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000942 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000943
944 assert cookie.name is not None
945
946 for n in "version", "verifiability", "name", "path", "domain", "port":
947 fn_name = "set_ok_"+n
948 fn = getattr(self, fn_name)
949 if not fn(cookie, request):
950 return False
951
952 return True
953
954 def set_ok_version(self, cookie, request):
955 if cookie.version is None:
956 # Version is always set to 0 by parse_ns_headers if it's a Netscape
957 # cookie, so this must be an invalid RFC 2965 cookie.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000958 _debug(" Set-Cookie2 without version attribute (%s=%s)",
959 cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000960 return False
961 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000962 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000963 return False
964 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000965 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000966 return False
967 return True
968
969 def set_ok_verifiability(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -0500970 if request.unverifiable and is_third_party(request):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000971 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000972 _debug(" third-party RFC 2965 cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000973 "unverifiable transaction")
974 return False
975 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000976 _debug(" third-party Netscape cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000977 "unverifiable transaction")
978 return False
979 return True
980
981 def set_ok_name(self, cookie, request):
982 # Try and stop servers setting V0 cookies designed to hack other
983 # servers that know both V0 and V1 protocols.
984 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
985 cookie.name.startswith("$")):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000986 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000987 return False
988 return True
989
990 def set_ok_path(self, cookie, request):
991 if cookie.path_specified:
992 req_path = request_path(request)
993 if ((cookie.version > 0 or
994 (cookie.version == 0 and self.strict_ns_set_path)) and
995 not req_path.startswith(cookie.path)):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000996 _debug(" path attribute %s is not a prefix of request "
997 "path %s", cookie.path, req_path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000998 return False
999 return True
1000
1001 def set_ok_domain(self, cookie, request):
1002 if self.is_blocked(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001003 _debug(" domain %s is in user block-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001004 return False
1005 if self.is_not_allowed(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001006 _debug(" domain %s is not in user allow-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001007 return False
1008 if cookie.domain_specified:
1009 req_host, erhn = eff_request_host(request)
1010 domain = cookie.domain
1011 if self.strict_domain and (domain.count(".") >= 2):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001012 # XXX This should probably be compared with the Konqueror
1013 # (kcookiejar.cpp) and Mozilla implementations, but it's a
1014 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001015 i = domain.rfind(".")
1016 j = domain.rfind(".", 0, i)
1017 if j == 0: # domain like .foo.bar
1018 tld = domain[i+1:]
1019 sld = domain[j+1:i]
Thomas Wouters477c8d52006-05-27 19:21:47 +00001020 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1021 "gov", "mil", "int", "aero", "biz", "cat", "coop",
1022 "info", "jobs", "mobi", "museum", "name", "pro",
1023 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001024 # domain like .co.uk
Thomas Wouters477c8d52006-05-27 19:21:47 +00001025 _debug(" country-code second level domain %s", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001026 return False
1027 if domain.startswith("."):
1028 undotted_domain = domain[1:]
1029 else:
1030 undotted_domain = domain
1031 embedded_dots = (undotted_domain.find(".") >= 0)
1032 if not embedded_dots and domain != ".local":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001033 _debug(" non-local domain %s contains no embedded dot",
1034 domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001035 return False
1036 if cookie.version == 0:
1037 if (not erhn.endswith(domain) and
1038 (not erhn.startswith(".") and
1039 not ("."+erhn).endswith(domain))):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001040 _debug(" effective request-host %s (even with added "
Ezio Melottie130a522011-10-19 10:58:56 +03001041 "initial dot) does not end with %s",
Thomas Wouters477c8d52006-05-27 19:21:47 +00001042 erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001043 return False
1044 if (cookie.version > 0 or
1045 (self.strict_ns_domain & self.DomainRFC2965Match)):
1046 if not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001047 _debug(" effective request-host %s does not domain-match "
1048 "%s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001049 return False
1050 if (cookie.version > 0 or
1051 (self.strict_ns_domain & self.DomainStrictNoDots)):
1052 host_prefix = req_host[:-len(domain)]
1053 if (host_prefix.find(".") >= 0 and
1054 not IPV4_RE.search(req_host)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001055 _debug(" host prefix %s for domain %s contains a dot",
1056 host_prefix, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001057 return False
1058 return True
1059
1060 def set_ok_port(self, cookie, request):
1061 if cookie.port_specified:
1062 req_port = request_port(request)
1063 if req_port is None:
1064 req_port = "80"
1065 else:
1066 req_port = str(req_port)
1067 for p in cookie.port.split(","):
1068 try:
1069 int(p)
1070 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001071 _debug(" bad port %s (not numeric)", p)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001072 return False
1073 if p == req_port:
1074 break
1075 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001076 _debug(" request port (%s) not found in %s",
1077 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001078 return False
1079 return True
1080
1081 def return_ok(self, cookie, request):
1082 """
1083 If you override .return_ok(), be sure to call this method. If it
1084 returns false, so should your subclass (assuming your subclass wants to
1085 be more strict about which cookies to return).
1086
1087 """
1088 # Path has already been checked by .path_return_ok(), and domain
1089 # blocking done by .domain_return_ok().
Thomas Wouters477c8d52006-05-27 19:21:47 +00001090 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001091
1092 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1093 fn_name = "return_ok_"+n
1094 fn = getattr(self, fn_name)
1095 if not fn(cookie, request):
1096 return False
1097 return True
1098
1099 def return_ok_version(self, cookie, request):
1100 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001101 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001102 return False
1103 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001104 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001105 return False
1106 return True
1107
1108 def return_ok_verifiability(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -05001109 if request.unverifiable and is_third_party(request):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001110 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001111 _debug(" third-party RFC 2965 cookie during unverifiable "
1112 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001113 return False
1114 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001115 _debug(" third-party Netscape cookie during unverifiable "
1116 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001117 return False
1118 return True
1119
1120 def return_ok_secure(self, cookie, request):
Paul Bailey4c339972018-10-08 13:49:29 -05001121 if cookie.secure and request.type not in self.secure_protocols:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001122 _debug(" secure cookie with non-secure request")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001123 return False
1124 return True
1125
1126 def return_ok_expires(self, cookie, request):
1127 if cookie.is_expired(self._now):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001128 _debug(" cookie expired")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001129 return False
1130 return True
1131
1132 def return_ok_port(self, cookie, request):
1133 if cookie.port:
1134 req_port = request_port(request)
1135 if req_port is None:
1136 req_port = "80"
1137 for p in cookie.port.split(","):
1138 if p == req_port:
1139 break
1140 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001141 _debug(" request port %s does not match cookie port %s",
1142 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001143 return False
1144 return True
1145
1146 def return_ok_domain(self, cookie, request):
1147 req_host, erhn = eff_request_host(request)
1148 domain = cookie.domain
1149
1150 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1151 if (cookie.version == 0 and
1152 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1153 not cookie.domain_specified and domain != erhn):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001154 _debug(" cookie with unspecified domain does not string-compare "
1155 "equal to request domain")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001156 return False
1157
1158 if cookie.version > 0 and not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001159 _debug(" effective request-host name %s does not domain-match "
1160 "RFC 2965 cookie domain %s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001161 return False
1162 if cookie.version == 0 and not ("."+erhn).endswith(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001163 _debug(" request-host %s does not match Netscape cookie domain "
1164 "%s", req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001165 return False
1166 return True
1167
1168 def domain_return_ok(self, domain, request):
1169 # Liberal check of. This is here as an optimization to avoid
1170 # having to load lots of MSIE cookie files unless necessary.
1171 req_host, erhn = eff_request_host(request)
1172 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001173 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001174 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001175 erhn = "."+erhn
1176 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001177 #_debug(" request domain %s does not match cookie domain %s",
1178 # req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001179 return False
1180
1181 if self.is_blocked(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001182 _debug(" domain %s is in user block-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001183 return False
1184 if self.is_not_allowed(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001185 _debug(" domain %s is not in user allow-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001186 return False
1187
1188 return True
1189
1190 def path_return_ok(self, path, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001191 _debug("- checking cookie path=%s", path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001192 req_path = request_path(request)
1193 if not req_path.startswith(path):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001194 _debug(" %s does not path-match %s", req_path, path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001195 return False
1196 return True
1197
1198
1199def vals_sorted_by_key(adict):
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001200 keys = sorted(adict.keys())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001201 return map(adict.get, keys)
1202
1203def deepvalues(mapping):
1204 """Iterates over nested mapping, depth-first, in sorted order by key."""
1205 values = vals_sorted_by_key(mapping)
1206 for obj in values:
1207 mapping = False
1208 try:
1209 obj.items
1210 except AttributeError:
1211 pass
1212 else:
1213 mapping = True
Philip Jenveyfd0d3e52012-10-01 15:34:31 -07001214 yield from deepvalues(obj)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001215 if not mapping:
1216 yield obj
1217
1218
1219# Used as second parameter to dict.get() method, to distinguish absent
1220# dict key from one with a None value.
1221class Absent: pass
1222
1223class CookieJar:
1224 """Collection of HTTP cookies.
1225
1226 You may not need to know about this class: try
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001227 urllib.request.build_opener(HTTPCookieProcessor).open(url).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001228 """
1229
1230 non_word_re = re.compile(r"\W")
1231 quote_re = re.compile(r"([\"\\])")
1232 strict_domain_re = re.compile(r"\.?[^.]*")
1233 domain_re = re.compile(r"[^.]*")
1234 dots_re = re.compile(r"^\.+")
1235
Antoine Pitroufd036452008-08-19 17:56:33 +00001236 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001237
1238 def __init__(self, policy=None):
1239 if policy is None:
1240 policy = DefaultCookiePolicy()
1241 self._policy = policy
1242
1243 self._cookies_lock = _threading.RLock()
1244 self._cookies = {}
1245
1246 def set_policy(self, policy):
1247 self._policy = policy
1248
1249 def _cookies_for_domain(self, domain, request):
1250 cookies = []
1251 if not self._policy.domain_return_ok(domain, request):
1252 return []
Thomas Wouters477c8d52006-05-27 19:21:47 +00001253 _debug("Checking %s for cookies to return", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001254 cookies_by_path = self._cookies[domain]
1255 for path in cookies_by_path.keys():
1256 if not self._policy.path_return_ok(path, request):
1257 continue
1258 cookies_by_name = cookies_by_path[path]
1259 for cookie in cookies_by_name.values():
1260 if not self._policy.return_ok(cookie, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001261 _debug(" not returning cookie")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001262 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001263 _debug(" it's a match")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001264 cookies.append(cookie)
1265 return cookies
1266
1267 def _cookies_for_request(self, request):
1268 """Return a list of cookies to be returned to server."""
1269 cookies = []
1270 for domain in self._cookies.keys():
1271 cookies.extend(self._cookies_for_domain(domain, request))
1272 return cookies
1273
1274 def _cookie_attrs(self, cookies):
1275 """Return a list of cookie-attributes to be returned to server.
1276
1277 like ['foo="bar"; $Path="/"', ...]
1278
1279 The $Version attribute is also added when appropriate (currently only
1280 once per request).
1281
1282 """
1283 # add cookies in order of most specific (ie. longest) path first
Raymond Hettinger70b64fc2008-01-30 20:15:17 +00001284 cookies.sort(key=lambda a: len(a.path), reverse=True)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001285
1286 version_set = False
1287
1288 attrs = []
1289 for cookie in cookies:
1290 # set version of Cookie header
1291 # XXX
1292 # What should it be if multiple matching Set-Cookie headers have
1293 # different versions themselves?
1294 # Answer: there is no answer; was supposed to be settled by
1295 # RFC 2965 errata, but that may never appear...
1296 version = cookie.version
1297 if not version_set:
1298 version_set = True
1299 if version > 0:
1300 attrs.append("$Version=%s" % version)
1301
1302 # quote cookie value if necessary
1303 # (not for Netscape protocol, which already has any quotes
1304 # intact, due to the poorly-specified Netscape Cookie: syntax)
1305 if ((cookie.value is not None) and
1306 self.non_word_re.search(cookie.value) and version > 0):
1307 value = self.quote_re.sub(r"\\\1", cookie.value)
1308 else:
1309 value = cookie.value
1310
1311 # add cookie-attributes to be returned in Cookie header
1312 if cookie.value is None:
1313 attrs.append(cookie.name)
1314 else:
1315 attrs.append("%s=%s" % (cookie.name, value))
1316 if version > 0:
1317 if cookie.path_specified:
1318 attrs.append('$Path="%s"' % cookie.path)
1319 if cookie.domain.startswith("."):
1320 domain = cookie.domain
1321 if (not cookie.domain_initial_dot and
1322 domain.startswith(".")):
1323 domain = domain[1:]
1324 attrs.append('$Domain="%s"' % domain)
1325 if cookie.port is not None:
1326 p = "$Port"
1327 if cookie.port_specified:
1328 p = p + ('="%s"' % cookie.port)
1329 attrs.append(p)
1330
1331 return attrs
1332
1333 def add_cookie_header(self, request):
Georg Brandl029986a2008-06-23 11:44:14 +00001334 """Add correct Cookie: header to request (urllib.request.Request object).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001335
1336 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1337
1338 """
Thomas Wouters477c8d52006-05-27 19:21:47 +00001339 _debug("add_cookie_header")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001340 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001341 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001342
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001343 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001344
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001345 cookies = self._cookies_for_request(request)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001346
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001347 attrs = self._cookie_attrs(cookies)
1348 if attrs:
1349 if not request.has_header("Cookie"):
1350 request.add_unredirected_header(
1351 "Cookie", "; ".join(attrs))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001352
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001353 # if necessary, advertise that we know RFC 2965
1354 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1355 not request.has_header("Cookie2")):
1356 for cookie in cookies:
1357 if cookie.version != 1:
1358 request.add_unredirected_header("Cookie2", '$Version="1"')
1359 break
1360
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001361 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001362 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001363
1364 self.clear_expired_cookies()
1365
1366 def _normalized_cookie_tuples(self, attrs_set):
1367 """Return list of tuples containing normalised cookie information.
1368
1369 attrs_set is the list of lists of key,value pairs extracted from
1370 the Set-Cookie or Set-Cookie2 headers.
1371
1372 Tuples are name, value, standard, rest, where name and value are the
1373 cookie name and value, standard is a dictionary containing the standard
1374 cookie-attributes (discard, secure, version, expires or max-age,
1375 domain, path and port) and rest is a dictionary containing the rest of
1376 the cookie-attributes.
1377
1378 """
1379 cookie_tuples = []
1380
1381 boolean_attrs = "discard", "secure"
1382 value_attrs = ("version",
1383 "expires", "max-age",
1384 "domain", "path", "port",
1385 "comment", "commenturl")
1386
1387 for cookie_attrs in attrs_set:
1388 name, value = cookie_attrs[0]
1389
1390 # Build dictionary of standard cookie-attributes (standard) and
1391 # dictionary of other cookie-attributes (rest).
1392
1393 # Note: expiry time is normalised to seconds since epoch. V0
1394 # cookies should have the Expires cookie-attribute, and V1 cookies
1395 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1396 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1397 # accept either (but prefer Max-Age).
1398 max_age_set = False
1399
1400 bad_cookie = False
1401
1402 standard = {}
1403 rest = {}
1404 for k, v in cookie_attrs[1:]:
1405 lc = k.lower()
1406 # don't lose case distinction for unknown fields
1407 if lc in value_attrs or lc in boolean_attrs:
1408 k = lc
1409 if k in boolean_attrs and v is None:
1410 # boolean cookie-attribute is present, but has no value
1411 # (like "discard", rather than "port=80")
1412 v = True
1413 if k in standard:
1414 # only first value is significant
1415 continue
1416 if k == "domain":
1417 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001418 _debug(" missing value for domain attribute")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001419 bad_cookie = True
1420 break
1421 # RFC 2965 section 3.3.3
1422 v = v.lower()
1423 if k == "expires":
1424 if max_age_set:
1425 # Prefer max-age to expires (like Mozilla)
1426 continue
1427 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001428 _debug(" missing or invalid value for expires "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001429 "attribute: treating as session cookie")
1430 continue
1431 if k == "max-age":
1432 max_age_set = True
1433 try:
1434 v = int(v)
1435 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001436 _debug(" missing or invalid (non-numeric) value for "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001437 "max-age attribute")
1438 bad_cookie = True
1439 break
1440 # convert RFC 2965 Max-Age to seconds since epoch
1441 # XXX Strictly you're supposed to follow RFC 2616
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001442 # age-calculation rules. Remember that zero Max-Age
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001443 # is a request to discard (old and new) cookie, though.
1444 k = "expires"
1445 v = self._now + v
1446 if (k in value_attrs) or (k in boolean_attrs):
1447 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001448 k not in ("port", "comment", "commenturl")):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001449 _debug(" missing value for %s attribute" % k)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001450 bad_cookie = True
1451 break
1452 standard[k] = v
1453 else:
1454 rest[k] = v
1455
1456 if bad_cookie:
1457 continue
1458
1459 cookie_tuples.append((name, value, standard, rest))
1460
1461 return cookie_tuples
1462
1463 def _cookie_from_cookie_tuple(self, tup, request):
1464 # standard is dict of standard cookie-attributes, rest is dict of the
1465 # rest of them
1466 name, value, standard, rest = tup
1467
1468 domain = standard.get("domain", Absent)
1469 path = standard.get("path", Absent)
1470 port = standard.get("port", Absent)
1471 expires = standard.get("expires", Absent)
1472
1473 # set the easy defaults
1474 version = standard.get("version", None)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +00001475 if version is not None:
1476 try:
1477 version = int(version)
1478 except ValueError:
1479 return None # invalid version, ignore cookie
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001480 secure = standard.get("secure", False)
1481 # (discard is also set if expires is Absent)
1482 discard = standard.get("discard", False)
1483 comment = standard.get("comment", None)
1484 comment_url = standard.get("commenturl", None)
1485
1486 # set default path
1487 if path is not Absent and path != "":
1488 path_specified = True
1489 path = escape_path(path)
1490 else:
1491 path_specified = False
1492 path = request_path(request)
1493 i = path.rfind("/")
1494 if i != -1:
1495 if version == 0:
1496 # Netscape spec parts company from reality here
1497 path = path[:i]
1498 else:
1499 path = path[:i+1]
1500 if len(path) == 0: path = "/"
1501
1502 # set default domain
1503 domain_specified = domain is not Absent
1504 # but first we have to remember whether it starts with a dot
1505 domain_initial_dot = False
1506 if domain_specified:
1507 domain_initial_dot = bool(domain.startswith("."))
1508 if domain is Absent:
1509 req_host, erhn = eff_request_host(request)
1510 domain = erhn
1511 elif not domain.startswith("."):
1512 domain = "."+domain
1513
1514 # set default port
1515 port_specified = False
1516 if port is not Absent:
1517 if port is None:
1518 # Port attr present, but has no value: default to request port.
1519 # Cookie should then only be sent back on that port.
1520 port = request_port(request)
1521 else:
1522 port_specified = True
1523 port = re.sub(r"\s+", "", port)
1524 else:
1525 # No port attr present. Cookie can be sent back on any port.
1526 port = None
1527
1528 # set default expires and discard
1529 if expires is Absent:
1530 expires = None
1531 discard = True
1532 elif expires <= self._now:
1533 # Expiry date in past is request to delete cookie. This can't be
1534 # in DefaultCookiePolicy, because can't delete cookies there.
1535 try:
1536 self.clear(domain, path, name)
1537 except KeyError:
1538 pass
Thomas Wouters477c8d52006-05-27 19:21:47 +00001539 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1540 domain, path, name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001541 return None
1542
1543 return Cookie(version,
1544 name, value,
1545 port, port_specified,
1546 domain, domain_specified, domain_initial_dot,
1547 path, path_specified,
1548 secure,
1549 expires,
1550 discard,
1551 comment,
1552 comment_url,
1553 rest)
1554
1555 def _cookies_from_attrs_set(self, attrs_set, request):
1556 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1557
1558 cookies = []
1559 for tup in cookie_tuples:
1560 cookie = self._cookie_from_cookie_tuple(tup, request)
1561 if cookie: cookies.append(cookie)
1562 return cookies
1563
Neal Norwitz71dad722005-12-23 21:43:48 +00001564 def _process_rfc2109_cookies(self, cookies):
1565 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1566 if rfc2109_as_ns is None:
1567 rfc2109_as_ns = not self._policy.rfc2965
1568 for cookie in cookies:
1569 if cookie.version == 1:
1570 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001571 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001572 # treat 2109 cookies as Netscape cookies rather than
1573 # as RFC2965 cookies
1574 cookie.version = 0
1575
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001576 def make_cookies(self, response, request):
1577 """Return sequence of Cookie objects extracted from response object."""
1578 # get cookie-attributes for RFC 2965 and Netscape protocols
1579 headers = response.info()
Barry Warsaw820c1202008-06-12 04:06:45 +00001580 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1581 ns_hdrs = headers.get_all("Set-Cookie", [])
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001582
1583 rfc2965 = self._policy.rfc2965
1584 netscape = self._policy.netscape
1585
1586 if ((not rfc2965_hdrs and not ns_hdrs) or
1587 (not ns_hdrs and not rfc2965) or
1588 (not rfc2965_hdrs and not netscape) or
1589 (not netscape and not rfc2965)):
1590 return [] # no relevant cookie headers: quick exit
1591
1592 try:
1593 cookies = self._cookies_from_attrs_set(
1594 split_header_words(rfc2965_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001595 except Exception:
1596 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001597 cookies = []
1598
1599 if ns_hdrs and netscape:
1600 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001601 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001602 ns_cookies = self._cookies_from_attrs_set(
1603 parse_ns_headers(ns_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001604 except Exception:
1605 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001606 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001607 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001608
1609 # Look for Netscape cookies (from Set-Cookie headers) that match
1610 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1611 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1612 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1613 # bundled in with the Netscape cookies for this purpose, which is
1614 # reasonable behaviour.
1615 if rfc2965:
1616 lookup = {}
1617 for cookie in cookies:
1618 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1619
1620 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1621 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1622 return key not in lookup
1623 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1624
1625 if ns_cookies:
1626 cookies.extend(ns_cookies)
1627
1628 return cookies
1629
1630 def set_cookie_if_ok(self, cookie, request):
1631 """Set a cookie if policy says it's OK to do so."""
1632 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001633 try:
1634 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001635
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001636 if self._policy.set_ok(cookie, request):
1637 self.set_cookie(cookie)
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001638
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001639
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001640 finally:
1641 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001642
1643 def set_cookie(self, cookie):
1644 """Set a cookie, without checking whether or not it should be set."""
1645 c = self._cookies
1646 self._cookies_lock.acquire()
1647 try:
1648 if cookie.domain not in c: c[cookie.domain] = {}
1649 c2 = c[cookie.domain]
1650 if cookie.path not in c2: c2[cookie.path] = {}
1651 c3 = c2[cookie.path]
1652 c3[cookie.name] = cookie
1653 finally:
1654 self._cookies_lock.release()
1655
1656 def extract_cookies(self, response, request):
1657 """Extract cookies from response, where allowable given the request."""
Thomas Wouters477c8d52006-05-27 19:21:47 +00001658 _debug("extract_cookies: %s", response.info())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001659 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001660 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001661 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001662
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001663 for cookie in self.make_cookies(response, request):
1664 if self._policy.set_ok(cookie, request):
1665 _debug(" setting cookie: %s", cookie)
1666 self.set_cookie(cookie)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001667 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001668 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001669
1670 def clear(self, domain=None, path=None, name=None):
1671 """Clear some cookies.
1672
1673 Invoking this method without arguments will clear all cookies. If
1674 given a single argument, only cookies belonging to that domain will be
1675 removed. If given two arguments, cookies belonging to the specified
1676 path within that domain are removed. If given three arguments, then
1677 the cookie with the specified name, path and domain is removed.
1678
1679 Raises KeyError if no matching cookie exists.
1680
1681 """
1682 if name is not None:
1683 if (domain is None) or (path is None):
1684 raise ValueError(
1685 "domain and path must be given to remove a cookie by name")
1686 del self._cookies[domain][path][name]
1687 elif path is not None:
1688 if domain is None:
1689 raise ValueError(
1690 "domain must be given to remove cookies by path")
1691 del self._cookies[domain][path]
1692 elif domain is not None:
1693 del self._cookies[domain]
1694 else:
1695 self._cookies = {}
1696
1697 def clear_session_cookies(self):
1698 """Discard all session cookies.
1699
1700 Note that the .save() method won't save session cookies anyway, unless
1701 you ask otherwise by passing a true ignore_discard argument.
1702
1703 """
1704 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001705 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001706 for cookie in self:
1707 if cookie.discard:
1708 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001709 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001710 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001711
1712 def clear_expired_cookies(self):
1713 """Discard all expired cookies.
1714
1715 You probably don't need to call this method: expired cookies are never
1716 sent back to the server (provided you're using DefaultCookiePolicy),
1717 this method is called by CookieJar itself every so often, and the
1718 .save() method won't save expired cookies anyway (unless you ask
1719 otherwise by passing a true ignore_expires argument).
1720
1721 """
1722 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001723 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001724 now = time.time()
1725 for cookie in self:
1726 if cookie.is_expired(now):
1727 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001728 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001729 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001730
1731 def __iter__(self):
1732 return deepvalues(self._cookies)
1733
1734 def __len__(self):
1735 """Return number of contained cookies."""
1736 i = 0
1737 for cookie in self: i = i + 1
1738 return i
1739
1740 def __repr__(self):
1741 r = []
1742 for cookie in self: r.append(repr(cookie))
Serhiy Storchaka1392df92014-07-22 11:09:36 +03001743 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001744
1745 def __str__(self):
1746 r = []
1747 for cookie in self: r.append(str(cookie))
Serhiy Storchaka1392df92014-07-22 11:09:36 +03001748 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001749
1750
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001751# derives from OSError for backwards-compatibility with Python 2.4.0
1752class LoadError(OSError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001753
1754class FileCookieJar(CookieJar):
1755 """CookieJar that can be loaded from and saved to a file."""
1756
1757 def __init__(self, filename=None, delayload=False, policy=None):
1758 """
1759 Cookies are NOT loaded from the named file until either the .load() or
1760 .revert() method is called.
1761
1762 """
1763 CookieJar.__init__(self, policy)
1764 if filename is not None:
1765 try:
1766 filename+""
1767 except:
1768 raise ValueError("filename must be string-like")
1769 self.filename = filename
1770 self.delayload = bool(delayload)
1771
1772 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1773 """Save cookies to a file."""
1774 raise NotImplementedError()
1775
1776 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1777 """Load cookies from a file."""
1778 if filename is None:
1779 if self.filename is not None: filename = self.filename
1780 else: raise ValueError(MISSING_FILENAME_TEXT)
1781
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01001782 with open(filename) as f:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001783 self._really_load(f, filename, ignore_discard, ignore_expires)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001784
1785 def revert(self, filename=None,
1786 ignore_discard=False, ignore_expires=False):
1787 """Clear all cookies and reload cookies from a saved file.
1788
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001789 Raises LoadError (or OSError) if reversion is not successful; the
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001790 object's state will not be altered if this happens.
1791
1792 """
1793 if filename is None:
1794 if self.filename is not None: filename = self.filename
1795 else: raise ValueError(MISSING_FILENAME_TEXT)
1796
1797 self._cookies_lock.acquire()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001798 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001799
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001800 old_state = copy.deepcopy(self._cookies)
1801 self._cookies = {}
1802 try:
1803 self.load(filename, ignore_discard, ignore_expires)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001804 except OSError:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001805 self._cookies = old_state
1806 raise
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001807
1808 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001809 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001810
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001811
1812def lwp_cookie_str(cookie):
Benjamin Peterson82f34ad2015-01-13 09:17:24 -05001813 """Return string representation of Cookie in the LWP cookie file format.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001814
1815 Actually, the format is extended a bit -- see module docstring.
1816
1817 """
1818 h = [(cookie.name, cookie.value),
1819 ("path", cookie.path),
1820 ("domain", cookie.domain)]
1821 if cookie.port is not None: h.append(("port", cookie.port))
1822 if cookie.path_specified: h.append(("path_spec", None))
1823 if cookie.port_specified: h.append(("port_spec", None))
1824 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1825 if cookie.secure: h.append(("secure", None))
1826 if cookie.expires: h.append(("expires",
1827 time2isoz(float(cookie.expires))))
1828 if cookie.discard: h.append(("discard", None))
1829 if cookie.comment: h.append(("comment", cookie.comment))
1830 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1831
1832 keys = sorted(cookie._rest.keys())
1833 for k in keys:
1834 h.append((k, str(cookie._rest[k])))
1835
1836 h.append(("version", str(cookie.version)))
1837
1838 return join_header_words([h])
1839
1840class LWPCookieJar(FileCookieJar):
1841 """
Ezio Melotti0847db72012-09-21 16:30:22 +03001842 The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
Martin Panterf05641642016-05-08 13:48:10 +00001843 "Set-Cookie3" is the format used by the libwww-perl library, not known
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001844 to be compatible with any browser, but which is easy to read and
1845 doesn't lose information about RFC 2965 cookies.
1846
1847 Additional methods
1848
1849 as_lwp_str(ignore_discard=True, ignore_expired=True)
1850
1851 """
1852
1853 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
Ezio Melotti0847db72012-09-21 16:30:22 +03001854 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001855
1856 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1857
1858 """
1859 now = time.time()
1860 r = []
1861 for cookie in self:
1862 if not ignore_discard and cookie.discard:
1863 continue
1864 if not ignore_expires and cookie.is_expired(now):
1865 continue
1866 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1867 return "\n".join(r+[""])
1868
1869 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1870 if filename is None:
1871 if self.filename is not None: filename = self.filename
1872 else: raise ValueError(MISSING_FILENAME_TEXT)
1873
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01001874 with open(filename, "w") as f:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001875 # There really isn't an LWP Cookies 2.0 format, but this indicates
1876 # that there is extra information in here (domain_dot and
1877 # port_spec) while still being compatible with libwww-perl, I hope.
1878 f.write("#LWP-Cookies-2.0\n")
1879 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001880
1881 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1882 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001883 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001884 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1885 "file" % filename)
1886 raise LoadError(msg)
1887
1888 now = time.time()
1889
1890 header = "Set-Cookie3:"
1891 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1892 "secure", "discard")
1893 value_attrs = ("version",
1894 "port", "path", "domain",
1895 "expires",
1896 "comment", "commenturl")
1897
1898 try:
1899 while 1:
1900 line = f.readline()
1901 if line == "": break
1902 if not line.startswith(header):
1903 continue
1904 line = line[len(header):].strip()
1905
1906 for data in split_header_words([line]):
1907 name, value = data[0]
1908 standard = {}
1909 rest = {}
1910 for k in boolean_attrs:
1911 standard[k] = False
1912 for k, v in data[1:]:
1913 if k is not None:
1914 lc = k.lower()
1915 else:
1916 lc = None
1917 # don't lose case distinction for unknown fields
1918 if (lc in value_attrs) or (lc in boolean_attrs):
1919 k = lc
1920 if k in boolean_attrs:
1921 if v is None: v = True
1922 standard[k] = v
1923 elif k in value_attrs:
1924 standard[k] = v
1925 else:
1926 rest[k] = v
1927
1928 h = standard.get
1929 expires = h("expires")
1930 discard = h("discard")
1931 if expires is not None:
1932 expires = iso2time(expires)
1933 if expires is None:
1934 discard = True
1935 domain = h("domain")
1936 domain_specified = domain.startswith(".")
1937 c = Cookie(h("version"), name, value,
1938 h("port"), h("port_spec"),
1939 domain, domain_specified, h("domain_dot"),
1940 h("path"), h("path_spec"),
1941 h("secure"),
1942 expires,
1943 discard,
1944 h("comment"),
1945 h("commenturl"),
1946 rest)
1947 if not ignore_discard and c.discard:
1948 continue
1949 if not ignore_expires and c.is_expired(now):
1950 continue
1951 self.set_cookie(c)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001952 except OSError:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001953 raise
1954 except Exception:
1955 _warn_unhandled_exception()
1956 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1957 (filename, line))
1958
1959
1960class MozillaCookieJar(FileCookieJar):
1961 """
1962
1963 WARNING: you may want to backup your browser's cookies file if you use
1964 this class to save cookies. I *think* it works, but there have been
1965 bugs in the past!
1966
1967 This class differs from CookieJar only in the format it uses to save and
1968 load cookies to and from a file. This class uses the Mozilla/Netscape
1969 `cookies.txt' format. lynx uses this file format, too.
1970
1971 Don't expect cookies saved while the browser is running to be noticed by
1972 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1973 you change them on disk while it's running; on Windows, you probably can't
1974 save at all while the browser is running).
1975
1976 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1977 Netscape cookies on saving.
1978
1979 In particular, the cookie version and port number information is lost,
1980 together with information about whether or not Path, Port and Discard were
1981 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1982 domain as set in the HTTP header started with a dot (yes, I'm aware some
1983 domains in Netscape files start with a dot and some don't -- trust me, you
1984 really don't want to know any more about this).
1985
1986 Note that though Mozilla and Netscape use the same format, they use
1987 slightly different headers. The class saves cookies using the Netscape
1988 header by default (Mozilla can cope with that).
1989
1990 """
Antoine Pitroufd036452008-08-19 17:56:33 +00001991 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001992 header = """\
Georg Brandl87a15642010-07-31 22:11:11 +00001993# Netscape HTTP Cookie File
Benjamin Petersonccedc222013-12-18 15:35:18 -06001994# http://curl.haxx.se/rfc/cookie_spec.html
Georg Brandl87a15642010-07-31 22:11:11 +00001995# This is a generated file! Do not edit.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001996
1997"""
1998
1999 def _really_load(self, f, filename, ignore_discard, ignore_expires):
2000 now = time.time()
2001
2002 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00002003 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002004 raise LoadError(
2005 "%r does not look like a Netscape format cookies file" %
2006 filename)
2007
2008 try:
2009 while 1:
2010 line = f.readline()
2011 if line == "": break
2012
2013 # last field may be absent, so keep any trailing tab
2014 if line.endswith("\n"): line = line[:-1]
2015
2016 # skip comments and blank lines XXX what is $ for?
2017 if (line.strip().startswith(("#", "$")) or
2018 line.strip() == ""):
2019 continue
2020
2021 domain, domain_specified, path, secure, expires, name, value = \
2022 line.split("\t")
2023 secure = (secure == "TRUE")
2024 domain_specified = (domain_specified == "TRUE")
2025 if name == "":
2026 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2027 # with no name, whereas http.cookiejar regards it as a
2028 # cookie with no value.
2029 name = value
2030 value = None
2031
2032 initial_dot = domain.startswith(".")
2033 assert domain_specified == initial_dot
2034
2035 discard = False
2036 if expires == "":
2037 expires = None
2038 discard = True
2039
2040 # assume path_specified is false
2041 c = Cookie(0, name, value,
2042 None, False,
2043 domain, domain_specified, initial_dot,
2044 path, False,
2045 secure,
2046 expires,
2047 discard,
2048 None,
2049 None,
2050 {})
2051 if not ignore_discard and c.discard:
2052 continue
2053 if not ignore_expires and c.is_expired(now):
2054 continue
2055 self.set_cookie(c)
2056
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002057 except OSError:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002058 raise
2059 except Exception:
2060 _warn_unhandled_exception()
2061 raise LoadError("invalid Netscape format cookies file %r: %r" %
2062 (filename, line))
2063
2064 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2065 if filename is None:
2066 if self.filename is not None: filename = self.filename
2067 else: raise ValueError(MISSING_FILENAME_TEXT)
2068
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01002069 with open(filename, "w") as f:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002070 f.write(self.header)
2071 now = time.time()
2072 for cookie in self:
2073 if not ignore_discard and cookie.discard:
2074 continue
2075 if not ignore_expires and cookie.is_expired(now):
2076 continue
2077 if cookie.secure: secure = "TRUE"
2078 else: secure = "FALSE"
2079 if cookie.domain.startswith("."): initial_dot = "TRUE"
2080 else: initial_dot = "FALSE"
2081 if cookie.expires is not None:
2082 expires = str(cookie.expires)
2083 else:
2084 expires = ""
2085 if cookie.value is None:
2086 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2087 # with no name, whereas http.cookiejar regards it as a
2088 # cookie with no value.
2089 name = ""
2090 value = cookie.name
2091 else:
2092 name = cookie.name
2093 value = cookie.value
2094 f.write(
2095 "\t".join([cookie.domain, initial_dot, cookie.path,
2096 secure, expires, name, value])+
2097 "\n")