blob: 265ccf99f285374b64724c399ebdcd992c023dcd [file] [log] [blame]
Éric Araujo23760e92011-11-07 17:52:48 +01001r"""HTTP cookie handling for web clients.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00002
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
Thomas Wouters477c8d52006-05-27 19:21:47 +000010Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000013
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Thomas Wouters477c8d52006-05-27 19:21:47 +000028__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
Jeremy Hylton1afc1692008-06-18 20:49:58 +000031import copy
Victor Stinner628225c2011-03-21 02:38:51 +010032import datetime
Jeremy Hylton1afc1692008-06-18 20:49:58 +000033import re
34import time
35import urllib.parse, urllib.request
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000036try:
37 import threading as _threading
Brett Cannoncd171c82013-07-04 17:43:24 -040038except ImportError:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000039 import dummy_threading as _threading
Georg Brandl24420152008-05-26 16:32:26 +000040import http.client # only for the default HTTP port
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000041from calendar import timegm
42
Thomas Wouters477c8d52006-05-27 19:21:47 +000043debug = False # set to True to enable debugging via the logging module
44logger = None
45
46def _debug(*args):
47 if not debug:
48 return
49 global logger
50 if not logger:
51 import logging
Georg Brandl24420152008-05-26 16:32:26 +000052 logger = logging.getLogger("http.cookiejar")
Thomas Wouters477c8d52006-05-27 19:21:47 +000053 return logger.debug(*args)
54
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000055
Georg Brandl24420152008-05-26 16:32:26 +000056DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000057MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
58 "instance initialised with one)")
59
Thomas Wouters477c8d52006-05-27 19:21:47 +000060def _warn_unhandled_exception():
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000061 # There are a few catch-all except: statements in this module, for
Thomas Wouters477c8d52006-05-27 19:21:47 +000062 # catching input that's bad in unexpected ways. Warn if any
63 # exceptions are caught there.
Jeremy Hylton7ecf3dc2008-05-10 20:38:40 +000064 import io, warnings, traceback
Guido van Rossum34d19282007-08-09 01:03:29 +000065 f = io.StringIO()
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000066 traceback.print_exc(None, f)
67 msg = f.getvalue()
Georg Brandl24420152008-05-26 16:32:26 +000068 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000069
70
71# Date/time conversion
72# -----------------------------------------------------------------------------
73
74EPOCH_YEAR = 1970
75def _timegm(tt):
76 year, month, mday, hour, min, sec = tt[:6]
77 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
78 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
79 return timegm(tt)
80 else:
81 return None
82
83DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
84MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
85 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
86MONTHS_LOWER = []
87for month in MONTHS: MONTHS_LOWER.append(month.lower())
88
89def time2isoz(t=None):
90 """Return a string representing time in seconds since epoch, t.
91
92 If the function is called without an argument, it will use the current
93 time.
94
95 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
96 representing Universal Time (UTC, aka GMT). An example of this format is:
97
98 1994-11-24 08:49:37Z
99
100 """
Victor Stinner628225c2011-03-21 02:38:51 +0100101 if t is None:
102 dt = datetime.datetime.utcnow()
103 else:
104 dt = datetime.datetime.utcfromtimestamp(t)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000105 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100106 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000107
108def time2netscape(t=None):
109 """Return a string representing time in seconds since epoch, t.
110
111 If the function is called without an argument, it will use the current
112 time.
113
114 The format of the returned string is like this:
115
116 Wed, DD-Mon-YYYY HH:MM:SS GMT
117
118 """
Victor Stinner628225c2011-03-21 02:38:51 +0100119 if t is None:
120 dt = datetime.datetime.utcnow()
121 else:
122 dt = datetime.datetime.utcfromtimestamp(t)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000123 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
Victor Stinner628225c2011-03-21 02:38:51 +0100124 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
125 dt.year, dt.hour, dt.minute, dt.second)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000126
127
128UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
129
Antoine Pitroufd036452008-08-19 17:56:33 +0000130TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000131def offset_from_tz_string(tz):
132 offset = None
133 if tz in UTC_ZONES:
134 offset = 0
135 else:
136 m = TIMEZONE_RE.search(tz)
137 if m:
138 offset = 3600 * int(m.group(2))
139 if m.group(3):
140 offset = offset + 60 * int(m.group(3))
141 if m.group(1) == '-':
142 offset = -offset
143 return offset
144
145def _str2time(day, mon, yr, hr, min, sec, tz):
Berker Peksag20be53e2016-03-14 05:48:02 +0200146 yr = int(yr)
147 if yr > datetime.MAXYEAR:
148 return None
149
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000150 # translate month name to number
151 # month numbers start with 1 (January)
152 try:
153 mon = MONTHS_LOWER.index(mon.lower())+1
154 except ValueError:
155 # maybe it's already a number
156 try:
157 imon = int(mon)
158 except ValueError:
159 return None
160 if 1 <= imon <= 12:
161 mon = imon
162 else:
163 return None
164
165 # make sure clock elements are defined
166 if hr is None: hr = 0
167 if min is None: min = 0
168 if sec is None: sec = 0
169
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000170 day = int(day)
171 hr = int(hr)
172 min = int(min)
173 sec = int(sec)
174
175 if yr < 1000:
176 # find "obvious" year
177 cur_yr = time.localtime(time.time())[0]
178 m = cur_yr % 100
179 tmp = yr
180 yr = yr + cur_yr - m
181 m = m - tmp
182 if abs(m) > 50:
183 if m > 0: yr = yr + 100
184 else: yr = yr - 100
185
186 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
187 t = _timegm((yr, mon, day, hr, min, sec, tz))
188
189 if t is not None:
190 # adjust time using timezone string, to get absolute time since epoch
191 if tz is None:
192 tz = "UTC"
193 tz = tz.upper()
194 offset = offset_from_tz_string(tz)
195 if offset is None:
196 return None
197 t = t - offset
198
199 return t
200
201STRICT_DATE_RE = re.compile(
202 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
Antoine Pitroufd036452008-08-19 17:56:33 +0000203 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000204WEEKDAY_RE = re.compile(
Antoine Pitroufd036452008-08-19 17:56:33 +0000205 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000206LOOSE_HTTP_DATE_RE = re.compile(
207 r"""^
208 (\d\d?) # day
209 (?:\s+|[-\/])
210 (\w+) # month
211 (?:\s+|[-\/])
212 (\d+) # year
213 (?:
214 (?:\s+|:) # separator before clock
215 (\d\d?):(\d\d) # hour:min
216 (?::(\d\d))? # optional seconds
217 )? # optional clock
218 \s*
219 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
220 \s*
221 (?:\(\w+\))? # ASCII representation of timezone in parens.
Antoine Pitroufd036452008-08-19 17:56:33 +0000222 \s*$""", re.X | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000223def http2time(text):
224 """Returns time in seconds since epoch of time represented by a string.
225
226 Return value is an integer.
227
228 None is returned if the format of str is unrecognized, the time is outside
229 the representable range, or the timezone string is not recognized. If the
230 string contains no timezone, UTC is assumed.
231
232 The timezone in the string may be numerical (like "-0800" or "+0100") or a
233 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
234 timezone strings equivalent to UTC (zero offset) are known to the function.
235
236 The function loosely parses the following formats:
237
238 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
239 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
240 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
241 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
242 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
243 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
244
245 The parser ignores leading and trailing whitespace. The time may be
246 absent.
247
248 If the year is given with only 2 digits, the function will select the
249 century that makes the year closest to the current date.
250
251 """
252 # fast exit for strictly conforming string
253 m = STRICT_DATE_RE.search(text)
254 if m:
255 g = m.groups()
256 mon = MONTHS_LOWER.index(g[1].lower()) + 1
257 tt = (int(g[2]), mon, int(g[0]),
258 int(g[3]), int(g[4]), float(g[5]))
259 return _timegm(tt)
260
261 # No, we need some messy parsing...
262
263 # clean up
264 text = text.lstrip()
265 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
266
267 # tz is time zone specifier string
268 day, mon, yr, hr, min, sec, tz = [None]*7
269
270 # loose regexp parse
271 m = LOOSE_HTTP_DATE_RE.search(text)
272 if m is not None:
273 day, mon, yr, hr, min, sec, tz = m.groups()
274 else:
275 return None # bad format
276
277 return _str2time(day, mon, yr, hr, min, sec, tz)
278
279ISO_DATE_RE = re.compile(
280 """^
281 (\d{4}) # year
282 [-\/]?
283 (\d\d?) # numerical month
284 [-\/]?
285 (\d\d?) # day
286 (?:
287 (?:\s+|[-:Tt]) # separator before clock
288 (\d\d?):?(\d\d) # hour:min
289 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
290 )? # optional clock
291 \s*
292 ([-+]?\d\d?:?(:?\d\d)?
293 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
Antoine Pitroufd036452008-08-19 17:56:33 +0000294 \s*$""", re.X | re. ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000295def iso2time(text):
296 """
297 As for http2time, but parses the ISO 8601 formats:
298
299 1994-02-03 14:15:29 -0100 -- ISO 8601 format
300 1994-02-03 14:15:29 -- zone is optional
301 1994-02-03 -- only date
302 1994-02-03T14:15:29 -- Use T as separator
303 19940203T141529Z -- ISO 8601 compact format
304 19940203 -- only date
305
306 """
307 # clean up
308 text = text.lstrip()
309
310 # tz is time zone specifier string
311 day, mon, yr, hr, min, sec, tz = [None]*7
312
313 # loose regexp parse
314 m = ISO_DATE_RE.search(text)
315 if m is not None:
316 # XXX there's an extra bit of the timezone I'm ignoring here: is
317 # this the right thing to do?
318 yr, mon, day, hr, min, sec, tz, _ = m.groups()
319 else:
320 return None # bad format
321
322 return _str2time(day, mon, yr, hr, min, sec, tz)
323
324
325# Header parsing
326# -----------------------------------------------------------------------------
327
328def unmatched(match):
329 """Return unmatched part of re.Match object."""
330 start, end = match.span(0)
331 return match.string[:start]+match.string[end:]
332
333HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
334HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
335HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
336HEADER_ESCAPE_RE = re.compile(r"\\(.)")
337def split_header_words(header_values):
338 r"""Parse header values into a list of lists containing key,value pairs.
339
340 The function knows how to deal with ",", ";" and "=" as well as quoted
341 values after "=". A list of space separated tokens are parsed as if they
342 were separated by ";".
343
344 If the header_values passed as argument contains multiple values, then they
345 are treated as if they were a single value separated by comma ",".
346
347 This means that this function is useful for parsing header fields that
348 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
349 the requirement for tokens).
350
351 headers = #header
352 header = (token | parameter) *( [";"] (token | parameter))
353
354 token = 1*<any CHAR except CTLs or separators>
355 separators = "(" | ")" | "<" | ">" | "@"
356 | "," | ";" | ":" | "\" | <">
357 | "/" | "[" | "]" | "?" | "="
358 | "{" | "}" | SP | HT
359
360 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
361 qdtext = <any TEXT except <">>
362 quoted-pair = "\" CHAR
363
364 parameter = attribute "=" value
365 attribute = token
366 value = token | quoted-string
367
368 Each header is represented by a list of key/value pairs. The value for a
369 simple token (not part of a parameter) is None. Syntactically incorrect
370 headers will not necessarily be parsed as you would want.
371
372 This is easier to describe with some examples:
373
374 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
375 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
376 >>> split_header_words(['text/html; charset="iso-8859-1"'])
377 [[('text/html', None), ('charset', 'iso-8859-1')]]
378 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
379 [[('Basic', None), ('realm', '"foobar"')]]
380
381 """
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000382 assert not isinstance(header_values, str)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000383 result = []
384 for text in header_values:
385 orig_text = text
386 pairs = []
387 while text:
388 m = HEADER_TOKEN_RE.search(text)
389 if m:
390 text = unmatched(m)
391 name = m.group(1)
392 m = HEADER_QUOTED_VALUE_RE.search(text)
393 if m: # quoted value
394 text = unmatched(m)
395 value = m.group(1)
396 value = HEADER_ESCAPE_RE.sub(r"\1", value)
397 else:
398 m = HEADER_VALUE_RE.search(text)
399 if m: # unquoted value
400 text = unmatched(m)
401 value = m.group(1)
402 value = value.rstrip()
403 else:
404 # no value, a lone token
405 value = None
406 pairs.append((name, value))
407 elif text.lstrip().startswith(","):
408 # concatenated headers, as per RFC 2616 section 4.2
409 text = text.lstrip()[1:]
410 if pairs: result.append(pairs)
411 pairs = []
412 else:
413 # skip junk
414 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
415 assert nr_junk_chars > 0, (
416 "split_header_words bug: '%s', '%s', %s" %
417 (orig_text, text, pairs))
418 text = non_junk
419 if pairs: result.append(pairs)
420 return result
421
422HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
423def join_header_words(lists):
424 """Do the inverse (almost) of the conversion done by split_header_words.
425
426 Takes a list of lists of (key, value) pairs and produces a single header
427 value. Attribute values are quoted if needed.
428
Martin Panterac34e092015-11-14 00:58:32 +0000429 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]])
430 'text/plain; charset="iso-8859-1"'
431 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]])
432 'text/plain, charset="iso-8859-1"'
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000433
434 """
435 headers = []
436 for pairs in lists:
437 attr = []
438 for k, v in pairs:
439 if v is not None:
440 if not re.search(r"^\w+$", v):
441 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
442 v = '"%s"' % v
443 k = "%s=%s" % (k, v)
444 attr.append(k)
445 if attr: headers.append("; ".join(attr))
446 return ", ".join(headers)
447
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000448def strip_quotes(text):
449 if text.startswith('"'):
450 text = text[1:]
451 if text.endswith('"'):
452 text = text[:-1]
453 return text
454
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000455def parse_ns_headers(ns_headers):
456 """Ad-hoc parser for Netscape protocol cookie-attributes.
457
458 The old Netscape cookie format for Set-Cookie can for instance contain
459 an unquoted "," in the expires field, so we have to use this ad-hoc
460 parser instead of split_header_words.
461
462 XXX This may not make the best possible effort to parse all the crap
463 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
464 parser is probably better, so could do worse than following that if
465 this ever gives any trouble.
466
467 Currently, this is also used for parsing RFC 2109 cookies.
468
469 """
470 known_attrs = ("expires", "domain", "path", "secure",
471 # RFC 2109 attrs (may turn up in Netscape cookies, too)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +0000472 "version", "port", "max-age")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000473
474 result = []
475 for ns_header in ns_headers:
476 pairs = []
477 version_set = False
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200478
479 # XXX: The following does not strictly adhere to RFCs in that empty
480 # names and values are legal (the former will only appear once and will
481 # be overwritten if multiple occurrences are present). This is
482 # mostly to deal with backwards compatibility.
483 for ii, param in enumerate(ns_header.split(';')):
484 param = param.strip()
485
486 key, sep, val = param.partition('=')
487 key = key.strip()
488
489 if not key:
490 if ii == 0:
491 break
492 else:
493 continue
494
495 # allow for a distinction between present and empty and missing
496 # altogether
497 val = val.strip() if sep else None
498
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000499 if ii != 0:
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200500 lc = key.lower()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000501 if lc in known_attrs:
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200502 key = lc
503
504 if key == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000505 # This is an RFC 2109 cookie.
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200506 if val is not None:
507 val = strip_quotes(val)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000508 version_set = True
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200509 elif key == "expires":
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000510 # convert expires date to seconds since epoch
Serhiy Storchaka577fc4e2015-03-13 09:05:01 +0200511 if val is not None:
512 val = http2time(strip_quotes(val)) # None if invalid
513 pairs.append((key, val))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000514
515 if pairs:
516 if not version_set:
517 pairs.append(("version", "0"))
518 result.append(pairs)
519
520 return result
521
522
Antoine Pitroufd036452008-08-19 17:56:33 +0000523IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000524def is_HDN(text):
525 """Return True if text is a host domain name."""
526 # XXX
527 # This may well be wrong. Which RFC is HDN defined in, if any (for
528 # the purposes of RFC 2965)?
529 # For the current implementation, what about IPv6? Remember to look
530 # at other uses of IPV4_RE also, if change this.
531 if IPV4_RE.search(text):
532 return False
533 if text == "":
534 return False
535 if text[0] == "." or text[-1] == ".":
536 return False
537 return True
538
539def domain_match(A, B):
540 """Return True if domain A domain-matches domain B, according to RFC 2965.
541
542 A and B may be host domain names or IP addresses.
543
544 RFC 2965, section 1:
545
546 Host names can be specified either as an IP address or a HDN string.
547 Sometimes we compare one host name with another. (Such comparisons SHALL
548 be case-insensitive.) Host A's name domain-matches host B's if
549
550 * their host name strings string-compare equal; or
551
552 * A is a HDN string and has the form NB, where N is a non-empty
553 name string, B has the form .B', and B' is a HDN string. (So,
554 x.y.com domain-matches .Y.com but not Y.com.)
555
556 Note that domain-match is not a commutative operation: a.b.c.com
557 domain-matches .c.com, but not the reverse.
558
559 """
560 # Note that, if A or B are IP addresses, the only relevant part of the
561 # definition of the domain-match algorithm is the direct string-compare.
562 A = A.lower()
563 B = B.lower()
564 if A == B:
565 return True
566 if not is_HDN(A):
567 return False
568 i = A.rfind(B)
569 if i == -1 or i == 0:
570 # A does not have form NB, or N is the empty string
571 return False
572 if not B.startswith("."):
573 return False
574 if not is_HDN(B[1:]):
575 return False
576 return True
577
578def liberal_is_HDN(text):
579 """Return True if text is a sort-of-like a host domain name.
580
581 For accepting/blocking domains.
582
583 """
584 if IPV4_RE.search(text):
585 return False
586 return True
587
588def user_domain_match(A, B):
589 """For blocking/accepting domains.
590
591 A and B may be host domain names or IP addresses.
592
593 """
594 A = A.lower()
595 B = B.lower()
596 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
597 if A == B:
598 # equal IP addresses
599 return True
600 return False
601 initial_dot = B.startswith(".")
602 if initial_dot and A.endswith(B):
603 return True
604 if not initial_dot and A == B:
605 return True
606 return False
607
Antoine Pitroufd036452008-08-19 17:56:33 +0000608cut_port_re = re.compile(r":\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000609def request_host(request):
610 """Return request-host, as defined by RFC 2965.
611
612 Variation from RFC: returned value is lowercased, for convenient
613 comparison.
614
615 """
616 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000617 host = urllib.parse.urlparse(url)[1]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000618 if host == "":
619 host = request.get_header("Host", "")
620
621 # remove port, if present
622 host = cut_port_re.sub("", host, 1)
623 return host.lower()
624
625def eff_request_host(request):
626 """Return a tuple (request-host, effective request-host name).
627
628 As defined by RFC 2965, except both are lowercased.
629
630 """
631 erhn = req_host = request_host(request)
632 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
633 erhn = req_host + ".local"
634 return req_host, erhn
635
636def request_path(request):
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000637 """Path component of request-URI, as defined by RFC 2965."""
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000638 url = request.get_full_url()
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000639 parts = urllib.parse.urlsplit(url)
640 path = escape_path(parts.path)
641 if not path.startswith("/"):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000642 # fix bad RFC 2396 absoluteURI
Gregory P. Smith41e6c3d2010-07-19 23:17:22 +0000643 path = "/" + path
644 return path
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000645
646def request_port(request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -0500647 host = request.host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000648 i = host.find(':')
649 if i >= 0:
650 port = host[i+1:]
651 try:
652 int(port)
653 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000654 _debug("nonnumeric port: '%s'", port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000655 return None
656 else:
657 port = DEFAULT_HTTP_PORT
658 return port
659
660# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
661# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
662HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
663ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
664def uppercase_escaped_char(match):
665 return "%%%s" % match.group(1).upper()
666def escape_path(path):
667 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
668 # There's no knowing what character encoding was used to create URLs
669 # containing %-escapes, but since we have to pick one to escape invalid
670 # path characters, we pick UTF-8, as recommended in the HTML 4.0
671 # specification:
672 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
673 # And here, kind of: draft-fielding-uri-rfc2396bis-03
674 # (And in draft IRI specification: draft-duerst-iri-05)
675 # (And here, for new URI schemes: RFC 2718)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000676 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000677 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
678 return path
679
680def reach(h):
681 """Return reach of host h, as defined by RFC 2965, section 1.
682
683 The reach R of a host name H is defined as follows:
684
685 * If
686
687 - H is the host domain name of a host; and,
688
689 - H has the form A.B; and
690
691 - A has no embedded (that is, interior) dots; and
692
693 - B has at least one embedded dot, or B is the string "local".
694 then the reach of H is .B.
695
696 * Otherwise, the reach of H is H.
697
698 >>> reach("www.acme.com")
699 '.acme.com'
700 >>> reach("acme.com")
701 'acme.com'
702 >>> reach("acme.local")
703 '.local'
704
705 """
706 i = h.find(".")
707 if i >= 0:
708 #a = h[:i] # this line is only here to show what a is
709 b = h[i+1:]
710 i = b.find(".")
711 if is_HDN(h) and (i >= 0 or b == "local"):
712 return "."+b
713 return h
714
715def is_third_party(request):
716 """
717
718 RFC 2965, section 3.3.6:
719
720 An unverifiable transaction is to a third-party host if its request-
721 host U does not domain-match the reach R of the request-host O in the
722 origin transaction.
723
724 """
725 req_host = request_host(request)
Senthil Kumarand9fbf362013-04-09 07:07:59 -0700726 if not domain_match(req_host, reach(request.origin_req_host)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000727 return True
728 else:
729 return False
730
731
732class Cookie:
733 """HTTP Cookie.
734
735 This class represents both Netscape and RFC 2965 cookies.
736
737 This is deliberately a very simple class. It just holds attributes. It's
738 possible to construct Cookie instances that don't comply with the cookie
739 standards. CookieJar.make_cookies is the factory function for Cookie
740 objects -- it deals with cookie parsing, supplying defaults, and
741 normalising to the representation used in this class. CookiePolicy is
742 responsible for checking them to see whether they should be accepted from
743 and returned to the server.
744
745 Note that the port may be present in the headers, but unspecified ("Port"
746 rather than"Port=80", for example); if this is the case, port is None.
747
748 """
749
750 def __init__(self, version, name, value,
751 port, port_specified,
752 domain, domain_specified, domain_initial_dot,
753 path, path_specified,
754 secure,
755 expires,
756 discard,
757 comment,
758 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000759 rest,
760 rfc2109=False,
761 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000762
763 if version is not None: version = int(version)
Robert Collinsa0e5d982015-08-04 10:06:29 +1200764 if expires is not None: expires = int(float(expires))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000765 if port is None and port_specified is True:
766 raise ValueError("if port is None, port_specified must be false")
767
768 self.version = version
769 self.name = name
770 self.value = value
771 self.port = port
772 self.port_specified = port_specified
773 # normalise case, as per RFC 2965 section 3.3.3
774 self.domain = domain.lower()
775 self.domain_specified = domain_specified
776 # Sigh. We need to know whether the domain given in the
777 # cookie-attribute had an initial dot, in order to follow RFC 2965
778 # (as clarified in draft errata). Needed for the returned $Domain
779 # value.
780 self.domain_initial_dot = domain_initial_dot
781 self.path = path
782 self.path_specified = path_specified
783 self.secure = secure
784 self.expires = expires
785 self.discard = discard
786 self.comment = comment
787 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000788 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000789
790 self._rest = copy.copy(rest)
791
792 def has_nonstandard_attr(self, name):
793 return name in self._rest
794 def get_nonstandard_attr(self, name, default=None):
795 return self._rest.get(name, default)
796 def set_nonstandard_attr(self, name, value):
797 self._rest[name] = value
798
799 def is_expired(self, now=None):
800 if now is None: now = time.time()
801 if (self.expires is not None) and (self.expires <= now):
802 return True
803 return False
804
805 def __str__(self):
806 if self.port is None: p = ""
807 else: p = ":"+self.port
808 limit = self.domain + p + self.path
809 if self.value is not None:
810 namevalue = "%s=%s" % (self.name, self.value)
811 else:
812 namevalue = self.name
813 return "<Cookie %s for %s>" % (namevalue, limit)
814
815 def __repr__(self):
816 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000817 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000818 "port", "port_specified",
819 "domain", "domain_specified", "domain_initial_dot",
820 "path", "path_specified",
821 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000822 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000823 attr = getattr(self, name)
824 args.append("%s=%s" % (name, repr(attr)))
825 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000826 args.append("rfc2109=%s" % repr(self.rfc2109))
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300827 return "%s(%s)" % (self.__class__.__name__, ", ".join(args))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000828
829
830class CookiePolicy:
831 """Defines which cookies get accepted from and returned to server.
832
833 May also modify cookies, though this is probably a bad idea.
834
835 The subclass DefaultCookiePolicy defines the standard rules for Netscape
836 and RFC 2965 cookies -- override that if you want a customised policy.
837
838 """
839 def set_ok(self, cookie, request):
840 """Return true if (and only if) cookie should be accepted from server.
841
842 Currently, pre-expired cookies never get this far -- the CookieJar
843 class deletes such cookies itself.
844
845 """
846 raise NotImplementedError()
847
848 def return_ok(self, cookie, request):
849 """Return true if (and only if) cookie should be returned to server."""
850 raise NotImplementedError()
851
852 def domain_return_ok(self, domain, request):
853 """Return false if cookies should not be returned, given cookie domain.
854 """
855 return True
856
857 def path_return_ok(self, path, request):
858 """Return false if cookies should not be returned, given cookie path.
859 """
860 return True
861
862
863class DefaultCookiePolicy(CookiePolicy):
864 """Implements the standard rules for accepting and returning cookies."""
865
866 DomainStrictNoDots = 1
867 DomainStrictNonDomain = 2
868 DomainRFC2965Match = 4
869
870 DomainLiberal = 0
871 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
872
873 def __init__(self,
874 blocked_domains=None, allowed_domains=None,
875 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000876 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000877 hide_cookie2=False,
878 strict_domain=False,
879 strict_rfc2965_unverifiable=True,
880 strict_ns_unverifiable=False,
881 strict_ns_domain=DomainLiberal,
882 strict_ns_set_initial_dollar=False,
883 strict_ns_set_path=False,
884 ):
885 """Constructor arguments should be passed as keyword arguments only."""
886 self.netscape = netscape
887 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000888 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000889 self.hide_cookie2 = hide_cookie2
890 self.strict_domain = strict_domain
891 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
892 self.strict_ns_unverifiable = strict_ns_unverifiable
893 self.strict_ns_domain = strict_ns_domain
894 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
895 self.strict_ns_set_path = strict_ns_set_path
896
897 if blocked_domains is not None:
898 self._blocked_domains = tuple(blocked_domains)
899 else:
900 self._blocked_domains = ()
901
902 if allowed_domains is not None:
903 allowed_domains = tuple(allowed_domains)
904 self._allowed_domains = allowed_domains
905
906 def blocked_domains(self):
907 """Return the sequence of blocked domains (as a tuple)."""
908 return self._blocked_domains
909 def set_blocked_domains(self, blocked_domains):
910 """Set the sequence of blocked domains."""
911 self._blocked_domains = tuple(blocked_domains)
912
913 def is_blocked(self, domain):
914 for blocked_domain in self._blocked_domains:
915 if user_domain_match(domain, blocked_domain):
916 return True
917 return False
918
919 def allowed_domains(self):
920 """Return None, or the sequence of allowed domains (as a tuple)."""
921 return self._allowed_domains
922 def set_allowed_domains(self, allowed_domains):
923 """Set the sequence of allowed domains, or None."""
924 if allowed_domains is not None:
925 allowed_domains = tuple(allowed_domains)
926 self._allowed_domains = allowed_domains
927
928 def is_not_allowed(self, domain):
929 if self._allowed_domains is None:
930 return False
931 for allowed_domain in self._allowed_domains:
932 if user_domain_match(domain, allowed_domain):
933 return False
934 return True
935
936 def set_ok(self, cookie, request):
937 """
938 If you override .set_ok(), be sure to call this method. If it returns
939 false, so should your subclass (assuming your subclass wants to be more
940 strict about which cookies to accept).
941
942 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000943 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000944
945 assert cookie.name is not None
946
947 for n in "version", "verifiability", "name", "path", "domain", "port":
948 fn_name = "set_ok_"+n
949 fn = getattr(self, fn_name)
950 if not fn(cookie, request):
951 return False
952
953 return True
954
955 def set_ok_version(self, cookie, request):
956 if cookie.version is None:
957 # Version is always set to 0 by parse_ns_headers if it's a Netscape
958 # cookie, so this must be an invalid RFC 2965 cookie.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000959 _debug(" Set-Cookie2 without version attribute (%s=%s)",
960 cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000961 return False
962 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000963 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000964 return False
965 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000966 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000967 return False
968 return True
969
970 def set_ok_verifiability(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -0500971 if request.unverifiable and is_third_party(request):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000972 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000973 _debug(" third-party RFC 2965 cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000974 "unverifiable transaction")
975 return False
976 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000977 _debug(" third-party Netscape cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000978 "unverifiable transaction")
979 return False
980 return True
981
982 def set_ok_name(self, cookie, request):
983 # Try and stop servers setting V0 cookies designed to hack other
984 # servers that know both V0 and V1 protocols.
985 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
986 cookie.name.startswith("$")):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000987 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000988 return False
989 return True
990
991 def set_ok_path(self, cookie, request):
992 if cookie.path_specified:
993 req_path = request_path(request)
994 if ((cookie.version > 0 or
995 (cookie.version == 0 and self.strict_ns_set_path)) and
996 not req_path.startswith(cookie.path)):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000997 _debug(" path attribute %s is not a prefix of request "
998 "path %s", cookie.path, req_path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000999 return False
1000 return True
1001
1002 def set_ok_domain(self, cookie, request):
1003 if self.is_blocked(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001004 _debug(" domain %s is in user block-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001005 return False
1006 if self.is_not_allowed(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001007 _debug(" domain %s is not in user allow-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001008 return False
1009 if cookie.domain_specified:
1010 req_host, erhn = eff_request_host(request)
1011 domain = cookie.domain
1012 if self.strict_domain and (domain.count(".") >= 2):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001013 # XXX This should probably be compared with the Konqueror
1014 # (kcookiejar.cpp) and Mozilla implementations, but it's a
1015 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001016 i = domain.rfind(".")
1017 j = domain.rfind(".", 0, i)
1018 if j == 0: # domain like .foo.bar
1019 tld = domain[i+1:]
1020 sld = domain[j+1:i]
Thomas Wouters477c8d52006-05-27 19:21:47 +00001021 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1022 "gov", "mil", "int", "aero", "biz", "cat", "coop",
1023 "info", "jobs", "mobi", "museum", "name", "pro",
1024 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001025 # domain like .co.uk
Thomas Wouters477c8d52006-05-27 19:21:47 +00001026 _debug(" country-code second level domain %s", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001027 return False
1028 if domain.startswith("."):
1029 undotted_domain = domain[1:]
1030 else:
1031 undotted_domain = domain
1032 embedded_dots = (undotted_domain.find(".") >= 0)
1033 if not embedded_dots and domain != ".local":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001034 _debug(" non-local domain %s contains no embedded dot",
1035 domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001036 return False
1037 if cookie.version == 0:
1038 if (not erhn.endswith(domain) and
1039 (not erhn.startswith(".") and
1040 not ("."+erhn).endswith(domain))):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001041 _debug(" effective request-host %s (even with added "
Ezio Melottie130a522011-10-19 10:58:56 +03001042 "initial dot) does not end with %s",
Thomas Wouters477c8d52006-05-27 19:21:47 +00001043 erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001044 return False
1045 if (cookie.version > 0 or
1046 (self.strict_ns_domain & self.DomainRFC2965Match)):
1047 if not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001048 _debug(" effective request-host %s does not domain-match "
1049 "%s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001050 return False
1051 if (cookie.version > 0 or
1052 (self.strict_ns_domain & self.DomainStrictNoDots)):
1053 host_prefix = req_host[:-len(domain)]
1054 if (host_prefix.find(".") >= 0 and
1055 not IPV4_RE.search(req_host)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001056 _debug(" host prefix %s for domain %s contains a dot",
1057 host_prefix, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001058 return False
1059 return True
1060
1061 def set_ok_port(self, cookie, request):
1062 if cookie.port_specified:
1063 req_port = request_port(request)
1064 if req_port is None:
1065 req_port = "80"
1066 else:
1067 req_port = str(req_port)
1068 for p in cookie.port.split(","):
1069 try:
1070 int(p)
1071 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 _debug(" bad port %s (not numeric)", p)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001073 return False
1074 if p == req_port:
1075 break
1076 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001077 _debug(" request port (%s) not found in %s",
1078 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001079 return False
1080 return True
1081
1082 def return_ok(self, cookie, request):
1083 """
1084 If you override .return_ok(), be sure to call this method. If it
1085 returns false, so should your subclass (assuming your subclass wants to
1086 be more strict about which cookies to return).
1087
1088 """
1089 # Path has already been checked by .path_return_ok(), and domain
1090 # blocking done by .domain_return_ok().
Thomas Wouters477c8d52006-05-27 19:21:47 +00001091 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001092
1093 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1094 fn_name = "return_ok_"+n
1095 fn = getattr(self, fn_name)
1096 if not fn(cookie, request):
1097 return False
1098 return True
1099
1100 def return_ok_version(self, cookie, request):
1101 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001102 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001103 return False
1104 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001105 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001106 return False
1107 return True
1108
1109 def return_ok_verifiability(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -05001110 if request.unverifiable and is_third_party(request):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001111 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001112 _debug(" third-party RFC 2965 cookie during unverifiable "
1113 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001114 return False
1115 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001116 _debug(" third-party Netscape cookie during unverifiable "
1117 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001118 return False
1119 return True
1120
1121 def return_ok_secure(self, cookie, request):
Meador Ingeda1ffbc2012-07-20 19:12:04 -05001122 if cookie.secure and request.type != "https":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001123 _debug(" secure cookie with non-secure request")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001124 return False
1125 return True
1126
1127 def return_ok_expires(self, cookie, request):
1128 if cookie.is_expired(self._now):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001129 _debug(" cookie expired")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001130 return False
1131 return True
1132
1133 def return_ok_port(self, cookie, request):
1134 if cookie.port:
1135 req_port = request_port(request)
1136 if req_port is None:
1137 req_port = "80"
1138 for p in cookie.port.split(","):
1139 if p == req_port:
1140 break
1141 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001142 _debug(" request port %s does not match cookie port %s",
1143 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001144 return False
1145 return True
1146
1147 def return_ok_domain(self, cookie, request):
1148 req_host, erhn = eff_request_host(request)
1149 domain = cookie.domain
1150
1151 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1152 if (cookie.version == 0 and
1153 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1154 not cookie.domain_specified and domain != erhn):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001155 _debug(" cookie with unspecified domain does not string-compare "
1156 "equal to request domain")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001157 return False
1158
1159 if cookie.version > 0 and not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001160 _debug(" effective request-host name %s does not domain-match "
1161 "RFC 2965 cookie domain %s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001162 return False
1163 if cookie.version == 0 and not ("."+erhn).endswith(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001164 _debug(" request-host %s does not match Netscape cookie domain "
1165 "%s", req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001166 return False
1167 return True
1168
1169 def domain_return_ok(self, domain, request):
1170 # Liberal check of. This is here as an optimization to avoid
1171 # having to load lots of MSIE cookie files unless necessary.
1172 req_host, erhn = eff_request_host(request)
1173 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001174 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001175 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001176 erhn = "."+erhn
1177 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001178 #_debug(" request domain %s does not match cookie domain %s",
1179 # req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001180 return False
1181
1182 if self.is_blocked(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001183 _debug(" domain %s is in user block-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001184 return False
1185 if self.is_not_allowed(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001186 _debug(" domain %s is not in user allow-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001187 return False
1188
1189 return True
1190
1191 def path_return_ok(self, path, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001192 _debug("- checking cookie path=%s", path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001193 req_path = request_path(request)
1194 if not req_path.startswith(path):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001195 _debug(" %s does not path-match %s", req_path, path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001196 return False
1197 return True
1198
1199
1200def vals_sorted_by_key(adict):
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001201 keys = sorted(adict.keys())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001202 return map(adict.get, keys)
1203
1204def deepvalues(mapping):
1205 """Iterates over nested mapping, depth-first, in sorted order by key."""
1206 values = vals_sorted_by_key(mapping)
1207 for obj in values:
1208 mapping = False
1209 try:
1210 obj.items
1211 except AttributeError:
1212 pass
1213 else:
1214 mapping = True
Philip Jenveyfd0d3e52012-10-01 15:34:31 -07001215 yield from deepvalues(obj)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001216 if not mapping:
1217 yield obj
1218
1219
1220# Used as second parameter to dict.get() method, to distinguish absent
1221# dict key from one with a None value.
1222class Absent: pass
1223
1224class CookieJar:
1225 """Collection of HTTP cookies.
1226
1227 You may not need to know about this class: try
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001228 urllib.request.build_opener(HTTPCookieProcessor).open(url).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001229 """
1230
1231 non_word_re = re.compile(r"\W")
1232 quote_re = re.compile(r"([\"\\])")
1233 strict_domain_re = re.compile(r"\.?[^.]*")
1234 domain_re = re.compile(r"[^.]*")
1235 dots_re = re.compile(r"^\.+")
1236
Antoine Pitroufd036452008-08-19 17:56:33 +00001237 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001238
1239 def __init__(self, policy=None):
1240 if policy is None:
1241 policy = DefaultCookiePolicy()
1242 self._policy = policy
1243
1244 self._cookies_lock = _threading.RLock()
1245 self._cookies = {}
1246
1247 def set_policy(self, policy):
1248 self._policy = policy
1249
1250 def _cookies_for_domain(self, domain, request):
1251 cookies = []
1252 if not self._policy.domain_return_ok(domain, request):
1253 return []
Thomas Wouters477c8d52006-05-27 19:21:47 +00001254 _debug("Checking %s for cookies to return", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001255 cookies_by_path = self._cookies[domain]
1256 for path in cookies_by_path.keys():
1257 if not self._policy.path_return_ok(path, request):
1258 continue
1259 cookies_by_name = cookies_by_path[path]
1260 for cookie in cookies_by_name.values():
1261 if not self._policy.return_ok(cookie, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001262 _debug(" not returning cookie")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001263 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001264 _debug(" it's a match")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001265 cookies.append(cookie)
1266 return cookies
1267
1268 def _cookies_for_request(self, request):
1269 """Return a list of cookies to be returned to server."""
1270 cookies = []
1271 for domain in self._cookies.keys():
1272 cookies.extend(self._cookies_for_domain(domain, request))
1273 return cookies
1274
1275 def _cookie_attrs(self, cookies):
1276 """Return a list of cookie-attributes to be returned to server.
1277
1278 like ['foo="bar"; $Path="/"', ...]
1279
1280 The $Version attribute is also added when appropriate (currently only
1281 once per request).
1282
1283 """
1284 # add cookies in order of most specific (ie. longest) path first
Raymond Hettinger70b64fc2008-01-30 20:15:17 +00001285 cookies.sort(key=lambda a: len(a.path), reverse=True)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001286
1287 version_set = False
1288
1289 attrs = []
1290 for cookie in cookies:
1291 # set version of Cookie header
1292 # XXX
1293 # What should it be if multiple matching Set-Cookie headers have
1294 # different versions themselves?
1295 # Answer: there is no answer; was supposed to be settled by
1296 # RFC 2965 errata, but that may never appear...
1297 version = cookie.version
1298 if not version_set:
1299 version_set = True
1300 if version > 0:
1301 attrs.append("$Version=%s" % version)
1302
1303 # quote cookie value if necessary
1304 # (not for Netscape protocol, which already has any quotes
1305 # intact, due to the poorly-specified Netscape Cookie: syntax)
1306 if ((cookie.value is not None) and
1307 self.non_word_re.search(cookie.value) and version > 0):
1308 value = self.quote_re.sub(r"\\\1", cookie.value)
1309 else:
1310 value = cookie.value
1311
1312 # add cookie-attributes to be returned in Cookie header
1313 if cookie.value is None:
1314 attrs.append(cookie.name)
1315 else:
1316 attrs.append("%s=%s" % (cookie.name, value))
1317 if version > 0:
1318 if cookie.path_specified:
1319 attrs.append('$Path="%s"' % cookie.path)
1320 if cookie.domain.startswith("."):
1321 domain = cookie.domain
1322 if (not cookie.domain_initial_dot and
1323 domain.startswith(".")):
1324 domain = domain[1:]
1325 attrs.append('$Domain="%s"' % domain)
1326 if cookie.port is not None:
1327 p = "$Port"
1328 if cookie.port_specified:
1329 p = p + ('="%s"' % cookie.port)
1330 attrs.append(p)
1331
1332 return attrs
1333
1334 def add_cookie_header(self, request):
Georg Brandl029986a2008-06-23 11:44:14 +00001335 """Add correct Cookie: header to request (urllib.request.Request object).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001336
1337 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1338
1339 """
Thomas Wouters477c8d52006-05-27 19:21:47 +00001340 _debug("add_cookie_header")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001341 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001342 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001343
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001344 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001345
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001346 cookies = self._cookies_for_request(request)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001347
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001348 attrs = self._cookie_attrs(cookies)
1349 if attrs:
1350 if not request.has_header("Cookie"):
1351 request.add_unredirected_header(
1352 "Cookie", "; ".join(attrs))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001353
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001354 # if necessary, advertise that we know RFC 2965
1355 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1356 not request.has_header("Cookie2")):
1357 for cookie in cookies:
1358 if cookie.version != 1:
1359 request.add_unredirected_header("Cookie2", '$Version="1"')
1360 break
1361
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001362 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001363 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001364
1365 self.clear_expired_cookies()
1366
1367 def _normalized_cookie_tuples(self, attrs_set):
1368 """Return list of tuples containing normalised cookie information.
1369
1370 attrs_set is the list of lists of key,value pairs extracted from
1371 the Set-Cookie or Set-Cookie2 headers.
1372
1373 Tuples are name, value, standard, rest, where name and value are the
1374 cookie name and value, standard is a dictionary containing the standard
1375 cookie-attributes (discard, secure, version, expires or max-age,
1376 domain, path and port) and rest is a dictionary containing the rest of
1377 the cookie-attributes.
1378
1379 """
1380 cookie_tuples = []
1381
1382 boolean_attrs = "discard", "secure"
1383 value_attrs = ("version",
1384 "expires", "max-age",
1385 "domain", "path", "port",
1386 "comment", "commenturl")
1387
1388 for cookie_attrs in attrs_set:
1389 name, value = cookie_attrs[0]
1390
1391 # Build dictionary of standard cookie-attributes (standard) and
1392 # dictionary of other cookie-attributes (rest).
1393
1394 # Note: expiry time is normalised to seconds since epoch. V0
1395 # cookies should have the Expires cookie-attribute, and V1 cookies
1396 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1397 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1398 # accept either (but prefer Max-Age).
1399 max_age_set = False
1400
1401 bad_cookie = False
1402
1403 standard = {}
1404 rest = {}
1405 for k, v in cookie_attrs[1:]:
1406 lc = k.lower()
1407 # don't lose case distinction for unknown fields
1408 if lc in value_attrs or lc in boolean_attrs:
1409 k = lc
1410 if k in boolean_attrs and v is None:
1411 # boolean cookie-attribute is present, but has no value
1412 # (like "discard", rather than "port=80")
1413 v = True
1414 if k in standard:
1415 # only first value is significant
1416 continue
1417 if k == "domain":
1418 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001419 _debug(" missing value for domain attribute")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001420 bad_cookie = True
1421 break
1422 # RFC 2965 section 3.3.3
1423 v = v.lower()
1424 if k == "expires":
1425 if max_age_set:
1426 # Prefer max-age to expires (like Mozilla)
1427 continue
1428 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001429 _debug(" missing or invalid value for expires "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001430 "attribute: treating as session cookie")
1431 continue
1432 if k == "max-age":
1433 max_age_set = True
1434 try:
1435 v = int(v)
1436 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001437 _debug(" missing or invalid (non-numeric) value for "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001438 "max-age attribute")
1439 bad_cookie = True
1440 break
1441 # convert RFC 2965 Max-Age to seconds since epoch
1442 # XXX Strictly you're supposed to follow RFC 2616
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001443 # age-calculation rules. Remember that zero Max-Age
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001444 # is a request to discard (old and new) cookie, though.
1445 k = "expires"
1446 v = self._now + v
1447 if (k in value_attrs) or (k in boolean_attrs):
1448 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001449 k not in ("port", "comment", "commenturl")):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001450 _debug(" missing value for %s attribute" % k)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001451 bad_cookie = True
1452 break
1453 standard[k] = v
1454 else:
1455 rest[k] = v
1456
1457 if bad_cookie:
1458 continue
1459
1460 cookie_tuples.append((name, value, standard, rest))
1461
1462 return cookie_tuples
1463
1464 def _cookie_from_cookie_tuple(self, tup, request):
1465 # standard is dict of standard cookie-attributes, rest is dict of the
1466 # rest of them
1467 name, value, standard, rest = tup
1468
1469 domain = standard.get("domain", Absent)
1470 path = standard.get("path", Absent)
1471 port = standard.get("port", Absent)
1472 expires = standard.get("expires", Absent)
1473
1474 # set the easy defaults
1475 version = standard.get("version", None)
Benjamin Peterson3e5cd1d2010-06-27 21:45:24 +00001476 if version is not None:
1477 try:
1478 version = int(version)
1479 except ValueError:
1480 return None # invalid version, ignore cookie
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001481 secure = standard.get("secure", False)
1482 # (discard is also set if expires is Absent)
1483 discard = standard.get("discard", False)
1484 comment = standard.get("comment", None)
1485 comment_url = standard.get("commenturl", None)
1486
1487 # set default path
1488 if path is not Absent and path != "":
1489 path_specified = True
1490 path = escape_path(path)
1491 else:
1492 path_specified = False
1493 path = request_path(request)
1494 i = path.rfind("/")
1495 if i != -1:
1496 if version == 0:
1497 # Netscape spec parts company from reality here
1498 path = path[:i]
1499 else:
1500 path = path[:i+1]
1501 if len(path) == 0: path = "/"
1502
1503 # set default domain
1504 domain_specified = domain is not Absent
1505 # but first we have to remember whether it starts with a dot
1506 domain_initial_dot = False
1507 if domain_specified:
1508 domain_initial_dot = bool(domain.startswith("."))
1509 if domain is Absent:
1510 req_host, erhn = eff_request_host(request)
1511 domain = erhn
1512 elif not domain.startswith("."):
1513 domain = "."+domain
1514
1515 # set default port
1516 port_specified = False
1517 if port is not Absent:
1518 if port is None:
1519 # Port attr present, but has no value: default to request port.
1520 # Cookie should then only be sent back on that port.
1521 port = request_port(request)
1522 else:
1523 port_specified = True
1524 port = re.sub(r"\s+", "", port)
1525 else:
1526 # No port attr present. Cookie can be sent back on any port.
1527 port = None
1528
1529 # set default expires and discard
1530 if expires is Absent:
1531 expires = None
1532 discard = True
1533 elif expires <= self._now:
1534 # Expiry date in past is request to delete cookie. This can't be
1535 # in DefaultCookiePolicy, because can't delete cookies there.
1536 try:
1537 self.clear(domain, path, name)
1538 except KeyError:
1539 pass
Thomas Wouters477c8d52006-05-27 19:21:47 +00001540 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1541 domain, path, name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001542 return None
1543
1544 return Cookie(version,
1545 name, value,
1546 port, port_specified,
1547 domain, domain_specified, domain_initial_dot,
1548 path, path_specified,
1549 secure,
1550 expires,
1551 discard,
1552 comment,
1553 comment_url,
1554 rest)
1555
1556 def _cookies_from_attrs_set(self, attrs_set, request):
1557 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1558
1559 cookies = []
1560 for tup in cookie_tuples:
1561 cookie = self._cookie_from_cookie_tuple(tup, request)
1562 if cookie: cookies.append(cookie)
1563 return cookies
1564
Neal Norwitz71dad722005-12-23 21:43:48 +00001565 def _process_rfc2109_cookies(self, cookies):
1566 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1567 if rfc2109_as_ns is None:
1568 rfc2109_as_ns = not self._policy.rfc2965
1569 for cookie in cookies:
1570 if cookie.version == 1:
1571 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001572 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001573 # treat 2109 cookies as Netscape cookies rather than
1574 # as RFC2965 cookies
1575 cookie.version = 0
1576
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001577 def make_cookies(self, response, request):
1578 """Return sequence of Cookie objects extracted from response object."""
1579 # get cookie-attributes for RFC 2965 and Netscape protocols
1580 headers = response.info()
Barry Warsaw820c1202008-06-12 04:06:45 +00001581 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1582 ns_hdrs = headers.get_all("Set-Cookie", [])
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001583
1584 rfc2965 = self._policy.rfc2965
1585 netscape = self._policy.netscape
1586
1587 if ((not rfc2965_hdrs and not ns_hdrs) or
1588 (not ns_hdrs and not rfc2965) or
1589 (not rfc2965_hdrs and not netscape) or
1590 (not netscape and not rfc2965)):
1591 return [] # no relevant cookie headers: quick exit
1592
1593 try:
1594 cookies = self._cookies_from_attrs_set(
1595 split_header_words(rfc2965_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001596 except Exception:
1597 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001598 cookies = []
1599
1600 if ns_hdrs and netscape:
1601 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001602 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001603 ns_cookies = self._cookies_from_attrs_set(
1604 parse_ns_headers(ns_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001605 except Exception:
1606 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001607 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001608 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001609
1610 # Look for Netscape cookies (from Set-Cookie headers) that match
1611 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1612 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1613 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1614 # bundled in with the Netscape cookies for this purpose, which is
1615 # reasonable behaviour.
1616 if rfc2965:
1617 lookup = {}
1618 for cookie in cookies:
1619 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1620
1621 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1622 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1623 return key not in lookup
1624 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1625
1626 if ns_cookies:
1627 cookies.extend(ns_cookies)
1628
1629 return cookies
1630
1631 def set_cookie_if_ok(self, cookie, request):
1632 """Set a cookie if policy says it's OK to do so."""
1633 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001634 try:
1635 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001636
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001637 if self._policy.set_ok(cookie, request):
1638 self.set_cookie(cookie)
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001639
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001640
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001641 finally:
1642 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001643
1644 def set_cookie(self, cookie):
1645 """Set a cookie, without checking whether or not it should be set."""
1646 c = self._cookies
1647 self._cookies_lock.acquire()
1648 try:
1649 if cookie.domain not in c: c[cookie.domain] = {}
1650 c2 = c[cookie.domain]
1651 if cookie.path not in c2: c2[cookie.path] = {}
1652 c3 = c2[cookie.path]
1653 c3[cookie.name] = cookie
1654 finally:
1655 self._cookies_lock.release()
1656
1657 def extract_cookies(self, response, request):
1658 """Extract cookies from response, where allowable given the request."""
Thomas Wouters477c8d52006-05-27 19:21:47 +00001659 _debug("extract_cookies: %s", response.info())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001660 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001661 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001662 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001663
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001664 for cookie in self.make_cookies(response, request):
1665 if self._policy.set_ok(cookie, request):
1666 _debug(" setting cookie: %s", cookie)
1667 self.set_cookie(cookie)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001668 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001669 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001670
1671 def clear(self, domain=None, path=None, name=None):
1672 """Clear some cookies.
1673
1674 Invoking this method without arguments will clear all cookies. If
1675 given a single argument, only cookies belonging to that domain will be
1676 removed. If given two arguments, cookies belonging to the specified
1677 path within that domain are removed. If given three arguments, then
1678 the cookie with the specified name, path and domain is removed.
1679
1680 Raises KeyError if no matching cookie exists.
1681
1682 """
1683 if name is not None:
1684 if (domain is None) or (path is None):
1685 raise ValueError(
1686 "domain and path must be given to remove a cookie by name")
1687 del self._cookies[domain][path][name]
1688 elif path is not None:
1689 if domain is None:
1690 raise ValueError(
1691 "domain must be given to remove cookies by path")
1692 del self._cookies[domain][path]
1693 elif domain is not None:
1694 del self._cookies[domain]
1695 else:
1696 self._cookies = {}
1697
1698 def clear_session_cookies(self):
1699 """Discard all session cookies.
1700
1701 Note that the .save() method won't save session cookies anyway, unless
1702 you ask otherwise by passing a true ignore_discard argument.
1703
1704 """
1705 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001706 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001707 for cookie in self:
1708 if cookie.discard:
1709 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001710 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001711 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001712
1713 def clear_expired_cookies(self):
1714 """Discard all expired cookies.
1715
1716 You probably don't need to call this method: expired cookies are never
1717 sent back to the server (provided you're using DefaultCookiePolicy),
1718 this method is called by CookieJar itself every so often, and the
1719 .save() method won't save expired cookies anyway (unless you ask
1720 otherwise by passing a true ignore_expires argument).
1721
1722 """
1723 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001724 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001725 now = time.time()
1726 for cookie in self:
1727 if cookie.is_expired(now):
1728 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001729 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001730 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001731
1732 def __iter__(self):
1733 return deepvalues(self._cookies)
1734
1735 def __len__(self):
1736 """Return number of contained cookies."""
1737 i = 0
1738 for cookie in self: i = i + 1
1739 return i
1740
1741 def __repr__(self):
1742 r = []
1743 for cookie in self: r.append(repr(cookie))
Serhiy Storchaka1392df92014-07-22 11:09:36 +03001744 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001745
1746 def __str__(self):
1747 r = []
1748 for cookie in self: r.append(str(cookie))
Serhiy Storchaka1392df92014-07-22 11:09:36 +03001749 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001750
1751
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001752# derives from OSError for backwards-compatibility with Python 2.4.0
1753class LoadError(OSError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001754
1755class FileCookieJar(CookieJar):
1756 """CookieJar that can be loaded from and saved to a file."""
1757
1758 def __init__(self, filename=None, delayload=False, policy=None):
1759 """
1760 Cookies are NOT loaded from the named file until either the .load() or
1761 .revert() method is called.
1762
1763 """
1764 CookieJar.__init__(self, policy)
1765 if filename is not None:
1766 try:
1767 filename+""
1768 except:
1769 raise ValueError("filename must be string-like")
1770 self.filename = filename
1771 self.delayload = bool(delayload)
1772
1773 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1774 """Save cookies to a file."""
1775 raise NotImplementedError()
1776
1777 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1778 """Load cookies from a file."""
1779 if filename is None:
1780 if self.filename is not None: filename = self.filename
1781 else: raise ValueError(MISSING_FILENAME_TEXT)
1782
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01001783 with open(filename) as f:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001784 self._really_load(f, filename, ignore_discard, ignore_expires)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001785
1786 def revert(self, filename=None,
1787 ignore_discard=False, ignore_expires=False):
1788 """Clear all cookies and reload cookies from a saved file.
1789
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001790 Raises LoadError (or OSError) if reversion is not successful; the
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001791 object's state will not be altered if this happens.
1792
1793 """
1794 if filename is None:
1795 if self.filename is not None: filename = self.filename
1796 else: raise ValueError(MISSING_FILENAME_TEXT)
1797
1798 self._cookies_lock.acquire()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001799 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001800
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001801 old_state = copy.deepcopy(self._cookies)
1802 self._cookies = {}
1803 try:
1804 self.load(filename, ignore_discard, ignore_expires)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001805 except OSError:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001806 self._cookies = old_state
1807 raise
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001808
1809 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001810 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001811
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001812
1813def lwp_cookie_str(cookie):
Benjamin Peterson82f34ad2015-01-13 09:17:24 -05001814 """Return string representation of Cookie in the LWP cookie file format.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001815
1816 Actually, the format is extended a bit -- see module docstring.
1817
1818 """
1819 h = [(cookie.name, cookie.value),
1820 ("path", cookie.path),
1821 ("domain", cookie.domain)]
1822 if cookie.port is not None: h.append(("port", cookie.port))
1823 if cookie.path_specified: h.append(("path_spec", None))
1824 if cookie.port_specified: h.append(("port_spec", None))
1825 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1826 if cookie.secure: h.append(("secure", None))
1827 if cookie.expires: h.append(("expires",
1828 time2isoz(float(cookie.expires))))
1829 if cookie.discard: h.append(("discard", None))
1830 if cookie.comment: h.append(("comment", cookie.comment))
1831 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1832
1833 keys = sorted(cookie._rest.keys())
1834 for k in keys:
1835 h.append((k, str(cookie._rest[k])))
1836
1837 h.append(("version", str(cookie.version)))
1838
1839 return join_header_words([h])
1840
1841class LWPCookieJar(FileCookieJar):
1842 """
Ezio Melotti0847db72012-09-21 16:30:22 +03001843 The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
Martin Panterf05641642016-05-08 13:48:10 +00001844 "Set-Cookie3" is the format used by the libwww-perl library, not known
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001845 to be compatible with any browser, but which is easy to read and
1846 doesn't lose information about RFC 2965 cookies.
1847
1848 Additional methods
1849
1850 as_lwp_str(ignore_discard=True, ignore_expired=True)
1851
1852 """
1853
1854 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
Ezio Melotti0847db72012-09-21 16:30:22 +03001855 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001856
1857 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1858
1859 """
1860 now = time.time()
1861 r = []
1862 for cookie in self:
1863 if not ignore_discard and cookie.discard:
1864 continue
1865 if not ignore_expires and cookie.is_expired(now):
1866 continue
1867 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1868 return "\n".join(r+[""])
1869
1870 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1871 if filename is None:
1872 if self.filename is not None: filename = self.filename
1873 else: raise ValueError(MISSING_FILENAME_TEXT)
1874
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01001875 with open(filename, "w") as f:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001876 # There really isn't an LWP Cookies 2.0 format, but this indicates
1877 # that there is extra information in here (domain_dot and
1878 # port_spec) while still being compatible with libwww-perl, I hope.
1879 f.write("#LWP-Cookies-2.0\n")
1880 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001881
1882 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1883 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001884 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001885 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1886 "file" % filename)
1887 raise LoadError(msg)
1888
1889 now = time.time()
1890
1891 header = "Set-Cookie3:"
1892 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1893 "secure", "discard")
1894 value_attrs = ("version",
1895 "port", "path", "domain",
1896 "expires",
1897 "comment", "commenturl")
1898
1899 try:
1900 while 1:
1901 line = f.readline()
1902 if line == "": break
1903 if not line.startswith(header):
1904 continue
1905 line = line[len(header):].strip()
1906
1907 for data in split_header_words([line]):
1908 name, value = data[0]
1909 standard = {}
1910 rest = {}
1911 for k in boolean_attrs:
1912 standard[k] = False
1913 for k, v in data[1:]:
1914 if k is not None:
1915 lc = k.lower()
1916 else:
1917 lc = None
1918 # don't lose case distinction for unknown fields
1919 if (lc in value_attrs) or (lc in boolean_attrs):
1920 k = lc
1921 if k in boolean_attrs:
1922 if v is None: v = True
1923 standard[k] = v
1924 elif k in value_attrs:
1925 standard[k] = v
1926 else:
1927 rest[k] = v
1928
1929 h = standard.get
1930 expires = h("expires")
1931 discard = h("discard")
1932 if expires is not None:
1933 expires = iso2time(expires)
1934 if expires is None:
1935 discard = True
1936 domain = h("domain")
1937 domain_specified = domain.startswith(".")
1938 c = Cookie(h("version"), name, value,
1939 h("port"), h("port_spec"),
1940 domain, domain_specified, h("domain_dot"),
1941 h("path"), h("path_spec"),
1942 h("secure"),
1943 expires,
1944 discard,
1945 h("comment"),
1946 h("commenturl"),
1947 rest)
1948 if not ignore_discard and c.discard:
1949 continue
1950 if not ignore_expires and c.is_expired(now):
1951 continue
1952 self.set_cookie(c)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02001953 except OSError:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001954 raise
1955 except Exception:
1956 _warn_unhandled_exception()
1957 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1958 (filename, line))
1959
1960
1961class MozillaCookieJar(FileCookieJar):
1962 """
1963
1964 WARNING: you may want to backup your browser's cookies file if you use
1965 this class to save cookies. I *think* it works, but there have been
1966 bugs in the past!
1967
1968 This class differs from CookieJar only in the format it uses to save and
1969 load cookies to and from a file. This class uses the Mozilla/Netscape
1970 `cookies.txt' format. lynx uses this file format, too.
1971
1972 Don't expect cookies saved while the browser is running to be noticed by
1973 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1974 you change them on disk while it's running; on Windows, you probably can't
1975 save at all while the browser is running).
1976
1977 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1978 Netscape cookies on saving.
1979
1980 In particular, the cookie version and port number information is lost,
1981 together with information about whether or not Path, Port and Discard were
1982 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1983 domain as set in the HTTP header started with a dot (yes, I'm aware some
1984 domains in Netscape files start with a dot and some don't -- trust me, you
1985 really don't want to know any more about this).
1986
1987 Note that though Mozilla and Netscape use the same format, they use
1988 slightly different headers. The class saves cookies using the Netscape
1989 header by default (Mozilla can cope with that).
1990
1991 """
Antoine Pitroufd036452008-08-19 17:56:33 +00001992 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001993 header = """\
Georg Brandl87a15642010-07-31 22:11:11 +00001994# Netscape HTTP Cookie File
Benjamin Petersonccedc222013-12-18 15:35:18 -06001995# http://curl.haxx.se/rfc/cookie_spec.html
Georg Brandl87a15642010-07-31 22:11:11 +00001996# This is a generated file! Do not edit.
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001997
1998"""
1999
2000 def _really_load(self, f, filename, ignore_discard, ignore_expires):
2001 now = time.time()
2002
2003 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00002004 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002005 raise LoadError(
2006 "%r does not look like a Netscape format cookies file" %
2007 filename)
2008
2009 try:
2010 while 1:
2011 line = f.readline()
2012 if line == "": break
2013
2014 # last field may be absent, so keep any trailing tab
2015 if line.endswith("\n"): line = line[:-1]
2016
2017 # skip comments and blank lines XXX what is $ for?
2018 if (line.strip().startswith(("#", "$")) or
2019 line.strip() == ""):
2020 continue
2021
2022 domain, domain_specified, path, secure, expires, name, value = \
2023 line.split("\t")
2024 secure = (secure == "TRUE")
2025 domain_specified = (domain_specified == "TRUE")
2026 if name == "":
2027 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2028 # with no name, whereas http.cookiejar regards it as a
2029 # cookie with no value.
2030 name = value
2031 value = None
2032
2033 initial_dot = domain.startswith(".")
2034 assert domain_specified == initial_dot
2035
2036 discard = False
2037 if expires == "":
2038 expires = None
2039 discard = True
2040
2041 # assume path_specified is false
2042 c = Cookie(0, name, value,
2043 None, False,
2044 domain, domain_specified, initial_dot,
2045 path, False,
2046 secure,
2047 expires,
2048 discard,
2049 None,
2050 None,
2051 {})
2052 if not ignore_discard and c.discard:
2053 continue
2054 if not ignore_expires and c.is_expired(now):
2055 continue
2056 self.set_cookie(c)
2057
Andrew Svetlovf7a17b42012-12-25 16:47:37 +02002058 except OSError:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002059 raise
2060 except Exception:
2061 _warn_unhandled_exception()
2062 raise LoadError("invalid Netscape format cookies file %r: %r" %
2063 (filename, line))
2064
2065 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2066 if filename is None:
2067 if self.filename is not None: filename = self.filename
2068 else: raise ValueError(MISSING_FILENAME_TEXT)
2069
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +01002070 with open(filename, "w") as f:
Georg Brandl7c9b61b2008-05-26 17:56:51 +00002071 f.write(self.header)
2072 now = time.time()
2073 for cookie in self:
2074 if not ignore_discard and cookie.discard:
2075 continue
2076 if not ignore_expires and cookie.is_expired(now):
2077 continue
2078 if cookie.secure: secure = "TRUE"
2079 else: secure = "FALSE"
2080 if cookie.domain.startswith("."): initial_dot = "TRUE"
2081 else: initial_dot = "FALSE"
2082 if cookie.expires is not None:
2083 expires = str(cookie.expires)
2084 else:
2085 expires = ""
2086 if cookie.value is None:
2087 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2088 # with no name, whereas http.cookiejar regards it as a
2089 # cookie with no value.
2090 name = ""
2091 value = cookie.name
2092 else:
2093 name = cookie.name
2094 value = cookie.value
2095 f.write(
2096 "\t".join([cookie.domain, initial_dot, cookie.path,
2097 secure, expires, name, value])+
2098 "\n")