blob: e9efab88df7661998e313f6aa42c44f3b37a91b4 [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
Thomas Wouters477c8d52006-05-27 19:21:47 +000010Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000013
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Thomas Wouters477c8d52006-05-27 19:21:47 +000028__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
Jeremy Hylton1afc1692008-06-18 20:49:58 +000031import copy
32import re
33import time
34import urllib.parse, urllib.request
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000035try:
36 import threading as _threading
37except ImportError:
38 import dummy_threading as _threading
Georg Brandl24420152008-05-26 16:32:26 +000039import http.client # only for the default HTTP port
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000040from calendar import timegm
41
Thomas Wouters477c8d52006-05-27 19:21:47 +000042debug = False # set to True to enable debugging via the logging module
43logger = None
44
45def _debug(*args):
46 if not debug:
47 return
48 global logger
49 if not logger:
50 import logging
Georg Brandl24420152008-05-26 16:32:26 +000051 logger = logging.getLogger("http.cookiejar")
Thomas Wouters477c8d52006-05-27 19:21:47 +000052 return logger.debug(*args)
53
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000054
Georg Brandl24420152008-05-26 16:32:26 +000055DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000056MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
57 "instance initialised with one)")
58
Thomas Wouters477c8d52006-05-27 19:21:47 +000059def _warn_unhandled_exception():
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000060 # There are a few catch-all except: statements in this module, for
Thomas Wouters477c8d52006-05-27 19:21:47 +000061 # catching input that's bad in unexpected ways. Warn if any
62 # exceptions are caught there.
Jeremy Hylton7ecf3dc2008-05-10 20:38:40 +000063 import io, warnings, traceback
Guido van Rossum34d19282007-08-09 01:03:29 +000064 f = io.StringIO()
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000065 traceback.print_exc(None, f)
66 msg = f.getvalue()
Georg Brandl24420152008-05-26 16:32:26 +000067 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000068
69
70# Date/time conversion
71# -----------------------------------------------------------------------------
72
73EPOCH_YEAR = 1970
74def _timegm(tt):
75 year, month, mday, hour, min, sec = tt[:6]
76 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
77 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
78 return timegm(tt)
79 else:
80 return None
81
82DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
83MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
84 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
85MONTHS_LOWER = []
86for month in MONTHS: MONTHS_LOWER.append(month.lower())
87
88def time2isoz(t=None):
89 """Return a string representing time in seconds since epoch, t.
90
91 If the function is called without an argument, it will use the current
92 time.
93
94 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
95 representing Universal Time (UTC, aka GMT). An example of this format is:
96
97 1994-11-24 08:49:37Z
98
99 """
100 if t is None: t = time.time()
101 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
102 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
103 year, mon, mday, hour, min, sec)
104
105def time2netscape(t=None):
106 """Return a string representing time in seconds since epoch, t.
107
108 If the function is called without an argument, it will use the current
109 time.
110
111 The format of the returned string is like this:
112
113 Wed, DD-Mon-YYYY HH:MM:SS GMT
114
115 """
116 if t is None: t = time.time()
117 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
118 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
119 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
120
121
122UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
123
Antoine Pitroufd036452008-08-19 17:56:33 +0000124TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000125def offset_from_tz_string(tz):
126 offset = None
127 if tz in UTC_ZONES:
128 offset = 0
129 else:
130 m = TIMEZONE_RE.search(tz)
131 if m:
132 offset = 3600 * int(m.group(2))
133 if m.group(3):
134 offset = offset + 60 * int(m.group(3))
135 if m.group(1) == '-':
136 offset = -offset
137 return offset
138
139def _str2time(day, mon, yr, hr, min, sec, tz):
140 # translate month name to number
141 # month numbers start with 1 (January)
142 try:
143 mon = MONTHS_LOWER.index(mon.lower())+1
144 except ValueError:
145 # maybe it's already a number
146 try:
147 imon = int(mon)
148 except ValueError:
149 return None
150 if 1 <= imon <= 12:
151 mon = imon
152 else:
153 return None
154
155 # make sure clock elements are defined
156 if hr is None: hr = 0
157 if min is None: min = 0
158 if sec is None: sec = 0
159
160 yr = int(yr)
161 day = int(day)
162 hr = int(hr)
163 min = int(min)
164 sec = int(sec)
165
166 if yr < 1000:
167 # find "obvious" year
168 cur_yr = time.localtime(time.time())[0]
169 m = cur_yr % 100
170 tmp = yr
171 yr = yr + cur_yr - m
172 m = m - tmp
173 if abs(m) > 50:
174 if m > 0: yr = yr + 100
175 else: yr = yr - 100
176
177 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
178 t = _timegm((yr, mon, day, hr, min, sec, tz))
179
180 if t is not None:
181 # adjust time using timezone string, to get absolute time since epoch
182 if tz is None:
183 tz = "UTC"
184 tz = tz.upper()
185 offset = offset_from_tz_string(tz)
186 if offset is None:
187 return None
188 t = t - offset
189
190 return t
191
192STRICT_DATE_RE = re.compile(
193 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
Antoine Pitroufd036452008-08-19 17:56:33 +0000194 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000195WEEKDAY_RE = re.compile(
Antoine Pitroufd036452008-08-19 17:56:33 +0000196 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000197LOOSE_HTTP_DATE_RE = re.compile(
198 r"""^
199 (\d\d?) # day
200 (?:\s+|[-\/])
201 (\w+) # month
202 (?:\s+|[-\/])
203 (\d+) # year
204 (?:
205 (?:\s+|:) # separator before clock
206 (\d\d?):(\d\d) # hour:min
207 (?::(\d\d))? # optional seconds
208 )? # optional clock
209 \s*
210 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
211 \s*
212 (?:\(\w+\))? # ASCII representation of timezone in parens.
Antoine Pitroufd036452008-08-19 17:56:33 +0000213 \s*$""", re.X | re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000214def http2time(text):
215 """Returns time in seconds since epoch of time represented by a string.
216
217 Return value is an integer.
218
219 None is returned if the format of str is unrecognized, the time is outside
220 the representable range, or the timezone string is not recognized. If the
221 string contains no timezone, UTC is assumed.
222
223 The timezone in the string may be numerical (like "-0800" or "+0100") or a
224 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
225 timezone strings equivalent to UTC (zero offset) are known to the function.
226
227 The function loosely parses the following formats:
228
229 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
230 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
231 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
232 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
233 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
234 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
235
236 The parser ignores leading and trailing whitespace. The time may be
237 absent.
238
239 If the year is given with only 2 digits, the function will select the
240 century that makes the year closest to the current date.
241
242 """
243 # fast exit for strictly conforming string
244 m = STRICT_DATE_RE.search(text)
245 if m:
246 g = m.groups()
247 mon = MONTHS_LOWER.index(g[1].lower()) + 1
248 tt = (int(g[2]), mon, int(g[0]),
249 int(g[3]), int(g[4]), float(g[5]))
250 return _timegm(tt)
251
252 # No, we need some messy parsing...
253
254 # clean up
255 text = text.lstrip()
256 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
257
258 # tz is time zone specifier string
259 day, mon, yr, hr, min, sec, tz = [None]*7
260
261 # loose regexp parse
262 m = LOOSE_HTTP_DATE_RE.search(text)
263 if m is not None:
264 day, mon, yr, hr, min, sec, tz = m.groups()
265 else:
266 return None # bad format
267
268 return _str2time(day, mon, yr, hr, min, sec, tz)
269
270ISO_DATE_RE = re.compile(
271 """^
272 (\d{4}) # year
273 [-\/]?
274 (\d\d?) # numerical month
275 [-\/]?
276 (\d\d?) # day
277 (?:
278 (?:\s+|[-:Tt]) # separator before clock
279 (\d\d?):?(\d\d) # hour:min
280 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
281 )? # optional clock
282 \s*
283 ([-+]?\d\d?:?(:?\d\d)?
284 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
Antoine Pitroufd036452008-08-19 17:56:33 +0000285 \s*$""", re.X | re. ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000286def iso2time(text):
287 """
288 As for http2time, but parses the ISO 8601 formats:
289
290 1994-02-03 14:15:29 -0100 -- ISO 8601 format
291 1994-02-03 14:15:29 -- zone is optional
292 1994-02-03 -- only date
293 1994-02-03T14:15:29 -- Use T as separator
294 19940203T141529Z -- ISO 8601 compact format
295 19940203 -- only date
296
297 """
298 # clean up
299 text = text.lstrip()
300
301 # tz is time zone specifier string
302 day, mon, yr, hr, min, sec, tz = [None]*7
303
304 # loose regexp parse
305 m = ISO_DATE_RE.search(text)
306 if m is not None:
307 # XXX there's an extra bit of the timezone I'm ignoring here: is
308 # this the right thing to do?
309 yr, mon, day, hr, min, sec, tz, _ = m.groups()
310 else:
311 return None # bad format
312
313 return _str2time(day, mon, yr, hr, min, sec, tz)
314
315
316# Header parsing
317# -----------------------------------------------------------------------------
318
319def unmatched(match):
320 """Return unmatched part of re.Match object."""
321 start, end = match.span(0)
322 return match.string[:start]+match.string[end:]
323
324HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
325HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
326HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
327HEADER_ESCAPE_RE = re.compile(r"\\(.)")
328def split_header_words(header_values):
329 r"""Parse header values into a list of lists containing key,value pairs.
330
331 The function knows how to deal with ",", ";" and "=" as well as quoted
332 values after "=". A list of space separated tokens are parsed as if they
333 were separated by ";".
334
335 If the header_values passed as argument contains multiple values, then they
336 are treated as if they were a single value separated by comma ",".
337
338 This means that this function is useful for parsing header fields that
339 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
340 the requirement for tokens).
341
342 headers = #header
343 header = (token | parameter) *( [";"] (token | parameter))
344
345 token = 1*<any CHAR except CTLs or separators>
346 separators = "(" | ")" | "<" | ">" | "@"
347 | "," | ";" | ":" | "\" | <">
348 | "/" | "[" | "]" | "?" | "="
349 | "{" | "}" | SP | HT
350
351 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
352 qdtext = <any TEXT except <">>
353 quoted-pair = "\" CHAR
354
355 parameter = attribute "=" value
356 attribute = token
357 value = token | quoted-string
358
359 Each header is represented by a list of key/value pairs. The value for a
360 simple token (not part of a parameter) is None. Syntactically incorrect
361 headers will not necessarily be parsed as you would want.
362
363 This is easier to describe with some examples:
364
365 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
366 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
367 >>> split_header_words(['text/html; charset="iso-8859-1"'])
368 [[('text/html', None), ('charset', 'iso-8859-1')]]
369 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
370 [[('Basic', None), ('realm', '"foobar"')]]
371
372 """
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000373 assert not isinstance(header_values, str)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000374 result = []
375 for text in header_values:
376 orig_text = text
377 pairs = []
378 while text:
379 m = HEADER_TOKEN_RE.search(text)
380 if m:
381 text = unmatched(m)
382 name = m.group(1)
383 m = HEADER_QUOTED_VALUE_RE.search(text)
384 if m: # quoted value
385 text = unmatched(m)
386 value = m.group(1)
387 value = HEADER_ESCAPE_RE.sub(r"\1", value)
388 else:
389 m = HEADER_VALUE_RE.search(text)
390 if m: # unquoted value
391 text = unmatched(m)
392 value = m.group(1)
393 value = value.rstrip()
394 else:
395 # no value, a lone token
396 value = None
397 pairs.append((name, value))
398 elif text.lstrip().startswith(","):
399 # concatenated headers, as per RFC 2616 section 4.2
400 text = text.lstrip()[1:]
401 if pairs: result.append(pairs)
402 pairs = []
403 else:
404 # skip junk
405 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
406 assert nr_junk_chars > 0, (
407 "split_header_words bug: '%s', '%s', %s" %
408 (orig_text, text, pairs))
409 text = non_junk
410 if pairs: result.append(pairs)
411 return result
412
413HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
414def join_header_words(lists):
415 """Do the inverse (almost) of the conversion done by split_header_words.
416
417 Takes a list of lists of (key, value) pairs and produces a single header
418 value. Attribute values are quoted if needed.
419
420 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
421 'text/plain; charset="iso-8859/1"'
422 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
423 'text/plain, charset="iso-8859/1"'
424
425 """
426 headers = []
427 for pairs in lists:
428 attr = []
429 for k, v in pairs:
430 if v is not None:
431 if not re.search(r"^\w+$", v):
432 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
433 v = '"%s"' % v
434 k = "%s=%s" % (k, v)
435 attr.append(k)
436 if attr: headers.append("; ".join(attr))
437 return ", ".join(headers)
438
439def parse_ns_headers(ns_headers):
440 """Ad-hoc parser for Netscape protocol cookie-attributes.
441
442 The old Netscape cookie format for Set-Cookie can for instance contain
443 an unquoted "," in the expires field, so we have to use this ad-hoc
444 parser instead of split_header_words.
445
446 XXX This may not make the best possible effort to parse all the crap
447 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
448 parser is probably better, so could do worse than following that if
449 this ever gives any trouble.
450
451 Currently, this is also used for parsing RFC 2109 cookies.
452
453 """
454 known_attrs = ("expires", "domain", "path", "secure",
455 # RFC 2109 attrs (may turn up in Netscape cookies, too)
456 "port", "max-age")
457
458 result = []
459 for ns_header in ns_headers:
460 pairs = []
461 version_set = False
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000462 for ii, param in enumerate(re.split(r";\s*", ns_header)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000463 param = param.rstrip()
464 if param == "": continue
465 if "=" not in param:
Martin v. Löwisc5574e82005-03-03 10:57:37 +0000466 k, v = param, None
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000467 else:
468 k, v = re.split(r"\s*=\s*", param, 1)
469 k = k.lstrip()
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000470 if ii != 0:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000471 lc = k.lower()
472 if lc in known_attrs:
473 k = lc
474 if k == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000475 # This is an RFC 2109 cookie.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000476 version_set = True
477 if k == "expires":
478 # convert expires date to seconds since epoch
479 if v.startswith('"'): v = v[1:]
480 if v.endswith('"'): v = v[:-1]
481 v = http2time(v) # None if invalid
482 pairs.append((k, v))
483
484 if pairs:
485 if not version_set:
486 pairs.append(("version", "0"))
487 result.append(pairs)
488
489 return result
490
491
Antoine Pitroufd036452008-08-19 17:56:33 +0000492IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000493def is_HDN(text):
494 """Return True if text is a host domain name."""
495 # XXX
496 # This may well be wrong. Which RFC is HDN defined in, if any (for
497 # the purposes of RFC 2965)?
498 # For the current implementation, what about IPv6? Remember to look
499 # at other uses of IPV4_RE also, if change this.
500 if IPV4_RE.search(text):
501 return False
502 if text == "":
503 return False
504 if text[0] == "." or text[-1] == ".":
505 return False
506 return True
507
508def domain_match(A, B):
509 """Return True if domain A domain-matches domain B, according to RFC 2965.
510
511 A and B may be host domain names or IP addresses.
512
513 RFC 2965, section 1:
514
515 Host names can be specified either as an IP address or a HDN string.
516 Sometimes we compare one host name with another. (Such comparisons SHALL
517 be case-insensitive.) Host A's name domain-matches host B's if
518
519 * their host name strings string-compare equal; or
520
521 * A is a HDN string and has the form NB, where N is a non-empty
522 name string, B has the form .B', and B' is a HDN string. (So,
523 x.y.com domain-matches .Y.com but not Y.com.)
524
525 Note that domain-match is not a commutative operation: a.b.c.com
526 domain-matches .c.com, but not the reverse.
527
528 """
529 # Note that, if A or B are IP addresses, the only relevant part of the
530 # definition of the domain-match algorithm is the direct string-compare.
531 A = A.lower()
532 B = B.lower()
533 if A == B:
534 return True
535 if not is_HDN(A):
536 return False
537 i = A.rfind(B)
538 if i == -1 or i == 0:
539 # A does not have form NB, or N is the empty string
540 return False
541 if not B.startswith("."):
542 return False
543 if not is_HDN(B[1:]):
544 return False
545 return True
546
547def liberal_is_HDN(text):
548 """Return True if text is a sort-of-like a host domain name.
549
550 For accepting/blocking domains.
551
552 """
553 if IPV4_RE.search(text):
554 return False
555 return True
556
557def user_domain_match(A, B):
558 """For blocking/accepting domains.
559
560 A and B may be host domain names or IP addresses.
561
562 """
563 A = A.lower()
564 B = B.lower()
565 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
566 if A == B:
567 # equal IP addresses
568 return True
569 return False
570 initial_dot = B.startswith(".")
571 if initial_dot and A.endswith(B):
572 return True
573 if not initial_dot and A == B:
574 return True
575 return False
576
Antoine Pitroufd036452008-08-19 17:56:33 +0000577cut_port_re = re.compile(r":\d+$", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000578def request_host(request):
579 """Return request-host, as defined by RFC 2965.
580
581 Variation from RFC: returned value is lowercased, for convenient
582 comparison.
583
584 """
585 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000586 host = urllib.parse.urlparse(url)[1]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000587 if host == "":
588 host = request.get_header("Host", "")
589
590 # remove port, if present
591 host = cut_port_re.sub("", host, 1)
592 return host.lower()
593
594def eff_request_host(request):
595 """Return a tuple (request-host, effective request-host name).
596
597 As defined by RFC 2965, except both are lowercased.
598
599 """
600 erhn = req_host = request_host(request)
601 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
602 erhn = req_host + ".local"
603 return req_host, erhn
604
605def request_path(request):
606 """request-URI, as defined by RFC 2965."""
607 url = request.get_full_url()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000608 path, parameters, query, frag = urllib.parse.urlparse(url)[2:]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000609 if parameters:
610 path = "%s;%s" % (path, parameters)
611 path = escape_path(path)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000612 req_path = urllib.parse.urlunparse(("", "", path, "", query, frag))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000613 if not req_path.startswith("/"):
614 # fix bad RFC 2396 absoluteURI
615 req_path = "/"+req_path
616 return req_path
617
618def request_port(request):
619 host = request.get_host()
620 i = host.find(':')
621 if i >= 0:
622 port = host[i+1:]
623 try:
624 int(port)
625 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000626 _debug("nonnumeric port: '%s'", port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000627 return None
628 else:
629 port = DEFAULT_HTTP_PORT
630 return port
631
632# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
633# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
634HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
635ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
636def uppercase_escaped_char(match):
637 return "%%%s" % match.group(1).upper()
638def escape_path(path):
639 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
640 # There's no knowing what character encoding was used to create URLs
641 # containing %-escapes, but since we have to pick one to escape invalid
642 # path characters, we pick UTF-8, as recommended in the HTML 4.0
643 # specification:
644 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
645 # And here, kind of: draft-fielding-uri-rfc2396bis-03
646 # (And in draft IRI specification: draft-duerst-iri-05)
647 # (And here, for new URI schemes: RFC 2718)
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000648 path = urllib.parse.quote(path, HTTP_PATH_SAFE)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000649 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
650 return path
651
652def reach(h):
653 """Return reach of host h, as defined by RFC 2965, section 1.
654
655 The reach R of a host name H is defined as follows:
656
657 * If
658
659 - H is the host domain name of a host; and,
660
661 - H has the form A.B; and
662
663 - A has no embedded (that is, interior) dots; and
664
665 - B has at least one embedded dot, or B is the string "local".
666 then the reach of H is .B.
667
668 * Otherwise, the reach of H is H.
669
670 >>> reach("www.acme.com")
671 '.acme.com'
672 >>> reach("acme.com")
673 'acme.com'
674 >>> reach("acme.local")
675 '.local'
676
677 """
678 i = h.find(".")
679 if i >= 0:
680 #a = h[:i] # this line is only here to show what a is
681 b = h[i+1:]
682 i = b.find(".")
683 if is_HDN(h) and (i >= 0 or b == "local"):
684 return "."+b
685 return h
686
687def is_third_party(request):
688 """
689
690 RFC 2965, section 3.3.6:
691
692 An unverifiable transaction is to a third-party host if its request-
693 host U does not domain-match the reach R of the request-host O in the
694 origin transaction.
695
696 """
697 req_host = request_host(request)
698 if not domain_match(req_host, reach(request.get_origin_req_host())):
699 return True
700 else:
701 return False
702
703
704class Cookie:
705 """HTTP Cookie.
706
707 This class represents both Netscape and RFC 2965 cookies.
708
709 This is deliberately a very simple class. It just holds attributes. It's
710 possible to construct Cookie instances that don't comply with the cookie
711 standards. CookieJar.make_cookies is the factory function for Cookie
712 objects -- it deals with cookie parsing, supplying defaults, and
713 normalising to the representation used in this class. CookiePolicy is
714 responsible for checking them to see whether they should be accepted from
715 and returned to the server.
716
717 Note that the port may be present in the headers, but unspecified ("Port"
718 rather than"Port=80", for example); if this is the case, port is None.
719
720 """
721
722 def __init__(self, version, name, value,
723 port, port_specified,
724 domain, domain_specified, domain_initial_dot,
725 path, path_specified,
726 secure,
727 expires,
728 discard,
729 comment,
730 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000731 rest,
732 rfc2109=False,
733 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000734
735 if version is not None: version = int(version)
736 if expires is not None: expires = int(expires)
737 if port is None and port_specified is True:
738 raise ValueError("if port is None, port_specified must be false")
739
740 self.version = version
741 self.name = name
742 self.value = value
743 self.port = port
744 self.port_specified = port_specified
745 # normalise case, as per RFC 2965 section 3.3.3
746 self.domain = domain.lower()
747 self.domain_specified = domain_specified
748 # Sigh. We need to know whether the domain given in the
749 # cookie-attribute had an initial dot, in order to follow RFC 2965
750 # (as clarified in draft errata). Needed for the returned $Domain
751 # value.
752 self.domain_initial_dot = domain_initial_dot
753 self.path = path
754 self.path_specified = path_specified
755 self.secure = secure
756 self.expires = expires
757 self.discard = discard
758 self.comment = comment
759 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000760 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000761
762 self._rest = copy.copy(rest)
763
764 def has_nonstandard_attr(self, name):
765 return name in self._rest
766 def get_nonstandard_attr(self, name, default=None):
767 return self._rest.get(name, default)
768 def set_nonstandard_attr(self, name, value):
769 self._rest[name] = value
770
771 def is_expired(self, now=None):
772 if now is None: now = time.time()
773 if (self.expires is not None) and (self.expires <= now):
774 return True
775 return False
776
777 def __str__(self):
778 if self.port is None: p = ""
779 else: p = ":"+self.port
780 limit = self.domain + p + self.path
781 if self.value is not None:
782 namevalue = "%s=%s" % (self.name, self.value)
783 else:
784 namevalue = self.name
785 return "<Cookie %s for %s>" % (namevalue, limit)
786
787 def __repr__(self):
788 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000789 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000790 "port", "port_specified",
791 "domain", "domain_specified", "domain_initial_dot",
792 "path", "path_specified",
793 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000794 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000795 attr = getattr(self, name)
796 args.append("%s=%s" % (name, repr(attr)))
797 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000798 args.append("rfc2109=%s" % repr(self.rfc2109))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000799 return "Cookie(%s)" % ", ".join(args)
800
801
802class CookiePolicy:
803 """Defines which cookies get accepted from and returned to server.
804
805 May also modify cookies, though this is probably a bad idea.
806
807 The subclass DefaultCookiePolicy defines the standard rules for Netscape
808 and RFC 2965 cookies -- override that if you want a customised policy.
809
810 """
811 def set_ok(self, cookie, request):
812 """Return true if (and only if) cookie should be accepted from server.
813
814 Currently, pre-expired cookies never get this far -- the CookieJar
815 class deletes such cookies itself.
816
817 """
818 raise NotImplementedError()
819
820 def return_ok(self, cookie, request):
821 """Return true if (and only if) cookie should be returned to server."""
822 raise NotImplementedError()
823
824 def domain_return_ok(self, domain, request):
825 """Return false if cookies should not be returned, given cookie domain.
826 """
827 return True
828
829 def path_return_ok(self, path, request):
830 """Return false if cookies should not be returned, given cookie path.
831 """
832 return True
833
834
835class DefaultCookiePolicy(CookiePolicy):
836 """Implements the standard rules for accepting and returning cookies."""
837
838 DomainStrictNoDots = 1
839 DomainStrictNonDomain = 2
840 DomainRFC2965Match = 4
841
842 DomainLiberal = 0
843 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
844
845 def __init__(self,
846 blocked_domains=None, allowed_domains=None,
847 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000848 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000849 hide_cookie2=False,
850 strict_domain=False,
851 strict_rfc2965_unverifiable=True,
852 strict_ns_unverifiable=False,
853 strict_ns_domain=DomainLiberal,
854 strict_ns_set_initial_dollar=False,
855 strict_ns_set_path=False,
856 ):
857 """Constructor arguments should be passed as keyword arguments only."""
858 self.netscape = netscape
859 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000860 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000861 self.hide_cookie2 = hide_cookie2
862 self.strict_domain = strict_domain
863 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
864 self.strict_ns_unverifiable = strict_ns_unverifiable
865 self.strict_ns_domain = strict_ns_domain
866 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
867 self.strict_ns_set_path = strict_ns_set_path
868
869 if blocked_domains is not None:
870 self._blocked_domains = tuple(blocked_domains)
871 else:
872 self._blocked_domains = ()
873
874 if allowed_domains is not None:
875 allowed_domains = tuple(allowed_domains)
876 self._allowed_domains = allowed_domains
877
878 def blocked_domains(self):
879 """Return the sequence of blocked domains (as a tuple)."""
880 return self._blocked_domains
881 def set_blocked_domains(self, blocked_domains):
882 """Set the sequence of blocked domains."""
883 self._blocked_domains = tuple(blocked_domains)
884
885 def is_blocked(self, domain):
886 for blocked_domain in self._blocked_domains:
887 if user_domain_match(domain, blocked_domain):
888 return True
889 return False
890
891 def allowed_domains(self):
892 """Return None, or the sequence of allowed domains (as a tuple)."""
893 return self._allowed_domains
894 def set_allowed_domains(self, allowed_domains):
895 """Set the sequence of allowed domains, or None."""
896 if allowed_domains is not None:
897 allowed_domains = tuple(allowed_domains)
898 self._allowed_domains = allowed_domains
899
900 def is_not_allowed(self, domain):
901 if self._allowed_domains is None:
902 return False
903 for allowed_domain in self._allowed_domains:
904 if user_domain_match(domain, allowed_domain):
905 return False
906 return True
907
908 def set_ok(self, cookie, request):
909 """
910 If you override .set_ok(), be sure to call this method. If it returns
911 false, so should your subclass (assuming your subclass wants to be more
912 strict about which cookies to accept).
913
914 """
Thomas Wouters477c8d52006-05-27 19:21:47 +0000915 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000916
917 assert cookie.name is not None
918
919 for n in "version", "verifiability", "name", "path", "domain", "port":
920 fn_name = "set_ok_"+n
921 fn = getattr(self, fn_name)
922 if not fn(cookie, request):
923 return False
924
925 return True
926
927 def set_ok_version(self, cookie, request):
928 if cookie.version is None:
929 # Version is always set to 0 by parse_ns_headers if it's a Netscape
930 # cookie, so this must be an invalid RFC 2965 cookie.
Thomas Wouters477c8d52006-05-27 19:21:47 +0000931 _debug(" Set-Cookie2 without version attribute (%s=%s)",
932 cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000933 return False
934 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000935 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000936 return False
937 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000938 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000939 return False
940 return True
941
942 def set_ok_verifiability(self, cookie, request):
943 if request.is_unverifiable() and is_third_party(request):
944 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000945 _debug(" third-party RFC 2965 cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000946 "unverifiable transaction")
947 return False
948 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000949 _debug(" third-party Netscape cookie during "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000950 "unverifiable transaction")
951 return False
952 return True
953
954 def set_ok_name(self, cookie, request):
955 # Try and stop servers setting V0 cookies designed to hack other
956 # servers that know both V0 and V1 protocols.
957 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
958 cookie.name.startswith("$")):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000959 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000960 return False
961 return True
962
963 def set_ok_path(self, cookie, request):
964 if cookie.path_specified:
965 req_path = request_path(request)
966 if ((cookie.version > 0 or
967 (cookie.version == 0 and self.strict_ns_set_path)) and
968 not req_path.startswith(cookie.path)):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000969 _debug(" path attribute %s is not a prefix of request "
970 "path %s", cookie.path, req_path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000971 return False
972 return True
973
974 def set_ok_domain(self, cookie, request):
975 if self.is_blocked(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000976 _debug(" domain %s is in user block-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000977 return False
978 if self.is_not_allowed(cookie.domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000979 _debug(" domain %s is not in user allow-list", cookie.domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000980 return False
981 if cookie.domain_specified:
982 req_host, erhn = eff_request_host(request)
983 domain = cookie.domain
984 if self.strict_domain and (domain.count(".") >= 2):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000985 # XXX This should probably be compared with the Konqueror
986 # (kcookiejar.cpp) and Mozilla implementations, but it's a
987 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000988 i = domain.rfind(".")
989 j = domain.rfind(".", 0, i)
990 if j == 0: # domain like .foo.bar
991 tld = domain[i+1:]
992 sld = domain[j+1:i]
Thomas Wouters477c8d52006-05-27 19:21:47 +0000993 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
994 "gov", "mil", "int", "aero", "biz", "cat", "coop",
995 "info", "jobs", "mobi", "museum", "name", "pro",
996 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000997 # domain like .co.uk
Thomas Wouters477c8d52006-05-27 19:21:47 +0000998 _debug(" country-code second level domain %s", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000999 return False
1000 if domain.startswith("."):
1001 undotted_domain = domain[1:]
1002 else:
1003 undotted_domain = domain
1004 embedded_dots = (undotted_domain.find(".") >= 0)
1005 if not embedded_dots and domain != ".local":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001006 _debug(" non-local domain %s contains no embedded dot",
1007 domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001008 return False
1009 if cookie.version == 0:
1010 if (not erhn.endswith(domain) and
1011 (not erhn.startswith(".") and
1012 not ("."+erhn).endswith(domain))):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001013 _debug(" effective request-host %s (even with added "
1014 "initial dot) does not end end with %s",
1015 erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001016 return False
1017 if (cookie.version > 0 or
1018 (self.strict_ns_domain & self.DomainRFC2965Match)):
1019 if not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001020 _debug(" effective request-host %s does not domain-match "
1021 "%s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001022 return False
1023 if (cookie.version > 0 or
1024 (self.strict_ns_domain & self.DomainStrictNoDots)):
1025 host_prefix = req_host[:-len(domain)]
1026 if (host_prefix.find(".") >= 0 and
1027 not IPV4_RE.search(req_host)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001028 _debug(" host prefix %s for domain %s contains a dot",
1029 host_prefix, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001030 return False
1031 return True
1032
1033 def set_ok_port(self, cookie, request):
1034 if cookie.port_specified:
1035 req_port = request_port(request)
1036 if req_port is None:
1037 req_port = "80"
1038 else:
1039 req_port = str(req_port)
1040 for p in cookie.port.split(","):
1041 try:
1042 int(p)
1043 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001044 _debug(" bad port %s (not numeric)", p)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001045 return False
1046 if p == req_port:
1047 break
1048 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001049 _debug(" request port (%s) not found in %s",
1050 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001051 return False
1052 return True
1053
1054 def return_ok(self, cookie, request):
1055 """
1056 If you override .return_ok(), be sure to call this method. If it
1057 returns false, so should your subclass (assuming your subclass wants to
1058 be more strict about which cookies to return).
1059
1060 """
1061 # Path has already been checked by .path_return_ok(), and domain
1062 # blocking done by .domain_return_ok().
Thomas Wouters477c8d52006-05-27 19:21:47 +00001063 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001064
1065 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1066 fn_name = "return_ok_"+n
1067 fn = getattr(self, fn_name)
1068 if not fn(cookie, request):
1069 return False
1070 return True
1071
1072 def return_ok_version(self, cookie, request):
1073 if cookie.version > 0 and not self.rfc2965:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001074 _debug(" RFC 2965 cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001075 return False
1076 elif cookie.version == 0 and not self.netscape:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001077 _debug(" Netscape cookies are switched off")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001078 return False
1079 return True
1080
1081 def return_ok_verifiability(self, cookie, request):
1082 if request.is_unverifiable() and is_third_party(request):
1083 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001084 _debug(" third-party RFC 2965 cookie during unverifiable "
1085 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001086 return False
1087 elif cookie.version == 0 and self.strict_ns_unverifiable:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001088 _debug(" third-party Netscape cookie during unverifiable "
1089 "transaction")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001090 return False
1091 return True
1092
1093 def return_ok_secure(self, cookie, request):
1094 if cookie.secure and request.get_type() != "https":
Thomas Wouters477c8d52006-05-27 19:21:47 +00001095 _debug(" secure cookie with non-secure request")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001096 return False
1097 return True
1098
1099 def return_ok_expires(self, cookie, request):
1100 if cookie.is_expired(self._now):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001101 _debug(" cookie expired")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001102 return False
1103 return True
1104
1105 def return_ok_port(self, cookie, request):
1106 if cookie.port:
1107 req_port = request_port(request)
1108 if req_port is None:
1109 req_port = "80"
1110 for p in cookie.port.split(","):
1111 if p == req_port:
1112 break
1113 else:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001114 _debug(" request port %s does not match cookie port %s",
1115 req_port, cookie.port)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001116 return False
1117 return True
1118
1119 def return_ok_domain(self, cookie, request):
1120 req_host, erhn = eff_request_host(request)
1121 domain = cookie.domain
1122
1123 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1124 if (cookie.version == 0 and
1125 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1126 not cookie.domain_specified and domain != erhn):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001127 _debug(" cookie with unspecified domain does not string-compare "
1128 "equal to request domain")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001129 return False
1130
1131 if cookie.version > 0 and not domain_match(erhn, domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001132 _debug(" effective request-host name %s does not domain-match "
1133 "RFC 2965 cookie domain %s", erhn, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001134 return False
1135 if cookie.version == 0 and not ("."+erhn).endswith(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001136 _debug(" request-host %s does not match Netscape cookie domain "
1137 "%s", req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001138 return False
1139 return True
1140
1141 def domain_return_ok(self, domain, request):
1142 # Liberal check of. This is here as an optimization to avoid
1143 # having to load lots of MSIE cookie files unless necessary.
1144 req_host, erhn = eff_request_host(request)
1145 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001146 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001147 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001148 erhn = "."+erhn
1149 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001150 #_debug(" request domain %s does not match cookie domain %s",
1151 # req_host, domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001152 return False
1153
1154 if self.is_blocked(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001155 _debug(" domain %s is in user block-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001156 return False
1157 if self.is_not_allowed(domain):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001158 _debug(" domain %s is not in user allow-list", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001159 return False
1160
1161 return True
1162
1163 def path_return_ok(self, path, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001164 _debug("- checking cookie path=%s", path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001165 req_path = request_path(request)
1166 if not req_path.startswith(path):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001167 _debug(" %s does not path-match %s", req_path, path)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001168 return False
1169 return True
1170
1171
1172def vals_sorted_by_key(adict):
Guido van Rossumcc2b0162007-02-11 06:12:03 +00001173 keys = sorted(adict.keys())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001174 return map(adict.get, keys)
1175
1176def deepvalues(mapping):
1177 """Iterates over nested mapping, depth-first, in sorted order by key."""
1178 values = vals_sorted_by_key(mapping)
1179 for obj in values:
1180 mapping = False
1181 try:
1182 obj.items
1183 except AttributeError:
1184 pass
1185 else:
1186 mapping = True
1187 for subobj in deepvalues(obj):
1188 yield subobj
1189 if not mapping:
1190 yield obj
1191
1192
1193# Used as second parameter to dict.get() method, to distinguish absent
1194# dict key from one with a None value.
1195class Absent: pass
1196
1197class CookieJar:
1198 """Collection of HTTP cookies.
1199
1200 You may not need to know about this class: try
Jeremy Hylton1afc1692008-06-18 20:49:58 +00001201 urllib.request.build_opener(HTTPCookieProcessor).open(url).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001202 """
1203
1204 non_word_re = re.compile(r"\W")
1205 quote_re = re.compile(r"([\"\\])")
1206 strict_domain_re = re.compile(r"\.?[^.]*")
1207 domain_re = re.compile(r"[^.]*")
1208 dots_re = re.compile(r"^\.+")
1209
Antoine Pitroufd036452008-08-19 17:56:33 +00001210 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001211
1212 def __init__(self, policy=None):
1213 if policy is None:
1214 policy = DefaultCookiePolicy()
1215 self._policy = policy
1216
1217 self._cookies_lock = _threading.RLock()
1218 self._cookies = {}
1219
1220 def set_policy(self, policy):
1221 self._policy = policy
1222
1223 def _cookies_for_domain(self, domain, request):
1224 cookies = []
1225 if not self._policy.domain_return_ok(domain, request):
1226 return []
Thomas Wouters477c8d52006-05-27 19:21:47 +00001227 _debug("Checking %s for cookies to return", domain)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001228 cookies_by_path = self._cookies[domain]
1229 for path in cookies_by_path.keys():
1230 if not self._policy.path_return_ok(path, request):
1231 continue
1232 cookies_by_name = cookies_by_path[path]
1233 for cookie in cookies_by_name.values():
1234 if not self._policy.return_ok(cookie, request):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001235 _debug(" not returning cookie")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001236 continue
Thomas Wouters477c8d52006-05-27 19:21:47 +00001237 _debug(" it's a match")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001238 cookies.append(cookie)
1239 return cookies
1240
1241 def _cookies_for_request(self, request):
1242 """Return a list of cookies to be returned to server."""
1243 cookies = []
1244 for domain in self._cookies.keys():
1245 cookies.extend(self._cookies_for_domain(domain, request))
1246 return cookies
1247
1248 def _cookie_attrs(self, cookies):
1249 """Return a list of cookie-attributes to be returned to server.
1250
1251 like ['foo="bar"; $Path="/"', ...]
1252
1253 The $Version attribute is also added when appropriate (currently only
1254 once per request).
1255
1256 """
1257 # add cookies in order of most specific (ie. longest) path first
Raymond Hettinger70b64fc2008-01-30 20:15:17 +00001258 cookies.sort(key=lambda a: len(a.path), reverse=True)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001259
1260 version_set = False
1261
1262 attrs = []
1263 for cookie in cookies:
1264 # set version of Cookie header
1265 # XXX
1266 # What should it be if multiple matching Set-Cookie headers have
1267 # different versions themselves?
1268 # Answer: there is no answer; was supposed to be settled by
1269 # RFC 2965 errata, but that may never appear...
1270 version = cookie.version
1271 if not version_set:
1272 version_set = True
1273 if version > 0:
1274 attrs.append("$Version=%s" % version)
1275
1276 # quote cookie value if necessary
1277 # (not for Netscape protocol, which already has any quotes
1278 # intact, due to the poorly-specified Netscape Cookie: syntax)
1279 if ((cookie.value is not None) and
1280 self.non_word_re.search(cookie.value) and version > 0):
1281 value = self.quote_re.sub(r"\\\1", cookie.value)
1282 else:
1283 value = cookie.value
1284
1285 # add cookie-attributes to be returned in Cookie header
1286 if cookie.value is None:
1287 attrs.append(cookie.name)
1288 else:
1289 attrs.append("%s=%s" % (cookie.name, value))
1290 if version > 0:
1291 if cookie.path_specified:
1292 attrs.append('$Path="%s"' % cookie.path)
1293 if cookie.domain.startswith("."):
1294 domain = cookie.domain
1295 if (not cookie.domain_initial_dot and
1296 domain.startswith(".")):
1297 domain = domain[1:]
1298 attrs.append('$Domain="%s"' % domain)
1299 if cookie.port is not None:
1300 p = "$Port"
1301 if cookie.port_specified:
1302 p = p + ('="%s"' % cookie.port)
1303 attrs.append(p)
1304
1305 return attrs
1306
1307 def add_cookie_header(self, request):
Georg Brandl029986a2008-06-23 11:44:14 +00001308 """Add correct Cookie: header to request (urllib.request.Request object).
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001309
1310 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1311
1312 """
Thomas Wouters477c8d52006-05-27 19:21:47 +00001313 _debug("add_cookie_header")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001314 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001315 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001316
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001317 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001318
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001319 cookies = self._cookies_for_request(request)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001320
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001321 attrs = self._cookie_attrs(cookies)
1322 if attrs:
1323 if not request.has_header("Cookie"):
1324 request.add_unredirected_header(
1325 "Cookie", "; ".join(attrs))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001326
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001327 # if necessary, advertise that we know RFC 2965
1328 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1329 not request.has_header("Cookie2")):
1330 for cookie in cookies:
1331 if cookie.version != 1:
1332 request.add_unredirected_header("Cookie2", '$Version="1"')
1333 break
1334
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001335 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001336 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001337
1338 self.clear_expired_cookies()
1339
1340 def _normalized_cookie_tuples(self, attrs_set):
1341 """Return list of tuples containing normalised cookie information.
1342
1343 attrs_set is the list of lists of key,value pairs extracted from
1344 the Set-Cookie or Set-Cookie2 headers.
1345
1346 Tuples are name, value, standard, rest, where name and value are the
1347 cookie name and value, standard is a dictionary containing the standard
1348 cookie-attributes (discard, secure, version, expires or max-age,
1349 domain, path and port) and rest is a dictionary containing the rest of
1350 the cookie-attributes.
1351
1352 """
1353 cookie_tuples = []
1354
1355 boolean_attrs = "discard", "secure"
1356 value_attrs = ("version",
1357 "expires", "max-age",
1358 "domain", "path", "port",
1359 "comment", "commenturl")
1360
1361 for cookie_attrs in attrs_set:
1362 name, value = cookie_attrs[0]
1363
1364 # Build dictionary of standard cookie-attributes (standard) and
1365 # dictionary of other cookie-attributes (rest).
1366
1367 # Note: expiry time is normalised to seconds since epoch. V0
1368 # cookies should have the Expires cookie-attribute, and V1 cookies
1369 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1370 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1371 # accept either (but prefer Max-Age).
1372 max_age_set = False
1373
1374 bad_cookie = False
1375
1376 standard = {}
1377 rest = {}
1378 for k, v in cookie_attrs[1:]:
1379 lc = k.lower()
1380 # don't lose case distinction for unknown fields
1381 if lc in value_attrs or lc in boolean_attrs:
1382 k = lc
1383 if k in boolean_attrs and v is None:
1384 # boolean cookie-attribute is present, but has no value
1385 # (like "discard", rather than "port=80")
1386 v = True
1387 if k in standard:
1388 # only first value is significant
1389 continue
1390 if k == "domain":
1391 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001392 _debug(" missing value for domain attribute")
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001393 bad_cookie = True
1394 break
1395 # RFC 2965 section 3.3.3
1396 v = v.lower()
1397 if k == "expires":
1398 if max_age_set:
1399 # Prefer max-age to expires (like Mozilla)
1400 continue
1401 if v is None:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001402 _debug(" missing or invalid value for expires "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001403 "attribute: treating as session cookie")
1404 continue
1405 if k == "max-age":
1406 max_age_set = True
1407 try:
1408 v = int(v)
1409 except ValueError:
Thomas Wouters477c8d52006-05-27 19:21:47 +00001410 _debug(" missing or invalid (non-numeric) value for "
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001411 "max-age attribute")
1412 bad_cookie = True
1413 break
1414 # convert RFC 2965 Max-Age to seconds since epoch
1415 # XXX Strictly you're supposed to follow RFC 2616
1416 # age-calculation rules. Remember that zero Max-Age is a
1417 # is a request to discard (old and new) cookie, though.
1418 k = "expires"
1419 v = self._now + v
1420 if (k in value_attrs) or (k in boolean_attrs):
1421 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001422 k not in ("port", "comment", "commenturl")):
Thomas Wouters477c8d52006-05-27 19:21:47 +00001423 _debug(" missing value for %s attribute" % k)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001424 bad_cookie = True
1425 break
1426 standard[k] = v
1427 else:
1428 rest[k] = v
1429
1430 if bad_cookie:
1431 continue
1432
1433 cookie_tuples.append((name, value, standard, rest))
1434
1435 return cookie_tuples
1436
1437 def _cookie_from_cookie_tuple(self, tup, request):
1438 # standard is dict of standard cookie-attributes, rest is dict of the
1439 # rest of them
1440 name, value, standard, rest = tup
1441
1442 domain = standard.get("domain", Absent)
1443 path = standard.get("path", Absent)
1444 port = standard.get("port", Absent)
1445 expires = standard.get("expires", Absent)
1446
1447 # set the easy defaults
1448 version = standard.get("version", None)
1449 if version is not None: version = int(version)
1450 secure = standard.get("secure", False)
1451 # (discard is also set if expires is Absent)
1452 discard = standard.get("discard", False)
1453 comment = standard.get("comment", None)
1454 comment_url = standard.get("commenturl", None)
1455
1456 # set default path
1457 if path is not Absent and path != "":
1458 path_specified = True
1459 path = escape_path(path)
1460 else:
1461 path_specified = False
1462 path = request_path(request)
1463 i = path.rfind("/")
1464 if i != -1:
1465 if version == 0:
1466 # Netscape spec parts company from reality here
1467 path = path[:i]
1468 else:
1469 path = path[:i+1]
1470 if len(path) == 0: path = "/"
1471
1472 # set default domain
1473 domain_specified = domain is not Absent
1474 # but first we have to remember whether it starts with a dot
1475 domain_initial_dot = False
1476 if domain_specified:
1477 domain_initial_dot = bool(domain.startswith("."))
1478 if domain is Absent:
1479 req_host, erhn = eff_request_host(request)
1480 domain = erhn
1481 elif not domain.startswith("."):
1482 domain = "."+domain
1483
1484 # set default port
1485 port_specified = False
1486 if port is not Absent:
1487 if port is None:
1488 # Port attr present, but has no value: default to request port.
1489 # Cookie should then only be sent back on that port.
1490 port = request_port(request)
1491 else:
1492 port_specified = True
1493 port = re.sub(r"\s+", "", port)
1494 else:
1495 # No port attr present. Cookie can be sent back on any port.
1496 port = None
1497
1498 # set default expires and discard
1499 if expires is Absent:
1500 expires = None
1501 discard = True
1502 elif expires <= self._now:
1503 # Expiry date in past is request to delete cookie. This can't be
1504 # in DefaultCookiePolicy, because can't delete cookies there.
1505 try:
1506 self.clear(domain, path, name)
1507 except KeyError:
1508 pass
Thomas Wouters477c8d52006-05-27 19:21:47 +00001509 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1510 domain, path, name)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001511 return None
1512
1513 return Cookie(version,
1514 name, value,
1515 port, port_specified,
1516 domain, domain_specified, domain_initial_dot,
1517 path, path_specified,
1518 secure,
1519 expires,
1520 discard,
1521 comment,
1522 comment_url,
1523 rest)
1524
1525 def _cookies_from_attrs_set(self, attrs_set, request):
1526 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1527
1528 cookies = []
1529 for tup in cookie_tuples:
1530 cookie = self._cookie_from_cookie_tuple(tup, request)
1531 if cookie: cookies.append(cookie)
1532 return cookies
1533
Neal Norwitz71dad722005-12-23 21:43:48 +00001534 def _process_rfc2109_cookies(self, cookies):
1535 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1536 if rfc2109_as_ns is None:
1537 rfc2109_as_ns = not self._policy.rfc2965
1538 for cookie in cookies:
1539 if cookie.version == 1:
1540 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001541 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001542 # treat 2109 cookies as Netscape cookies rather than
1543 # as RFC2965 cookies
1544 cookie.version = 0
1545
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001546 def make_cookies(self, response, request):
1547 """Return sequence of Cookie objects extracted from response object."""
1548 # get cookie-attributes for RFC 2965 and Netscape protocols
1549 headers = response.info()
Barry Warsaw820c1202008-06-12 04:06:45 +00001550 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1551 ns_hdrs = headers.get_all("Set-Cookie", [])
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001552
1553 rfc2965 = self._policy.rfc2965
1554 netscape = self._policy.netscape
1555
1556 if ((not rfc2965_hdrs and not ns_hdrs) or
1557 (not ns_hdrs and not rfc2965) or
1558 (not rfc2965_hdrs and not netscape) or
1559 (not netscape and not rfc2965)):
1560 return [] # no relevant cookie headers: quick exit
1561
1562 try:
1563 cookies = self._cookies_from_attrs_set(
1564 split_header_words(rfc2965_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001565 except Exception:
1566 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001567 cookies = []
1568
1569 if ns_hdrs and netscape:
1570 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001571 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001572 ns_cookies = self._cookies_from_attrs_set(
1573 parse_ns_headers(ns_hdrs), request)
Thomas Wouters477c8d52006-05-27 19:21:47 +00001574 except Exception:
1575 _warn_unhandled_exception()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001576 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001577 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001578
1579 # Look for Netscape cookies (from Set-Cookie headers) that match
1580 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1581 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1582 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1583 # bundled in with the Netscape cookies for this purpose, which is
1584 # reasonable behaviour.
1585 if rfc2965:
1586 lookup = {}
1587 for cookie in cookies:
1588 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1589
1590 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1591 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1592 return key not in lookup
1593 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1594
1595 if ns_cookies:
1596 cookies.extend(ns_cookies)
1597
1598 return cookies
1599
1600 def set_cookie_if_ok(self, cookie, request):
1601 """Set a cookie if policy says it's OK to do so."""
1602 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001603 try:
1604 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001605
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001606 if self._policy.set_ok(cookie, request):
1607 self.set_cookie(cookie)
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001608
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001609
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001610 finally:
1611 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001612
1613 def set_cookie(self, cookie):
1614 """Set a cookie, without checking whether or not it should be set."""
1615 c = self._cookies
1616 self._cookies_lock.acquire()
1617 try:
1618 if cookie.domain not in c: c[cookie.domain] = {}
1619 c2 = c[cookie.domain]
1620 if cookie.path not in c2: c2[cookie.path] = {}
1621 c3 = c2[cookie.path]
1622 c3[cookie.name] = cookie
1623 finally:
1624 self._cookies_lock.release()
1625
1626 def extract_cookies(self, response, request):
1627 """Extract cookies from response, where allowable given the request."""
Thomas Wouters477c8d52006-05-27 19:21:47 +00001628 _debug("extract_cookies: %s", response.info())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001629 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001630 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001631 self._policy._now = self._now = int(time.time())
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001632
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001633 for cookie in self.make_cookies(response, request):
1634 if self._policy.set_ok(cookie, request):
1635 _debug(" setting cookie: %s", cookie)
1636 self.set_cookie(cookie)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001637 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001638 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001639
1640 def clear(self, domain=None, path=None, name=None):
1641 """Clear some cookies.
1642
1643 Invoking this method without arguments will clear all cookies. If
1644 given a single argument, only cookies belonging to that domain will be
1645 removed. If given two arguments, cookies belonging to the specified
1646 path within that domain are removed. If given three arguments, then
1647 the cookie with the specified name, path and domain is removed.
1648
1649 Raises KeyError if no matching cookie exists.
1650
1651 """
1652 if name is not None:
1653 if (domain is None) or (path is None):
1654 raise ValueError(
1655 "domain and path must be given to remove a cookie by name")
1656 del self._cookies[domain][path][name]
1657 elif path is not None:
1658 if domain is None:
1659 raise ValueError(
1660 "domain must be given to remove cookies by path")
1661 del self._cookies[domain][path]
1662 elif domain is not None:
1663 del self._cookies[domain]
1664 else:
1665 self._cookies = {}
1666
1667 def clear_session_cookies(self):
1668 """Discard all session cookies.
1669
1670 Note that the .save() method won't save session cookies anyway, unless
1671 you ask otherwise by passing a true ignore_discard argument.
1672
1673 """
1674 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001675 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001676 for cookie in self:
1677 if cookie.discard:
1678 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001679 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001680 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001681
1682 def clear_expired_cookies(self):
1683 """Discard all expired cookies.
1684
1685 You probably don't need to call this method: expired cookies are never
1686 sent back to the server (provided you're using DefaultCookiePolicy),
1687 this method is called by CookieJar itself every so often, and the
1688 .save() method won't save expired cookies anyway (unless you ask
1689 otherwise by passing a true ignore_expires argument).
1690
1691 """
1692 self._cookies_lock.acquire()
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001693 try:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001694 now = time.time()
1695 for cookie in self:
1696 if cookie.is_expired(now):
1697 self.clear(cookie.domain, cookie.path, cookie.name)
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001698 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001699 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001700
1701 def __iter__(self):
1702 return deepvalues(self._cookies)
1703
1704 def __len__(self):
1705 """Return number of contained cookies."""
1706 i = 0
1707 for cookie in self: i = i + 1
1708 return i
1709
1710 def __repr__(self):
1711 r = []
1712 for cookie in self: r.append(repr(cookie))
1713 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1714
1715 def __str__(self):
1716 r = []
1717 for cookie in self: r.append(str(cookie))
1718 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1719
1720
Neal Norwitz3e7de592005-12-23 21:24:35 +00001721# derives from IOError for backwards-compatibility with Python 2.4.0
1722class LoadError(IOError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001723
1724class FileCookieJar(CookieJar):
1725 """CookieJar that can be loaded from and saved to a file."""
1726
1727 def __init__(self, filename=None, delayload=False, policy=None):
1728 """
1729 Cookies are NOT loaded from the named file until either the .load() or
1730 .revert() method is called.
1731
1732 """
1733 CookieJar.__init__(self, policy)
1734 if filename is not None:
1735 try:
1736 filename+""
1737 except:
1738 raise ValueError("filename must be string-like")
1739 self.filename = filename
1740 self.delayload = bool(delayload)
1741
1742 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1743 """Save cookies to a file."""
1744 raise NotImplementedError()
1745
1746 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1747 """Load cookies from a file."""
1748 if filename is None:
1749 if self.filename is not None: filename = self.filename
1750 else: raise ValueError(MISSING_FILENAME_TEXT)
1751
1752 f = open(filename)
1753 try:
1754 self._really_load(f, filename, ignore_discard, ignore_expires)
1755 finally:
1756 f.close()
1757
1758 def revert(self, filename=None,
1759 ignore_discard=False, ignore_expires=False):
1760 """Clear all cookies and reload cookies from a saved file.
1761
1762 Raises LoadError (or IOError) if reversion is not successful; the
1763 object's state will not be altered if this happens.
1764
1765 """
1766 if filename is None:
1767 if self.filename is not None: filename = self.filename
1768 else: raise ValueError(MISSING_FILENAME_TEXT)
1769
1770 self._cookies_lock.acquire()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001771 try:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001772
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001773 old_state = copy.deepcopy(self._cookies)
1774 self._cookies = {}
1775 try:
1776 self.load(filename, ignore_discard, ignore_expires)
1777 except (LoadError, IOError):
1778 self._cookies = old_state
1779 raise
Thomas Wouters902d6eb2007-01-09 23:18:33 +00001780
1781 finally:
Thomas Wouters9fe394c2007-02-05 01:24:16 +00001782 self._cookies_lock.release()
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001783
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001784
1785def lwp_cookie_str(cookie):
1786 """Return string representation of Cookie in an the LWP cookie file format.
1787
1788 Actually, the format is extended a bit -- see module docstring.
1789
1790 """
1791 h = [(cookie.name, cookie.value),
1792 ("path", cookie.path),
1793 ("domain", cookie.domain)]
1794 if cookie.port is not None: h.append(("port", cookie.port))
1795 if cookie.path_specified: h.append(("path_spec", None))
1796 if cookie.port_specified: h.append(("port_spec", None))
1797 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1798 if cookie.secure: h.append(("secure", None))
1799 if cookie.expires: h.append(("expires",
1800 time2isoz(float(cookie.expires))))
1801 if cookie.discard: h.append(("discard", None))
1802 if cookie.comment: h.append(("comment", cookie.comment))
1803 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1804
1805 keys = sorted(cookie._rest.keys())
1806 for k in keys:
1807 h.append((k, str(cookie._rest[k])))
1808
1809 h.append(("version", str(cookie.version)))
1810
1811 return join_header_words([h])
1812
1813class LWPCookieJar(FileCookieJar):
1814 """
1815 The LWPCookieJar saves a sequence of"Set-Cookie3" lines.
1816 "Set-Cookie3" is the format used by the libwww-perl libary, not known
1817 to be compatible with any browser, but which is easy to read and
1818 doesn't lose information about RFC 2965 cookies.
1819
1820 Additional methods
1821
1822 as_lwp_str(ignore_discard=True, ignore_expired=True)
1823
1824 """
1825
1826 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1827 """Return cookies as a string of "\n"-separated "Set-Cookie3" headers.
1828
1829 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1830
1831 """
1832 now = time.time()
1833 r = []
1834 for cookie in self:
1835 if not ignore_discard and cookie.discard:
1836 continue
1837 if not ignore_expires and cookie.is_expired(now):
1838 continue
1839 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1840 return "\n".join(r+[""])
1841
1842 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1843 if filename is None:
1844 if self.filename is not None: filename = self.filename
1845 else: raise ValueError(MISSING_FILENAME_TEXT)
1846
1847 f = open(filename, "w")
1848 try:
1849 # There really isn't an LWP Cookies 2.0 format, but this indicates
1850 # that there is extra information in here (domain_dot and
1851 # port_spec) while still being compatible with libwww-perl, I hope.
1852 f.write("#LWP-Cookies-2.0\n")
1853 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1854 finally:
1855 f.close()
1856
1857 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1858 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001859 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001860 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1861 "file" % filename)
1862 raise LoadError(msg)
1863
1864 now = time.time()
1865
1866 header = "Set-Cookie3:"
1867 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1868 "secure", "discard")
1869 value_attrs = ("version",
1870 "port", "path", "domain",
1871 "expires",
1872 "comment", "commenturl")
1873
1874 try:
1875 while 1:
1876 line = f.readline()
1877 if line == "": break
1878 if not line.startswith(header):
1879 continue
1880 line = line[len(header):].strip()
1881
1882 for data in split_header_words([line]):
1883 name, value = data[0]
1884 standard = {}
1885 rest = {}
1886 for k in boolean_attrs:
1887 standard[k] = False
1888 for k, v in data[1:]:
1889 if k is not None:
1890 lc = k.lower()
1891 else:
1892 lc = None
1893 # don't lose case distinction for unknown fields
1894 if (lc in value_attrs) or (lc in boolean_attrs):
1895 k = lc
1896 if k in boolean_attrs:
1897 if v is None: v = True
1898 standard[k] = v
1899 elif k in value_attrs:
1900 standard[k] = v
1901 else:
1902 rest[k] = v
1903
1904 h = standard.get
1905 expires = h("expires")
1906 discard = h("discard")
1907 if expires is not None:
1908 expires = iso2time(expires)
1909 if expires is None:
1910 discard = True
1911 domain = h("domain")
1912 domain_specified = domain.startswith(".")
1913 c = Cookie(h("version"), name, value,
1914 h("port"), h("port_spec"),
1915 domain, domain_specified, h("domain_dot"),
1916 h("path"), h("path_spec"),
1917 h("secure"),
1918 expires,
1919 discard,
1920 h("comment"),
1921 h("commenturl"),
1922 rest)
1923 if not ignore_discard and c.discard:
1924 continue
1925 if not ignore_expires and c.is_expired(now):
1926 continue
1927 self.set_cookie(c)
1928
1929 except IOError:
1930 raise
1931 except Exception:
1932 _warn_unhandled_exception()
1933 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1934 (filename, line))
1935
1936
1937class MozillaCookieJar(FileCookieJar):
1938 """
1939
1940 WARNING: you may want to backup your browser's cookies file if you use
1941 this class to save cookies. I *think* it works, but there have been
1942 bugs in the past!
1943
1944 This class differs from CookieJar only in the format it uses to save and
1945 load cookies to and from a file. This class uses the Mozilla/Netscape
1946 `cookies.txt' format. lynx uses this file format, too.
1947
1948 Don't expect cookies saved while the browser is running to be noticed by
1949 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1950 you change them on disk while it's running; on Windows, you probably can't
1951 save at all while the browser is running).
1952
1953 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1954 Netscape cookies on saving.
1955
1956 In particular, the cookie version and port number information is lost,
1957 together with information about whether or not Path, Port and Discard were
1958 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1959 domain as set in the HTTP header started with a dot (yes, I'm aware some
1960 domains in Netscape files start with a dot and some don't -- trust me, you
1961 really don't want to know any more about this).
1962
1963 Note that though Mozilla and Netscape use the same format, they use
1964 slightly different headers. The class saves cookies using the Netscape
1965 header by default (Mozilla can cope with that).
1966
1967 """
Antoine Pitroufd036452008-08-19 17:56:33 +00001968 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001969 header = """\
1970 # Netscape HTTP Cookie File
1971 # http://www.netscape.com/newsref/std/cookie_spec.html
1972 # This is a generated file! Do not edit.
1973
1974"""
1975
1976 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1977 now = time.time()
1978
1979 magic = f.readline()
Antoine Pitroufd036452008-08-19 17:56:33 +00001980 if not self.magic_re.search(magic):
Georg Brandl7c9b61b2008-05-26 17:56:51 +00001981 f.close()
1982 raise LoadError(
1983 "%r does not look like a Netscape format cookies file" %
1984 filename)
1985
1986 try:
1987 while 1:
1988 line = f.readline()
1989 if line == "": break
1990
1991 # last field may be absent, so keep any trailing tab
1992 if line.endswith("\n"): line = line[:-1]
1993
1994 # skip comments and blank lines XXX what is $ for?
1995 if (line.strip().startswith(("#", "$")) or
1996 line.strip() == ""):
1997 continue
1998
1999 domain, domain_specified, path, secure, expires, name, value = \
2000 line.split("\t")
2001 secure = (secure == "TRUE")
2002 domain_specified = (domain_specified == "TRUE")
2003 if name == "":
2004 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2005 # with no name, whereas http.cookiejar regards it as a
2006 # cookie with no value.
2007 name = value
2008 value = None
2009
2010 initial_dot = domain.startswith(".")
2011 assert domain_specified == initial_dot
2012
2013 discard = False
2014 if expires == "":
2015 expires = None
2016 discard = True
2017
2018 # assume path_specified is false
2019 c = Cookie(0, name, value,
2020 None, False,
2021 domain, domain_specified, initial_dot,
2022 path, False,
2023 secure,
2024 expires,
2025 discard,
2026 None,
2027 None,
2028 {})
2029 if not ignore_discard and c.discard:
2030 continue
2031 if not ignore_expires and c.is_expired(now):
2032 continue
2033 self.set_cookie(c)
2034
2035 except IOError:
2036 raise
2037 except Exception:
2038 _warn_unhandled_exception()
2039 raise LoadError("invalid Netscape format cookies file %r: %r" %
2040 (filename, line))
2041
2042 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2043 if filename is None:
2044 if self.filename is not None: filename = self.filename
2045 else: raise ValueError(MISSING_FILENAME_TEXT)
2046
2047 f = open(filename, "w")
2048 try:
2049 f.write(self.header)
2050 now = time.time()
2051 for cookie in self:
2052 if not ignore_discard and cookie.discard:
2053 continue
2054 if not ignore_expires and cookie.is_expired(now):
2055 continue
2056 if cookie.secure: secure = "TRUE"
2057 else: secure = "FALSE"
2058 if cookie.domain.startswith("."): initial_dot = "TRUE"
2059 else: initial_dot = "FALSE"
2060 if cookie.expires is not None:
2061 expires = str(cookie.expires)
2062 else:
2063 expires = ""
2064 if cookie.value is None:
2065 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2066 # with no name, whereas http.cookiejar regards it as a
2067 # cookie with no value.
2068 name = ""
2069 value = cookie.name
2070 else:
2071 name = cookie.name
2072 value = cookie.value
2073 f.write(
2074 "\t".join([cookie.domain, initial_dot, cookie.path,
2075 secure, expires, name, value])+
2076 "\n")
2077 finally:
2078 f.close()