blob: 86479f864adaa7b9b3c1d85fda7673cf72d8e28e [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that the classes which do not derive from
11FileCookieJar are not distributed with the Python standard library, but
12are available from http://wwwsearch.sf.net/):
13
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +000028import sys, re, urlparse, copy, time, urllib, logging
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000029from types import StringTypes
30try:
31 import threading as _threading
32except ImportError:
33 import dummy_threading as _threading
34import httplib # only for the default HTTP port
35from calendar import timegm
36
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000037debug = logging.getLogger("cookielib").debug
38
39DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
40MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
41 "instance initialised with one)")
42
43def reraise_unmasked_exceptions(unmasked=()):
44 # There are a few catch-all except: statements in this module, for
45 # catching input that's bad in unexpected ways.
46 # This function re-raises some exceptions we don't want to trap.
47 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
48 etype = sys.exc_info()[0]
49 if issubclass(etype, unmasked):
50 raise
51 # swallowed an exception
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000052 import warnings, traceback, StringIO
53 f = StringIO.StringIO()
54 traceback.print_exc(None, f)
55 msg = f.getvalue()
56 warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000057
58
59# Date/time conversion
60# -----------------------------------------------------------------------------
61
62EPOCH_YEAR = 1970
63def _timegm(tt):
64 year, month, mday, hour, min, sec = tt[:6]
65 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
66 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
67 return timegm(tt)
68 else:
69 return None
70
71DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
72MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
73 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
74MONTHS_LOWER = []
75for month in MONTHS: MONTHS_LOWER.append(month.lower())
76
77def time2isoz(t=None):
78 """Return a string representing time in seconds since epoch, t.
79
80 If the function is called without an argument, it will use the current
81 time.
82
83 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
84 representing Universal Time (UTC, aka GMT). An example of this format is:
85
86 1994-11-24 08:49:37Z
87
88 """
89 if t is None: t = time.time()
90 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
91 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
92 year, mon, mday, hour, min, sec)
93
94def time2netscape(t=None):
95 """Return a string representing time in seconds since epoch, t.
96
97 If the function is called without an argument, it will use the current
98 time.
99
100 The format of the returned string is like this:
101
102 Wed, DD-Mon-YYYY HH:MM:SS GMT
103
104 """
105 if t is None: t = time.time()
106 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
107 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
108 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
109
110
111UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
112
113TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
114def offset_from_tz_string(tz):
115 offset = None
116 if tz in UTC_ZONES:
117 offset = 0
118 else:
119 m = TIMEZONE_RE.search(tz)
120 if m:
121 offset = 3600 * int(m.group(2))
122 if m.group(3):
123 offset = offset + 60 * int(m.group(3))
124 if m.group(1) == '-':
125 offset = -offset
126 return offset
127
128def _str2time(day, mon, yr, hr, min, sec, tz):
129 # translate month name to number
130 # month numbers start with 1 (January)
131 try:
132 mon = MONTHS_LOWER.index(mon.lower())+1
133 except ValueError:
134 # maybe it's already a number
135 try:
136 imon = int(mon)
137 except ValueError:
138 return None
139 if 1 <= imon <= 12:
140 mon = imon
141 else:
142 return None
143
144 # make sure clock elements are defined
145 if hr is None: hr = 0
146 if min is None: min = 0
147 if sec is None: sec = 0
148
149 yr = int(yr)
150 day = int(day)
151 hr = int(hr)
152 min = int(min)
153 sec = int(sec)
154
155 if yr < 1000:
156 # find "obvious" year
157 cur_yr = time.localtime(time.time())[0]
158 m = cur_yr % 100
159 tmp = yr
160 yr = yr + cur_yr - m
161 m = m - tmp
162 if abs(m) > 50:
163 if m > 0: yr = yr + 100
164 else: yr = yr - 100
165
166 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
167 t = _timegm((yr, mon, day, hr, min, sec, tz))
168
169 if t is not None:
170 # adjust time using timezone string, to get absolute time since epoch
171 if tz is None:
172 tz = "UTC"
173 tz = tz.upper()
174 offset = offset_from_tz_string(tz)
175 if offset is None:
176 return None
177 t = t - offset
178
179 return t
180
181STRICT_DATE_RE = re.compile(
182 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
183 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
184WEEKDAY_RE = re.compile(
185 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
186LOOSE_HTTP_DATE_RE = re.compile(
187 r"""^
188 (\d\d?) # day
189 (?:\s+|[-\/])
190 (\w+) # month
191 (?:\s+|[-\/])
192 (\d+) # year
193 (?:
194 (?:\s+|:) # separator before clock
195 (\d\d?):(\d\d) # hour:min
196 (?::(\d\d))? # optional seconds
197 )? # optional clock
198 \s*
199 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
200 \s*
201 (?:\(\w+\))? # ASCII representation of timezone in parens.
202 \s*$""", re.X)
203def http2time(text):
204 """Returns time in seconds since epoch of time represented by a string.
205
206 Return value is an integer.
207
208 None is returned if the format of str is unrecognized, the time is outside
209 the representable range, or the timezone string is not recognized. If the
210 string contains no timezone, UTC is assumed.
211
212 The timezone in the string may be numerical (like "-0800" or "+0100") or a
213 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
214 timezone strings equivalent to UTC (zero offset) are known to the function.
215
216 The function loosely parses the following formats:
217
218 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
219 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
220 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
221 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
222 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
223 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
224
225 The parser ignores leading and trailing whitespace. The time may be
226 absent.
227
228 If the year is given with only 2 digits, the function will select the
229 century that makes the year closest to the current date.
230
231 """
232 # fast exit for strictly conforming string
233 m = STRICT_DATE_RE.search(text)
234 if m:
235 g = m.groups()
236 mon = MONTHS_LOWER.index(g[1].lower()) + 1
237 tt = (int(g[2]), mon, int(g[0]),
238 int(g[3]), int(g[4]), float(g[5]))
239 return _timegm(tt)
240
241 # No, we need some messy parsing...
242
243 # clean up
244 text = text.lstrip()
245 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
246
247 # tz is time zone specifier string
248 day, mon, yr, hr, min, sec, tz = [None]*7
249
250 # loose regexp parse
251 m = LOOSE_HTTP_DATE_RE.search(text)
252 if m is not None:
253 day, mon, yr, hr, min, sec, tz = m.groups()
254 else:
255 return None # bad format
256
257 return _str2time(day, mon, yr, hr, min, sec, tz)
258
259ISO_DATE_RE = re.compile(
260 """^
261 (\d{4}) # year
262 [-\/]?
263 (\d\d?) # numerical month
264 [-\/]?
265 (\d\d?) # day
266 (?:
267 (?:\s+|[-:Tt]) # separator before clock
268 (\d\d?):?(\d\d) # hour:min
269 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
270 )? # optional clock
271 \s*
272 ([-+]?\d\d?:?(:?\d\d)?
273 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
274 \s*$""", re.X)
275def iso2time(text):
276 """
277 As for http2time, but parses the ISO 8601 formats:
278
279 1994-02-03 14:15:29 -0100 -- ISO 8601 format
280 1994-02-03 14:15:29 -- zone is optional
281 1994-02-03 -- only date
282 1994-02-03T14:15:29 -- Use T as separator
283 19940203T141529Z -- ISO 8601 compact format
284 19940203 -- only date
285
286 """
287 # clean up
288 text = text.lstrip()
289
290 # tz is time zone specifier string
291 day, mon, yr, hr, min, sec, tz = [None]*7
292
293 # loose regexp parse
294 m = ISO_DATE_RE.search(text)
295 if m is not None:
296 # XXX there's an extra bit of the timezone I'm ignoring here: is
297 # this the right thing to do?
298 yr, mon, day, hr, min, sec, tz, _ = m.groups()
299 else:
300 return None # bad format
301
302 return _str2time(day, mon, yr, hr, min, sec, tz)
303
304
305# Header parsing
306# -----------------------------------------------------------------------------
307
308def unmatched(match):
309 """Return unmatched part of re.Match object."""
310 start, end = match.span(0)
311 return match.string[:start]+match.string[end:]
312
313HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
314HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
315HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
316HEADER_ESCAPE_RE = re.compile(r"\\(.)")
317def split_header_words(header_values):
318 r"""Parse header values into a list of lists containing key,value pairs.
319
320 The function knows how to deal with ",", ";" and "=" as well as quoted
321 values after "=". A list of space separated tokens are parsed as if they
322 were separated by ";".
323
324 If the header_values passed as argument contains multiple values, then they
325 are treated as if they were a single value separated by comma ",".
326
327 This means that this function is useful for parsing header fields that
328 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
329 the requirement for tokens).
330
331 headers = #header
332 header = (token | parameter) *( [";"] (token | parameter))
333
334 token = 1*<any CHAR except CTLs or separators>
335 separators = "(" | ")" | "<" | ">" | "@"
336 | "," | ";" | ":" | "\" | <">
337 | "/" | "[" | "]" | "?" | "="
338 | "{" | "}" | SP | HT
339
340 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
341 qdtext = <any TEXT except <">>
342 quoted-pair = "\" CHAR
343
344 parameter = attribute "=" value
345 attribute = token
346 value = token | quoted-string
347
348 Each header is represented by a list of key/value pairs. The value for a
349 simple token (not part of a parameter) is None. Syntactically incorrect
350 headers will not necessarily be parsed as you would want.
351
352 This is easier to describe with some examples:
353
354 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
355 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
356 >>> split_header_words(['text/html; charset="iso-8859-1"'])
357 [[('text/html', None), ('charset', 'iso-8859-1')]]
358 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
359 [[('Basic', None), ('realm', '"foobar"')]]
360
361 """
362 assert type(header_values) not in StringTypes
363 result = []
364 for text in header_values:
365 orig_text = text
366 pairs = []
367 while text:
368 m = HEADER_TOKEN_RE.search(text)
369 if m:
370 text = unmatched(m)
371 name = m.group(1)
372 m = HEADER_QUOTED_VALUE_RE.search(text)
373 if m: # quoted value
374 text = unmatched(m)
375 value = m.group(1)
376 value = HEADER_ESCAPE_RE.sub(r"\1", value)
377 else:
378 m = HEADER_VALUE_RE.search(text)
379 if m: # unquoted value
380 text = unmatched(m)
381 value = m.group(1)
382 value = value.rstrip()
383 else:
384 # no value, a lone token
385 value = None
386 pairs.append((name, value))
387 elif text.lstrip().startswith(","):
388 # concatenated headers, as per RFC 2616 section 4.2
389 text = text.lstrip()[1:]
390 if pairs: result.append(pairs)
391 pairs = []
392 else:
393 # skip junk
394 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
395 assert nr_junk_chars > 0, (
396 "split_header_words bug: '%s', '%s', %s" %
397 (orig_text, text, pairs))
398 text = non_junk
399 if pairs: result.append(pairs)
400 return result
401
402HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
403def join_header_words(lists):
404 """Do the inverse (almost) of the conversion done by split_header_words.
405
406 Takes a list of lists of (key, value) pairs and produces a single header
407 value. Attribute values are quoted if needed.
408
409 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
410 'text/plain; charset="iso-8859/1"'
411 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
412 'text/plain, charset="iso-8859/1"'
413
414 """
415 headers = []
416 for pairs in lists:
417 attr = []
418 for k, v in pairs:
419 if v is not None:
420 if not re.search(r"^\w+$", v):
421 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
422 v = '"%s"' % v
423 k = "%s=%s" % (k, v)
424 attr.append(k)
425 if attr: headers.append("; ".join(attr))
426 return ", ".join(headers)
427
428def parse_ns_headers(ns_headers):
429 """Ad-hoc parser for Netscape protocol cookie-attributes.
430
431 The old Netscape cookie format for Set-Cookie can for instance contain
432 an unquoted "," in the expires field, so we have to use this ad-hoc
433 parser instead of split_header_words.
434
435 XXX This may not make the best possible effort to parse all the crap
436 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
437 parser is probably better, so could do worse than following that if
438 this ever gives any trouble.
439
440 Currently, this is also used for parsing RFC 2109 cookies.
441
442 """
443 known_attrs = ("expires", "domain", "path", "secure",
444 # RFC 2109 attrs (may turn up in Netscape cookies, too)
445 "port", "max-age")
446
447 result = []
448 for ns_header in ns_headers:
449 pairs = []
450 version_set = False
451 for param in re.split(r";\s*", ns_header):
452 param = param.rstrip()
453 if param == "": continue
454 if "=" not in param:
455 if param.lower() in known_attrs:
456 k, v = param, None
457 else:
458 # cookie with missing value
459 k, v = param, None
460 else:
461 k, v = re.split(r"\s*=\s*", param, 1)
462 k = k.lstrip()
463 if k is not None:
464 lc = k.lower()
465 if lc in known_attrs:
466 k = lc
467 if k == "version":
468 # This is an RFC 2109 cookie. Will be treated as RFC 2965
469 # cookie in rest of code.
470 # Probably it should be parsed with split_header_words, but
471 # that's too much hassle.
472 version_set = True
473 if k == "expires":
474 # convert expires date to seconds since epoch
475 if v.startswith('"'): v = v[1:]
476 if v.endswith('"'): v = v[:-1]
477 v = http2time(v) # None if invalid
478 pairs.append((k, v))
479
480 if pairs:
481 if not version_set:
482 pairs.append(("version", "0"))
483 result.append(pairs)
484
485 return result
486
487
488IPV4_RE = re.compile(r"\.\d+$")
489def is_HDN(text):
490 """Return True if text is a host domain name."""
491 # XXX
492 # This may well be wrong. Which RFC is HDN defined in, if any (for
493 # the purposes of RFC 2965)?
494 # For the current implementation, what about IPv6? Remember to look
495 # at other uses of IPV4_RE also, if change this.
496 if IPV4_RE.search(text):
497 return False
498 if text == "":
499 return False
500 if text[0] == "." or text[-1] == ".":
501 return False
502 return True
503
504def domain_match(A, B):
505 """Return True if domain A domain-matches domain B, according to RFC 2965.
506
507 A and B may be host domain names or IP addresses.
508
509 RFC 2965, section 1:
510
511 Host names can be specified either as an IP address or a HDN string.
512 Sometimes we compare one host name with another. (Such comparisons SHALL
513 be case-insensitive.) Host A's name domain-matches host B's if
514
515 * their host name strings string-compare equal; or
516
517 * A is a HDN string and has the form NB, where N is a non-empty
518 name string, B has the form .B', and B' is a HDN string. (So,
519 x.y.com domain-matches .Y.com but not Y.com.)
520
521 Note that domain-match is not a commutative operation: a.b.c.com
522 domain-matches .c.com, but not the reverse.
523
524 """
525 # Note that, if A or B are IP addresses, the only relevant part of the
526 # definition of the domain-match algorithm is the direct string-compare.
527 A = A.lower()
528 B = B.lower()
529 if A == B:
530 return True
531 if not is_HDN(A):
532 return False
533 i = A.rfind(B)
534 if i == -1 or i == 0:
535 # A does not have form NB, or N is the empty string
536 return False
537 if not B.startswith("."):
538 return False
539 if not is_HDN(B[1:]):
540 return False
541 return True
542
543def liberal_is_HDN(text):
544 """Return True if text is a sort-of-like a host domain name.
545
546 For accepting/blocking domains.
547
548 """
549 if IPV4_RE.search(text):
550 return False
551 return True
552
553def user_domain_match(A, B):
554 """For blocking/accepting domains.
555
556 A and B may be host domain names or IP addresses.
557
558 """
559 A = A.lower()
560 B = B.lower()
561 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
562 if A == B:
563 # equal IP addresses
564 return True
565 return False
566 initial_dot = B.startswith(".")
567 if initial_dot and A.endswith(B):
568 return True
569 if not initial_dot and A == B:
570 return True
571 return False
572
573cut_port_re = re.compile(r":\d+$")
574def request_host(request):
575 """Return request-host, as defined by RFC 2965.
576
577 Variation from RFC: returned value is lowercased, for convenient
578 comparison.
579
580 """
581 url = request.get_full_url()
582 host = urlparse.urlparse(url)[1]
583 if host == "":
584 host = request.get_header("Host", "")
585
586 # remove port, if present
587 host = cut_port_re.sub("", host, 1)
588 return host.lower()
589
590def eff_request_host(request):
591 """Return a tuple (request-host, effective request-host name).
592
593 As defined by RFC 2965, except both are lowercased.
594
595 """
596 erhn = req_host = request_host(request)
597 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
598 erhn = req_host + ".local"
599 return req_host, erhn
600
601def request_path(request):
602 """request-URI, as defined by RFC 2965."""
603 url = request.get_full_url()
604 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
605 #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
606 path, parameters, query, frag = urlparse.urlparse(url)[2:]
607 if parameters:
608 path = "%s;%s" % (path, parameters)
609 path = escape_path(path)
610 req_path = urlparse.urlunparse(("", "", path, "", query, frag))
611 if not req_path.startswith("/"):
612 # fix bad RFC 2396 absoluteURI
613 req_path = "/"+req_path
614 return req_path
615
616def request_port(request):
617 host = request.get_host()
618 i = host.find(':')
619 if i >= 0:
620 port = host[i+1:]
621 try:
622 int(port)
623 except ValueError:
624 debug("nonnumeric port: '%s'", port)
625 return None
626 else:
627 port = DEFAULT_HTTP_PORT
628 return port
629
630# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
631# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
632HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
633ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
634def uppercase_escaped_char(match):
635 return "%%%s" % match.group(1).upper()
636def escape_path(path):
637 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
638 # There's no knowing what character encoding was used to create URLs
639 # containing %-escapes, but since we have to pick one to escape invalid
640 # path characters, we pick UTF-8, as recommended in the HTML 4.0
641 # specification:
642 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
643 # And here, kind of: draft-fielding-uri-rfc2396bis-03
644 # (And in draft IRI specification: draft-duerst-iri-05)
645 # (And here, for new URI schemes: RFC 2718)
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +0000646 if isinstance(path, unicode):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000647 path = path.encode("utf-8")
648 path = urllib.quote(path, HTTP_PATH_SAFE)
649 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
650 return path
651
652def reach(h):
653 """Return reach of host h, as defined by RFC 2965, section 1.
654
655 The reach R of a host name H is defined as follows:
656
657 * If
658
659 - H is the host domain name of a host; and,
660
661 - H has the form A.B; and
662
663 - A has no embedded (that is, interior) dots; and
664
665 - B has at least one embedded dot, or B is the string "local".
666 then the reach of H is .B.
667
668 * Otherwise, the reach of H is H.
669
670 >>> reach("www.acme.com")
671 '.acme.com'
672 >>> reach("acme.com")
673 'acme.com'
674 >>> reach("acme.local")
675 '.local'
676
677 """
678 i = h.find(".")
679 if i >= 0:
680 #a = h[:i] # this line is only here to show what a is
681 b = h[i+1:]
682 i = b.find(".")
683 if is_HDN(h) and (i >= 0 or b == "local"):
684 return "."+b
685 return h
686
687def is_third_party(request):
688 """
689
690 RFC 2965, section 3.3.6:
691
692 An unverifiable transaction is to a third-party host if its request-
693 host U does not domain-match the reach R of the request-host O in the
694 origin transaction.
695
696 """
697 req_host = request_host(request)
698 if not domain_match(req_host, reach(request.get_origin_req_host())):
699 return True
700 else:
701 return False
702
703
704class Cookie:
705 """HTTP Cookie.
706
707 This class represents both Netscape and RFC 2965 cookies.
708
709 This is deliberately a very simple class. It just holds attributes. It's
710 possible to construct Cookie instances that don't comply with the cookie
711 standards. CookieJar.make_cookies is the factory function for Cookie
712 objects -- it deals with cookie parsing, supplying defaults, and
713 normalising to the representation used in this class. CookiePolicy is
714 responsible for checking them to see whether they should be accepted from
715 and returned to the server.
716
717 Note that the port may be present in the headers, but unspecified ("Port"
718 rather than"Port=80", for example); if this is the case, port is None.
719
720 """
721
722 def __init__(self, version, name, value,
723 port, port_specified,
724 domain, domain_specified, domain_initial_dot,
725 path, path_specified,
726 secure,
727 expires,
728 discard,
729 comment,
730 comment_url,
731 rest):
732
733 if version is not None: version = int(version)
734 if expires is not None: expires = int(expires)
735 if port is None and port_specified is True:
736 raise ValueError("if port is None, port_specified must be false")
737
738 self.version = version
739 self.name = name
740 self.value = value
741 self.port = port
742 self.port_specified = port_specified
743 # normalise case, as per RFC 2965 section 3.3.3
744 self.domain = domain.lower()
745 self.domain_specified = domain_specified
746 # Sigh. We need to know whether the domain given in the
747 # cookie-attribute had an initial dot, in order to follow RFC 2965
748 # (as clarified in draft errata). Needed for the returned $Domain
749 # value.
750 self.domain_initial_dot = domain_initial_dot
751 self.path = path
752 self.path_specified = path_specified
753 self.secure = secure
754 self.expires = expires
755 self.discard = discard
756 self.comment = comment
757 self.comment_url = comment_url
758
759 self._rest = copy.copy(rest)
760
761 def has_nonstandard_attr(self, name):
762 return name in self._rest
763 def get_nonstandard_attr(self, name, default=None):
764 return self._rest.get(name, default)
765 def set_nonstandard_attr(self, name, value):
766 self._rest[name] = value
767
768 def is_expired(self, now=None):
769 if now is None: now = time.time()
770 if (self.expires is not None) and (self.expires <= now):
771 return True
772 return False
773
774 def __str__(self):
775 if self.port is None: p = ""
776 else: p = ":"+self.port
777 limit = self.domain + p + self.path
778 if self.value is not None:
779 namevalue = "%s=%s" % (self.name, self.value)
780 else:
781 namevalue = self.name
782 return "<Cookie %s for %s>" % (namevalue, limit)
783
784 def __repr__(self):
785 args = []
786 for name in ["version", "name", "value",
787 "port", "port_specified",
788 "domain", "domain_specified", "domain_initial_dot",
789 "path", "path_specified",
790 "secure", "expires", "discard", "comment", "comment_url",
791 ]:
792 attr = getattr(self, name)
793 args.append("%s=%s" % (name, repr(attr)))
794 args.append("rest=%s" % repr(self._rest))
795 return "Cookie(%s)" % ", ".join(args)
796
797
798class CookiePolicy:
799 """Defines which cookies get accepted from and returned to server.
800
801 May also modify cookies, though this is probably a bad idea.
802
803 The subclass DefaultCookiePolicy defines the standard rules for Netscape
804 and RFC 2965 cookies -- override that if you want a customised policy.
805
806 """
807 def set_ok(self, cookie, request):
808 """Return true if (and only if) cookie should be accepted from server.
809
810 Currently, pre-expired cookies never get this far -- the CookieJar
811 class deletes such cookies itself.
812
813 """
814 raise NotImplementedError()
815
816 def return_ok(self, cookie, request):
817 """Return true if (and only if) cookie should be returned to server."""
818 raise NotImplementedError()
819
820 def domain_return_ok(self, domain, request):
821 """Return false if cookies should not be returned, given cookie domain.
822 """
823 return True
824
825 def path_return_ok(self, path, request):
826 """Return false if cookies should not be returned, given cookie path.
827 """
828 return True
829
830
831class DefaultCookiePolicy(CookiePolicy):
832 """Implements the standard rules for accepting and returning cookies."""
833
834 DomainStrictNoDots = 1
835 DomainStrictNonDomain = 2
836 DomainRFC2965Match = 4
837
838 DomainLiberal = 0
839 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
840
841 def __init__(self,
842 blocked_domains=None, allowed_domains=None,
843 netscape=True, rfc2965=False,
844 hide_cookie2=False,
845 strict_domain=False,
846 strict_rfc2965_unverifiable=True,
847 strict_ns_unverifiable=False,
848 strict_ns_domain=DomainLiberal,
849 strict_ns_set_initial_dollar=False,
850 strict_ns_set_path=False,
851 ):
852 """Constructor arguments should be passed as keyword arguments only."""
853 self.netscape = netscape
854 self.rfc2965 = rfc2965
855 self.hide_cookie2 = hide_cookie2
856 self.strict_domain = strict_domain
857 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
858 self.strict_ns_unverifiable = strict_ns_unverifiable
859 self.strict_ns_domain = strict_ns_domain
860 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
861 self.strict_ns_set_path = strict_ns_set_path
862
863 if blocked_domains is not None:
864 self._blocked_domains = tuple(blocked_domains)
865 else:
866 self._blocked_domains = ()
867
868 if allowed_domains is not None:
869 allowed_domains = tuple(allowed_domains)
870 self._allowed_domains = allowed_domains
871
872 def blocked_domains(self):
873 """Return the sequence of blocked domains (as a tuple)."""
874 return self._blocked_domains
875 def set_blocked_domains(self, blocked_domains):
876 """Set the sequence of blocked domains."""
877 self._blocked_domains = tuple(blocked_domains)
878
879 def is_blocked(self, domain):
880 for blocked_domain in self._blocked_domains:
881 if user_domain_match(domain, blocked_domain):
882 return True
883 return False
884
885 def allowed_domains(self):
886 """Return None, or the sequence of allowed domains (as a tuple)."""
887 return self._allowed_domains
888 def set_allowed_domains(self, allowed_domains):
889 """Set the sequence of allowed domains, or None."""
890 if allowed_domains is not None:
891 allowed_domains = tuple(allowed_domains)
892 self._allowed_domains = allowed_domains
893
894 def is_not_allowed(self, domain):
895 if self._allowed_domains is None:
896 return False
897 for allowed_domain in self._allowed_domains:
898 if user_domain_match(domain, allowed_domain):
899 return False
900 return True
901
902 def set_ok(self, cookie, request):
903 """
904 If you override .set_ok(), be sure to call this method. If it returns
905 false, so should your subclass (assuming your subclass wants to be more
906 strict about which cookies to accept).
907
908 """
909 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
910
911 assert cookie.name is not None
912
913 for n in "version", "verifiability", "name", "path", "domain", "port":
914 fn_name = "set_ok_"+n
915 fn = getattr(self, fn_name)
916 if not fn(cookie, request):
917 return False
918
919 return True
920
921 def set_ok_version(self, cookie, request):
922 if cookie.version is None:
923 # Version is always set to 0 by parse_ns_headers if it's a Netscape
924 # cookie, so this must be an invalid RFC 2965 cookie.
925 debug(" Set-Cookie2 without version attribute (%s=%s)",
926 cookie.name, cookie.value)
927 return False
928 if cookie.version > 0 and not self.rfc2965:
929 debug(" RFC 2965 cookies are switched off")
930 return False
931 elif cookie.version == 0 and not self.netscape:
932 debug(" Netscape cookies are switched off")
933 return False
934 return True
935
936 def set_ok_verifiability(self, cookie, request):
937 if request.is_unverifiable() and is_third_party(request):
938 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
939 debug(" third-party RFC 2965 cookie during "
940 "unverifiable transaction")
941 return False
942 elif cookie.version == 0 and self.strict_ns_unverifiable:
943 debug(" third-party Netscape cookie during "
944 "unverifiable transaction")
945 return False
946 return True
947
948 def set_ok_name(self, cookie, request):
949 # Try and stop servers setting V0 cookies designed to hack other
950 # servers that know both V0 and V1 protocols.
951 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
952 cookie.name.startswith("$")):
953 debug(" illegal name (starts with '$'): '%s'", cookie.name)
954 return False
955 return True
956
957 def set_ok_path(self, cookie, request):
958 if cookie.path_specified:
959 req_path = request_path(request)
960 if ((cookie.version > 0 or
961 (cookie.version == 0 and self.strict_ns_set_path)) and
962 not req_path.startswith(cookie.path)):
963 debug(" path attribute %s is not a prefix of request "
964 "path %s", cookie.path, req_path)
965 return False
966 return True
967
968 def set_ok_domain(self, cookie, request):
969 if self.is_blocked(cookie.domain):
970 debug(" domain %s is in user block-list", cookie.domain)
971 return False
972 if self.is_not_allowed(cookie.domain):
973 debug(" domain %s is not in user allow-list", cookie.domain)
974 return False
975 if cookie.domain_specified:
976 req_host, erhn = eff_request_host(request)
977 domain = cookie.domain
978 if self.strict_domain and (domain.count(".") >= 2):
979 i = domain.rfind(".")
980 j = domain.rfind(".", 0, i)
981 if j == 0: # domain like .foo.bar
982 tld = domain[i+1:]
983 sld = domain[j+1:i]
984 if (sld.lower() in [
985 "co", "ac",
986 "com", "edu", "org", "net", "gov", "mil", "int"] and
987 len(tld) == 2):
988 # domain like .co.uk
989 debug(" country-code second level domain %s", domain)
990 return False
991 if domain.startswith("."):
992 undotted_domain = domain[1:]
993 else:
994 undotted_domain = domain
995 embedded_dots = (undotted_domain.find(".") >= 0)
996 if not embedded_dots and domain != ".local":
997 debug(" non-local domain %s contains no embedded dot",
998 domain)
999 return False
1000 if cookie.version == 0:
1001 if (not erhn.endswith(domain) and
1002 (not erhn.startswith(".") and
1003 not ("."+erhn).endswith(domain))):
1004 debug(" effective request-host %s (even with added "
1005 "initial dot) does not end end with %s",
1006 erhn, domain)
1007 return False
1008 if (cookie.version > 0 or
1009 (self.strict_ns_domain & self.DomainRFC2965Match)):
1010 if not domain_match(erhn, domain):
1011 debug(" effective request-host %s does not domain-match "
1012 "%s", erhn, domain)
1013 return False
1014 if (cookie.version > 0 or
1015 (self.strict_ns_domain & self.DomainStrictNoDots)):
1016 host_prefix = req_host[:-len(domain)]
1017 if (host_prefix.find(".") >= 0 and
1018 not IPV4_RE.search(req_host)):
1019 debug(" host prefix %s for domain %s contains a dot",
1020 host_prefix, domain)
1021 return False
1022 return True
1023
1024 def set_ok_port(self, cookie, request):
1025 if cookie.port_specified:
1026 req_port = request_port(request)
1027 if req_port is None:
1028 req_port = "80"
1029 else:
1030 req_port = str(req_port)
1031 for p in cookie.port.split(","):
1032 try:
1033 int(p)
1034 except ValueError:
1035 debug(" bad port %s (not numeric)", p)
1036 return False
1037 if p == req_port:
1038 break
1039 else:
1040 debug(" request port (%s) not found in %s",
1041 req_port, cookie.port)
1042 return False
1043 return True
1044
1045 def return_ok(self, cookie, request):
1046 """
1047 If you override .return_ok(), be sure to call this method. If it
1048 returns false, so should your subclass (assuming your subclass wants to
1049 be more strict about which cookies to return).
1050
1051 """
1052 # Path has already been checked by .path_return_ok(), and domain
1053 # blocking done by .domain_return_ok().
1054 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1055
1056 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1057 fn_name = "return_ok_"+n
1058 fn = getattr(self, fn_name)
1059 if not fn(cookie, request):
1060 return False
1061 return True
1062
1063 def return_ok_version(self, cookie, request):
1064 if cookie.version > 0 and not self.rfc2965:
1065 debug(" RFC 2965 cookies are switched off")
1066 return False
1067 elif cookie.version == 0 and not self.netscape:
1068 debug(" Netscape cookies are switched off")
1069 return False
1070 return True
1071
1072 def return_ok_verifiability(self, cookie, request):
1073 if request.is_unverifiable() and is_third_party(request):
1074 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1075 debug(" third-party RFC 2965 cookie during unverifiable "
1076 "transaction")
1077 return False
1078 elif cookie.version == 0 and self.strict_ns_unverifiable:
1079 debug(" third-party Netscape cookie during unverifiable "
1080 "transaction")
1081 return False
1082 return True
1083
1084 def return_ok_secure(self, cookie, request):
1085 if cookie.secure and request.get_type() != "https":
1086 debug(" secure cookie with non-secure request")
1087 return False
1088 return True
1089
1090 def return_ok_expires(self, cookie, request):
1091 if cookie.is_expired(self._now):
1092 debug(" cookie expired")
1093 return False
1094 return True
1095
1096 def return_ok_port(self, cookie, request):
1097 if cookie.port:
1098 req_port = request_port(request)
1099 if req_port is None:
1100 req_port = "80"
1101 for p in cookie.port.split(","):
1102 if p == req_port:
1103 break
1104 else:
1105 debug(" request port %s does not match cookie port %s",
1106 req_port, cookie.port)
1107 return False
1108 return True
1109
1110 def return_ok_domain(self, cookie, request):
1111 req_host, erhn = eff_request_host(request)
1112 domain = cookie.domain
1113
1114 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1115 if (cookie.version == 0 and
1116 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1117 not cookie.domain_specified and domain != erhn):
1118 debug(" cookie with unspecified domain does not string-compare "
1119 "equal to request domain")
1120 return False
1121
1122 if cookie.version > 0 and not domain_match(erhn, domain):
1123 debug(" effective request-host name %s does not domain-match "
1124 "RFC 2965 cookie domain %s", erhn, domain)
1125 return False
1126 if cookie.version == 0 and not ("."+erhn).endswith(domain):
1127 debug(" request-host %s does not match Netscape cookie domain "
1128 "%s", req_host, domain)
1129 return False
1130 return True
1131
1132 def domain_return_ok(self, domain, request):
1133 # Liberal check of. This is here as an optimization to avoid
1134 # having to load lots of MSIE cookie files unless necessary.
1135 req_host, erhn = eff_request_host(request)
1136 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001137 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001138 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001139 erhn = "."+erhn
1140 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001141 #debug(" request domain %s does not match cookie domain %s",
1142 # req_host, domain)
1143 return False
1144
1145 if self.is_blocked(domain):
1146 debug(" domain %s is in user block-list", domain)
1147 return False
1148 if self.is_not_allowed(domain):
1149 debug(" domain %s is not in user allow-list", domain)
1150 return False
1151
1152 return True
1153
1154 def path_return_ok(self, path, request):
1155 debug("- checking cookie path=%s", path)
1156 req_path = request_path(request)
1157 if not req_path.startswith(path):
1158 debug(" %s does not path-match %s", req_path, path)
1159 return False
1160 return True
1161
1162
1163def vals_sorted_by_key(adict):
1164 keys = adict.keys()
1165 keys.sort()
1166 return map(adict.get, keys)
1167
1168def deepvalues(mapping):
1169 """Iterates over nested mapping, depth-first, in sorted order by key."""
1170 values = vals_sorted_by_key(mapping)
1171 for obj in values:
1172 mapping = False
1173 try:
1174 obj.items
1175 except AttributeError:
1176 pass
1177 else:
1178 mapping = True
1179 for subobj in deepvalues(obj):
1180 yield subobj
1181 if not mapping:
1182 yield obj
1183
1184
1185# Used as second parameter to dict.get() method, to distinguish absent
1186# dict key from one with a None value.
1187class Absent: pass
1188
1189class CookieJar:
1190 """Collection of HTTP cookies.
1191
1192 You may not need to know about this class: try
1193 urllib2.build_opener(HTTPCookieProcessor).open(url).
1194
1195 """
1196
1197 non_word_re = re.compile(r"\W")
1198 quote_re = re.compile(r"([\"\\])")
1199 strict_domain_re = re.compile(r"\.?[^.]*")
1200 domain_re = re.compile(r"[^.]*")
1201 dots_re = re.compile(r"^\.+")
1202
1203 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1204
1205 def __init__(self, policy=None):
1206 if policy is None:
1207 policy = DefaultCookiePolicy()
1208 self._policy = policy
1209
1210 self._cookies_lock = _threading.RLock()
1211 self._cookies = {}
1212
1213 def set_policy(self, policy):
1214 self._policy = policy
1215
1216 def _cookies_for_domain(self, domain, request):
1217 cookies = []
1218 if not self._policy.domain_return_ok(domain, request):
1219 return []
1220 debug("Checking %s for cookies to return", domain)
1221 cookies_by_path = self._cookies[domain]
1222 for path in cookies_by_path.keys():
1223 if not self._policy.path_return_ok(path, request):
1224 continue
1225 cookies_by_name = cookies_by_path[path]
1226 for cookie in cookies_by_name.values():
1227 if not self._policy.return_ok(cookie, request):
1228 debug(" not returning cookie")
1229 continue
1230 debug(" it's a match")
1231 cookies.append(cookie)
1232 return cookies
1233
1234 def _cookies_for_request(self, request):
1235 """Return a list of cookies to be returned to server."""
1236 cookies = []
1237 for domain in self._cookies.keys():
1238 cookies.extend(self._cookies_for_domain(domain, request))
1239 return cookies
1240
1241 def _cookie_attrs(self, cookies):
1242 """Return a list of cookie-attributes to be returned to server.
1243
1244 like ['foo="bar"; $Path="/"', ...]
1245
1246 The $Version attribute is also added when appropriate (currently only
1247 once per request).
1248
1249 """
1250 # add cookies in order of most specific (ie. longest) path first
1251 def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
1252 cookies.sort(decreasing_size)
1253
1254 version_set = False
1255
1256 attrs = []
1257 for cookie in cookies:
1258 # set version of Cookie header
1259 # XXX
1260 # What should it be if multiple matching Set-Cookie headers have
1261 # different versions themselves?
1262 # Answer: there is no answer; was supposed to be settled by
1263 # RFC 2965 errata, but that may never appear...
1264 version = cookie.version
1265 if not version_set:
1266 version_set = True
1267 if version > 0:
1268 attrs.append("$Version=%s" % version)
1269
1270 # quote cookie value if necessary
1271 # (not for Netscape protocol, which already has any quotes
1272 # intact, due to the poorly-specified Netscape Cookie: syntax)
1273 if ((cookie.value is not None) and
1274 self.non_word_re.search(cookie.value) and version > 0):
1275 value = self.quote_re.sub(r"\\\1", cookie.value)
1276 else:
1277 value = cookie.value
1278
1279 # add cookie-attributes to be returned in Cookie header
1280 if cookie.value is None:
1281 attrs.append(cookie.name)
1282 else:
1283 attrs.append("%s=%s" % (cookie.name, value))
1284 if version > 0:
1285 if cookie.path_specified:
1286 attrs.append('$Path="%s"' % cookie.path)
1287 if cookie.domain.startswith("."):
1288 domain = cookie.domain
1289 if (not cookie.domain_initial_dot and
1290 domain.startswith(".")):
1291 domain = domain[1:]
1292 attrs.append('$Domain="%s"' % domain)
1293 if cookie.port is not None:
1294 p = "$Port"
1295 if cookie.port_specified:
1296 p = p + ('="%s"' % cookie.port)
1297 attrs.append(p)
1298
1299 return attrs
1300
1301 def add_cookie_header(self, request):
1302 """Add correct Cookie: header to request (urllib2.Request object).
1303
1304 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1305
1306 """
1307 debug("add_cookie_header")
1308 self._cookies_lock.acquire()
1309
1310 self._policy._now = self._now = int(time.time())
1311
1312 req_host, erhn = eff_request_host(request)
1313 strict_non_domain = (
1314 self._policy.strict_ns_domain & self._policy.DomainStrictNonDomain)
1315
1316 cookies = self._cookies_for_request(request)
1317
1318 attrs = self._cookie_attrs(cookies)
1319 if attrs:
1320 if not request.has_header("Cookie"):
1321 request.add_unredirected_header(
1322 "Cookie", "; ".join(attrs))
1323
1324 # if necessary, advertise that we know RFC 2965
1325 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1326 not request.has_header("Cookie2")):
1327 for cookie in cookies:
1328 if cookie.version != 1:
1329 request.add_unredirected_header("Cookie2", '$Version="1"')
1330 break
1331
1332 self._cookies_lock.release()
1333
1334 self.clear_expired_cookies()
1335
1336 def _normalized_cookie_tuples(self, attrs_set):
1337 """Return list of tuples containing normalised cookie information.
1338
1339 attrs_set is the list of lists of key,value pairs extracted from
1340 the Set-Cookie or Set-Cookie2 headers.
1341
1342 Tuples are name, value, standard, rest, where name and value are the
1343 cookie name and value, standard is a dictionary containing the standard
1344 cookie-attributes (discard, secure, version, expires or max-age,
1345 domain, path and port) and rest is a dictionary containing the rest of
1346 the cookie-attributes.
1347
1348 """
1349 cookie_tuples = []
1350
1351 boolean_attrs = "discard", "secure"
1352 value_attrs = ("version",
1353 "expires", "max-age",
1354 "domain", "path", "port",
1355 "comment", "commenturl")
1356
1357 for cookie_attrs in attrs_set:
1358 name, value = cookie_attrs[0]
1359
1360 # Build dictionary of standard cookie-attributes (standard) and
1361 # dictionary of other cookie-attributes (rest).
1362
1363 # Note: expiry time is normalised to seconds since epoch. V0
1364 # cookies should have the Expires cookie-attribute, and V1 cookies
1365 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1366 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1367 # accept either (but prefer Max-Age).
1368 max_age_set = False
1369
1370 bad_cookie = False
1371
1372 standard = {}
1373 rest = {}
1374 for k, v in cookie_attrs[1:]:
1375 lc = k.lower()
1376 # don't lose case distinction for unknown fields
1377 if lc in value_attrs or lc in boolean_attrs:
1378 k = lc
1379 if k in boolean_attrs and v is None:
1380 # boolean cookie-attribute is present, but has no value
1381 # (like "discard", rather than "port=80")
1382 v = True
1383 if k in standard:
1384 # only first value is significant
1385 continue
1386 if k == "domain":
1387 if v is None:
1388 debug(" missing value for domain attribute")
1389 bad_cookie = True
1390 break
1391 # RFC 2965 section 3.3.3
1392 v = v.lower()
1393 if k == "expires":
1394 if max_age_set:
1395 # Prefer max-age to expires (like Mozilla)
1396 continue
1397 if v is None:
1398 debug(" missing or invalid value for expires "
1399 "attribute: treating as session cookie")
1400 continue
1401 if k == "max-age":
1402 max_age_set = True
1403 try:
1404 v = int(v)
1405 except ValueError:
1406 debug(" missing or invalid (non-numeric) value for "
1407 "max-age attribute")
1408 bad_cookie = True
1409 break
1410 # convert RFC 2965 Max-Age to seconds since epoch
1411 # XXX Strictly you're supposed to follow RFC 2616
1412 # age-calculation rules. Remember that zero Max-Age is a
1413 # is a request to discard (old and new) cookie, though.
1414 k = "expires"
1415 v = self._now + v
1416 if (k in value_attrs) or (k in boolean_attrs):
1417 if (v is None and
1418 k not in ["port", "comment", "commenturl"]):
1419 debug(" missing value for %s attribute" % k)
1420 bad_cookie = True
1421 break
1422 standard[k] = v
1423 else:
1424 rest[k] = v
1425
1426 if bad_cookie:
1427 continue
1428
1429 cookie_tuples.append((name, value, standard, rest))
1430
1431 return cookie_tuples
1432
1433 def _cookie_from_cookie_tuple(self, tup, request):
1434 # standard is dict of standard cookie-attributes, rest is dict of the
1435 # rest of them
1436 name, value, standard, rest = tup
1437
1438 domain = standard.get("domain", Absent)
1439 path = standard.get("path", Absent)
1440 port = standard.get("port", Absent)
1441 expires = standard.get("expires", Absent)
1442
1443 # set the easy defaults
1444 version = standard.get("version", None)
1445 if version is not None: version = int(version)
1446 secure = standard.get("secure", False)
1447 # (discard is also set if expires is Absent)
1448 discard = standard.get("discard", False)
1449 comment = standard.get("comment", None)
1450 comment_url = standard.get("commenturl", None)
1451
1452 # set default path
1453 if path is not Absent and path != "":
1454 path_specified = True
1455 path = escape_path(path)
1456 else:
1457 path_specified = False
1458 path = request_path(request)
1459 i = path.rfind("/")
1460 if i != -1:
1461 if version == 0:
1462 # Netscape spec parts company from reality here
1463 path = path[:i]
1464 else:
1465 path = path[:i+1]
1466 if len(path) == 0: path = "/"
1467
1468 # set default domain
1469 domain_specified = domain is not Absent
1470 # but first we have to remember whether it starts with a dot
1471 domain_initial_dot = False
1472 if domain_specified:
1473 domain_initial_dot = bool(domain.startswith("."))
1474 if domain is Absent:
1475 req_host, erhn = eff_request_host(request)
1476 domain = erhn
1477 elif not domain.startswith("."):
1478 domain = "."+domain
1479
1480 # set default port
1481 port_specified = False
1482 if port is not Absent:
1483 if port is None:
1484 # Port attr present, but has no value: default to request port.
1485 # Cookie should then only be sent back on that port.
1486 port = request_port(request)
1487 else:
1488 port_specified = True
1489 port = re.sub(r"\s+", "", port)
1490 else:
1491 # No port attr present. Cookie can be sent back on any port.
1492 port = None
1493
1494 # set default expires and discard
1495 if expires is Absent:
1496 expires = None
1497 discard = True
1498 elif expires <= self._now:
1499 # Expiry date in past is request to delete cookie. This can't be
1500 # in DefaultCookiePolicy, because can't delete cookies there.
1501 try:
1502 self.clear(domain, path, name)
1503 except KeyError:
1504 pass
1505 debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1506 domain, path, name)
1507 return None
1508
1509 return Cookie(version,
1510 name, value,
1511 port, port_specified,
1512 domain, domain_specified, domain_initial_dot,
1513 path, path_specified,
1514 secure,
1515 expires,
1516 discard,
1517 comment,
1518 comment_url,
1519 rest)
1520
1521 def _cookies_from_attrs_set(self, attrs_set, request):
1522 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1523
1524 cookies = []
1525 for tup in cookie_tuples:
1526 cookie = self._cookie_from_cookie_tuple(tup, request)
1527 if cookie: cookies.append(cookie)
1528 return cookies
1529
1530 def make_cookies(self, response, request):
1531 """Return sequence of Cookie objects extracted from response object."""
1532 # get cookie-attributes for RFC 2965 and Netscape protocols
1533 headers = response.info()
1534 rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1535 ns_hdrs = headers.getheaders("Set-Cookie")
1536
1537 rfc2965 = self._policy.rfc2965
1538 netscape = self._policy.netscape
1539
1540 if ((not rfc2965_hdrs and not ns_hdrs) or
1541 (not ns_hdrs and not rfc2965) or
1542 (not rfc2965_hdrs and not netscape) or
1543 (not netscape and not rfc2965)):
1544 return [] # no relevant cookie headers: quick exit
1545
1546 try:
1547 cookies = self._cookies_from_attrs_set(
1548 split_header_words(rfc2965_hdrs), request)
1549 except:
1550 reraise_unmasked_exceptions()
1551 cookies = []
1552
1553 if ns_hdrs and netscape:
1554 try:
1555 ns_cookies = self._cookies_from_attrs_set(
1556 parse_ns_headers(ns_hdrs), request)
1557 except:
1558 reraise_unmasked_exceptions()
1559 ns_cookies = []
1560
1561 # Look for Netscape cookies (from Set-Cookie headers) that match
1562 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1563 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1564 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1565 # bundled in with the Netscape cookies for this purpose, which is
1566 # reasonable behaviour.
1567 if rfc2965:
1568 lookup = {}
1569 for cookie in cookies:
1570 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1571
1572 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1573 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1574 return key not in lookup
1575 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1576
1577 if ns_cookies:
1578 cookies.extend(ns_cookies)
1579
1580 return cookies
1581
1582 def set_cookie_if_ok(self, cookie, request):
1583 """Set a cookie if policy says it's OK to do so."""
1584 self._cookies_lock.acquire()
1585 self._policy._now = self._now = int(time.time())
1586
1587 if self._policy.set_ok(cookie, request):
1588 self.set_cookie(cookie)
1589
1590 self._cookies_lock.release()
1591
1592 def set_cookie(self, cookie):
1593 """Set a cookie, without checking whether or not it should be set."""
1594 c = self._cookies
1595 self._cookies_lock.acquire()
1596 try:
1597 if cookie.domain not in c: c[cookie.domain] = {}
1598 c2 = c[cookie.domain]
1599 if cookie.path not in c2: c2[cookie.path] = {}
1600 c3 = c2[cookie.path]
1601 c3[cookie.name] = cookie
1602 finally:
1603 self._cookies_lock.release()
1604
1605 def extract_cookies(self, response, request):
1606 """Extract cookies from response, where allowable given the request."""
1607 debug("extract_cookies: %s", response.info())
1608 self._cookies_lock.acquire()
1609 self._policy._now = self._now = int(time.time())
1610
1611 for cookie in self.make_cookies(response, request):
1612 if self._policy.set_ok(cookie, request):
1613 debug(" setting cookie: %s", cookie)
1614 self.set_cookie(cookie)
1615 self._cookies_lock.release()
1616
1617 def clear(self, domain=None, path=None, name=None):
1618 """Clear some cookies.
1619
1620 Invoking this method without arguments will clear all cookies. If
1621 given a single argument, only cookies belonging to that domain will be
1622 removed. If given two arguments, cookies belonging to the specified
1623 path within that domain are removed. If given three arguments, then
1624 the cookie with the specified name, path and domain is removed.
1625
1626 Raises KeyError if no matching cookie exists.
1627
1628 """
1629 if name is not None:
1630 if (domain is None) or (path is None):
1631 raise ValueError(
1632 "domain and path must be given to remove a cookie by name")
1633 del self._cookies[domain][path][name]
1634 elif path is not None:
1635 if domain is None:
1636 raise ValueError(
1637 "domain must be given to remove cookies by path")
1638 del self._cookies[domain][path]
1639 elif domain is not None:
1640 del self._cookies[domain]
1641 else:
1642 self._cookies = {}
1643
1644 def clear_session_cookies(self):
1645 """Discard all session cookies.
1646
1647 Note that the .save() method won't save session cookies anyway, unless
1648 you ask otherwise by passing a true ignore_discard argument.
1649
1650 """
1651 self._cookies_lock.acquire()
1652 for cookie in self:
1653 if cookie.discard:
1654 self.clear(cookie.domain, cookie.path, cookie.name)
1655 self._cookies_lock.release()
1656
1657 def clear_expired_cookies(self):
1658 """Discard all expired cookies.
1659
1660 You probably don't need to call this method: expired cookies are never
1661 sent back to the server (provided you're using DefaultCookiePolicy),
1662 this method is called by CookieJar itself every so often, and the
1663 .save() method won't save expired cookies anyway (unless you ask
1664 otherwise by passing a true ignore_expires argument).
1665
1666 """
1667 self._cookies_lock.acquire()
1668 now = time.time()
1669 for cookie in self:
1670 if cookie.is_expired(now):
1671 self.clear(cookie.domain, cookie.path, cookie.name)
1672 self._cookies_lock.release()
1673
1674 def __iter__(self):
1675 return deepvalues(self._cookies)
1676
1677 def __len__(self):
1678 """Return number of contained cookies."""
1679 i = 0
1680 for cookie in self: i = i + 1
1681 return i
1682
1683 def __repr__(self):
1684 r = []
1685 for cookie in self: r.append(repr(cookie))
1686 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1687
1688 def __str__(self):
1689 r = []
1690 for cookie in self: r.append(str(cookie))
1691 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1692
1693
1694class LoadError(Exception): pass
1695
1696class FileCookieJar(CookieJar):
1697 """CookieJar that can be loaded from and saved to a file."""
1698
1699 def __init__(self, filename=None, delayload=False, policy=None):
1700 """
1701 Cookies are NOT loaded from the named file until either the .load() or
1702 .revert() method is called.
1703
1704 """
1705 CookieJar.__init__(self, policy)
1706 if filename is not None:
1707 try:
1708 filename+""
1709 except:
1710 raise ValueError("filename must be string-like")
1711 self.filename = filename
1712 self.delayload = bool(delayload)
1713
1714 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1715 """Save cookies to a file."""
1716 raise NotImplementedError()
1717
1718 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1719 """Load cookies from a file."""
1720 if filename is None:
1721 if self.filename is not None: filename = self.filename
1722 else: raise ValueError(MISSING_FILENAME_TEXT)
1723
1724 f = open(filename)
1725 try:
1726 self._really_load(f, filename, ignore_discard, ignore_expires)
1727 finally:
1728 f.close()
1729
1730 def revert(self, filename=None,
1731 ignore_discard=False, ignore_expires=False):
1732 """Clear all cookies and reload cookies from a saved file.
1733
1734 Raises LoadError (or IOError) if reversion is not successful; the
1735 object's state will not be altered if this happens.
1736
1737 """
1738 if filename is None:
1739 if self.filename is not None: filename = self.filename
1740 else: raise ValueError(MISSING_FILENAME_TEXT)
1741
1742 self._cookies_lock.acquire()
1743
1744 old_state = copy.deepcopy(self._cookies)
1745 self._cookies = {}
1746 try:
1747 self.load(filename, ignore_discard, ignore_expires)
1748 except (LoadError, IOError):
1749 self._cookies = old_state
1750 raise
1751
1752 self._cookies_lock.release()
1753
1754from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1755from _MozillaCookieJar import MozillaCookieJar