blob: a125b219867528f9b753129742cdff5d2892c001 [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that the classes which do not derive from
11FileCookieJar are not distributed with the Python standard library, but
12are available from http://wwwsearch.sf.net/):
13
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +000028import sys, re, urlparse, copy, time, urllib, logging
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000029from types import StringTypes
30try:
31 import threading as _threading
32except ImportError:
33 import dummy_threading as _threading
34import httplib # only for the default HTTP port
35from calendar import timegm
36
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000037debug = logging.getLogger("cookielib").debug
38
39DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
40MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
41 "instance initialised with one)")
42
43def reraise_unmasked_exceptions(unmasked=()):
44 # There are a few catch-all except: statements in this module, for
45 # catching input that's bad in unexpected ways.
46 # This function re-raises some exceptions we don't want to trap.
47 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
48 etype = sys.exc_info()[0]
49 if issubclass(etype, unmasked):
50 raise
51 # swallowed an exception
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000052 import warnings, traceback, StringIO
53 f = StringIO.StringIO()
54 traceback.print_exc(None, f)
55 msg = f.getvalue()
56 warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000057
58
59# Date/time conversion
60# -----------------------------------------------------------------------------
61
62EPOCH_YEAR = 1970
63def _timegm(tt):
64 year, month, mday, hour, min, sec = tt[:6]
65 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
66 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
67 return timegm(tt)
68 else:
69 return None
70
71DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
72MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
73 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
74MONTHS_LOWER = []
75for month in MONTHS: MONTHS_LOWER.append(month.lower())
76
77def time2isoz(t=None):
78 """Return a string representing time in seconds since epoch, t.
79
80 If the function is called without an argument, it will use the current
81 time.
82
83 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
84 representing Universal Time (UTC, aka GMT). An example of this format is:
85
86 1994-11-24 08:49:37Z
87
88 """
89 if t is None: t = time.time()
90 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
91 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
92 year, mon, mday, hour, min, sec)
93
94def time2netscape(t=None):
95 """Return a string representing time in seconds since epoch, t.
96
97 If the function is called without an argument, it will use the current
98 time.
99
100 The format of the returned string is like this:
101
102 Wed, DD-Mon-YYYY HH:MM:SS GMT
103
104 """
105 if t is None: t = time.time()
106 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
107 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
108 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
109
110
111UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
112
113TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
114def offset_from_tz_string(tz):
115 offset = None
116 if tz in UTC_ZONES:
117 offset = 0
118 else:
119 m = TIMEZONE_RE.search(tz)
120 if m:
121 offset = 3600 * int(m.group(2))
122 if m.group(3):
123 offset = offset + 60 * int(m.group(3))
124 if m.group(1) == '-':
125 offset = -offset
126 return offset
127
128def _str2time(day, mon, yr, hr, min, sec, tz):
129 # translate month name to number
130 # month numbers start with 1 (January)
131 try:
132 mon = MONTHS_LOWER.index(mon.lower())+1
133 except ValueError:
134 # maybe it's already a number
135 try:
136 imon = int(mon)
137 except ValueError:
138 return None
139 if 1 <= imon <= 12:
140 mon = imon
141 else:
142 return None
143
144 # make sure clock elements are defined
145 if hr is None: hr = 0
146 if min is None: min = 0
147 if sec is None: sec = 0
148
149 yr = int(yr)
150 day = int(day)
151 hr = int(hr)
152 min = int(min)
153 sec = int(sec)
154
155 if yr < 1000:
156 # find "obvious" year
157 cur_yr = time.localtime(time.time())[0]
158 m = cur_yr % 100
159 tmp = yr
160 yr = yr + cur_yr - m
161 m = m - tmp
162 if abs(m) > 50:
163 if m > 0: yr = yr + 100
164 else: yr = yr - 100
165
166 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
167 t = _timegm((yr, mon, day, hr, min, sec, tz))
168
169 if t is not None:
170 # adjust time using timezone string, to get absolute time since epoch
171 if tz is None:
172 tz = "UTC"
173 tz = tz.upper()
174 offset = offset_from_tz_string(tz)
175 if offset is None:
176 return None
177 t = t - offset
178
179 return t
180
181STRICT_DATE_RE = re.compile(
182 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
183 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
184WEEKDAY_RE = re.compile(
185 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
186LOOSE_HTTP_DATE_RE = re.compile(
187 r"""^
188 (\d\d?) # day
189 (?:\s+|[-\/])
190 (\w+) # month
191 (?:\s+|[-\/])
192 (\d+) # year
193 (?:
194 (?:\s+|:) # separator before clock
195 (\d\d?):(\d\d) # hour:min
196 (?::(\d\d))? # optional seconds
197 )? # optional clock
198 \s*
199 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
200 \s*
201 (?:\(\w+\))? # ASCII representation of timezone in parens.
202 \s*$""", re.X)
203def http2time(text):
204 """Returns time in seconds since epoch of time represented by a string.
205
206 Return value is an integer.
207
208 None is returned if the format of str is unrecognized, the time is outside
209 the representable range, or the timezone string is not recognized. If the
210 string contains no timezone, UTC is assumed.
211
212 The timezone in the string may be numerical (like "-0800" or "+0100") or a
213 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
214 timezone strings equivalent to UTC (zero offset) are known to the function.
215
216 The function loosely parses the following formats:
217
218 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
219 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
220 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
221 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
222 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
223 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
224
225 The parser ignores leading and trailing whitespace. The time may be
226 absent.
227
228 If the year is given with only 2 digits, the function will select the
229 century that makes the year closest to the current date.
230
231 """
232 # fast exit for strictly conforming string
233 m = STRICT_DATE_RE.search(text)
234 if m:
235 g = m.groups()
236 mon = MONTHS_LOWER.index(g[1].lower()) + 1
237 tt = (int(g[2]), mon, int(g[0]),
238 int(g[3]), int(g[4]), float(g[5]))
239 return _timegm(tt)
240
241 # No, we need some messy parsing...
242
243 # clean up
244 text = text.lstrip()
245 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
246
247 # tz is time zone specifier string
248 day, mon, yr, hr, min, sec, tz = [None]*7
249
250 # loose regexp parse
251 m = LOOSE_HTTP_DATE_RE.search(text)
252 if m is not None:
253 day, mon, yr, hr, min, sec, tz = m.groups()
254 else:
255 return None # bad format
256
257 return _str2time(day, mon, yr, hr, min, sec, tz)
258
259ISO_DATE_RE = re.compile(
260 """^
261 (\d{4}) # year
262 [-\/]?
263 (\d\d?) # numerical month
264 [-\/]?
265 (\d\d?) # day
266 (?:
267 (?:\s+|[-:Tt]) # separator before clock
268 (\d\d?):?(\d\d) # hour:min
269 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
270 )? # optional clock
271 \s*
272 ([-+]?\d\d?:?(:?\d\d)?
273 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
274 \s*$""", re.X)
275def iso2time(text):
276 """
277 As for http2time, but parses the ISO 8601 formats:
278
279 1994-02-03 14:15:29 -0100 -- ISO 8601 format
280 1994-02-03 14:15:29 -- zone is optional
281 1994-02-03 -- only date
282 1994-02-03T14:15:29 -- Use T as separator
283 19940203T141529Z -- ISO 8601 compact format
284 19940203 -- only date
285
286 """
287 # clean up
288 text = text.lstrip()
289
290 # tz is time zone specifier string
291 day, mon, yr, hr, min, sec, tz = [None]*7
292
293 # loose regexp parse
294 m = ISO_DATE_RE.search(text)
295 if m is not None:
296 # XXX there's an extra bit of the timezone I'm ignoring here: is
297 # this the right thing to do?
298 yr, mon, day, hr, min, sec, tz, _ = m.groups()
299 else:
300 return None # bad format
301
302 return _str2time(day, mon, yr, hr, min, sec, tz)
303
304
305# Header parsing
306# -----------------------------------------------------------------------------
307
308def unmatched(match):
309 """Return unmatched part of re.Match object."""
310 start, end = match.span(0)
311 return match.string[:start]+match.string[end:]
312
313HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
314HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
315HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
316HEADER_ESCAPE_RE = re.compile(r"\\(.)")
317def split_header_words(header_values):
318 r"""Parse header values into a list of lists containing key,value pairs.
319
320 The function knows how to deal with ",", ";" and "=" as well as quoted
321 values after "=". A list of space separated tokens are parsed as if they
322 were separated by ";".
323
324 If the header_values passed as argument contains multiple values, then they
325 are treated as if they were a single value separated by comma ",".
326
327 This means that this function is useful for parsing header fields that
328 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
329 the requirement for tokens).
330
331 headers = #header
332 header = (token | parameter) *( [";"] (token | parameter))
333
334 token = 1*<any CHAR except CTLs or separators>
335 separators = "(" | ")" | "<" | ">" | "@"
336 | "," | ";" | ":" | "\" | <">
337 | "/" | "[" | "]" | "?" | "="
338 | "{" | "}" | SP | HT
339
340 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
341 qdtext = <any TEXT except <">>
342 quoted-pair = "\" CHAR
343
344 parameter = attribute "=" value
345 attribute = token
346 value = token | quoted-string
347
348 Each header is represented by a list of key/value pairs. The value for a
349 simple token (not part of a parameter) is None. Syntactically incorrect
350 headers will not necessarily be parsed as you would want.
351
352 This is easier to describe with some examples:
353
354 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
355 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
356 >>> split_header_words(['text/html; charset="iso-8859-1"'])
357 [[('text/html', None), ('charset', 'iso-8859-1')]]
358 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
359 [[('Basic', None), ('realm', '"foobar"')]]
360
361 """
362 assert type(header_values) not in StringTypes
363 result = []
364 for text in header_values:
365 orig_text = text
366 pairs = []
367 while text:
368 m = HEADER_TOKEN_RE.search(text)
369 if m:
370 text = unmatched(m)
371 name = m.group(1)
372 m = HEADER_QUOTED_VALUE_RE.search(text)
373 if m: # quoted value
374 text = unmatched(m)
375 value = m.group(1)
376 value = HEADER_ESCAPE_RE.sub(r"\1", value)
377 else:
378 m = HEADER_VALUE_RE.search(text)
379 if m: # unquoted value
380 text = unmatched(m)
381 value = m.group(1)
382 value = value.rstrip()
383 else:
384 # no value, a lone token
385 value = None
386 pairs.append((name, value))
387 elif text.lstrip().startswith(","):
388 # concatenated headers, as per RFC 2616 section 4.2
389 text = text.lstrip()[1:]
390 if pairs: result.append(pairs)
391 pairs = []
392 else:
393 # skip junk
394 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
395 assert nr_junk_chars > 0, (
396 "split_header_words bug: '%s', '%s', %s" %
397 (orig_text, text, pairs))
398 text = non_junk
399 if pairs: result.append(pairs)
400 return result
401
402HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
403def join_header_words(lists):
404 """Do the inverse (almost) of the conversion done by split_header_words.
405
406 Takes a list of lists of (key, value) pairs and produces a single header
407 value. Attribute values are quoted if needed.
408
409 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
410 'text/plain; charset="iso-8859/1"'
411 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
412 'text/plain, charset="iso-8859/1"'
413
414 """
415 headers = []
416 for pairs in lists:
417 attr = []
418 for k, v in pairs:
419 if v is not None:
420 if not re.search(r"^\w+$", v):
421 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
422 v = '"%s"' % v
423 k = "%s=%s" % (k, v)
424 attr.append(k)
425 if attr: headers.append("; ".join(attr))
426 return ", ".join(headers)
427
428def parse_ns_headers(ns_headers):
429 """Ad-hoc parser for Netscape protocol cookie-attributes.
430
431 The old Netscape cookie format for Set-Cookie can for instance contain
432 an unquoted "," in the expires field, so we have to use this ad-hoc
433 parser instead of split_header_words.
434
435 XXX This may not make the best possible effort to parse all the crap
436 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
437 parser is probably better, so could do worse than following that if
438 this ever gives any trouble.
439
440 Currently, this is also used for parsing RFC 2109 cookies.
441
442 """
443 known_attrs = ("expires", "domain", "path", "secure",
444 # RFC 2109 attrs (may turn up in Netscape cookies, too)
445 "port", "max-age")
446
447 result = []
448 for ns_header in ns_headers:
449 pairs = []
450 version_set = False
451 for param in re.split(r";\s*", ns_header):
452 param = param.rstrip()
453 if param == "": continue
454 if "=" not in param:
455 if param.lower() in known_attrs:
456 k, v = param, None
457 else:
458 # cookie with missing value
459 k, v = param, None
460 else:
461 k, v = re.split(r"\s*=\s*", param, 1)
462 k = k.lstrip()
463 if k is not None:
464 lc = k.lower()
465 if lc in known_attrs:
466 k = lc
467 if k == "version":
468 # This is an RFC 2109 cookie. Will be treated as RFC 2965
469 # cookie in rest of code.
470 # Probably it should be parsed with split_header_words, but
471 # that's too much hassle.
472 version_set = True
473 if k == "expires":
474 # convert expires date to seconds since epoch
475 if v.startswith('"'): v = v[1:]
476 if v.endswith('"'): v = v[:-1]
477 v = http2time(v) # None if invalid
478 pairs.append((k, v))
479
480 if pairs:
481 if not version_set:
482 pairs.append(("version", "0"))
483 result.append(pairs)
484
485 return result
486
487
488IPV4_RE = re.compile(r"\.\d+$")
489def is_HDN(text):
490 """Return True if text is a host domain name."""
491 # XXX
492 # This may well be wrong. Which RFC is HDN defined in, if any (for
493 # the purposes of RFC 2965)?
494 # For the current implementation, what about IPv6? Remember to look
495 # at other uses of IPV4_RE also, if change this.
496 if IPV4_RE.search(text):
497 return False
498 if text == "":
499 return False
500 if text[0] == "." or text[-1] == ".":
501 return False
502 return True
503
504def domain_match(A, B):
505 """Return True if domain A domain-matches domain B, according to RFC 2965.
506
507 A and B may be host domain names or IP addresses.
508
509 RFC 2965, section 1:
510
511 Host names can be specified either as an IP address or a HDN string.
512 Sometimes we compare one host name with another. (Such comparisons SHALL
513 be case-insensitive.) Host A's name domain-matches host B's if
514
515 * their host name strings string-compare equal; or
516
517 * A is a HDN string and has the form NB, where N is a non-empty
518 name string, B has the form .B', and B' is a HDN string. (So,
519 x.y.com domain-matches .Y.com but not Y.com.)
520
521 Note that domain-match is not a commutative operation: a.b.c.com
522 domain-matches .c.com, but not the reverse.
523
524 """
525 # Note that, if A or B are IP addresses, the only relevant part of the
526 # definition of the domain-match algorithm is the direct string-compare.
527 A = A.lower()
528 B = B.lower()
529 if A == B:
530 return True
531 if not is_HDN(A):
532 return False
533 i = A.rfind(B)
534 if i == -1 or i == 0:
535 # A does not have form NB, or N is the empty string
536 return False
537 if not B.startswith("."):
538 return False
539 if not is_HDN(B[1:]):
540 return False
541 return True
542
543def liberal_is_HDN(text):
544 """Return True if text is a sort-of-like a host domain name.
545
546 For accepting/blocking domains.
547
548 """
549 if IPV4_RE.search(text):
550 return False
551 return True
552
553def user_domain_match(A, B):
554 """For blocking/accepting domains.
555
556 A and B may be host domain names or IP addresses.
557
558 """
559 A = A.lower()
560 B = B.lower()
561 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
562 if A == B:
563 # equal IP addresses
564 return True
565 return False
566 initial_dot = B.startswith(".")
567 if initial_dot and A.endswith(B):
568 return True
569 if not initial_dot and A == B:
570 return True
571 return False
572
573cut_port_re = re.compile(r":\d+$")
574def request_host(request):
575 """Return request-host, as defined by RFC 2965.
576
577 Variation from RFC: returned value is lowercased, for convenient
578 comparison.
579
580 """
581 url = request.get_full_url()
582 host = urlparse.urlparse(url)[1]
583 if host == "":
584 host = request.get_header("Host", "")
585
586 # remove port, if present
587 host = cut_port_re.sub("", host, 1)
588 return host.lower()
589
590def eff_request_host(request):
591 """Return a tuple (request-host, effective request-host name).
592
593 As defined by RFC 2965, except both are lowercased.
594
595 """
596 erhn = req_host = request_host(request)
597 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
598 erhn = req_host + ".local"
599 return req_host, erhn
600
601def request_path(request):
602 """request-URI, as defined by RFC 2965."""
603 url = request.get_full_url()
604 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
605 #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
606 path, parameters, query, frag = urlparse.urlparse(url)[2:]
607 if parameters:
608 path = "%s;%s" % (path, parameters)
609 path = escape_path(path)
610 req_path = urlparse.urlunparse(("", "", path, "", query, frag))
611 if not req_path.startswith("/"):
612 # fix bad RFC 2396 absoluteURI
613 req_path = "/"+req_path
614 return req_path
615
616def request_port(request):
617 host = request.get_host()
618 i = host.find(':')
619 if i >= 0:
620 port = host[i+1:]
621 try:
622 int(port)
623 except ValueError:
624 debug("nonnumeric port: '%s'", port)
625 return None
626 else:
627 port = DEFAULT_HTTP_PORT
628 return port
629
630# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
631# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
632HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
633ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
634def uppercase_escaped_char(match):
635 return "%%%s" % match.group(1).upper()
636def escape_path(path):
637 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
638 # There's no knowing what character encoding was used to create URLs
639 # containing %-escapes, but since we have to pick one to escape invalid
640 # path characters, we pick UTF-8, as recommended in the HTML 4.0
641 # specification:
642 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
643 # And here, kind of: draft-fielding-uri-rfc2396bis-03
644 # (And in draft IRI specification: draft-duerst-iri-05)
645 # (And here, for new URI schemes: RFC 2718)
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +0000646 if isinstance(path, unicode):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000647 path = path.encode("utf-8")
648 path = urllib.quote(path, HTTP_PATH_SAFE)
649 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
650 return path
651
652def reach(h):
653 """Return reach of host h, as defined by RFC 2965, section 1.
654
655 The reach R of a host name H is defined as follows:
656
657 * If
658
659 - H is the host domain name of a host; and,
660
661 - H has the form A.B; and
662
663 - A has no embedded (that is, interior) dots; and
664
665 - B has at least one embedded dot, or B is the string "local".
666 then the reach of H is .B.
667
668 * Otherwise, the reach of H is H.
669
670 >>> reach("www.acme.com")
671 '.acme.com'
672 >>> reach("acme.com")
673 'acme.com'
674 >>> reach("acme.local")
675 '.local'
676
677 """
678 i = h.find(".")
679 if i >= 0:
680 #a = h[:i] # this line is only here to show what a is
681 b = h[i+1:]
682 i = b.find(".")
683 if is_HDN(h) and (i >= 0 or b == "local"):
684 return "."+b
685 return h
686
687def is_third_party(request):
688 """
689
690 RFC 2965, section 3.3.6:
691
692 An unverifiable transaction is to a third-party host if its request-
693 host U does not domain-match the reach R of the request-host O in the
694 origin transaction.
695
696 """
697 req_host = request_host(request)
698 if not domain_match(req_host, reach(request.get_origin_req_host())):
699 return True
700 else:
701 return False
702
703
704class Cookie:
705 """HTTP Cookie.
706
707 This class represents both Netscape and RFC 2965 cookies.
708
709 This is deliberately a very simple class. It just holds attributes. It's
710 possible to construct Cookie instances that don't comply with the cookie
711 standards. CookieJar.make_cookies is the factory function for Cookie
712 objects -- it deals with cookie parsing, supplying defaults, and
713 normalising to the representation used in this class. CookiePolicy is
714 responsible for checking them to see whether they should be accepted from
715 and returned to the server.
716
717 Note that the port may be present in the headers, but unspecified ("Port"
718 rather than"Port=80", for example); if this is the case, port is None.
719
720 """
721
722 def __init__(self, version, name, value,
723 port, port_specified,
724 domain, domain_specified, domain_initial_dot,
725 path, path_specified,
726 secure,
727 expires,
728 discard,
729 comment,
730 comment_url,
731 rest):
732
733 if version is not None: version = int(version)
734 if expires is not None: expires = int(expires)
735 if port is None and port_specified is True:
736 raise ValueError("if port is None, port_specified must be false")
737
738 self.version = version
739 self.name = name
740 self.value = value
741 self.port = port
742 self.port_specified = port_specified
743 # normalise case, as per RFC 2965 section 3.3.3
744 self.domain = domain.lower()
745 self.domain_specified = domain_specified
746 # Sigh. We need to know whether the domain given in the
747 # cookie-attribute had an initial dot, in order to follow RFC 2965
748 # (as clarified in draft errata). Needed for the returned $Domain
749 # value.
750 self.domain_initial_dot = domain_initial_dot
751 self.path = path
752 self.path_specified = path_specified
753 self.secure = secure
754 self.expires = expires
755 self.discard = discard
756 self.comment = comment
757 self.comment_url = comment_url
758
759 self._rest = copy.copy(rest)
760
761 def has_nonstandard_attr(self, name):
762 return name in self._rest
763 def get_nonstandard_attr(self, name, default=None):
764 return self._rest.get(name, default)
765 def set_nonstandard_attr(self, name, value):
766 self._rest[name] = value
767
768 def is_expired(self, now=None):
769 if now is None: now = time.time()
770 if (self.expires is not None) and (self.expires <= now):
771 return True
772 return False
773
774 def __str__(self):
775 if self.port is None: p = ""
776 else: p = ":"+self.port
777 limit = self.domain + p + self.path
778 if self.value is not None:
779 namevalue = "%s=%s" % (self.name, self.value)
780 else:
781 namevalue = self.name
782 return "<Cookie %s for %s>" % (namevalue, limit)
783
784 def __repr__(self):
785 args = []
786 for name in ["version", "name", "value",
787 "port", "port_specified",
788 "domain", "domain_specified", "domain_initial_dot",
789 "path", "path_specified",
790 "secure", "expires", "discard", "comment", "comment_url",
791 ]:
792 attr = getattr(self, name)
793 args.append("%s=%s" % (name, repr(attr)))
794 args.append("rest=%s" % repr(self._rest))
795 return "Cookie(%s)" % ", ".join(args)
796
797
798class CookiePolicy:
799 """Defines which cookies get accepted from and returned to server.
800
801 May also modify cookies, though this is probably a bad idea.
802
803 The subclass DefaultCookiePolicy defines the standard rules for Netscape
804 and RFC 2965 cookies -- override that if you want a customised policy.
805
806 """
807 def set_ok(self, cookie, request):
808 """Return true if (and only if) cookie should be accepted from server.
809
810 Currently, pre-expired cookies never get this far -- the CookieJar
811 class deletes such cookies itself.
812
813 """
814 raise NotImplementedError()
815
816 def return_ok(self, cookie, request):
817 """Return true if (and only if) cookie should be returned to server."""
818 raise NotImplementedError()
819
820 def domain_return_ok(self, domain, request):
821 """Return false if cookies should not be returned, given cookie domain.
822 """
823 return True
824
825 def path_return_ok(self, path, request):
826 """Return false if cookies should not be returned, given cookie path.
827 """
828 return True
829
830
831class DefaultCookiePolicy(CookiePolicy):
832 """Implements the standard rules for accepting and returning cookies."""
833
834 DomainStrictNoDots = 1
835 DomainStrictNonDomain = 2
836 DomainRFC2965Match = 4
837
838 DomainLiberal = 0
839 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
840
841 def __init__(self,
842 blocked_domains=None, allowed_domains=None,
843 netscape=True, rfc2965=False,
844 hide_cookie2=False,
845 strict_domain=False,
846 strict_rfc2965_unverifiable=True,
847 strict_ns_unverifiable=False,
848 strict_ns_domain=DomainLiberal,
849 strict_ns_set_initial_dollar=False,
850 strict_ns_set_path=False,
851 ):
852 """Constructor arguments should be passed as keyword arguments only."""
853 self.netscape = netscape
854 self.rfc2965 = rfc2965
855 self.hide_cookie2 = hide_cookie2
856 self.strict_domain = strict_domain
857 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
858 self.strict_ns_unverifiable = strict_ns_unverifiable
859 self.strict_ns_domain = strict_ns_domain
860 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
861 self.strict_ns_set_path = strict_ns_set_path
862
863 if blocked_domains is not None:
864 self._blocked_domains = tuple(blocked_domains)
865 else:
866 self._blocked_domains = ()
867
868 if allowed_domains is not None:
869 allowed_domains = tuple(allowed_domains)
870 self._allowed_domains = allowed_domains
871
872 def blocked_domains(self):
873 """Return the sequence of blocked domains (as a tuple)."""
874 return self._blocked_domains
875 def set_blocked_domains(self, blocked_domains):
876 """Set the sequence of blocked domains."""
877 self._blocked_domains = tuple(blocked_domains)
878
879 def is_blocked(self, domain):
880 for blocked_domain in self._blocked_domains:
881 if user_domain_match(domain, blocked_domain):
882 return True
883 return False
884
885 def allowed_domains(self):
886 """Return None, or the sequence of allowed domains (as a tuple)."""
887 return self._allowed_domains
888 def set_allowed_domains(self, allowed_domains):
889 """Set the sequence of allowed domains, or None."""
890 if allowed_domains is not None:
891 allowed_domains = tuple(allowed_domains)
892 self._allowed_domains = allowed_domains
893
894 def is_not_allowed(self, domain):
895 if self._allowed_domains is None:
896 return False
897 for allowed_domain in self._allowed_domains:
898 if user_domain_match(domain, allowed_domain):
899 return False
900 return True
901
902 def set_ok(self, cookie, request):
903 """
904 If you override .set_ok(), be sure to call this method. If it returns
905 false, so should your subclass (assuming your subclass wants to be more
906 strict about which cookies to accept).
907
908 """
909 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
910
911 assert cookie.name is not None
912
913 for n in "version", "verifiability", "name", "path", "domain", "port":
914 fn_name = "set_ok_"+n
915 fn = getattr(self, fn_name)
916 if not fn(cookie, request):
917 return False
918
919 return True
920
921 def set_ok_version(self, cookie, request):
922 if cookie.version is None:
923 # Version is always set to 0 by parse_ns_headers if it's a Netscape
924 # cookie, so this must be an invalid RFC 2965 cookie.
925 debug(" Set-Cookie2 without version attribute (%s=%s)",
926 cookie.name, cookie.value)
927 return False
928 if cookie.version > 0 and not self.rfc2965:
929 debug(" RFC 2965 cookies are switched off")
930 return False
931 elif cookie.version == 0 and not self.netscape:
932 debug(" Netscape cookies are switched off")
933 return False
934 return True
935
936 def set_ok_verifiability(self, cookie, request):
937 if request.is_unverifiable() and is_third_party(request):
938 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
939 debug(" third-party RFC 2965 cookie during "
940 "unverifiable transaction")
941 return False
942 elif cookie.version == 0 and self.strict_ns_unverifiable:
943 debug(" third-party Netscape cookie during "
944 "unverifiable transaction")
945 return False
946 return True
947
948 def set_ok_name(self, cookie, request):
949 # Try and stop servers setting V0 cookies designed to hack other
950 # servers that know both V0 and V1 protocols.
951 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
952 cookie.name.startswith("$")):
953 debug(" illegal name (starts with '$'): '%s'", cookie.name)
954 return False
955 return True
956
957 def set_ok_path(self, cookie, request):
958 if cookie.path_specified:
959 req_path = request_path(request)
960 if ((cookie.version > 0 or
961 (cookie.version == 0 and self.strict_ns_set_path)) and
962 not req_path.startswith(cookie.path)):
963 debug(" path attribute %s is not a prefix of request "
964 "path %s", cookie.path, req_path)
965 return False
966 return True
967
968 def set_ok_domain(self, cookie, request):
969 if self.is_blocked(cookie.domain):
970 debug(" domain %s is in user block-list", cookie.domain)
971 return False
972 if self.is_not_allowed(cookie.domain):
973 debug(" domain %s is not in user allow-list", cookie.domain)
974 return False
975 if cookie.domain_specified:
976 req_host, erhn = eff_request_host(request)
977 domain = cookie.domain
978 if self.strict_domain and (domain.count(".") >= 2):
979 i = domain.rfind(".")
980 j = domain.rfind(".", 0, i)
981 if j == 0: # domain like .foo.bar
982 tld = domain[i+1:]
983 sld = domain[j+1:i]
984 if (sld.lower() in [
985 "co", "ac",
986 "com", "edu", "org", "net", "gov", "mil", "int"] and
987 len(tld) == 2):
988 # domain like .co.uk
989 debug(" country-code second level domain %s", domain)
990 return False
991 if domain.startswith("."):
992 undotted_domain = domain[1:]
993 else:
994 undotted_domain = domain
995 embedded_dots = (undotted_domain.find(".") >= 0)
996 if not embedded_dots and domain != ".local":
997 debug(" non-local domain %s contains no embedded dot",
998 domain)
999 return False
1000 if cookie.version == 0:
1001 if (not erhn.endswith(domain) and
1002 (not erhn.startswith(".") and
1003 not ("."+erhn).endswith(domain))):
1004 debug(" effective request-host %s (even with added "
1005 "initial dot) does not end end with %s",
1006 erhn, domain)
1007 return False
1008 if (cookie.version > 0 or
1009 (self.strict_ns_domain & self.DomainRFC2965Match)):
1010 if not domain_match(erhn, domain):
1011 debug(" effective request-host %s does not domain-match "
1012 "%s", erhn, domain)
1013 return False
1014 if (cookie.version > 0 or
1015 (self.strict_ns_domain & self.DomainStrictNoDots)):
1016 host_prefix = req_host[:-len(domain)]
1017 if (host_prefix.find(".") >= 0 and
1018 not IPV4_RE.search(req_host)):
1019 debug(" host prefix %s for domain %s contains a dot",
1020 host_prefix, domain)
1021 return False
1022 return True
1023
1024 def set_ok_port(self, cookie, request):
1025 if cookie.port_specified:
1026 req_port = request_port(request)
1027 if req_port is None:
1028 req_port = "80"
1029 else:
1030 req_port = str(req_port)
1031 for p in cookie.port.split(","):
1032 try:
1033 int(p)
1034 except ValueError:
1035 debug(" bad port %s (not numeric)", p)
1036 return False
1037 if p == req_port:
1038 break
1039 else:
1040 debug(" request port (%s) not found in %s",
1041 req_port, cookie.port)
1042 return False
1043 return True
1044
1045 def return_ok(self, cookie, request):
1046 """
1047 If you override .return_ok(), be sure to call this method. If it
1048 returns false, so should your subclass (assuming your subclass wants to
1049 be more strict about which cookies to return).
1050
1051 """
1052 # Path has already been checked by .path_return_ok(), and domain
1053 # blocking done by .domain_return_ok().
1054 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1055
1056 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1057 fn_name = "return_ok_"+n
1058 fn = getattr(self, fn_name)
1059 if not fn(cookie, request):
1060 return False
1061 return True
1062
1063 def return_ok_version(self, cookie, request):
1064 if cookie.version > 0 and not self.rfc2965:
1065 debug(" RFC 2965 cookies are switched off")
1066 return False
1067 elif cookie.version == 0 and not self.netscape:
1068 debug(" Netscape cookies are switched off")
1069 return False
1070 return True
1071
1072 def return_ok_verifiability(self, cookie, request):
1073 if request.is_unverifiable() and is_third_party(request):
1074 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1075 debug(" third-party RFC 2965 cookie during unverifiable "
1076 "transaction")
1077 return False
1078 elif cookie.version == 0 and self.strict_ns_unverifiable:
1079 debug(" third-party Netscape cookie during unverifiable "
1080 "transaction")
1081 return False
1082 return True
1083
1084 def return_ok_secure(self, cookie, request):
1085 if cookie.secure and request.get_type() != "https":
1086 debug(" secure cookie with non-secure request")
1087 return False
1088 return True
1089
1090 def return_ok_expires(self, cookie, request):
1091 if cookie.is_expired(self._now):
1092 debug(" cookie expired")
1093 return False
1094 return True
1095
1096 def return_ok_port(self, cookie, request):
1097 if cookie.port:
1098 req_port = request_port(request)
1099 if req_port is None:
1100 req_port = "80"
1101 for p in cookie.port.split(","):
1102 if p == req_port:
1103 break
1104 else:
1105 debug(" request port %s does not match cookie port %s",
1106 req_port, cookie.port)
1107 return False
1108 return True
1109
1110 def return_ok_domain(self, cookie, request):
1111 req_host, erhn = eff_request_host(request)
1112 domain = cookie.domain
1113
1114 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1115 if (cookie.version == 0 and
1116 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1117 not cookie.domain_specified and domain != erhn):
1118 debug(" cookie with unspecified domain does not string-compare "
1119 "equal to request domain")
1120 return False
1121
1122 if cookie.version > 0 and not domain_match(erhn, domain):
1123 debug(" effective request-host name %s does not domain-match "
1124 "RFC 2965 cookie domain %s", erhn, domain)
1125 return False
1126 if cookie.version == 0 and not ("."+erhn).endswith(domain):
1127 debug(" request-host %s does not match Netscape cookie domain "
1128 "%s", req_host, domain)
1129 return False
1130 return True
1131
1132 def domain_return_ok(self, domain, request):
1133 # Liberal check of. This is here as an optimization to avoid
1134 # having to load lots of MSIE cookie files unless necessary.
1135 req_host, erhn = eff_request_host(request)
1136 if not req_host.startswith("."):
1137 dotted_req_host = "."+req_host
1138 if not erhn.startswith("."):
1139 dotted_erhn = "."+erhn
1140 if not (dotted_req_host.endswith(domain) or
1141 dotted_erhn.endswith(domain)):
1142 #debug(" request domain %s does not match cookie domain %s",
1143 # req_host, domain)
1144 return False
1145
1146 if self.is_blocked(domain):
1147 debug(" domain %s is in user block-list", domain)
1148 return False
1149 if self.is_not_allowed(domain):
1150 debug(" domain %s is not in user allow-list", domain)
1151 return False
1152
1153 return True
1154
1155 def path_return_ok(self, path, request):
1156 debug("- checking cookie path=%s", path)
1157 req_path = request_path(request)
1158 if not req_path.startswith(path):
1159 debug(" %s does not path-match %s", req_path, path)
1160 return False
1161 return True
1162
1163
1164def vals_sorted_by_key(adict):
1165 keys = adict.keys()
1166 keys.sort()
1167 return map(adict.get, keys)
1168
1169def deepvalues(mapping):
1170 """Iterates over nested mapping, depth-first, in sorted order by key."""
1171 values = vals_sorted_by_key(mapping)
1172 for obj in values:
1173 mapping = False
1174 try:
1175 obj.items
1176 except AttributeError:
1177 pass
1178 else:
1179 mapping = True
1180 for subobj in deepvalues(obj):
1181 yield subobj
1182 if not mapping:
1183 yield obj
1184
1185
1186# Used as second parameter to dict.get() method, to distinguish absent
1187# dict key from one with a None value.
1188class Absent: pass
1189
1190class CookieJar:
1191 """Collection of HTTP cookies.
1192
1193 You may not need to know about this class: try
1194 urllib2.build_opener(HTTPCookieProcessor).open(url).
1195
1196 """
1197
1198 non_word_re = re.compile(r"\W")
1199 quote_re = re.compile(r"([\"\\])")
1200 strict_domain_re = re.compile(r"\.?[^.]*")
1201 domain_re = re.compile(r"[^.]*")
1202 dots_re = re.compile(r"^\.+")
1203
1204 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1205
1206 def __init__(self, policy=None):
1207 if policy is None:
1208 policy = DefaultCookiePolicy()
1209 self._policy = policy
1210
1211 self._cookies_lock = _threading.RLock()
1212 self._cookies = {}
1213
1214 def set_policy(self, policy):
1215 self._policy = policy
1216
1217 def _cookies_for_domain(self, domain, request):
1218 cookies = []
1219 if not self._policy.domain_return_ok(domain, request):
1220 return []
1221 debug("Checking %s for cookies to return", domain)
1222 cookies_by_path = self._cookies[domain]
1223 for path in cookies_by_path.keys():
1224 if not self._policy.path_return_ok(path, request):
1225 continue
1226 cookies_by_name = cookies_by_path[path]
1227 for cookie in cookies_by_name.values():
1228 if not self._policy.return_ok(cookie, request):
1229 debug(" not returning cookie")
1230 continue
1231 debug(" it's a match")
1232 cookies.append(cookie)
1233 return cookies
1234
1235 def _cookies_for_request(self, request):
1236 """Return a list of cookies to be returned to server."""
1237 cookies = []
1238 for domain in self._cookies.keys():
1239 cookies.extend(self._cookies_for_domain(domain, request))
1240 return cookies
1241
1242 def _cookie_attrs(self, cookies):
1243 """Return a list of cookie-attributes to be returned to server.
1244
1245 like ['foo="bar"; $Path="/"', ...]
1246
1247 The $Version attribute is also added when appropriate (currently only
1248 once per request).
1249
1250 """
1251 # add cookies in order of most specific (ie. longest) path first
1252 def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
1253 cookies.sort(decreasing_size)
1254
1255 version_set = False
1256
1257 attrs = []
1258 for cookie in cookies:
1259 # set version of Cookie header
1260 # XXX
1261 # What should it be if multiple matching Set-Cookie headers have
1262 # different versions themselves?
1263 # Answer: there is no answer; was supposed to be settled by
1264 # RFC 2965 errata, but that may never appear...
1265 version = cookie.version
1266 if not version_set:
1267 version_set = True
1268 if version > 0:
1269 attrs.append("$Version=%s" % version)
1270
1271 # quote cookie value if necessary
1272 # (not for Netscape protocol, which already has any quotes
1273 # intact, due to the poorly-specified Netscape Cookie: syntax)
1274 if ((cookie.value is not None) and
1275 self.non_word_re.search(cookie.value) and version > 0):
1276 value = self.quote_re.sub(r"\\\1", cookie.value)
1277 else:
1278 value = cookie.value
1279
1280 # add cookie-attributes to be returned in Cookie header
1281 if cookie.value is None:
1282 attrs.append(cookie.name)
1283 else:
1284 attrs.append("%s=%s" % (cookie.name, value))
1285 if version > 0:
1286 if cookie.path_specified:
1287 attrs.append('$Path="%s"' % cookie.path)
1288 if cookie.domain.startswith("."):
1289 domain = cookie.domain
1290 if (not cookie.domain_initial_dot and
1291 domain.startswith(".")):
1292 domain = domain[1:]
1293 attrs.append('$Domain="%s"' % domain)
1294 if cookie.port is not None:
1295 p = "$Port"
1296 if cookie.port_specified:
1297 p = p + ('="%s"' % cookie.port)
1298 attrs.append(p)
1299
1300 return attrs
1301
1302 def add_cookie_header(self, request):
1303 """Add correct Cookie: header to request (urllib2.Request object).
1304
1305 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1306
1307 """
1308 debug("add_cookie_header")
1309 self._cookies_lock.acquire()
1310
1311 self._policy._now = self._now = int(time.time())
1312
1313 req_host, erhn = eff_request_host(request)
1314 strict_non_domain = (
1315 self._policy.strict_ns_domain & self._policy.DomainStrictNonDomain)
1316
1317 cookies = self._cookies_for_request(request)
1318
1319 attrs = self._cookie_attrs(cookies)
1320 if attrs:
1321 if not request.has_header("Cookie"):
1322 request.add_unredirected_header(
1323 "Cookie", "; ".join(attrs))
1324
1325 # if necessary, advertise that we know RFC 2965
1326 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1327 not request.has_header("Cookie2")):
1328 for cookie in cookies:
1329 if cookie.version != 1:
1330 request.add_unredirected_header("Cookie2", '$Version="1"')
1331 break
1332
1333 self._cookies_lock.release()
1334
1335 self.clear_expired_cookies()
1336
1337 def _normalized_cookie_tuples(self, attrs_set):
1338 """Return list of tuples containing normalised cookie information.
1339
1340 attrs_set is the list of lists of key,value pairs extracted from
1341 the Set-Cookie or Set-Cookie2 headers.
1342
1343 Tuples are name, value, standard, rest, where name and value are the
1344 cookie name and value, standard is a dictionary containing the standard
1345 cookie-attributes (discard, secure, version, expires or max-age,
1346 domain, path and port) and rest is a dictionary containing the rest of
1347 the cookie-attributes.
1348
1349 """
1350 cookie_tuples = []
1351
1352 boolean_attrs = "discard", "secure"
1353 value_attrs = ("version",
1354 "expires", "max-age",
1355 "domain", "path", "port",
1356 "comment", "commenturl")
1357
1358 for cookie_attrs in attrs_set:
1359 name, value = cookie_attrs[0]
1360
1361 # Build dictionary of standard cookie-attributes (standard) and
1362 # dictionary of other cookie-attributes (rest).
1363
1364 # Note: expiry time is normalised to seconds since epoch. V0
1365 # cookies should have the Expires cookie-attribute, and V1 cookies
1366 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1367 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1368 # accept either (but prefer Max-Age).
1369 max_age_set = False
1370
1371 bad_cookie = False
1372
1373 standard = {}
1374 rest = {}
1375 for k, v in cookie_attrs[1:]:
1376 lc = k.lower()
1377 # don't lose case distinction for unknown fields
1378 if lc in value_attrs or lc in boolean_attrs:
1379 k = lc
1380 if k in boolean_attrs and v is None:
1381 # boolean cookie-attribute is present, but has no value
1382 # (like "discard", rather than "port=80")
1383 v = True
1384 if k in standard:
1385 # only first value is significant
1386 continue
1387 if k == "domain":
1388 if v is None:
1389 debug(" missing value for domain attribute")
1390 bad_cookie = True
1391 break
1392 # RFC 2965 section 3.3.3
1393 v = v.lower()
1394 if k == "expires":
1395 if max_age_set:
1396 # Prefer max-age to expires (like Mozilla)
1397 continue
1398 if v is None:
1399 debug(" missing or invalid value for expires "
1400 "attribute: treating as session cookie")
1401 continue
1402 if k == "max-age":
1403 max_age_set = True
1404 try:
1405 v = int(v)
1406 except ValueError:
1407 debug(" missing or invalid (non-numeric) value for "
1408 "max-age attribute")
1409 bad_cookie = True
1410 break
1411 # convert RFC 2965 Max-Age to seconds since epoch
1412 # XXX Strictly you're supposed to follow RFC 2616
1413 # age-calculation rules. Remember that zero Max-Age is a
1414 # is a request to discard (old and new) cookie, though.
1415 k = "expires"
1416 v = self._now + v
1417 if (k in value_attrs) or (k in boolean_attrs):
1418 if (v is None and
1419 k not in ["port", "comment", "commenturl"]):
1420 debug(" missing value for %s attribute" % k)
1421 bad_cookie = True
1422 break
1423 standard[k] = v
1424 else:
1425 rest[k] = v
1426
1427 if bad_cookie:
1428 continue
1429
1430 cookie_tuples.append((name, value, standard, rest))
1431
1432 return cookie_tuples
1433
1434 def _cookie_from_cookie_tuple(self, tup, request):
1435 # standard is dict of standard cookie-attributes, rest is dict of the
1436 # rest of them
1437 name, value, standard, rest = tup
1438
1439 domain = standard.get("domain", Absent)
1440 path = standard.get("path", Absent)
1441 port = standard.get("port", Absent)
1442 expires = standard.get("expires", Absent)
1443
1444 # set the easy defaults
1445 version = standard.get("version", None)
1446 if version is not None: version = int(version)
1447 secure = standard.get("secure", False)
1448 # (discard is also set if expires is Absent)
1449 discard = standard.get("discard", False)
1450 comment = standard.get("comment", None)
1451 comment_url = standard.get("commenturl", None)
1452
1453 # set default path
1454 if path is not Absent and path != "":
1455 path_specified = True
1456 path = escape_path(path)
1457 else:
1458 path_specified = False
1459 path = request_path(request)
1460 i = path.rfind("/")
1461 if i != -1:
1462 if version == 0:
1463 # Netscape spec parts company from reality here
1464 path = path[:i]
1465 else:
1466 path = path[:i+1]
1467 if len(path) == 0: path = "/"
1468
1469 # set default domain
1470 domain_specified = domain is not Absent
1471 # but first we have to remember whether it starts with a dot
1472 domain_initial_dot = False
1473 if domain_specified:
1474 domain_initial_dot = bool(domain.startswith("."))
1475 if domain is Absent:
1476 req_host, erhn = eff_request_host(request)
1477 domain = erhn
1478 elif not domain.startswith("."):
1479 domain = "."+domain
1480
1481 # set default port
1482 port_specified = False
1483 if port is not Absent:
1484 if port is None:
1485 # Port attr present, but has no value: default to request port.
1486 # Cookie should then only be sent back on that port.
1487 port = request_port(request)
1488 else:
1489 port_specified = True
1490 port = re.sub(r"\s+", "", port)
1491 else:
1492 # No port attr present. Cookie can be sent back on any port.
1493 port = None
1494
1495 # set default expires and discard
1496 if expires is Absent:
1497 expires = None
1498 discard = True
1499 elif expires <= self._now:
1500 # Expiry date in past is request to delete cookie. This can't be
1501 # in DefaultCookiePolicy, because can't delete cookies there.
1502 try:
1503 self.clear(domain, path, name)
1504 except KeyError:
1505 pass
1506 debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1507 domain, path, name)
1508 return None
1509
1510 return Cookie(version,
1511 name, value,
1512 port, port_specified,
1513 domain, domain_specified, domain_initial_dot,
1514 path, path_specified,
1515 secure,
1516 expires,
1517 discard,
1518 comment,
1519 comment_url,
1520 rest)
1521
1522 def _cookies_from_attrs_set(self, attrs_set, request):
1523 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1524
1525 cookies = []
1526 for tup in cookie_tuples:
1527 cookie = self._cookie_from_cookie_tuple(tup, request)
1528 if cookie: cookies.append(cookie)
1529 return cookies
1530
1531 def make_cookies(self, response, request):
1532 """Return sequence of Cookie objects extracted from response object."""
1533 # get cookie-attributes for RFC 2965 and Netscape protocols
1534 headers = response.info()
1535 rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1536 ns_hdrs = headers.getheaders("Set-Cookie")
1537
1538 rfc2965 = self._policy.rfc2965
1539 netscape = self._policy.netscape
1540
1541 if ((not rfc2965_hdrs and not ns_hdrs) or
1542 (not ns_hdrs and not rfc2965) or
1543 (not rfc2965_hdrs and not netscape) or
1544 (not netscape and not rfc2965)):
1545 return [] # no relevant cookie headers: quick exit
1546
1547 try:
1548 cookies = self._cookies_from_attrs_set(
1549 split_header_words(rfc2965_hdrs), request)
1550 except:
1551 reraise_unmasked_exceptions()
1552 cookies = []
1553
1554 if ns_hdrs and netscape:
1555 try:
1556 ns_cookies = self._cookies_from_attrs_set(
1557 parse_ns_headers(ns_hdrs), request)
1558 except:
1559 reraise_unmasked_exceptions()
1560 ns_cookies = []
1561
1562 # Look for Netscape cookies (from Set-Cookie headers) that match
1563 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1564 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1565 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1566 # bundled in with the Netscape cookies for this purpose, which is
1567 # reasonable behaviour.
1568 if rfc2965:
1569 lookup = {}
1570 for cookie in cookies:
1571 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1572
1573 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1574 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1575 return key not in lookup
1576 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1577
1578 if ns_cookies:
1579 cookies.extend(ns_cookies)
1580
1581 return cookies
1582
1583 def set_cookie_if_ok(self, cookie, request):
1584 """Set a cookie if policy says it's OK to do so."""
1585 self._cookies_lock.acquire()
1586 self._policy._now = self._now = int(time.time())
1587
1588 if self._policy.set_ok(cookie, request):
1589 self.set_cookie(cookie)
1590
1591 self._cookies_lock.release()
1592
1593 def set_cookie(self, cookie):
1594 """Set a cookie, without checking whether or not it should be set."""
1595 c = self._cookies
1596 self._cookies_lock.acquire()
1597 try:
1598 if cookie.domain not in c: c[cookie.domain] = {}
1599 c2 = c[cookie.domain]
1600 if cookie.path not in c2: c2[cookie.path] = {}
1601 c3 = c2[cookie.path]
1602 c3[cookie.name] = cookie
1603 finally:
1604 self._cookies_lock.release()
1605
1606 def extract_cookies(self, response, request):
1607 """Extract cookies from response, where allowable given the request."""
1608 debug("extract_cookies: %s", response.info())
1609 self._cookies_lock.acquire()
1610 self._policy._now = self._now = int(time.time())
1611
1612 for cookie in self.make_cookies(response, request):
1613 if self._policy.set_ok(cookie, request):
1614 debug(" setting cookie: %s", cookie)
1615 self.set_cookie(cookie)
1616 self._cookies_lock.release()
1617
1618 def clear(self, domain=None, path=None, name=None):
1619 """Clear some cookies.
1620
1621 Invoking this method without arguments will clear all cookies. If
1622 given a single argument, only cookies belonging to that domain will be
1623 removed. If given two arguments, cookies belonging to the specified
1624 path within that domain are removed. If given three arguments, then
1625 the cookie with the specified name, path and domain is removed.
1626
1627 Raises KeyError if no matching cookie exists.
1628
1629 """
1630 if name is not None:
1631 if (domain is None) or (path is None):
1632 raise ValueError(
1633 "domain and path must be given to remove a cookie by name")
1634 del self._cookies[domain][path][name]
1635 elif path is not None:
1636 if domain is None:
1637 raise ValueError(
1638 "domain must be given to remove cookies by path")
1639 del self._cookies[domain][path]
1640 elif domain is not None:
1641 del self._cookies[domain]
1642 else:
1643 self._cookies = {}
1644
1645 def clear_session_cookies(self):
1646 """Discard all session cookies.
1647
1648 Note that the .save() method won't save session cookies anyway, unless
1649 you ask otherwise by passing a true ignore_discard argument.
1650
1651 """
1652 self._cookies_lock.acquire()
1653 for cookie in self:
1654 if cookie.discard:
1655 self.clear(cookie.domain, cookie.path, cookie.name)
1656 self._cookies_lock.release()
1657
1658 def clear_expired_cookies(self):
1659 """Discard all expired cookies.
1660
1661 You probably don't need to call this method: expired cookies are never
1662 sent back to the server (provided you're using DefaultCookiePolicy),
1663 this method is called by CookieJar itself every so often, and the
1664 .save() method won't save expired cookies anyway (unless you ask
1665 otherwise by passing a true ignore_expires argument).
1666
1667 """
1668 self._cookies_lock.acquire()
1669 now = time.time()
1670 for cookie in self:
1671 if cookie.is_expired(now):
1672 self.clear(cookie.domain, cookie.path, cookie.name)
1673 self._cookies_lock.release()
1674
1675 def __iter__(self):
1676 return deepvalues(self._cookies)
1677
1678 def __len__(self):
1679 """Return number of contained cookies."""
1680 i = 0
1681 for cookie in self: i = i + 1
1682 return i
1683
1684 def __repr__(self):
1685 r = []
1686 for cookie in self: r.append(repr(cookie))
1687 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1688
1689 def __str__(self):
1690 r = []
1691 for cookie in self: r.append(str(cookie))
1692 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1693
1694
1695class LoadError(Exception): pass
1696
1697class FileCookieJar(CookieJar):
1698 """CookieJar that can be loaded from and saved to a file."""
1699
1700 def __init__(self, filename=None, delayload=False, policy=None):
1701 """
1702 Cookies are NOT loaded from the named file until either the .load() or
1703 .revert() method is called.
1704
1705 """
1706 CookieJar.__init__(self, policy)
1707 if filename is not None:
1708 try:
1709 filename+""
1710 except:
1711 raise ValueError("filename must be string-like")
1712 self.filename = filename
1713 self.delayload = bool(delayload)
1714
1715 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1716 """Save cookies to a file."""
1717 raise NotImplementedError()
1718
1719 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1720 """Load cookies from a file."""
1721 if filename is None:
1722 if self.filename is not None: filename = self.filename
1723 else: raise ValueError(MISSING_FILENAME_TEXT)
1724
1725 f = open(filename)
1726 try:
1727 self._really_load(f, filename, ignore_discard, ignore_expires)
1728 finally:
1729 f.close()
1730
1731 def revert(self, filename=None,
1732 ignore_discard=False, ignore_expires=False):
1733 """Clear all cookies and reload cookies from a saved file.
1734
1735 Raises LoadError (or IOError) if reversion is not successful; the
1736 object's state will not be altered if this happens.
1737
1738 """
1739 if filename is None:
1740 if self.filename is not None: filename = self.filename
1741 else: raise ValueError(MISSING_FILENAME_TEXT)
1742
1743 self._cookies_lock.acquire()
1744
1745 old_state = copy.deepcopy(self._cookies)
1746 self._cookies = {}
1747 try:
1748 self.load(filename, ignore_discard, ignore_expires)
1749 except (LoadError, IOError):
1750 self._cookies = old_state
1751 raise
1752
1753 self._cookies_lock.release()
1754
1755from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1756from _MozillaCookieJar import MozillaCookieJar