blob: 5732125d70ded12c3ddc31e9a29a42208c5f8b94 [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that the classes which do not derive from
11FileCookieJar are not distributed with the Python standard library, but
12are available from http://wwwsearch.sf.net/):
13
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +000028import sys, re, urlparse, copy, time, urllib, logging
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000029try:
30 import threading as _threading
31except ImportError:
32 import dummy_threading as _threading
33import httplib # only for the default HTTP port
34from calendar import timegm
35
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000036debug = logging.getLogger("cookielib").debug
37
38DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
39MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
40 "instance initialised with one)")
41
42def reraise_unmasked_exceptions(unmasked=()):
43 # There are a few catch-all except: statements in this module, for
44 # catching input that's bad in unexpected ways.
45 # This function re-raises some exceptions we don't want to trap.
46 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
47 etype = sys.exc_info()[0]
48 if issubclass(etype, unmasked):
49 raise
50 # swallowed an exception
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000051 import warnings, traceback, StringIO
52 f = StringIO.StringIO()
53 traceback.print_exc(None, f)
54 msg = f.getvalue()
55 warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000056
57
58# Date/time conversion
59# -----------------------------------------------------------------------------
60
61EPOCH_YEAR = 1970
62def _timegm(tt):
63 year, month, mday, hour, min, sec = tt[:6]
64 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
65 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
66 return timegm(tt)
67 else:
68 return None
69
70DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
71MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
72 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
73MONTHS_LOWER = []
74for month in MONTHS: MONTHS_LOWER.append(month.lower())
75
76def time2isoz(t=None):
77 """Return a string representing time in seconds since epoch, t.
78
79 If the function is called without an argument, it will use the current
80 time.
81
82 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
83 representing Universal Time (UTC, aka GMT). An example of this format is:
84
85 1994-11-24 08:49:37Z
86
87 """
88 if t is None: t = time.time()
89 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
90 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
91 year, mon, mday, hour, min, sec)
92
93def time2netscape(t=None):
94 """Return a string representing time in seconds since epoch, t.
95
96 If the function is called without an argument, it will use the current
97 time.
98
99 The format of the returned string is like this:
100
101 Wed, DD-Mon-YYYY HH:MM:SS GMT
102
103 """
104 if t is None: t = time.time()
105 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
106 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
107 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
108
109
110UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
111
112TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
113def offset_from_tz_string(tz):
114 offset = None
115 if tz in UTC_ZONES:
116 offset = 0
117 else:
118 m = TIMEZONE_RE.search(tz)
119 if m:
120 offset = 3600 * int(m.group(2))
121 if m.group(3):
122 offset = offset + 60 * int(m.group(3))
123 if m.group(1) == '-':
124 offset = -offset
125 return offset
126
127def _str2time(day, mon, yr, hr, min, sec, tz):
128 # translate month name to number
129 # month numbers start with 1 (January)
130 try:
131 mon = MONTHS_LOWER.index(mon.lower())+1
132 except ValueError:
133 # maybe it's already a number
134 try:
135 imon = int(mon)
136 except ValueError:
137 return None
138 if 1 <= imon <= 12:
139 mon = imon
140 else:
141 return None
142
143 # make sure clock elements are defined
144 if hr is None: hr = 0
145 if min is None: min = 0
146 if sec is None: sec = 0
147
148 yr = int(yr)
149 day = int(day)
150 hr = int(hr)
151 min = int(min)
152 sec = int(sec)
153
154 if yr < 1000:
155 # find "obvious" year
156 cur_yr = time.localtime(time.time())[0]
157 m = cur_yr % 100
158 tmp = yr
159 yr = yr + cur_yr - m
160 m = m - tmp
161 if abs(m) > 50:
162 if m > 0: yr = yr + 100
163 else: yr = yr - 100
164
165 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
166 t = _timegm((yr, mon, day, hr, min, sec, tz))
167
168 if t is not None:
169 # adjust time using timezone string, to get absolute time since epoch
170 if tz is None:
171 tz = "UTC"
172 tz = tz.upper()
173 offset = offset_from_tz_string(tz)
174 if offset is None:
175 return None
176 t = t - offset
177
178 return t
179
180STRICT_DATE_RE = re.compile(
181 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
182 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
183WEEKDAY_RE = re.compile(
184 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
185LOOSE_HTTP_DATE_RE = re.compile(
186 r"""^
187 (\d\d?) # day
188 (?:\s+|[-\/])
189 (\w+) # month
190 (?:\s+|[-\/])
191 (\d+) # year
192 (?:
193 (?:\s+|:) # separator before clock
194 (\d\d?):(\d\d) # hour:min
195 (?::(\d\d))? # optional seconds
196 )? # optional clock
197 \s*
198 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
199 \s*
200 (?:\(\w+\))? # ASCII representation of timezone in parens.
201 \s*$""", re.X)
202def http2time(text):
203 """Returns time in seconds since epoch of time represented by a string.
204
205 Return value is an integer.
206
207 None is returned if the format of str is unrecognized, the time is outside
208 the representable range, or the timezone string is not recognized. If the
209 string contains no timezone, UTC is assumed.
210
211 The timezone in the string may be numerical (like "-0800" or "+0100") or a
212 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
213 timezone strings equivalent to UTC (zero offset) are known to the function.
214
215 The function loosely parses the following formats:
216
217 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
218 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
219 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
220 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
221 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
222 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
223
224 The parser ignores leading and trailing whitespace. The time may be
225 absent.
226
227 If the year is given with only 2 digits, the function will select the
228 century that makes the year closest to the current date.
229
230 """
231 # fast exit for strictly conforming string
232 m = STRICT_DATE_RE.search(text)
233 if m:
234 g = m.groups()
235 mon = MONTHS_LOWER.index(g[1].lower()) + 1
236 tt = (int(g[2]), mon, int(g[0]),
237 int(g[3]), int(g[4]), float(g[5]))
238 return _timegm(tt)
239
240 # No, we need some messy parsing...
241
242 # clean up
243 text = text.lstrip()
244 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
245
246 # tz is time zone specifier string
247 day, mon, yr, hr, min, sec, tz = [None]*7
248
249 # loose regexp parse
250 m = LOOSE_HTTP_DATE_RE.search(text)
251 if m is not None:
252 day, mon, yr, hr, min, sec, tz = m.groups()
253 else:
254 return None # bad format
255
256 return _str2time(day, mon, yr, hr, min, sec, tz)
257
258ISO_DATE_RE = re.compile(
259 """^
260 (\d{4}) # year
261 [-\/]?
262 (\d\d?) # numerical month
263 [-\/]?
264 (\d\d?) # day
265 (?:
266 (?:\s+|[-:Tt]) # separator before clock
267 (\d\d?):?(\d\d) # hour:min
268 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
269 )? # optional clock
270 \s*
271 ([-+]?\d\d?:?(:?\d\d)?
272 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
273 \s*$""", re.X)
274def iso2time(text):
275 """
276 As for http2time, but parses the ISO 8601 formats:
277
278 1994-02-03 14:15:29 -0100 -- ISO 8601 format
279 1994-02-03 14:15:29 -- zone is optional
280 1994-02-03 -- only date
281 1994-02-03T14:15:29 -- Use T as separator
282 19940203T141529Z -- ISO 8601 compact format
283 19940203 -- only date
284
285 """
286 # clean up
287 text = text.lstrip()
288
289 # tz is time zone specifier string
290 day, mon, yr, hr, min, sec, tz = [None]*7
291
292 # loose regexp parse
293 m = ISO_DATE_RE.search(text)
294 if m is not None:
295 # XXX there's an extra bit of the timezone I'm ignoring here: is
296 # this the right thing to do?
297 yr, mon, day, hr, min, sec, tz, _ = m.groups()
298 else:
299 return None # bad format
300
301 return _str2time(day, mon, yr, hr, min, sec, tz)
302
303
304# Header parsing
305# -----------------------------------------------------------------------------
306
307def unmatched(match):
308 """Return unmatched part of re.Match object."""
309 start, end = match.span(0)
310 return match.string[:start]+match.string[end:]
311
312HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
313HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
314HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
315HEADER_ESCAPE_RE = re.compile(r"\\(.)")
316def split_header_words(header_values):
317 r"""Parse header values into a list of lists containing key,value pairs.
318
319 The function knows how to deal with ",", ";" and "=" as well as quoted
320 values after "=". A list of space separated tokens are parsed as if they
321 were separated by ";".
322
323 If the header_values passed as argument contains multiple values, then they
324 are treated as if they were a single value separated by comma ",".
325
326 This means that this function is useful for parsing header fields that
327 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
328 the requirement for tokens).
329
330 headers = #header
331 header = (token | parameter) *( [";"] (token | parameter))
332
333 token = 1*<any CHAR except CTLs or separators>
334 separators = "(" | ")" | "<" | ">" | "@"
335 | "," | ";" | ":" | "\" | <">
336 | "/" | "[" | "]" | "?" | "="
337 | "{" | "}" | SP | HT
338
339 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
340 qdtext = <any TEXT except <">>
341 quoted-pair = "\" CHAR
342
343 parameter = attribute "=" value
344 attribute = token
345 value = token | quoted-string
346
347 Each header is represented by a list of key/value pairs. The value for a
348 simple token (not part of a parameter) is None. Syntactically incorrect
349 headers will not necessarily be parsed as you would want.
350
351 This is easier to describe with some examples:
352
353 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
354 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
355 >>> split_header_words(['text/html; charset="iso-8859-1"'])
356 [[('text/html', None), ('charset', 'iso-8859-1')]]
357 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
358 [[('Basic', None), ('realm', '"foobar"')]]
359
360 """
Raymond Hettingerf7153662005-02-07 14:16:21 +0000361 assert not isinstance(header_values, basestring)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000362 result = []
363 for text in header_values:
364 orig_text = text
365 pairs = []
366 while text:
367 m = HEADER_TOKEN_RE.search(text)
368 if m:
369 text = unmatched(m)
370 name = m.group(1)
371 m = HEADER_QUOTED_VALUE_RE.search(text)
372 if m: # quoted value
373 text = unmatched(m)
374 value = m.group(1)
375 value = HEADER_ESCAPE_RE.sub(r"\1", value)
376 else:
377 m = HEADER_VALUE_RE.search(text)
378 if m: # unquoted value
379 text = unmatched(m)
380 value = m.group(1)
381 value = value.rstrip()
382 else:
383 # no value, a lone token
384 value = None
385 pairs.append((name, value))
386 elif text.lstrip().startswith(","):
387 # concatenated headers, as per RFC 2616 section 4.2
388 text = text.lstrip()[1:]
389 if pairs: result.append(pairs)
390 pairs = []
391 else:
392 # skip junk
393 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
394 assert nr_junk_chars > 0, (
395 "split_header_words bug: '%s', '%s', %s" %
396 (orig_text, text, pairs))
397 text = non_junk
398 if pairs: result.append(pairs)
399 return result
400
401HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
402def join_header_words(lists):
403 """Do the inverse (almost) of the conversion done by split_header_words.
404
405 Takes a list of lists of (key, value) pairs and produces a single header
406 value. Attribute values are quoted if needed.
407
408 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
409 'text/plain; charset="iso-8859/1"'
410 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
411 'text/plain, charset="iso-8859/1"'
412
413 """
414 headers = []
415 for pairs in lists:
416 attr = []
417 for k, v in pairs:
418 if v is not None:
419 if not re.search(r"^\w+$", v):
420 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
421 v = '"%s"' % v
422 k = "%s=%s" % (k, v)
423 attr.append(k)
424 if attr: headers.append("; ".join(attr))
425 return ", ".join(headers)
426
427def parse_ns_headers(ns_headers):
428 """Ad-hoc parser for Netscape protocol cookie-attributes.
429
430 The old Netscape cookie format for Set-Cookie can for instance contain
431 an unquoted "," in the expires field, so we have to use this ad-hoc
432 parser instead of split_header_words.
433
434 XXX This may not make the best possible effort to parse all the crap
435 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
436 parser is probably better, so could do worse than following that if
437 this ever gives any trouble.
438
439 Currently, this is also used for parsing RFC 2109 cookies.
440
441 """
442 known_attrs = ("expires", "domain", "path", "secure",
443 # RFC 2109 attrs (may turn up in Netscape cookies, too)
444 "port", "max-age")
445
446 result = []
447 for ns_header in ns_headers:
448 pairs = []
449 version_set = False
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000450 for ii, param in enumerate(re.split(r";\s*", ns_header)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000451 param = param.rstrip()
452 if param == "": continue
453 if "=" not in param:
Martin v. Löwisc5574e82005-03-03 10:57:37 +0000454 k, v = param, None
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000455 else:
456 k, v = re.split(r"\s*=\s*", param, 1)
457 k = k.lstrip()
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000458 if ii != 0:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000459 lc = k.lower()
460 if lc in known_attrs:
461 k = lc
462 if k == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000463 # This is an RFC 2109 cookie.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000464 version_set = True
465 if k == "expires":
466 # convert expires date to seconds since epoch
467 if v.startswith('"'): v = v[1:]
468 if v.endswith('"'): v = v[:-1]
469 v = http2time(v) # None if invalid
470 pairs.append((k, v))
471
472 if pairs:
473 if not version_set:
474 pairs.append(("version", "0"))
475 result.append(pairs)
476
477 return result
478
479
480IPV4_RE = re.compile(r"\.\d+$")
481def is_HDN(text):
482 """Return True if text is a host domain name."""
483 # XXX
484 # This may well be wrong. Which RFC is HDN defined in, if any (for
485 # the purposes of RFC 2965)?
486 # For the current implementation, what about IPv6? Remember to look
487 # at other uses of IPV4_RE also, if change this.
488 if IPV4_RE.search(text):
489 return False
490 if text == "":
491 return False
492 if text[0] == "." or text[-1] == ".":
493 return False
494 return True
495
496def domain_match(A, B):
497 """Return True if domain A domain-matches domain B, according to RFC 2965.
498
499 A and B may be host domain names or IP addresses.
500
501 RFC 2965, section 1:
502
503 Host names can be specified either as an IP address or a HDN string.
504 Sometimes we compare one host name with another. (Such comparisons SHALL
505 be case-insensitive.) Host A's name domain-matches host B's if
506
507 * their host name strings string-compare equal; or
508
509 * A is a HDN string and has the form NB, where N is a non-empty
510 name string, B has the form .B', and B' is a HDN string. (So,
511 x.y.com domain-matches .Y.com but not Y.com.)
512
513 Note that domain-match is not a commutative operation: a.b.c.com
514 domain-matches .c.com, but not the reverse.
515
516 """
517 # Note that, if A or B are IP addresses, the only relevant part of the
518 # definition of the domain-match algorithm is the direct string-compare.
519 A = A.lower()
520 B = B.lower()
521 if A == B:
522 return True
523 if not is_HDN(A):
524 return False
525 i = A.rfind(B)
526 if i == -1 or i == 0:
527 # A does not have form NB, or N is the empty string
528 return False
529 if not B.startswith("."):
530 return False
531 if not is_HDN(B[1:]):
532 return False
533 return True
534
535def liberal_is_HDN(text):
536 """Return True if text is a sort-of-like a host domain name.
537
538 For accepting/blocking domains.
539
540 """
541 if IPV4_RE.search(text):
542 return False
543 return True
544
545def user_domain_match(A, B):
546 """For blocking/accepting domains.
547
548 A and B may be host domain names or IP addresses.
549
550 """
551 A = A.lower()
552 B = B.lower()
553 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
554 if A == B:
555 # equal IP addresses
556 return True
557 return False
558 initial_dot = B.startswith(".")
559 if initial_dot and A.endswith(B):
560 return True
561 if not initial_dot and A == B:
562 return True
563 return False
564
565cut_port_re = re.compile(r":\d+$")
566def request_host(request):
567 """Return request-host, as defined by RFC 2965.
568
569 Variation from RFC: returned value is lowercased, for convenient
570 comparison.
571
572 """
573 url = request.get_full_url()
574 host = urlparse.urlparse(url)[1]
575 if host == "":
576 host = request.get_header("Host", "")
577
578 # remove port, if present
579 host = cut_port_re.sub("", host, 1)
580 return host.lower()
581
582def eff_request_host(request):
583 """Return a tuple (request-host, effective request-host name).
584
585 As defined by RFC 2965, except both are lowercased.
586
587 """
588 erhn = req_host = request_host(request)
589 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
590 erhn = req_host + ".local"
591 return req_host, erhn
592
593def request_path(request):
594 """request-URI, as defined by RFC 2965."""
595 url = request.get_full_url()
596 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
597 #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
598 path, parameters, query, frag = urlparse.urlparse(url)[2:]
599 if parameters:
600 path = "%s;%s" % (path, parameters)
601 path = escape_path(path)
602 req_path = urlparse.urlunparse(("", "", path, "", query, frag))
603 if not req_path.startswith("/"):
604 # fix bad RFC 2396 absoluteURI
605 req_path = "/"+req_path
606 return req_path
607
608def request_port(request):
609 host = request.get_host()
610 i = host.find(':')
611 if i >= 0:
612 port = host[i+1:]
613 try:
614 int(port)
615 except ValueError:
616 debug("nonnumeric port: '%s'", port)
617 return None
618 else:
619 port = DEFAULT_HTTP_PORT
620 return port
621
622# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
623# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
624HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
625ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
626def uppercase_escaped_char(match):
627 return "%%%s" % match.group(1).upper()
628def escape_path(path):
629 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
630 # There's no knowing what character encoding was used to create URLs
631 # containing %-escapes, but since we have to pick one to escape invalid
632 # path characters, we pick UTF-8, as recommended in the HTML 4.0
633 # specification:
634 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
635 # And here, kind of: draft-fielding-uri-rfc2396bis-03
636 # (And in draft IRI specification: draft-duerst-iri-05)
637 # (And here, for new URI schemes: RFC 2718)
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +0000638 if isinstance(path, unicode):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000639 path = path.encode("utf-8")
640 path = urllib.quote(path, HTTP_PATH_SAFE)
641 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
642 return path
643
644def reach(h):
645 """Return reach of host h, as defined by RFC 2965, section 1.
646
647 The reach R of a host name H is defined as follows:
648
649 * If
650
651 - H is the host domain name of a host; and,
652
653 - H has the form A.B; and
654
655 - A has no embedded (that is, interior) dots; and
656
657 - B has at least one embedded dot, or B is the string "local".
658 then the reach of H is .B.
659
660 * Otherwise, the reach of H is H.
661
662 >>> reach("www.acme.com")
663 '.acme.com'
664 >>> reach("acme.com")
665 'acme.com'
666 >>> reach("acme.local")
667 '.local'
668
669 """
670 i = h.find(".")
671 if i >= 0:
672 #a = h[:i] # this line is only here to show what a is
673 b = h[i+1:]
674 i = b.find(".")
675 if is_HDN(h) and (i >= 0 or b == "local"):
676 return "."+b
677 return h
678
679def is_third_party(request):
680 """
681
682 RFC 2965, section 3.3.6:
683
684 An unverifiable transaction is to a third-party host if its request-
685 host U does not domain-match the reach R of the request-host O in the
686 origin transaction.
687
688 """
689 req_host = request_host(request)
690 if not domain_match(req_host, reach(request.get_origin_req_host())):
691 return True
692 else:
693 return False
694
695
696class Cookie:
697 """HTTP Cookie.
698
699 This class represents both Netscape and RFC 2965 cookies.
700
701 This is deliberately a very simple class. It just holds attributes. It's
702 possible to construct Cookie instances that don't comply with the cookie
703 standards. CookieJar.make_cookies is the factory function for Cookie
704 objects -- it deals with cookie parsing, supplying defaults, and
705 normalising to the representation used in this class. CookiePolicy is
706 responsible for checking them to see whether they should be accepted from
707 and returned to the server.
708
709 Note that the port may be present in the headers, but unspecified ("Port"
710 rather than"Port=80", for example); if this is the case, port is None.
711
712 """
713
714 def __init__(self, version, name, value,
715 port, port_specified,
716 domain, domain_specified, domain_initial_dot,
717 path, path_specified,
718 secure,
719 expires,
720 discard,
721 comment,
722 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000723 rest,
724 rfc2109=False,
725 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000726
727 if version is not None: version = int(version)
728 if expires is not None: expires = int(expires)
729 if port is None and port_specified is True:
730 raise ValueError("if port is None, port_specified must be false")
731
732 self.version = version
733 self.name = name
734 self.value = value
735 self.port = port
736 self.port_specified = port_specified
737 # normalise case, as per RFC 2965 section 3.3.3
738 self.domain = domain.lower()
739 self.domain_specified = domain_specified
740 # Sigh. We need to know whether the domain given in the
741 # cookie-attribute had an initial dot, in order to follow RFC 2965
742 # (as clarified in draft errata). Needed for the returned $Domain
743 # value.
744 self.domain_initial_dot = domain_initial_dot
745 self.path = path
746 self.path_specified = path_specified
747 self.secure = secure
748 self.expires = expires
749 self.discard = discard
750 self.comment = comment
751 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000752 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000753
754 self._rest = copy.copy(rest)
755
756 def has_nonstandard_attr(self, name):
757 return name in self._rest
758 def get_nonstandard_attr(self, name, default=None):
759 return self._rest.get(name, default)
760 def set_nonstandard_attr(self, name, value):
761 self._rest[name] = value
762
763 def is_expired(self, now=None):
764 if now is None: now = time.time()
765 if (self.expires is not None) and (self.expires <= now):
766 return True
767 return False
768
769 def __str__(self):
770 if self.port is None: p = ""
771 else: p = ":"+self.port
772 limit = self.domain + p + self.path
773 if self.value is not None:
774 namevalue = "%s=%s" % (self.name, self.value)
775 else:
776 namevalue = self.name
777 return "<Cookie %s for %s>" % (namevalue, limit)
778
779 def __repr__(self):
780 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000781 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000782 "port", "port_specified",
783 "domain", "domain_specified", "domain_initial_dot",
784 "path", "path_specified",
785 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000786 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000787 attr = getattr(self, name)
788 args.append("%s=%s" % (name, repr(attr)))
789 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000790 args.append("rfc2109=%s" % repr(self.rfc2109))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000791 return "Cookie(%s)" % ", ".join(args)
792
793
794class CookiePolicy:
795 """Defines which cookies get accepted from and returned to server.
796
797 May also modify cookies, though this is probably a bad idea.
798
799 The subclass DefaultCookiePolicy defines the standard rules for Netscape
800 and RFC 2965 cookies -- override that if you want a customised policy.
801
802 """
803 def set_ok(self, cookie, request):
804 """Return true if (and only if) cookie should be accepted from server.
805
806 Currently, pre-expired cookies never get this far -- the CookieJar
807 class deletes such cookies itself.
808
809 """
810 raise NotImplementedError()
811
812 def return_ok(self, cookie, request):
813 """Return true if (and only if) cookie should be returned to server."""
814 raise NotImplementedError()
815
816 def domain_return_ok(self, domain, request):
817 """Return false if cookies should not be returned, given cookie domain.
818 """
819 return True
820
821 def path_return_ok(self, path, request):
822 """Return false if cookies should not be returned, given cookie path.
823 """
824 return True
825
826
827class DefaultCookiePolicy(CookiePolicy):
828 """Implements the standard rules for accepting and returning cookies."""
829
830 DomainStrictNoDots = 1
831 DomainStrictNonDomain = 2
832 DomainRFC2965Match = 4
833
834 DomainLiberal = 0
835 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
836
837 def __init__(self,
838 blocked_domains=None, allowed_domains=None,
839 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000840 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000841 hide_cookie2=False,
842 strict_domain=False,
843 strict_rfc2965_unverifiable=True,
844 strict_ns_unverifiable=False,
845 strict_ns_domain=DomainLiberal,
846 strict_ns_set_initial_dollar=False,
847 strict_ns_set_path=False,
848 ):
849 """Constructor arguments should be passed as keyword arguments only."""
850 self.netscape = netscape
851 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000852 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000853 self.hide_cookie2 = hide_cookie2
854 self.strict_domain = strict_domain
855 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
856 self.strict_ns_unverifiable = strict_ns_unverifiable
857 self.strict_ns_domain = strict_ns_domain
858 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
859 self.strict_ns_set_path = strict_ns_set_path
860
861 if blocked_domains is not None:
862 self._blocked_domains = tuple(blocked_domains)
863 else:
864 self._blocked_domains = ()
865
866 if allowed_domains is not None:
867 allowed_domains = tuple(allowed_domains)
868 self._allowed_domains = allowed_domains
869
870 def blocked_domains(self):
871 """Return the sequence of blocked domains (as a tuple)."""
872 return self._blocked_domains
873 def set_blocked_domains(self, blocked_domains):
874 """Set the sequence of blocked domains."""
875 self._blocked_domains = tuple(blocked_domains)
876
877 def is_blocked(self, domain):
878 for blocked_domain in self._blocked_domains:
879 if user_domain_match(domain, blocked_domain):
880 return True
881 return False
882
883 def allowed_domains(self):
884 """Return None, or the sequence of allowed domains (as a tuple)."""
885 return self._allowed_domains
886 def set_allowed_domains(self, allowed_domains):
887 """Set the sequence of allowed domains, or None."""
888 if allowed_domains is not None:
889 allowed_domains = tuple(allowed_domains)
890 self._allowed_domains = allowed_domains
891
892 def is_not_allowed(self, domain):
893 if self._allowed_domains is None:
894 return False
895 for allowed_domain in self._allowed_domains:
896 if user_domain_match(domain, allowed_domain):
897 return False
898 return True
899
900 def set_ok(self, cookie, request):
901 """
902 If you override .set_ok(), be sure to call this method. If it returns
903 false, so should your subclass (assuming your subclass wants to be more
904 strict about which cookies to accept).
905
906 """
907 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
908
909 assert cookie.name is not None
910
911 for n in "version", "verifiability", "name", "path", "domain", "port":
912 fn_name = "set_ok_"+n
913 fn = getattr(self, fn_name)
914 if not fn(cookie, request):
915 return False
916
917 return True
918
919 def set_ok_version(self, cookie, request):
920 if cookie.version is None:
921 # Version is always set to 0 by parse_ns_headers if it's a Netscape
922 # cookie, so this must be an invalid RFC 2965 cookie.
923 debug(" Set-Cookie2 without version attribute (%s=%s)",
924 cookie.name, cookie.value)
925 return False
926 if cookie.version > 0 and not self.rfc2965:
927 debug(" RFC 2965 cookies are switched off")
928 return False
929 elif cookie.version == 0 and not self.netscape:
930 debug(" Netscape cookies are switched off")
931 return False
932 return True
933
934 def set_ok_verifiability(self, cookie, request):
935 if request.is_unverifiable() and is_third_party(request):
936 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
937 debug(" third-party RFC 2965 cookie during "
938 "unverifiable transaction")
939 return False
940 elif cookie.version == 0 and self.strict_ns_unverifiable:
941 debug(" third-party Netscape cookie during "
942 "unverifiable transaction")
943 return False
944 return True
945
946 def set_ok_name(self, cookie, request):
947 # Try and stop servers setting V0 cookies designed to hack other
948 # servers that know both V0 and V1 protocols.
949 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
950 cookie.name.startswith("$")):
951 debug(" illegal name (starts with '$'): '%s'", cookie.name)
952 return False
953 return True
954
955 def set_ok_path(self, cookie, request):
956 if cookie.path_specified:
957 req_path = request_path(request)
958 if ((cookie.version > 0 or
959 (cookie.version == 0 and self.strict_ns_set_path)) and
960 not req_path.startswith(cookie.path)):
961 debug(" path attribute %s is not a prefix of request "
962 "path %s", cookie.path, req_path)
963 return False
964 return True
965
966 def set_ok_domain(self, cookie, request):
967 if self.is_blocked(cookie.domain):
968 debug(" domain %s is in user block-list", cookie.domain)
969 return False
970 if self.is_not_allowed(cookie.domain):
971 debug(" domain %s is not in user allow-list", cookie.domain)
972 return False
973 if cookie.domain_specified:
974 req_host, erhn = eff_request_host(request)
975 domain = cookie.domain
976 if self.strict_domain and (domain.count(".") >= 2):
Georg Brandle58334a2006-05-07 20:44:34 +0000977 # XXX This should probably be compared with the Konqueror
978 # (kcookiejar.cpp) and Mozilla implementations, but it's a
979 # losing battle.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000980 i = domain.rfind(".")
981 j = domain.rfind(".", 0, i)
982 if j == 0: # domain like .foo.bar
983 tld = domain[i+1:]
984 sld = domain[j+1:i]
Georg Brandle58334a2006-05-07 20:44:34 +0000985 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
986 "gov", "mil", "int", "aero", "biz", "cat", "coop",
987 "info", "jobs", "mobi", "museum", "name", "pro",
988 "travel", "eu") and len(tld) == 2:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000989 # domain like .co.uk
990 debug(" country-code second level domain %s", domain)
991 return False
992 if domain.startswith("."):
993 undotted_domain = domain[1:]
994 else:
995 undotted_domain = domain
996 embedded_dots = (undotted_domain.find(".") >= 0)
997 if not embedded_dots and domain != ".local":
998 debug(" non-local domain %s contains no embedded dot",
999 domain)
1000 return False
1001 if cookie.version == 0:
1002 if (not erhn.endswith(domain) and
1003 (not erhn.startswith(".") and
1004 not ("."+erhn).endswith(domain))):
1005 debug(" effective request-host %s (even with added "
1006 "initial dot) does not end end with %s",
1007 erhn, domain)
1008 return False
1009 if (cookie.version > 0 or
1010 (self.strict_ns_domain & self.DomainRFC2965Match)):
1011 if not domain_match(erhn, domain):
1012 debug(" effective request-host %s does not domain-match "
1013 "%s", erhn, domain)
1014 return False
1015 if (cookie.version > 0 or
1016 (self.strict_ns_domain & self.DomainStrictNoDots)):
1017 host_prefix = req_host[:-len(domain)]
1018 if (host_prefix.find(".") >= 0 and
1019 not IPV4_RE.search(req_host)):
1020 debug(" host prefix %s for domain %s contains a dot",
1021 host_prefix, domain)
1022 return False
1023 return True
1024
1025 def set_ok_port(self, cookie, request):
1026 if cookie.port_specified:
1027 req_port = request_port(request)
1028 if req_port is None:
1029 req_port = "80"
1030 else:
1031 req_port = str(req_port)
1032 for p in cookie.port.split(","):
1033 try:
1034 int(p)
1035 except ValueError:
1036 debug(" bad port %s (not numeric)", p)
1037 return False
1038 if p == req_port:
1039 break
1040 else:
1041 debug(" request port (%s) not found in %s",
1042 req_port, cookie.port)
1043 return False
1044 return True
1045
1046 def return_ok(self, cookie, request):
1047 """
1048 If you override .return_ok(), be sure to call this method. If it
1049 returns false, so should your subclass (assuming your subclass wants to
1050 be more strict about which cookies to return).
1051
1052 """
1053 # Path has already been checked by .path_return_ok(), and domain
1054 # blocking done by .domain_return_ok().
1055 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1056
1057 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1058 fn_name = "return_ok_"+n
1059 fn = getattr(self, fn_name)
1060 if not fn(cookie, request):
1061 return False
1062 return True
1063
1064 def return_ok_version(self, cookie, request):
1065 if cookie.version > 0 and not self.rfc2965:
1066 debug(" RFC 2965 cookies are switched off")
1067 return False
1068 elif cookie.version == 0 and not self.netscape:
1069 debug(" Netscape cookies are switched off")
1070 return False
1071 return True
1072
1073 def return_ok_verifiability(self, cookie, request):
1074 if request.is_unverifiable() and is_third_party(request):
1075 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1076 debug(" third-party RFC 2965 cookie during unverifiable "
1077 "transaction")
1078 return False
1079 elif cookie.version == 0 and self.strict_ns_unverifiable:
1080 debug(" third-party Netscape cookie during unverifiable "
1081 "transaction")
1082 return False
1083 return True
1084
1085 def return_ok_secure(self, cookie, request):
1086 if cookie.secure and request.get_type() != "https":
1087 debug(" secure cookie with non-secure request")
1088 return False
1089 return True
1090
1091 def return_ok_expires(self, cookie, request):
1092 if cookie.is_expired(self._now):
1093 debug(" cookie expired")
1094 return False
1095 return True
1096
1097 def return_ok_port(self, cookie, request):
1098 if cookie.port:
1099 req_port = request_port(request)
1100 if req_port is None:
1101 req_port = "80"
1102 for p in cookie.port.split(","):
1103 if p == req_port:
1104 break
1105 else:
1106 debug(" request port %s does not match cookie port %s",
1107 req_port, cookie.port)
1108 return False
1109 return True
1110
1111 def return_ok_domain(self, cookie, request):
1112 req_host, erhn = eff_request_host(request)
1113 domain = cookie.domain
1114
1115 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1116 if (cookie.version == 0 and
1117 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1118 not cookie.domain_specified and domain != erhn):
1119 debug(" cookie with unspecified domain does not string-compare "
1120 "equal to request domain")
1121 return False
1122
1123 if cookie.version > 0 and not domain_match(erhn, domain):
1124 debug(" effective request-host name %s does not domain-match "
1125 "RFC 2965 cookie domain %s", erhn, domain)
1126 return False
1127 if cookie.version == 0 and not ("."+erhn).endswith(domain):
1128 debug(" request-host %s does not match Netscape cookie domain "
1129 "%s", req_host, domain)
1130 return False
1131 return True
1132
1133 def domain_return_ok(self, domain, request):
1134 # Liberal check of. This is here as an optimization to avoid
1135 # having to load lots of MSIE cookie files unless necessary.
1136 req_host, erhn = eff_request_host(request)
1137 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001138 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001139 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001140 erhn = "."+erhn
1141 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001142 #debug(" request domain %s does not match cookie domain %s",
1143 # req_host, domain)
1144 return False
1145
1146 if self.is_blocked(domain):
1147 debug(" domain %s is in user block-list", domain)
1148 return False
1149 if self.is_not_allowed(domain):
1150 debug(" domain %s is not in user allow-list", domain)
1151 return False
1152
1153 return True
1154
1155 def path_return_ok(self, path, request):
1156 debug("- checking cookie path=%s", path)
1157 req_path = request_path(request)
1158 if not req_path.startswith(path):
1159 debug(" %s does not path-match %s", req_path, path)
1160 return False
1161 return True
1162
1163
1164def vals_sorted_by_key(adict):
1165 keys = adict.keys()
1166 keys.sort()
1167 return map(adict.get, keys)
1168
1169def deepvalues(mapping):
1170 """Iterates over nested mapping, depth-first, in sorted order by key."""
1171 values = vals_sorted_by_key(mapping)
1172 for obj in values:
1173 mapping = False
1174 try:
1175 obj.items
1176 except AttributeError:
1177 pass
1178 else:
1179 mapping = True
1180 for subobj in deepvalues(obj):
1181 yield subobj
1182 if not mapping:
1183 yield obj
1184
1185
1186# Used as second parameter to dict.get() method, to distinguish absent
1187# dict key from one with a None value.
1188class Absent: pass
1189
1190class CookieJar:
1191 """Collection of HTTP cookies.
1192
1193 You may not need to know about this class: try
1194 urllib2.build_opener(HTTPCookieProcessor).open(url).
1195
1196 """
1197
1198 non_word_re = re.compile(r"\W")
1199 quote_re = re.compile(r"([\"\\])")
1200 strict_domain_re = re.compile(r"\.?[^.]*")
1201 domain_re = re.compile(r"[^.]*")
1202 dots_re = re.compile(r"^\.+")
1203
1204 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1205
1206 def __init__(self, policy=None):
1207 if policy is None:
1208 policy = DefaultCookiePolicy()
1209 self._policy = policy
1210
1211 self._cookies_lock = _threading.RLock()
1212 self._cookies = {}
1213
1214 def set_policy(self, policy):
1215 self._policy = policy
1216
1217 def _cookies_for_domain(self, domain, request):
1218 cookies = []
1219 if not self._policy.domain_return_ok(domain, request):
1220 return []
1221 debug("Checking %s for cookies to return", domain)
1222 cookies_by_path = self._cookies[domain]
1223 for path in cookies_by_path.keys():
1224 if not self._policy.path_return_ok(path, request):
1225 continue
1226 cookies_by_name = cookies_by_path[path]
1227 for cookie in cookies_by_name.values():
1228 if not self._policy.return_ok(cookie, request):
1229 debug(" not returning cookie")
1230 continue
1231 debug(" it's a match")
1232 cookies.append(cookie)
1233 return cookies
1234
1235 def _cookies_for_request(self, request):
1236 """Return a list of cookies to be returned to server."""
1237 cookies = []
1238 for domain in self._cookies.keys():
1239 cookies.extend(self._cookies_for_domain(domain, request))
1240 return cookies
1241
1242 def _cookie_attrs(self, cookies):
1243 """Return a list of cookie-attributes to be returned to server.
1244
1245 like ['foo="bar"; $Path="/"', ...]
1246
1247 The $Version attribute is also added when appropriate (currently only
1248 once per request).
1249
1250 """
1251 # add cookies in order of most specific (ie. longest) path first
1252 def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
1253 cookies.sort(decreasing_size)
1254
1255 version_set = False
1256
1257 attrs = []
1258 for cookie in cookies:
1259 # set version of Cookie header
1260 # XXX
1261 # What should it be if multiple matching Set-Cookie headers have
1262 # different versions themselves?
1263 # Answer: there is no answer; was supposed to be settled by
1264 # RFC 2965 errata, but that may never appear...
1265 version = cookie.version
1266 if not version_set:
1267 version_set = True
1268 if version > 0:
1269 attrs.append("$Version=%s" % version)
1270
1271 # quote cookie value if necessary
1272 # (not for Netscape protocol, which already has any quotes
1273 # intact, due to the poorly-specified Netscape Cookie: syntax)
1274 if ((cookie.value is not None) and
1275 self.non_word_re.search(cookie.value) and version > 0):
1276 value = self.quote_re.sub(r"\\\1", cookie.value)
1277 else:
1278 value = cookie.value
1279
1280 # add cookie-attributes to be returned in Cookie header
1281 if cookie.value is None:
1282 attrs.append(cookie.name)
1283 else:
1284 attrs.append("%s=%s" % (cookie.name, value))
1285 if version > 0:
1286 if cookie.path_specified:
1287 attrs.append('$Path="%s"' % cookie.path)
1288 if cookie.domain.startswith("."):
1289 domain = cookie.domain
1290 if (not cookie.domain_initial_dot and
1291 domain.startswith(".")):
1292 domain = domain[1:]
1293 attrs.append('$Domain="%s"' % domain)
1294 if cookie.port is not None:
1295 p = "$Port"
1296 if cookie.port_specified:
1297 p = p + ('="%s"' % cookie.port)
1298 attrs.append(p)
1299
1300 return attrs
1301
1302 def add_cookie_header(self, request):
1303 """Add correct Cookie: header to request (urllib2.Request object).
1304
1305 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1306
1307 """
1308 debug("add_cookie_header")
1309 self._cookies_lock.acquire()
1310
1311 self._policy._now = self._now = int(time.time())
1312
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001313 cookies = self._cookies_for_request(request)
1314
1315 attrs = self._cookie_attrs(cookies)
1316 if attrs:
1317 if not request.has_header("Cookie"):
1318 request.add_unredirected_header(
1319 "Cookie", "; ".join(attrs))
1320
1321 # if necessary, advertise that we know RFC 2965
1322 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1323 not request.has_header("Cookie2")):
1324 for cookie in cookies:
1325 if cookie.version != 1:
1326 request.add_unredirected_header("Cookie2", '$Version="1"')
1327 break
1328
1329 self._cookies_lock.release()
1330
1331 self.clear_expired_cookies()
1332
1333 def _normalized_cookie_tuples(self, attrs_set):
1334 """Return list of tuples containing normalised cookie information.
1335
1336 attrs_set is the list of lists of key,value pairs extracted from
1337 the Set-Cookie or Set-Cookie2 headers.
1338
1339 Tuples are name, value, standard, rest, where name and value are the
1340 cookie name and value, standard is a dictionary containing the standard
1341 cookie-attributes (discard, secure, version, expires or max-age,
1342 domain, path and port) and rest is a dictionary containing the rest of
1343 the cookie-attributes.
1344
1345 """
1346 cookie_tuples = []
1347
1348 boolean_attrs = "discard", "secure"
1349 value_attrs = ("version",
1350 "expires", "max-age",
1351 "domain", "path", "port",
1352 "comment", "commenturl")
1353
1354 for cookie_attrs in attrs_set:
1355 name, value = cookie_attrs[0]
1356
1357 # Build dictionary of standard cookie-attributes (standard) and
1358 # dictionary of other cookie-attributes (rest).
1359
1360 # Note: expiry time is normalised to seconds since epoch. V0
1361 # cookies should have the Expires cookie-attribute, and V1 cookies
1362 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1363 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1364 # accept either (but prefer Max-Age).
1365 max_age_set = False
1366
1367 bad_cookie = False
1368
1369 standard = {}
1370 rest = {}
1371 for k, v in cookie_attrs[1:]:
1372 lc = k.lower()
1373 # don't lose case distinction for unknown fields
1374 if lc in value_attrs or lc in boolean_attrs:
1375 k = lc
1376 if k in boolean_attrs and v is None:
1377 # boolean cookie-attribute is present, but has no value
1378 # (like "discard", rather than "port=80")
1379 v = True
1380 if k in standard:
1381 # only first value is significant
1382 continue
1383 if k == "domain":
1384 if v is None:
1385 debug(" missing value for domain attribute")
1386 bad_cookie = True
1387 break
1388 # RFC 2965 section 3.3.3
1389 v = v.lower()
1390 if k == "expires":
1391 if max_age_set:
1392 # Prefer max-age to expires (like Mozilla)
1393 continue
1394 if v is None:
1395 debug(" missing or invalid value for expires "
1396 "attribute: treating as session cookie")
1397 continue
1398 if k == "max-age":
1399 max_age_set = True
1400 try:
1401 v = int(v)
1402 except ValueError:
1403 debug(" missing or invalid (non-numeric) value for "
1404 "max-age attribute")
1405 bad_cookie = True
1406 break
1407 # convert RFC 2965 Max-Age to seconds since epoch
1408 # XXX Strictly you're supposed to follow RFC 2616
1409 # age-calculation rules. Remember that zero Max-Age is a
1410 # is a request to discard (old and new) cookie, though.
1411 k = "expires"
1412 v = self._now + v
1413 if (k in value_attrs) or (k in boolean_attrs):
1414 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001415 k not in ("port", "comment", "commenturl")):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001416 debug(" missing value for %s attribute" % k)
1417 bad_cookie = True
1418 break
1419 standard[k] = v
1420 else:
1421 rest[k] = v
1422
1423 if bad_cookie:
1424 continue
1425
1426 cookie_tuples.append((name, value, standard, rest))
1427
1428 return cookie_tuples
1429
1430 def _cookie_from_cookie_tuple(self, tup, request):
1431 # standard is dict of standard cookie-attributes, rest is dict of the
1432 # rest of them
1433 name, value, standard, rest = tup
1434
1435 domain = standard.get("domain", Absent)
1436 path = standard.get("path", Absent)
1437 port = standard.get("port", Absent)
1438 expires = standard.get("expires", Absent)
1439
1440 # set the easy defaults
1441 version = standard.get("version", None)
1442 if version is not None: version = int(version)
1443 secure = standard.get("secure", False)
1444 # (discard is also set if expires is Absent)
1445 discard = standard.get("discard", False)
1446 comment = standard.get("comment", None)
1447 comment_url = standard.get("commenturl", None)
1448
1449 # set default path
1450 if path is not Absent and path != "":
1451 path_specified = True
1452 path = escape_path(path)
1453 else:
1454 path_specified = False
1455 path = request_path(request)
1456 i = path.rfind("/")
1457 if i != -1:
1458 if version == 0:
1459 # Netscape spec parts company from reality here
1460 path = path[:i]
1461 else:
1462 path = path[:i+1]
1463 if len(path) == 0: path = "/"
1464
1465 # set default domain
1466 domain_specified = domain is not Absent
1467 # but first we have to remember whether it starts with a dot
1468 domain_initial_dot = False
1469 if domain_specified:
1470 domain_initial_dot = bool(domain.startswith("."))
1471 if domain is Absent:
1472 req_host, erhn = eff_request_host(request)
1473 domain = erhn
1474 elif not domain.startswith("."):
1475 domain = "."+domain
1476
1477 # set default port
1478 port_specified = False
1479 if port is not Absent:
1480 if port is None:
1481 # Port attr present, but has no value: default to request port.
1482 # Cookie should then only be sent back on that port.
1483 port = request_port(request)
1484 else:
1485 port_specified = True
1486 port = re.sub(r"\s+", "", port)
1487 else:
1488 # No port attr present. Cookie can be sent back on any port.
1489 port = None
1490
1491 # set default expires and discard
1492 if expires is Absent:
1493 expires = None
1494 discard = True
1495 elif expires <= self._now:
1496 # Expiry date in past is request to delete cookie. This can't be
1497 # in DefaultCookiePolicy, because can't delete cookies there.
1498 try:
1499 self.clear(domain, path, name)
1500 except KeyError:
1501 pass
1502 debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1503 domain, path, name)
1504 return None
1505
1506 return Cookie(version,
1507 name, value,
1508 port, port_specified,
1509 domain, domain_specified, domain_initial_dot,
1510 path, path_specified,
1511 secure,
1512 expires,
1513 discard,
1514 comment,
1515 comment_url,
1516 rest)
1517
1518 def _cookies_from_attrs_set(self, attrs_set, request):
1519 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1520
1521 cookies = []
1522 for tup in cookie_tuples:
1523 cookie = self._cookie_from_cookie_tuple(tup, request)
1524 if cookie: cookies.append(cookie)
1525 return cookies
1526
Neal Norwitz71dad722005-12-23 21:43:48 +00001527 def _process_rfc2109_cookies(self, cookies):
1528 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1529 if rfc2109_as_ns is None:
1530 rfc2109_as_ns = not self._policy.rfc2965
1531 for cookie in cookies:
1532 if cookie.version == 1:
1533 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001534 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001535 # treat 2109 cookies as Netscape cookies rather than
1536 # as RFC2965 cookies
1537 cookie.version = 0
1538
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001539 def make_cookies(self, response, request):
1540 """Return sequence of Cookie objects extracted from response object."""
1541 # get cookie-attributes for RFC 2965 and Netscape protocols
1542 headers = response.info()
1543 rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1544 ns_hdrs = headers.getheaders("Set-Cookie")
1545
1546 rfc2965 = self._policy.rfc2965
1547 netscape = self._policy.netscape
1548
1549 if ((not rfc2965_hdrs and not ns_hdrs) or
1550 (not ns_hdrs and not rfc2965) or
1551 (not rfc2965_hdrs and not netscape) or
1552 (not netscape and not rfc2965)):
1553 return [] # no relevant cookie headers: quick exit
1554
1555 try:
1556 cookies = self._cookies_from_attrs_set(
1557 split_header_words(rfc2965_hdrs), request)
1558 except:
1559 reraise_unmasked_exceptions()
1560 cookies = []
1561
1562 if ns_hdrs and netscape:
1563 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001564 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001565 ns_cookies = self._cookies_from_attrs_set(
1566 parse_ns_headers(ns_hdrs), request)
1567 except:
1568 reraise_unmasked_exceptions()
1569 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001570 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001571
1572 # Look for Netscape cookies (from Set-Cookie headers) that match
1573 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1574 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1575 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1576 # bundled in with the Netscape cookies for this purpose, which is
1577 # reasonable behaviour.
1578 if rfc2965:
1579 lookup = {}
1580 for cookie in cookies:
1581 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1582
1583 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1584 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1585 return key not in lookup
1586 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1587
1588 if ns_cookies:
1589 cookies.extend(ns_cookies)
1590
1591 return cookies
1592
1593 def set_cookie_if_ok(self, cookie, request):
1594 """Set a cookie if policy says it's OK to do so."""
1595 self._cookies_lock.acquire()
1596 self._policy._now = self._now = int(time.time())
1597
1598 if self._policy.set_ok(cookie, request):
1599 self.set_cookie(cookie)
1600
1601 self._cookies_lock.release()
1602
1603 def set_cookie(self, cookie):
1604 """Set a cookie, without checking whether or not it should be set."""
1605 c = self._cookies
1606 self._cookies_lock.acquire()
1607 try:
1608 if cookie.domain not in c: c[cookie.domain] = {}
1609 c2 = c[cookie.domain]
1610 if cookie.path not in c2: c2[cookie.path] = {}
1611 c3 = c2[cookie.path]
1612 c3[cookie.name] = cookie
1613 finally:
1614 self._cookies_lock.release()
1615
1616 def extract_cookies(self, response, request):
1617 """Extract cookies from response, where allowable given the request."""
1618 debug("extract_cookies: %s", response.info())
1619 self._cookies_lock.acquire()
1620 self._policy._now = self._now = int(time.time())
1621
1622 for cookie in self.make_cookies(response, request):
1623 if self._policy.set_ok(cookie, request):
1624 debug(" setting cookie: %s", cookie)
1625 self.set_cookie(cookie)
1626 self._cookies_lock.release()
1627
1628 def clear(self, domain=None, path=None, name=None):
1629 """Clear some cookies.
1630
1631 Invoking this method without arguments will clear all cookies. If
1632 given a single argument, only cookies belonging to that domain will be
1633 removed. If given two arguments, cookies belonging to the specified
1634 path within that domain are removed. If given three arguments, then
1635 the cookie with the specified name, path and domain is removed.
1636
1637 Raises KeyError if no matching cookie exists.
1638
1639 """
1640 if name is not None:
1641 if (domain is None) or (path is None):
1642 raise ValueError(
1643 "domain and path must be given to remove a cookie by name")
1644 del self._cookies[domain][path][name]
1645 elif path is not None:
1646 if domain is None:
1647 raise ValueError(
1648 "domain must be given to remove cookies by path")
1649 del self._cookies[domain][path]
1650 elif domain is not None:
1651 del self._cookies[domain]
1652 else:
1653 self._cookies = {}
1654
1655 def clear_session_cookies(self):
1656 """Discard all session cookies.
1657
1658 Note that the .save() method won't save session cookies anyway, unless
1659 you ask otherwise by passing a true ignore_discard argument.
1660
1661 """
1662 self._cookies_lock.acquire()
1663 for cookie in self:
1664 if cookie.discard:
1665 self.clear(cookie.domain, cookie.path, cookie.name)
1666 self._cookies_lock.release()
1667
1668 def clear_expired_cookies(self):
1669 """Discard all expired cookies.
1670
1671 You probably don't need to call this method: expired cookies are never
1672 sent back to the server (provided you're using DefaultCookiePolicy),
1673 this method is called by CookieJar itself every so often, and the
1674 .save() method won't save expired cookies anyway (unless you ask
1675 otherwise by passing a true ignore_expires argument).
1676
1677 """
1678 self._cookies_lock.acquire()
1679 now = time.time()
1680 for cookie in self:
1681 if cookie.is_expired(now):
1682 self.clear(cookie.domain, cookie.path, cookie.name)
1683 self._cookies_lock.release()
1684
1685 def __iter__(self):
1686 return deepvalues(self._cookies)
1687
1688 def __len__(self):
1689 """Return number of contained cookies."""
1690 i = 0
1691 for cookie in self: i = i + 1
1692 return i
1693
1694 def __repr__(self):
1695 r = []
1696 for cookie in self: r.append(repr(cookie))
1697 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1698
1699 def __str__(self):
1700 r = []
1701 for cookie in self: r.append(str(cookie))
1702 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1703
1704
Neal Norwitz3e7de592005-12-23 21:24:35 +00001705# derives from IOError for backwards-compatibility with Python 2.4.0
1706class LoadError(IOError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001707
1708class FileCookieJar(CookieJar):
1709 """CookieJar that can be loaded from and saved to a file."""
1710
1711 def __init__(self, filename=None, delayload=False, policy=None):
1712 """
1713 Cookies are NOT loaded from the named file until either the .load() or
1714 .revert() method is called.
1715
1716 """
1717 CookieJar.__init__(self, policy)
1718 if filename is not None:
1719 try:
1720 filename+""
1721 except:
1722 raise ValueError("filename must be string-like")
1723 self.filename = filename
1724 self.delayload = bool(delayload)
1725
1726 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1727 """Save cookies to a file."""
1728 raise NotImplementedError()
1729
1730 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1731 """Load cookies from a file."""
1732 if filename is None:
1733 if self.filename is not None: filename = self.filename
1734 else: raise ValueError(MISSING_FILENAME_TEXT)
1735
1736 f = open(filename)
1737 try:
1738 self._really_load(f, filename, ignore_discard, ignore_expires)
1739 finally:
1740 f.close()
1741
1742 def revert(self, filename=None,
1743 ignore_discard=False, ignore_expires=False):
1744 """Clear all cookies and reload cookies from a saved file.
1745
1746 Raises LoadError (or IOError) if reversion is not successful; the
1747 object's state will not be altered if this happens.
1748
1749 """
1750 if filename is None:
1751 if self.filename is not None: filename = self.filename
1752 else: raise ValueError(MISSING_FILENAME_TEXT)
1753
1754 self._cookies_lock.acquire()
1755
1756 old_state = copy.deepcopy(self._cookies)
1757 self._cookies = {}
1758 try:
1759 self.load(filename, ignore_discard, ignore_expires)
1760 except (LoadError, IOError):
1761 self._cookies = old_state
1762 raise
1763
1764 self._cookies_lock.release()
1765
1766from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1767from _MozillaCookieJar import MozillaCookieJar