blob: 656ae398fd36c2ffce61a06aa150bfece52749fa [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that the classes which do not derive from
11FileCookieJar are not distributed with the Python standard library, but
12are available from http://wwwsearch.sf.net/):
13
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +000028import sys, re, urlparse, copy, time, urllib, logging
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000029try:
30 import threading as _threading
31except ImportError:
32 import dummy_threading as _threading
33import httplib # only for the default HTTP port
34from calendar import timegm
35
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000036debug = logging.getLogger("cookielib").debug
37
38DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
39MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
40 "instance initialised with one)")
41
42def reraise_unmasked_exceptions(unmasked=()):
43 # There are a few catch-all except: statements in this module, for
44 # catching input that's bad in unexpected ways.
45 # This function re-raises some exceptions we don't want to trap.
46 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
47 etype = sys.exc_info()[0]
48 if issubclass(etype, unmasked):
49 raise
50 # swallowed an exception
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000051 import warnings, traceback, StringIO
52 f = StringIO.StringIO()
53 traceback.print_exc(None, f)
54 msg = f.getvalue()
55 warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000056
57
58# Date/time conversion
59# -----------------------------------------------------------------------------
60
61EPOCH_YEAR = 1970
62def _timegm(tt):
63 year, month, mday, hour, min, sec = tt[:6]
64 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
65 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
66 return timegm(tt)
67 else:
68 return None
69
70DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
71MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
72 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
73MONTHS_LOWER = []
74for month in MONTHS: MONTHS_LOWER.append(month.lower())
75
76def time2isoz(t=None):
77 """Return a string representing time in seconds since epoch, t.
78
79 If the function is called without an argument, it will use the current
80 time.
81
82 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
83 representing Universal Time (UTC, aka GMT). An example of this format is:
84
85 1994-11-24 08:49:37Z
86
87 """
88 if t is None: t = time.time()
89 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
90 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
91 year, mon, mday, hour, min, sec)
92
93def time2netscape(t=None):
94 """Return a string representing time in seconds since epoch, t.
95
96 If the function is called without an argument, it will use the current
97 time.
98
99 The format of the returned string is like this:
100
101 Wed, DD-Mon-YYYY HH:MM:SS GMT
102
103 """
104 if t is None: t = time.time()
105 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
106 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
107 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
108
109
110UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
111
112TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
113def offset_from_tz_string(tz):
114 offset = None
115 if tz in UTC_ZONES:
116 offset = 0
117 else:
118 m = TIMEZONE_RE.search(tz)
119 if m:
120 offset = 3600 * int(m.group(2))
121 if m.group(3):
122 offset = offset + 60 * int(m.group(3))
123 if m.group(1) == '-':
124 offset = -offset
125 return offset
126
127def _str2time(day, mon, yr, hr, min, sec, tz):
128 # translate month name to number
129 # month numbers start with 1 (January)
130 try:
131 mon = MONTHS_LOWER.index(mon.lower())+1
132 except ValueError:
133 # maybe it's already a number
134 try:
135 imon = int(mon)
136 except ValueError:
137 return None
138 if 1 <= imon <= 12:
139 mon = imon
140 else:
141 return None
142
143 # make sure clock elements are defined
144 if hr is None: hr = 0
145 if min is None: min = 0
146 if sec is None: sec = 0
147
148 yr = int(yr)
149 day = int(day)
150 hr = int(hr)
151 min = int(min)
152 sec = int(sec)
153
154 if yr < 1000:
155 # find "obvious" year
156 cur_yr = time.localtime(time.time())[0]
157 m = cur_yr % 100
158 tmp = yr
159 yr = yr + cur_yr - m
160 m = m - tmp
161 if abs(m) > 50:
162 if m > 0: yr = yr + 100
163 else: yr = yr - 100
164
165 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
166 t = _timegm((yr, mon, day, hr, min, sec, tz))
167
168 if t is not None:
169 # adjust time using timezone string, to get absolute time since epoch
170 if tz is None:
171 tz = "UTC"
172 tz = tz.upper()
173 offset = offset_from_tz_string(tz)
174 if offset is None:
175 return None
176 t = t - offset
177
178 return t
179
180STRICT_DATE_RE = re.compile(
181 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
182 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
183WEEKDAY_RE = re.compile(
184 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
185LOOSE_HTTP_DATE_RE = re.compile(
186 r"""^
187 (\d\d?) # day
188 (?:\s+|[-\/])
189 (\w+) # month
190 (?:\s+|[-\/])
191 (\d+) # year
192 (?:
193 (?:\s+|:) # separator before clock
194 (\d\d?):(\d\d) # hour:min
195 (?::(\d\d))? # optional seconds
196 )? # optional clock
197 \s*
198 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
199 \s*
200 (?:\(\w+\))? # ASCII representation of timezone in parens.
201 \s*$""", re.X)
202def http2time(text):
203 """Returns time in seconds since epoch of time represented by a string.
204
205 Return value is an integer.
206
207 None is returned if the format of str is unrecognized, the time is outside
208 the representable range, or the timezone string is not recognized. If the
209 string contains no timezone, UTC is assumed.
210
211 The timezone in the string may be numerical (like "-0800" or "+0100") or a
212 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
213 timezone strings equivalent to UTC (zero offset) are known to the function.
214
215 The function loosely parses the following formats:
216
217 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
218 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
219 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
220 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
221 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
222 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
223
224 The parser ignores leading and trailing whitespace. The time may be
225 absent.
226
227 If the year is given with only 2 digits, the function will select the
228 century that makes the year closest to the current date.
229
230 """
231 # fast exit for strictly conforming string
232 m = STRICT_DATE_RE.search(text)
233 if m:
234 g = m.groups()
235 mon = MONTHS_LOWER.index(g[1].lower()) + 1
236 tt = (int(g[2]), mon, int(g[0]),
237 int(g[3]), int(g[4]), float(g[5]))
238 return _timegm(tt)
239
240 # No, we need some messy parsing...
241
242 # clean up
243 text = text.lstrip()
244 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
245
246 # tz is time zone specifier string
247 day, mon, yr, hr, min, sec, tz = [None]*7
248
249 # loose regexp parse
250 m = LOOSE_HTTP_DATE_RE.search(text)
251 if m is not None:
252 day, mon, yr, hr, min, sec, tz = m.groups()
253 else:
254 return None # bad format
255
256 return _str2time(day, mon, yr, hr, min, sec, tz)
257
258ISO_DATE_RE = re.compile(
259 """^
260 (\d{4}) # year
261 [-\/]?
262 (\d\d?) # numerical month
263 [-\/]?
264 (\d\d?) # day
265 (?:
266 (?:\s+|[-:Tt]) # separator before clock
267 (\d\d?):?(\d\d) # hour:min
268 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
269 )? # optional clock
270 \s*
271 ([-+]?\d\d?:?(:?\d\d)?
272 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
273 \s*$""", re.X)
274def iso2time(text):
275 """
276 As for http2time, but parses the ISO 8601 formats:
277
278 1994-02-03 14:15:29 -0100 -- ISO 8601 format
279 1994-02-03 14:15:29 -- zone is optional
280 1994-02-03 -- only date
281 1994-02-03T14:15:29 -- Use T as separator
282 19940203T141529Z -- ISO 8601 compact format
283 19940203 -- only date
284
285 """
286 # clean up
287 text = text.lstrip()
288
289 # tz is time zone specifier string
290 day, mon, yr, hr, min, sec, tz = [None]*7
291
292 # loose regexp parse
293 m = ISO_DATE_RE.search(text)
294 if m is not None:
295 # XXX there's an extra bit of the timezone I'm ignoring here: is
296 # this the right thing to do?
297 yr, mon, day, hr, min, sec, tz, _ = m.groups()
298 else:
299 return None # bad format
300
301 return _str2time(day, mon, yr, hr, min, sec, tz)
302
303
304# Header parsing
305# -----------------------------------------------------------------------------
306
307def unmatched(match):
308 """Return unmatched part of re.Match object."""
309 start, end = match.span(0)
310 return match.string[:start]+match.string[end:]
311
312HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
313HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
314HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
315HEADER_ESCAPE_RE = re.compile(r"\\(.)")
316def split_header_words(header_values):
317 r"""Parse header values into a list of lists containing key,value pairs.
318
319 The function knows how to deal with ",", ";" and "=" as well as quoted
320 values after "=". A list of space separated tokens are parsed as if they
321 were separated by ";".
322
323 If the header_values passed as argument contains multiple values, then they
324 are treated as if they were a single value separated by comma ",".
325
326 This means that this function is useful for parsing header fields that
327 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
328 the requirement for tokens).
329
330 headers = #header
331 header = (token | parameter) *( [";"] (token | parameter))
332
333 token = 1*<any CHAR except CTLs or separators>
334 separators = "(" | ")" | "<" | ">" | "@"
335 | "," | ";" | ":" | "\" | <">
336 | "/" | "[" | "]" | "?" | "="
337 | "{" | "}" | SP | HT
338
339 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
340 qdtext = <any TEXT except <">>
341 quoted-pair = "\" CHAR
342
343 parameter = attribute "=" value
344 attribute = token
345 value = token | quoted-string
346
347 Each header is represented by a list of key/value pairs. The value for a
348 simple token (not part of a parameter) is None. Syntactically incorrect
349 headers will not necessarily be parsed as you would want.
350
351 This is easier to describe with some examples:
352
353 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
354 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
355 >>> split_header_words(['text/html; charset="iso-8859-1"'])
356 [[('text/html', None), ('charset', 'iso-8859-1')]]
357 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
358 [[('Basic', None), ('realm', '"foobar"')]]
359
360 """
Raymond Hettingerf7153662005-02-07 14:16:21 +0000361 assert not isinstance(header_values, basestring)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000362 result = []
363 for text in header_values:
364 orig_text = text
365 pairs = []
366 while text:
367 m = HEADER_TOKEN_RE.search(text)
368 if m:
369 text = unmatched(m)
370 name = m.group(1)
371 m = HEADER_QUOTED_VALUE_RE.search(text)
372 if m: # quoted value
373 text = unmatched(m)
374 value = m.group(1)
375 value = HEADER_ESCAPE_RE.sub(r"\1", value)
376 else:
377 m = HEADER_VALUE_RE.search(text)
378 if m: # unquoted value
379 text = unmatched(m)
380 value = m.group(1)
381 value = value.rstrip()
382 else:
383 # no value, a lone token
384 value = None
385 pairs.append((name, value))
386 elif text.lstrip().startswith(","):
387 # concatenated headers, as per RFC 2616 section 4.2
388 text = text.lstrip()[1:]
389 if pairs: result.append(pairs)
390 pairs = []
391 else:
392 # skip junk
393 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
394 assert nr_junk_chars > 0, (
395 "split_header_words bug: '%s', '%s', %s" %
396 (orig_text, text, pairs))
397 text = non_junk
398 if pairs: result.append(pairs)
399 return result
400
401HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
402def join_header_words(lists):
403 """Do the inverse (almost) of the conversion done by split_header_words.
404
405 Takes a list of lists of (key, value) pairs and produces a single header
406 value. Attribute values are quoted if needed.
407
408 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
409 'text/plain; charset="iso-8859/1"'
410 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
411 'text/plain, charset="iso-8859/1"'
412
413 """
414 headers = []
415 for pairs in lists:
416 attr = []
417 for k, v in pairs:
418 if v is not None:
419 if not re.search(r"^\w+$", v):
420 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
421 v = '"%s"' % v
422 k = "%s=%s" % (k, v)
423 attr.append(k)
424 if attr: headers.append("; ".join(attr))
425 return ", ".join(headers)
426
427def parse_ns_headers(ns_headers):
428 """Ad-hoc parser for Netscape protocol cookie-attributes.
429
430 The old Netscape cookie format for Set-Cookie can for instance contain
431 an unquoted "," in the expires field, so we have to use this ad-hoc
432 parser instead of split_header_words.
433
434 XXX This may not make the best possible effort to parse all the crap
435 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
436 parser is probably better, so could do worse than following that if
437 this ever gives any trouble.
438
439 Currently, this is also used for parsing RFC 2109 cookies.
440
441 """
442 known_attrs = ("expires", "domain", "path", "secure",
443 # RFC 2109 attrs (may turn up in Netscape cookies, too)
444 "port", "max-age")
445
446 result = []
447 for ns_header in ns_headers:
448 pairs = []
449 version_set = False
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000450 for ii, param in enumerate(re.split(r";\s*", ns_header)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000451 param = param.rstrip()
452 if param == "": continue
453 if "=" not in param:
Martin v. Löwisc5574e82005-03-03 10:57:37 +0000454 k, v = param, None
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000455 else:
456 k, v = re.split(r"\s*=\s*", param, 1)
457 k = k.lstrip()
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000458 if ii != 0:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000459 lc = k.lower()
460 if lc in known_attrs:
461 k = lc
462 if k == "version":
463 # This is an RFC 2109 cookie. Will be treated as RFC 2965
464 # cookie in rest of code.
465 # Probably it should be parsed with split_header_words, but
466 # that's too much hassle.
467 version_set = True
468 if k == "expires":
469 # convert expires date to seconds since epoch
470 if v.startswith('"'): v = v[1:]
471 if v.endswith('"'): v = v[:-1]
472 v = http2time(v) # None if invalid
473 pairs.append((k, v))
474
475 if pairs:
476 if not version_set:
477 pairs.append(("version", "0"))
478 result.append(pairs)
479
480 return result
481
482
483IPV4_RE = re.compile(r"\.\d+$")
484def is_HDN(text):
485 """Return True if text is a host domain name."""
486 # XXX
487 # This may well be wrong. Which RFC is HDN defined in, if any (for
488 # the purposes of RFC 2965)?
489 # For the current implementation, what about IPv6? Remember to look
490 # at other uses of IPV4_RE also, if change this.
491 if IPV4_RE.search(text):
492 return False
493 if text == "":
494 return False
495 if text[0] == "." or text[-1] == ".":
496 return False
497 return True
498
499def domain_match(A, B):
500 """Return True if domain A domain-matches domain B, according to RFC 2965.
501
502 A and B may be host domain names or IP addresses.
503
504 RFC 2965, section 1:
505
506 Host names can be specified either as an IP address or a HDN string.
507 Sometimes we compare one host name with another. (Such comparisons SHALL
508 be case-insensitive.) Host A's name domain-matches host B's if
509
510 * their host name strings string-compare equal; or
511
512 * A is a HDN string and has the form NB, where N is a non-empty
513 name string, B has the form .B', and B' is a HDN string. (So,
514 x.y.com domain-matches .Y.com but not Y.com.)
515
516 Note that domain-match is not a commutative operation: a.b.c.com
517 domain-matches .c.com, but not the reverse.
518
519 """
520 # Note that, if A or B are IP addresses, the only relevant part of the
521 # definition of the domain-match algorithm is the direct string-compare.
522 A = A.lower()
523 B = B.lower()
524 if A == B:
525 return True
526 if not is_HDN(A):
527 return False
528 i = A.rfind(B)
529 if i == -1 or i == 0:
530 # A does not have form NB, or N is the empty string
531 return False
532 if not B.startswith("."):
533 return False
534 if not is_HDN(B[1:]):
535 return False
536 return True
537
538def liberal_is_HDN(text):
539 """Return True if text is a sort-of-like a host domain name.
540
541 For accepting/blocking domains.
542
543 """
544 if IPV4_RE.search(text):
545 return False
546 return True
547
548def user_domain_match(A, B):
549 """For blocking/accepting domains.
550
551 A and B may be host domain names or IP addresses.
552
553 """
554 A = A.lower()
555 B = B.lower()
556 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
557 if A == B:
558 # equal IP addresses
559 return True
560 return False
561 initial_dot = B.startswith(".")
562 if initial_dot and A.endswith(B):
563 return True
564 if not initial_dot and A == B:
565 return True
566 return False
567
568cut_port_re = re.compile(r":\d+$")
569def request_host(request):
570 """Return request-host, as defined by RFC 2965.
571
572 Variation from RFC: returned value is lowercased, for convenient
573 comparison.
574
575 """
576 url = request.get_full_url()
577 host = urlparse.urlparse(url)[1]
578 if host == "":
579 host = request.get_header("Host", "")
580
581 # remove port, if present
582 host = cut_port_re.sub("", host, 1)
583 return host.lower()
584
585def eff_request_host(request):
586 """Return a tuple (request-host, effective request-host name).
587
588 As defined by RFC 2965, except both are lowercased.
589
590 """
591 erhn = req_host = request_host(request)
592 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
593 erhn = req_host + ".local"
594 return req_host, erhn
595
596def request_path(request):
597 """request-URI, as defined by RFC 2965."""
598 url = request.get_full_url()
599 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
600 #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
601 path, parameters, query, frag = urlparse.urlparse(url)[2:]
602 if parameters:
603 path = "%s;%s" % (path, parameters)
604 path = escape_path(path)
605 req_path = urlparse.urlunparse(("", "", path, "", query, frag))
606 if not req_path.startswith("/"):
607 # fix bad RFC 2396 absoluteURI
608 req_path = "/"+req_path
609 return req_path
610
611def request_port(request):
612 host = request.get_host()
613 i = host.find(':')
614 if i >= 0:
615 port = host[i+1:]
616 try:
617 int(port)
618 except ValueError:
619 debug("nonnumeric port: '%s'", port)
620 return None
621 else:
622 port = DEFAULT_HTTP_PORT
623 return port
624
625# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
626# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
627HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
628ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
629def uppercase_escaped_char(match):
630 return "%%%s" % match.group(1).upper()
631def escape_path(path):
632 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
633 # There's no knowing what character encoding was used to create URLs
634 # containing %-escapes, but since we have to pick one to escape invalid
635 # path characters, we pick UTF-8, as recommended in the HTML 4.0
636 # specification:
637 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
638 # And here, kind of: draft-fielding-uri-rfc2396bis-03
639 # (And in draft IRI specification: draft-duerst-iri-05)
640 # (And here, for new URI schemes: RFC 2718)
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +0000641 if isinstance(path, unicode):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000642 path = path.encode("utf-8")
643 path = urllib.quote(path, HTTP_PATH_SAFE)
644 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
645 return path
646
647def reach(h):
648 """Return reach of host h, as defined by RFC 2965, section 1.
649
650 The reach R of a host name H is defined as follows:
651
652 * If
653
654 - H is the host domain name of a host; and,
655
656 - H has the form A.B; and
657
658 - A has no embedded (that is, interior) dots; and
659
660 - B has at least one embedded dot, or B is the string "local".
661 then the reach of H is .B.
662
663 * Otherwise, the reach of H is H.
664
665 >>> reach("www.acme.com")
666 '.acme.com'
667 >>> reach("acme.com")
668 'acme.com'
669 >>> reach("acme.local")
670 '.local'
671
672 """
673 i = h.find(".")
674 if i >= 0:
675 #a = h[:i] # this line is only here to show what a is
676 b = h[i+1:]
677 i = b.find(".")
678 if is_HDN(h) and (i >= 0 or b == "local"):
679 return "."+b
680 return h
681
682def is_third_party(request):
683 """
684
685 RFC 2965, section 3.3.6:
686
687 An unverifiable transaction is to a third-party host if its request-
688 host U does not domain-match the reach R of the request-host O in the
689 origin transaction.
690
691 """
692 req_host = request_host(request)
693 if not domain_match(req_host, reach(request.get_origin_req_host())):
694 return True
695 else:
696 return False
697
698
699class Cookie:
700 """HTTP Cookie.
701
702 This class represents both Netscape and RFC 2965 cookies.
703
704 This is deliberately a very simple class. It just holds attributes. It's
705 possible to construct Cookie instances that don't comply with the cookie
706 standards. CookieJar.make_cookies is the factory function for Cookie
707 objects -- it deals with cookie parsing, supplying defaults, and
708 normalising to the representation used in this class. CookiePolicy is
709 responsible for checking them to see whether they should be accepted from
710 and returned to the server.
711
712 Note that the port may be present in the headers, but unspecified ("Port"
713 rather than"Port=80", for example); if this is the case, port is None.
714
715 """
716
717 def __init__(self, version, name, value,
718 port, port_specified,
719 domain, domain_specified, domain_initial_dot,
720 path, path_specified,
721 secure,
722 expires,
723 discard,
724 comment,
725 comment_url,
726 rest):
727
728 if version is not None: version = int(version)
729 if expires is not None: expires = int(expires)
730 if port is None and port_specified is True:
731 raise ValueError("if port is None, port_specified must be false")
732
733 self.version = version
734 self.name = name
735 self.value = value
736 self.port = port
737 self.port_specified = port_specified
738 # normalise case, as per RFC 2965 section 3.3.3
739 self.domain = domain.lower()
740 self.domain_specified = domain_specified
741 # Sigh. We need to know whether the domain given in the
742 # cookie-attribute had an initial dot, in order to follow RFC 2965
743 # (as clarified in draft errata). Needed for the returned $Domain
744 # value.
745 self.domain_initial_dot = domain_initial_dot
746 self.path = path
747 self.path_specified = path_specified
748 self.secure = secure
749 self.expires = expires
750 self.discard = discard
751 self.comment = comment
752 self.comment_url = comment_url
753
754 self._rest = copy.copy(rest)
755
756 def has_nonstandard_attr(self, name):
757 return name in self._rest
758 def get_nonstandard_attr(self, name, default=None):
759 return self._rest.get(name, default)
760 def set_nonstandard_attr(self, name, value):
761 self._rest[name] = value
762
763 def is_expired(self, now=None):
764 if now is None: now = time.time()
765 if (self.expires is not None) and (self.expires <= now):
766 return True
767 return False
768
769 def __str__(self):
770 if self.port is None: p = ""
771 else: p = ":"+self.port
772 limit = self.domain + p + self.path
773 if self.value is not None:
774 namevalue = "%s=%s" % (self.name, self.value)
775 else:
776 namevalue = self.name
777 return "<Cookie %s for %s>" % (namevalue, limit)
778
779 def __repr__(self):
780 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000781 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000782 "port", "port_specified",
783 "domain", "domain_specified", "domain_initial_dot",
784 "path", "path_specified",
785 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000786 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000787 attr = getattr(self, name)
788 args.append("%s=%s" % (name, repr(attr)))
789 args.append("rest=%s" % repr(self._rest))
790 return "Cookie(%s)" % ", ".join(args)
791
792
793class CookiePolicy:
794 """Defines which cookies get accepted from and returned to server.
795
796 May also modify cookies, though this is probably a bad idea.
797
798 The subclass DefaultCookiePolicy defines the standard rules for Netscape
799 and RFC 2965 cookies -- override that if you want a customised policy.
800
801 """
802 def set_ok(self, cookie, request):
803 """Return true if (and only if) cookie should be accepted from server.
804
805 Currently, pre-expired cookies never get this far -- the CookieJar
806 class deletes such cookies itself.
807
808 """
809 raise NotImplementedError()
810
811 def return_ok(self, cookie, request):
812 """Return true if (and only if) cookie should be returned to server."""
813 raise NotImplementedError()
814
815 def domain_return_ok(self, domain, request):
816 """Return false if cookies should not be returned, given cookie domain.
817 """
818 return True
819
820 def path_return_ok(self, path, request):
821 """Return false if cookies should not be returned, given cookie path.
822 """
823 return True
824
825
826class DefaultCookiePolicy(CookiePolicy):
827 """Implements the standard rules for accepting and returning cookies."""
828
829 DomainStrictNoDots = 1
830 DomainStrictNonDomain = 2
831 DomainRFC2965Match = 4
832
833 DomainLiberal = 0
834 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
835
836 def __init__(self,
837 blocked_domains=None, allowed_domains=None,
838 netscape=True, rfc2965=False,
839 hide_cookie2=False,
840 strict_domain=False,
841 strict_rfc2965_unverifiable=True,
842 strict_ns_unverifiable=False,
843 strict_ns_domain=DomainLiberal,
844 strict_ns_set_initial_dollar=False,
845 strict_ns_set_path=False,
846 ):
847 """Constructor arguments should be passed as keyword arguments only."""
848 self.netscape = netscape
849 self.rfc2965 = rfc2965
850 self.hide_cookie2 = hide_cookie2
851 self.strict_domain = strict_domain
852 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
853 self.strict_ns_unverifiable = strict_ns_unverifiable
854 self.strict_ns_domain = strict_ns_domain
855 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
856 self.strict_ns_set_path = strict_ns_set_path
857
858 if blocked_domains is not None:
859 self._blocked_domains = tuple(blocked_domains)
860 else:
861 self._blocked_domains = ()
862
863 if allowed_domains is not None:
864 allowed_domains = tuple(allowed_domains)
865 self._allowed_domains = allowed_domains
866
867 def blocked_domains(self):
868 """Return the sequence of blocked domains (as a tuple)."""
869 return self._blocked_domains
870 def set_blocked_domains(self, blocked_domains):
871 """Set the sequence of blocked domains."""
872 self._blocked_domains = tuple(blocked_domains)
873
874 def is_blocked(self, domain):
875 for blocked_domain in self._blocked_domains:
876 if user_domain_match(domain, blocked_domain):
877 return True
878 return False
879
880 def allowed_domains(self):
881 """Return None, or the sequence of allowed domains (as a tuple)."""
882 return self._allowed_domains
883 def set_allowed_domains(self, allowed_domains):
884 """Set the sequence of allowed domains, or None."""
885 if allowed_domains is not None:
886 allowed_domains = tuple(allowed_domains)
887 self._allowed_domains = allowed_domains
888
889 def is_not_allowed(self, domain):
890 if self._allowed_domains is None:
891 return False
892 for allowed_domain in self._allowed_domains:
893 if user_domain_match(domain, allowed_domain):
894 return False
895 return True
896
897 def set_ok(self, cookie, request):
898 """
899 If you override .set_ok(), be sure to call this method. If it returns
900 false, so should your subclass (assuming your subclass wants to be more
901 strict about which cookies to accept).
902
903 """
904 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
905
906 assert cookie.name is not None
907
908 for n in "version", "verifiability", "name", "path", "domain", "port":
909 fn_name = "set_ok_"+n
910 fn = getattr(self, fn_name)
911 if not fn(cookie, request):
912 return False
913
914 return True
915
916 def set_ok_version(self, cookie, request):
917 if cookie.version is None:
918 # Version is always set to 0 by parse_ns_headers if it's a Netscape
919 # cookie, so this must be an invalid RFC 2965 cookie.
920 debug(" Set-Cookie2 without version attribute (%s=%s)",
921 cookie.name, cookie.value)
922 return False
923 if cookie.version > 0 and not self.rfc2965:
924 debug(" RFC 2965 cookies are switched off")
925 return False
926 elif cookie.version == 0 and not self.netscape:
927 debug(" Netscape cookies are switched off")
928 return False
929 return True
930
931 def set_ok_verifiability(self, cookie, request):
932 if request.is_unverifiable() and is_third_party(request):
933 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
934 debug(" third-party RFC 2965 cookie during "
935 "unverifiable transaction")
936 return False
937 elif cookie.version == 0 and self.strict_ns_unverifiable:
938 debug(" third-party Netscape cookie during "
939 "unverifiable transaction")
940 return False
941 return True
942
943 def set_ok_name(self, cookie, request):
944 # Try and stop servers setting V0 cookies designed to hack other
945 # servers that know both V0 and V1 protocols.
946 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
947 cookie.name.startswith("$")):
948 debug(" illegal name (starts with '$'): '%s'", cookie.name)
949 return False
950 return True
951
952 def set_ok_path(self, cookie, request):
953 if cookie.path_specified:
954 req_path = request_path(request)
955 if ((cookie.version > 0 or
956 (cookie.version == 0 and self.strict_ns_set_path)) and
957 not req_path.startswith(cookie.path)):
958 debug(" path attribute %s is not a prefix of request "
959 "path %s", cookie.path, req_path)
960 return False
961 return True
962
963 def set_ok_domain(self, cookie, request):
964 if self.is_blocked(cookie.domain):
965 debug(" domain %s is in user block-list", cookie.domain)
966 return False
967 if self.is_not_allowed(cookie.domain):
968 debug(" domain %s is not in user allow-list", cookie.domain)
969 return False
970 if cookie.domain_specified:
971 req_host, erhn = eff_request_host(request)
972 domain = cookie.domain
973 if self.strict_domain and (domain.count(".") >= 2):
974 i = domain.rfind(".")
975 j = domain.rfind(".", 0, i)
976 if j == 0: # domain like .foo.bar
977 tld = domain[i+1:]
978 sld = domain[j+1:i]
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000979 if (sld.lower() in (
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000980 "co", "ac",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000981 "com", "edu", "org", "net", "gov", "mil", "int") and
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000982 len(tld) == 2):
983 # domain like .co.uk
984 debug(" country-code second level domain %s", domain)
985 return False
986 if domain.startswith("."):
987 undotted_domain = domain[1:]
988 else:
989 undotted_domain = domain
990 embedded_dots = (undotted_domain.find(".") >= 0)
991 if not embedded_dots and domain != ".local":
992 debug(" non-local domain %s contains no embedded dot",
993 domain)
994 return False
995 if cookie.version == 0:
996 if (not erhn.endswith(domain) and
997 (not erhn.startswith(".") and
998 not ("."+erhn).endswith(domain))):
999 debug(" effective request-host %s (even with added "
1000 "initial dot) does not end end with %s",
1001 erhn, domain)
1002 return False
1003 if (cookie.version > 0 or
1004 (self.strict_ns_domain & self.DomainRFC2965Match)):
1005 if not domain_match(erhn, domain):
1006 debug(" effective request-host %s does not domain-match "
1007 "%s", erhn, domain)
1008 return False
1009 if (cookie.version > 0 or
1010 (self.strict_ns_domain & self.DomainStrictNoDots)):
1011 host_prefix = req_host[:-len(domain)]
1012 if (host_prefix.find(".") >= 0 and
1013 not IPV4_RE.search(req_host)):
1014 debug(" host prefix %s for domain %s contains a dot",
1015 host_prefix, domain)
1016 return False
1017 return True
1018
1019 def set_ok_port(self, cookie, request):
1020 if cookie.port_specified:
1021 req_port = request_port(request)
1022 if req_port is None:
1023 req_port = "80"
1024 else:
1025 req_port = str(req_port)
1026 for p in cookie.port.split(","):
1027 try:
1028 int(p)
1029 except ValueError:
1030 debug(" bad port %s (not numeric)", p)
1031 return False
1032 if p == req_port:
1033 break
1034 else:
1035 debug(" request port (%s) not found in %s",
1036 req_port, cookie.port)
1037 return False
1038 return True
1039
1040 def return_ok(self, cookie, request):
1041 """
1042 If you override .return_ok(), be sure to call this method. If it
1043 returns false, so should your subclass (assuming your subclass wants to
1044 be more strict about which cookies to return).
1045
1046 """
1047 # Path has already been checked by .path_return_ok(), and domain
1048 # blocking done by .domain_return_ok().
1049 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1050
1051 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1052 fn_name = "return_ok_"+n
1053 fn = getattr(self, fn_name)
1054 if not fn(cookie, request):
1055 return False
1056 return True
1057
1058 def return_ok_version(self, cookie, request):
1059 if cookie.version > 0 and not self.rfc2965:
1060 debug(" RFC 2965 cookies are switched off")
1061 return False
1062 elif cookie.version == 0 and not self.netscape:
1063 debug(" Netscape cookies are switched off")
1064 return False
1065 return True
1066
1067 def return_ok_verifiability(self, cookie, request):
1068 if request.is_unverifiable() and is_third_party(request):
1069 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1070 debug(" third-party RFC 2965 cookie during unverifiable "
1071 "transaction")
1072 return False
1073 elif cookie.version == 0 and self.strict_ns_unverifiable:
1074 debug(" third-party Netscape cookie during unverifiable "
1075 "transaction")
1076 return False
1077 return True
1078
1079 def return_ok_secure(self, cookie, request):
1080 if cookie.secure and request.get_type() != "https":
1081 debug(" secure cookie with non-secure request")
1082 return False
1083 return True
1084
1085 def return_ok_expires(self, cookie, request):
1086 if cookie.is_expired(self._now):
1087 debug(" cookie expired")
1088 return False
1089 return True
1090
1091 def return_ok_port(self, cookie, request):
1092 if cookie.port:
1093 req_port = request_port(request)
1094 if req_port is None:
1095 req_port = "80"
1096 for p in cookie.port.split(","):
1097 if p == req_port:
1098 break
1099 else:
1100 debug(" request port %s does not match cookie port %s",
1101 req_port, cookie.port)
1102 return False
1103 return True
1104
1105 def return_ok_domain(self, cookie, request):
1106 req_host, erhn = eff_request_host(request)
1107 domain = cookie.domain
1108
1109 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1110 if (cookie.version == 0 and
1111 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1112 not cookie.domain_specified and domain != erhn):
1113 debug(" cookie with unspecified domain does not string-compare "
1114 "equal to request domain")
1115 return False
1116
1117 if cookie.version > 0 and not domain_match(erhn, domain):
1118 debug(" effective request-host name %s does not domain-match "
1119 "RFC 2965 cookie domain %s", erhn, domain)
1120 return False
1121 if cookie.version == 0 and not ("."+erhn).endswith(domain):
1122 debug(" request-host %s does not match Netscape cookie domain "
1123 "%s", req_host, domain)
1124 return False
1125 return True
1126
1127 def domain_return_ok(self, domain, request):
1128 # Liberal check of. This is here as an optimization to avoid
1129 # having to load lots of MSIE cookie files unless necessary.
1130 req_host, erhn = eff_request_host(request)
1131 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001132 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001133 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001134 erhn = "."+erhn
1135 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001136 #debug(" request domain %s does not match cookie domain %s",
1137 # req_host, domain)
1138 return False
1139
1140 if self.is_blocked(domain):
1141 debug(" domain %s is in user block-list", domain)
1142 return False
1143 if self.is_not_allowed(domain):
1144 debug(" domain %s is not in user allow-list", domain)
1145 return False
1146
1147 return True
1148
1149 def path_return_ok(self, path, request):
1150 debug("- checking cookie path=%s", path)
1151 req_path = request_path(request)
1152 if not req_path.startswith(path):
1153 debug(" %s does not path-match %s", req_path, path)
1154 return False
1155 return True
1156
1157
1158def vals_sorted_by_key(adict):
1159 keys = adict.keys()
1160 keys.sort()
1161 return map(adict.get, keys)
1162
1163def deepvalues(mapping):
1164 """Iterates over nested mapping, depth-first, in sorted order by key."""
1165 values = vals_sorted_by_key(mapping)
1166 for obj in values:
1167 mapping = False
1168 try:
1169 obj.items
1170 except AttributeError:
1171 pass
1172 else:
1173 mapping = True
1174 for subobj in deepvalues(obj):
1175 yield subobj
1176 if not mapping:
1177 yield obj
1178
1179
1180# Used as second parameter to dict.get() method, to distinguish absent
1181# dict key from one with a None value.
1182class Absent: pass
1183
1184class CookieJar:
1185 """Collection of HTTP cookies.
1186
1187 You may not need to know about this class: try
1188 urllib2.build_opener(HTTPCookieProcessor).open(url).
1189
1190 """
1191
1192 non_word_re = re.compile(r"\W")
1193 quote_re = re.compile(r"([\"\\])")
1194 strict_domain_re = re.compile(r"\.?[^.]*")
1195 domain_re = re.compile(r"[^.]*")
1196 dots_re = re.compile(r"^\.+")
1197
1198 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1199
1200 def __init__(self, policy=None):
1201 if policy is None:
1202 policy = DefaultCookiePolicy()
1203 self._policy = policy
1204
1205 self._cookies_lock = _threading.RLock()
1206 self._cookies = {}
1207
1208 def set_policy(self, policy):
1209 self._policy = policy
1210
1211 def _cookies_for_domain(self, domain, request):
1212 cookies = []
1213 if not self._policy.domain_return_ok(domain, request):
1214 return []
1215 debug("Checking %s for cookies to return", domain)
1216 cookies_by_path = self._cookies[domain]
1217 for path in cookies_by_path.keys():
1218 if not self._policy.path_return_ok(path, request):
1219 continue
1220 cookies_by_name = cookies_by_path[path]
1221 for cookie in cookies_by_name.values():
1222 if not self._policy.return_ok(cookie, request):
1223 debug(" not returning cookie")
1224 continue
1225 debug(" it's a match")
1226 cookies.append(cookie)
1227 return cookies
1228
1229 def _cookies_for_request(self, request):
1230 """Return a list of cookies to be returned to server."""
1231 cookies = []
1232 for domain in self._cookies.keys():
1233 cookies.extend(self._cookies_for_domain(domain, request))
1234 return cookies
1235
1236 def _cookie_attrs(self, cookies):
1237 """Return a list of cookie-attributes to be returned to server.
1238
1239 like ['foo="bar"; $Path="/"', ...]
1240
1241 The $Version attribute is also added when appropriate (currently only
1242 once per request).
1243
1244 """
1245 # add cookies in order of most specific (ie. longest) path first
1246 def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
1247 cookies.sort(decreasing_size)
1248
1249 version_set = False
1250
1251 attrs = []
1252 for cookie in cookies:
1253 # set version of Cookie header
1254 # XXX
1255 # What should it be if multiple matching Set-Cookie headers have
1256 # different versions themselves?
1257 # Answer: there is no answer; was supposed to be settled by
1258 # RFC 2965 errata, but that may never appear...
1259 version = cookie.version
1260 if not version_set:
1261 version_set = True
1262 if version > 0:
1263 attrs.append("$Version=%s" % version)
1264
1265 # quote cookie value if necessary
1266 # (not for Netscape protocol, which already has any quotes
1267 # intact, due to the poorly-specified Netscape Cookie: syntax)
1268 if ((cookie.value is not None) and
1269 self.non_word_re.search(cookie.value) and version > 0):
1270 value = self.quote_re.sub(r"\\\1", cookie.value)
1271 else:
1272 value = cookie.value
1273
1274 # add cookie-attributes to be returned in Cookie header
1275 if cookie.value is None:
1276 attrs.append(cookie.name)
1277 else:
1278 attrs.append("%s=%s" % (cookie.name, value))
1279 if version > 0:
1280 if cookie.path_specified:
1281 attrs.append('$Path="%s"' % cookie.path)
1282 if cookie.domain.startswith("."):
1283 domain = cookie.domain
1284 if (not cookie.domain_initial_dot and
1285 domain.startswith(".")):
1286 domain = domain[1:]
1287 attrs.append('$Domain="%s"' % domain)
1288 if cookie.port is not None:
1289 p = "$Port"
1290 if cookie.port_specified:
1291 p = p + ('="%s"' % cookie.port)
1292 attrs.append(p)
1293
1294 return attrs
1295
1296 def add_cookie_header(self, request):
1297 """Add correct Cookie: header to request (urllib2.Request object).
1298
1299 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1300
1301 """
1302 debug("add_cookie_header")
1303 self._cookies_lock.acquire()
1304
1305 self._policy._now = self._now = int(time.time())
1306
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001307 cookies = self._cookies_for_request(request)
1308
1309 attrs = self._cookie_attrs(cookies)
1310 if attrs:
1311 if not request.has_header("Cookie"):
1312 request.add_unredirected_header(
1313 "Cookie", "; ".join(attrs))
1314
1315 # if necessary, advertise that we know RFC 2965
1316 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1317 not request.has_header("Cookie2")):
1318 for cookie in cookies:
1319 if cookie.version != 1:
1320 request.add_unredirected_header("Cookie2", '$Version="1"')
1321 break
1322
1323 self._cookies_lock.release()
1324
1325 self.clear_expired_cookies()
1326
1327 def _normalized_cookie_tuples(self, attrs_set):
1328 """Return list of tuples containing normalised cookie information.
1329
1330 attrs_set is the list of lists of key,value pairs extracted from
1331 the Set-Cookie or Set-Cookie2 headers.
1332
1333 Tuples are name, value, standard, rest, where name and value are the
1334 cookie name and value, standard is a dictionary containing the standard
1335 cookie-attributes (discard, secure, version, expires or max-age,
1336 domain, path and port) and rest is a dictionary containing the rest of
1337 the cookie-attributes.
1338
1339 """
1340 cookie_tuples = []
1341
1342 boolean_attrs = "discard", "secure"
1343 value_attrs = ("version",
1344 "expires", "max-age",
1345 "domain", "path", "port",
1346 "comment", "commenturl")
1347
1348 for cookie_attrs in attrs_set:
1349 name, value = cookie_attrs[0]
1350
1351 # Build dictionary of standard cookie-attributes (standard) and
1352 # dictionary of other cookie-attributes (rest).
1353
1354 # Note: expiry time is normalised to seconds since epoch. V0
1355 # cookies should have the Expires cookie-attribute, and V1 cookies
1356 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1357 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1358 # accept either (but prefer Max-Age).
1359 max_age_set = False
1360
1361 bad_cookie = False
1362
1363 standard = {}
1364 rest = {}
1365 for k, v in cookie_attrs[1:]:
1366 lc = k.lower()
1367 # don't lose case distinction for unknown fields
1368 if lc in value_attrs or lc in boolean_attrs:
1369 k = lc
1370 if k in boolean_attrs and v is None:
1371 # boolean cookie-attribute is present, but has no value
1372 # (like "discard", rather than "port=80")
1373 v = True
1374 if k in standard:
1375 # only first value is significant
1376 continue
1377 if k == "domain":
1378 if v is None:
1379 debug(" missing value for domain attribute")
1380 bad_cookie = True
1381 break
1382 # RFC 2965 section 3.3.3
1383 v = v.lower()
1384 if k == "expires":
1385 if max_age_set:
1386 # Prefer max-age to expires (like Mozilla)
1387 continue
1388 if v is None:
1389 debug(" missing or invalid value for expires "
1390 "attribute: treating as session cookie")
1391 continue
1392 if k == "max-age":
1393 max_age_set = True
1394 try:
1395 v = int(v)
1396 except ValueError:
1397 debug(" missing or invalid (non-numeric) value for "
1398 "max-age attribute")
1399 bad_cookie = True
1400 break
1401 # convert RFC 2965 Max-Age to seconds since epoch
1402 # XXX Strictly you're supposed to follow RFC 2616
1403 # age-calculation rules. Remember that zero Max-Age is a
1404 # is a request to discard (old and new) cookie, though.
1405 k = "expires"
1406 v = self._now + v
1407 if (k in value_attrs) or (k in boolean_attrs):
1408 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001409 k not in ("port", "comment", "commenturl")):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001410 debug(" missing value for %s attribute" % k)
1411 bad_cookie = True
1412 break
1413 standard[k] = v
1414 else:
1415 rest[k] = v
1416
1417 if bad_cookie:
1418 continue
1419
1420 cookie_tuples.append((name, value, standard, rest))
1421
1422 return cookie_tuples
1423
1424 def _cookie_from_cookie_tuple(self, tup, request):
1425 # standard is dict of standard cookie-attributes, rest is dict of the
1426 # rest of them
1427 name, value, standard, rest = tup
1428
1429 domain = standard.get("domain", Absent)
1430 path = standard.get("path", Absent)
1431 port = standard.get("port", Absent)
1432 expires = standard.get("expires", Absent)
1433
1434 # set the easy defaults
1435 version = standard.get("version", None)
1436 if version is not None: version = int(version)
1437 secure = standard.get("secure", False)
1438 # (discard is also set if expires is Absent)
1439 discard = standard.get("discard", False)
1440 comment = standard.get("comment", None)
1441 comment_url = standard.get("commenturl", None)
1442
1443 # set default path
1444 if path is not Absent and path != "":
1445 path_specified = True
1446 path = escape_path(path)
1447 else:
1448 path_specified = False
1449 path = request_path(request)
1450 i = path.rfind("/")
1451 if i != -1:
1452 if version == 0:
1453 # Netscape spec parts company from reality here
1454 path = path[:i]
1455 else:
1456 path = path[:i+1]
1457 if len(path) == 0: path = "/"
1458
1459 # set default domain
1460 domain_specified = domain is not Absent
1461 # but first we have to remember whether it starts with a dot
1462 domain_initial_dot = False
1463 if domain_specified:
1464 domain_initial_dot = bool(domain.startswith("."))
1465 if domain is Absent:
1466 req_host, erhn = eff_request_host(request)
1467 domain = erhn
1468 elif not domain.startswith("."):
1469 domain = "."+domain
1470
1471 # set default port
1472 port_specified = False
1473 if port is not Absent:
1474 if port is None:
1475 # Port attr present, but has no value: default to request port.
1476 # Cookie should then only be sent back on that port.
1477 port = request_port(request)
1478 else:
1479 port_specified = True
1480 port = re.sub(r"\s+", "", port)
1481 else:
1482 # No port attr present. Cookie can be sent back on any port.
1483 port = None
1484
1485 # set default expires and discard
1486 if expires is Absent:
1487 expires = None
1488 discard = True
1489 elif expires <= self._now:
1490 # Expiry date in past is request to delete cookie. This can't be
1491 # in DefaultCookiePolicy, because can't delete cookies there.
1492 try:
1493 self.clear(domain, path, name)
1494 except KeyError:
1495 pass
1496 debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1497 domain, path, name)
1498 return None
1499
1500 return Cookie(version,
1501 name, value,
1502 port, port_specified,
1503 domain, domain_specified, domain_initial_dot,
1504 path, path_specified,
1505 secure,
1506 expires,
1507 discard,
1508 comment,
1509 comment_url,
1510 rest)
1511
1512 def _cookies_from_attrs_set(self, attrs_set, request):
1513 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1514
1515 cookies = []
1516 for tup in cookie_tuples:
1517 cookie = self._cookie_from_cookie_tuple(tup, request)
1518 if cookie: cookies.append(cookie)
1519 return cookies
1520
1521 def make_cookies(self, response, request):
1522 """Return sequence of Cookie objects extracted from response object."""
1523 # get cookie-attributes for RFC 2965 and Netscape protocols
1524 headers = response.info()
1525 rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1526 ns_hdrs = headers.getheaders("Set-Cookie")
1527
1528 rfc2965 = self._policy.rfc2965
1529 netscape = self._policy.netscape
1530
1531 if ((not rfc2965_hdrs and not ns_hdrs) or
1532 (not ns_hdrs and not rfc2965) or
1533 (not rfc2965_hdrs and not netscape) or
1534 (not netscape and not rfc2965)):
1535 return [] # no relevant cookie headers: quick exit
1536
1537 try:
1538 cookies = self._cookies_from_attrs_set(
1539 split_header_words(rfc2965_hdrs), request)
1540 except:
1541 reraise_unmasked_exceptions()
1542 cookies = []
1543
1544 if ns_hdrs and netscape:
1545 try:
1546 ns_cookies = self._cookies_from_attrs_set(
1547 parse_ns_headers(ns_hdrs), request)
1548 except:
1549 reraise_unmasked_exceptions()
1550 ns_cookies = []
1551
1552 # Look for Netscape cookies (from Set-Cookie headers) that match
1553 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1554 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1555 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1556 # bundled in with the Netscape cookies for this purpose, which is
1557 # reasonable behaviour.
1558 if rfc2965:
1559 lookup = {}
1560 for cookie in cookies:
1561 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1562
1563 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1564 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1565 return key not in lookup
1566 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1567
1568 if ns_cookies:
1569 cookies.extend(ns_cookies)
1570
1571 return cookies
1572
1573 def set_cookie_if_ok(self, cookie, request):
1574 """Set a cookie if policy says it's OK to do so."""
1575 self._cookies_lock.acquire()
1576 self._policy._now = self._now = int(time.time())
1577
1578 if self._policy.set_ok(cookie, request):
1579 self.set_cookie(cookie)
1580
1581 self._cookies_lock.release()
1582
1583 def set_cookie(self, cookie):
1584 """Set a cookie, without checking whether or not it should be set."""
1585 c = self._cookies
1586 self._cookies_lock.acquire()
1587 try:
1588 if cookie.domain not in c: c[cookie.domain] = {}
1589 c2 = c[cookie.domain]
1590 if cookie.path not in c2: c2[cookie.path] = {}
1591 c3 = c2[cookie.path]
1592 c3[cookie.name] = cookie
1593 finally:
1594 self._cookies_lock.release()
1595
1596 def extract_cookies(self, response, request):
1597 """Extract cookies from response, where allowable given the request."""
1598 debug("extract_cookies: %s", response.info())
1599 self._cookies_lock.acquire()
1600 self._policy._now = self._now = int(time.time())
1601
1602 for cookie in self.make_cookies(response, request):
1603 if self._policy.set_ok(cookie, request):
1604 debug(" setting cookie: %s", cookie)
1605 self.set_cookie(cookie)
1606 self._cookies_lock.release()
1607
1608 def clear(self, domain=None, path=None, name=None):
1609 """Clear some cookies.
1610
1611 Invoking this method without arguments will clear all cookies. If
1612 given a single argument, only cookies belonging to that domain will be
1613 removed. If given two arguments, cookies belonging to the specified
1614 path within that domain are removed. If given three arguments, then
1615 the cookie with the specified name, path and domain is removed.
1616
1617 Raises KeyError if no matching cookie exists.
1618
1619 """
1620 if name is not None:
1621 if (domain is None) or (path is None):
1622 raise ValueError(
1623 "domain and path must be given to remove a cookie by name")
1624 del self._cookies[domain][path][name]
1625 elif path is not None:
1626 if domain is None:
1627 raise ValueError(
1628 "domain must be given to remove cookies by path")
1629 del self._cookies[domain][path]
1630 elif domain is not None:
1631 del self._cookies[domain]
1632 else:
1633 self._cookies = {}
1634
1635 def clear_session_cookies(self):
1636 """Discard all session cookies.
1637
1638 Note that the .save() method won't save session cookies anyway, unless
1639 you ask otherwise by passing a true ignore_discard argument.
1640
1641 """
1642 self._cookies_lock.acquire()
1643 for cookie in self:
1644 if cookie.discard:
1645 self.clear(cookie.domain, cookie.path, cookie.name)
1646 self._cookies_lock.release()
1647
1648 def clear_expired_cookies(self):
1649 """Discard all expired cookies.
1650
1651 You probably don't need to call this method: expired cookies are never
1652 sent back to the server (provided you're using DefaultCookiePolicy),
1653 this method is called by CookieJar itself every so often, and the
1654 .save() method won't save expired cookies anyway (unless you ask
1655 otherwise by passing a true ignore_expires argument).
1656
1657 """
1658 self._cookies_lock.acquire()
1659 now = time.time()
1660 for cookie in self:
1661 if cookie.is_expired(now):
1662 self.clear(cookie.domain, cookie.path, cookie.name)
1663 self._cookies_lock.release()
1664
1665 def __iter__(self):
1666 return deepvalues(self._cookies)
1667
1668 def __len__(self):
1669 """Return number of contained cookies."""
1670 i = 0
1671 for cookie in self: i = i + 1
1672 return i
1673
1674 def __repr__(self):
1675 r = []
1676 for cookie in self: r.append(repr(cookie))
1677 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1678
1679 def __str__(self):
1680 r = []
1681 for cookie in self: r.append(str(cookie))
1682 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1683
1684
Neal Norwitz3e7de592005-12-23 21:24:35 +00001685# derives from IOError for backwards-compatibility with Python 2.4.0
1686class LoadError(IOError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001687
1688class FileCookieJar(CookieJar):
1689 """CookieJar that can be loaded from and saved to a file."""
1690
1691 def __init__(self, filename=None, delayload=False, policy=None):
1692 """
1693 Cookies are NOT loaded from the named file until either the .load() or
1694 .revert() method is called.
1695
1696 """
1697 CookieJar.__init__(self, policy)
1698 if filename is not None:
1699 try:
1700 filename+""
1701 except:
1702 raise ValueError("filename must be string-like")
1703 self.filename = filename
1704 self.delayload = bool(delayload)
1705
1706 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1707 """Save cookies to a file."""
1708 raise NotImplementedError()
1709
1710 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1711 """Load cookies from a file."""
1712 if filename is None:
1713 if self.filename is not None: filename = self.filename
1714 else: raise ValueError(MISSING_FILENAME_TEXT)
1715
1716 f = open(filename)
1717 try:
1718 self._really_load(f, filename, ignore_discard, ignore_expires)
1719 finally:
1720 f.close()
1721
1722 def revert(self, filename=None,
1723 ignore_discard=False, ignore_expires=False):
1724 """Clear all cookies and reload cookies from a saved file.
1725
1726 Raises LoadError (or IOError) if reversion is not successful; the
1727 object's state will not be altered if this happens.
1728
1729 """
1730 if filename is None:
1731 if self.filename is not None: filename = self.filename
1732 else: raise ValueError(MISSING_FILENAME_TEXT)
1733
1734 self._cookies_lock.acquire()
1735
1736 old_state = copy.deepcopy(self._cookies)
1737 self._cookies = {}
1738 try:
1739 self.load(filename, ignore_discard, ignore_expires)
1740 except (LoadError, IOError):
1741 self._cookies = old_state
1742 raise
1743
1744 self._cookies_lock.release()
1745
1746from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1747from _MozillaCookieJar import MozillaCookieJar