blob: 7fec8584fe3e697ec6a0f86d991ba46693729eb1 [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that the classes which do not derive from
11FileCookieJar are not distributed with the Python standard library, but
12are available from http://wwwsearch.sf.net/):
13
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +000028import sys, re, urlparse, copy, time, urllib, logging
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000029try:
30 import threading as _threading
31except ImportError:
32 import dummy_threading as _threading
33import httplib # only for the default HTTP port
34from calendar import timegm
35
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000036debug = logging.getLogger("cookielib").debug
37
38DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
39MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
40 "instance initialised with one)")
41
42def reraise_unmasked_exceptions(unmasked=()):
43 # There are a few catch-all except: statements in this module, for
44 # catching input that's bad in unexpected ways.
45 # This function re-raises some exceptions we don't want to trap.
46 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
47 etype = sys.exc_info()[0]
48 if issubclass(etype, unmasked):
49 raise
50 # swallowed an exception
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000051 import warnings, traceback, StringIO
52 f = StringIO.StringIO()
53 traceback.print_exc(None, f)
54 msg = f.getvalue()
55 warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000056
57
58# Date/time conversion
59# -----------------------------------------------------------------------------
60
61EPOCH_YEAR = 1970
62def _timegm(tt):
63 year, month, mday, hour, min, sec = tt[:6]
64 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
65 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
66 return timegm(tt)
67 else:
68 return None
69
70DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
71MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
72 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
73MONTHS_LOWER = []
74for month in MONTHS: MONTHS_LOWER.append(month.lower())
75
76def time2isoz(t=None):
77 """Return a string representing time in seconds since epoch, t.
78
79 If the function is called without an argument, it will use the current
80 time.
81
82 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
83 representing Universal Time (UTC, aka GMT). An example of this format is:
84
85 1994-11-24 08:49:37Z
86
87 """
88 if t is None: t = time.time()
89 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
90 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
91 year, mon, mday, hour, min, sec)
92
93def time2netscape(t=None):
94 """Return a string representing time in seconds since epoch, t.
95
96 If the function is called without an argument, it will use the current
97 time.
98
99 The format of the returned string is like this:
100
101 Wed, DD-Mon-YYYY HH:MM:SS GMT
102
103 """
104 if t is None: t = time.time()
105 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
106 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
107 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
108
109
110UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
111
112TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
113def offset_from_tz_string(tz):
114 offset = None
115 if tz in UTC_ZONES:
116 offset = 0
117 else:
118 m = TIMEZONE_RE.search(tz)
119 if m:
120 offset = 3600 * int(m.group(2))
121 if m.group(3):
122 offset = offset + 60 * int(m.group(3))
123 if m.group(1) == '-':
124 offset = -offset
125 return offset
126
127def _str2time(day, mon, yr, hr, min, sec, tz):
128 # translate month name to number
129 # month numbers start with 1 (January)
130 try:
131 mon = MONTHS_LOWER.index(mon.lower())+1
132 except ValueError:
133 # maybe it's already a number
134 try:
135 imon = int(mon)
136 except ValueError:
137 return None
138 if 1 <= imon <= 12:
139 mon = imon
140 else:
141 return None
142
143 # make sure clock elements are defined
144 if hr is None: hr = 0
145 if min is None: min = 0
146 if sec is None: sec = 0
147
148 yr = int(yr)
149 day = int(day)
150 hr = int(hr)
151 min = int(min)
152 sec = int(sec)
153
154 if yr < 1000:
155 # find "obvious" year
156 cur_yr = time.localtime(time.time())[0]
157 m = cur_yr % 100
158 tmp = yr
159 yr = yr + cur_yr - m
160 m = m - tmp
161 if abs(m) > 50:
162 if m > 0: yr = yr + 100
163 else: yr = yr - 100
164
165 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
166 t = _timegm((yr, mon, day, hr, min, sec, tz))
167
168 if t is not None:
169 # adjust time using timezone string, to get absolute time since epoch
170 if tz is None:
171 tz = "UTC"
172 tz = tz.upper()
173 offset = offset_from_tz_string(tz)
174 if offset is None:
175 return None
176 t = t - offset
177
178 return t
179
180STRICT_DATE_RE = re.compile(
181 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
182 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
183WEEKDAY_RE = re.compile(
184 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
185LOOSE_HTTP_DATE_RE = re.compile(
186 r"""^
187 (\d\d?) # day
188 (?:\s+|[-\/])
189 (\w+) # month
190 (?:\s+|[-\/])
191 (\d+) # year
192 (?:
193 (?:\s+|:) # separator before clock
194 (\d\d?):(\d\d) # hour:min
195 (?::(\d\d))? # optional seconds
196 )? # optional clock
197 \s*
198 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
199 \s*
200 (?:\(\w+\))? # ASCII representation of timezone in parens.
201 \s*$""", re.X)
202def http2time(text):
203 """Returns time in seconds since epoch of time represented by a string.
204
205 Return value is an integer.
206
207 None is returned if the format of str is unrecognized, the time is outside
208 the representable range, or the timezone string is not recognized. If the
209 string contains no timezone, UTC is assumed.
210
211 The timezone in the string may be numerical (like "-0800" or "+0100") or a
212 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
213 timezone strings equivalent to UTC (zero offset) are known to the function.
214
215 The function loosely parses the following formats:
216
217 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
218 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
219 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
220 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
221 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
222 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
223
224 The parser ignores leading and trailing whitespace. The time may be
225 absent.
226
227 If the year is given with only 2 digits, the function will select the
228 century that makes the year closest to the current date.
229
230 """
231 # fast exit for strictly conforming string
232 m = STRICT_DATE_RE.search(text)
233 if m:
234 g = m.groups()
235 mon = MONTHS_LOWER.index(g[1].lower()) + 1
236 tt = (int(g[2]), mon, int(g[0]),
237 int(g[3]), int(g[4]), float(g[5]))
238 return _timegm(tt)
239
240 # No, we need some messy parsing...
241
242 # clean up
243 text = text.lstrip()
244 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
245
246 # tz is time zone specifier string
247 day, mon, yr, hr, min, sec, tz = [None]*7
248
249 # loose regexp parse
250 m = LOOSE_HTTP_DATE_RE.search(text)
251 if m is not None:
252 day, mon, yr, hr, min, sec, tz = m.groups()
253 else:
254 return None # bad format
255
256 return _str2time(day, mon, yr, hr, min, sec, tz)
257
258ISO_DATE_RE = re.compile(
259 """^
260 (\d{4}) # year
261 [-\/]?
262 (\d\d?) # numerical month
263 [-\/]?
264 (\d\d?) # day
265 (?:
266 (?:\s+|[-:Tt]) # separator before clock
267 (\d\d?):?(\d\d) # hour:min
268 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
269 )? # optional clock
270 \s*
271 ([-+]?\d\d?:?(:?\d\d)?
272 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
273 \s*$""", re.X)
274def iso2time(text):
275 """
276 As for http2time, but parses the ISO 8601 formats:
277
278 1994-02-03 14:15:29 -0100 -- ISO 8601 format
279 1994-02-03 14:15:29 -- zone is optional
280 1994-02-03 -- only date
281 1994-02-03T14:15:29 -- Use T as separator
282 19940203T141529Z -- ISO 8601 compact format
283 19940203 -- only date
284
285 """
286 # clean up
287 text = text.lstrip()
288
289 # tz is time zone specifier string
290 day, mon, yr, hr, min, sec, tz = [None]*7
291
292 # loose regexp parse
293 m = ISO_DATE_RE.search(text)
294 if m is not None:
295 # XXX there's an extra bit of the timezone I'm ignoring here: is
296 # this the right thing to do?
297 yr, mon, day, hr, min, sec, tz, _ = m.groups()
298 else:
299 return None # bad format
300
301 return _str2time(day, mon, yr, hr, min, sec, tz)
302
303
304# Header parsing
305# -----------------------------------------------------------------------------
306
307def unmatched(match):
308 """Return unmatched part of re.Match object."""
309 start, end = match.span(0)
310 return match.string[:start]+match.string[end:]
311
312HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
313HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
314HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
315HEADER_ESCAPE_RE = re.compile(r"\\(.)")
316def split_header_words(header_values):
317 r"""Parse header values into a list of lists containing key,value pairs.
318
319 The function knows how to deal with ",", ";" and "=" as well as quoted
320 values after "=". A list of space separated tokens are parsed as if they
321 were separated by ";".
322
323 If the header_values passed as argument contains multiple values, then they
324 are treated as if they were a single value separated by comma ",".
325
326 This means that this function is useful for parsing header fields that
327 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
328 the requirement for tokens).
329
330 headers = #header
331 header = (token | parameter) *( [";"] (token | parameter))
332
333 token = 1*<any CHAR except CTLs or separators>
334 separators = "(" | ")" | "<" | ">" | "@"
335 | "," | ";" | ":" | "\" | <">
336 | "/" | "[" | "]" | "?" | "="
337 | "{" | "}" | SP | HT
338
339 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
340 qdtext = <any TEXT except <">>
341 quoted-pair = "\" CHAR
342
343 parameter = attribute "=" value
344 attribute = token
345 value = token | quoted-string
346
347 Each header is represented by a list of key/value pairs. The value for a
348 simple token (not part of a parameter) is None. Syntactically incorrect
349 headers will not necessarily be parsed as you would want.
350
351 This is easier to describe with some examples:
352
353 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
354 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
355 >>> split_header_words(['text/html; charset="iso-8859-1"'])
356 [[('text/html', None), ('charset', 'iso-8859-1')]]
357 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
358 [[('Basic', None), ('realm', '"foobar"')]]
359
360 """
Raymond Hettingerf7153662005-02-07 14:16:21 +0000361 assert not isinstance(header_values, basestring)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000362 result = []
363 for text in header_values:
364 orig_text = text
365 pairs = []
366 while text:
367 m = HEADER_TOKEN_RE.search(text)
368 if m:
369 text = unmatched(m)
370 name = m.group(1)
371 m = HEADER_QUOTED_VALUE_RE.search(text)
372 if m: # quoted value
373 text = unmatched(m)
374 value = m.group(1)
375 value = HEADER_ESCAPE_RE.sub(r"\1", value)
376 else:
377 m = HEADER_VALUE_RE.search(text)
378 if m: # unquoted value
379 text = unmatched(m)
380 value = m.group(1)
381 value = value.rstrip()
382 else:
383 # no value, a lone token
384 value = None
385 pairs.append((name, value))
386 elif text.lstrip().startswith(","):
387 # concatenated headers, as per RFC 2616 section 4.2
388 text = text.lstrip()[1:]
389 if pairs: result.append(pairs)
390 pairs = []
391 else:
392 # skip junk
393 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
394 assert nr_junk_chars > 0, (
395 "split_header_words bug: '%s', '%s', %s" %
396 (orig_text, text, pairs))
397 text = non_junk
398 if pairs: result.append(pairs)
399 return result
400
401HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
402def join_header_words(lists):
403 """Do the inverse (almost) of the conversion done by split_header_words.
404
405 Takes a list of lists of (key, value) pairs and produces a single header
406 value. Attribute values are quoted if needed.
407
408 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
409 'text/plain; charset="iso-8859/1"'
410 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
411 'text/plain, charset="iso-8859/1"'
412
413 """
414 headers = []
415 for pairs in lists:
416 attr = []
417 for k, v in pairs:
418 if v is not None:
419 if not re.search(r"^\w+$", v):
420 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
421 v = '"%s"' % v
422 k = "%s=%s" % (k, v)
423 attr.append(k)
424 if attr: headers.append("; ".join(attr))
425 return ", ".join(headers)
426
427def parse_ns_headers(ns_headers):
428 """Ad-hoc parser for Netscape protocol cookie-attributes.
429
430 The old Netscape cookie format for Set-Cookie can for instance contain
431 an unquoted "," in the expires field, so we have to use this ad-hoc
432 parser instead of split_header_words.
433
434 XXX This may not make the best possible effort to parse all the crap
435 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
436 parser is probably better, so could do worse than following that if
437 this ever gives any trouble.
438
439 Currently, this is also used for parsing RFC 2109 cookies.
440
441 """
442 known_attrs = ("expires", "domain", "path", "secure",
443 # RFC 2109 attrs (may turn up in Netscape cookies, too)
444 "port", "max-age")
445
446 result = []
447 for ns_header in ns_headers:
448 pairs = []
449 version_set = False
450 for param in re.split(r";\s*", ns_header):
451 param = param.rstrip()
452 if param == "": continue
453 if "=" not in param:
454 if param.lower() in known_attrs:
455 k, v = param, None
456 else:
457 # cookie with missing value
458 k, v = param, None
459 else:
460 k, v = re.split(r"\s*=\s*", param, 1)
461 k = k.lstrip()
462 if k is not None:
463 lc = k.lower()
464 if lc in known_attrs:
465 k = lc
466 if k == "version":
467 # This is an RFC 2109 cookie. Will be treated as RFC 2965
468 # cookie in rest of code.
469 # Probably it should be parsed with split_header_words, but
470 # that's too much hassle.
471 version_set = True
472 if k == "expires":
473 # convert expires date to seconds since epoch
474 if v.startswith('"'): v = v[1:]
475 if v.endswith('"'): v = v[:-1]
476 v = http2time(v) # None if invalid
477 pairs.append((k, v))
478
479 if pairs:
480 if not version_set:
481 pairs.append(("version", "0"))
482 result.append(pairs)
483
484 return result
485
486
487IPV4_RE = re.compile(r"\.\d+$")
488def is_HDN(text):
489 """Return True if text is a host domain name."""
490 # XXX
491 # This may well be wrong. Which RFC is HDN defined in, if any (for
492 # the purposes of RFC 2965)?
493 # For the current implementation, what about IPv6? Remember to look
494 # at other uses of IPV4_RE also, if change this.
495 if IPV4_RE.search(text):
496 return False
497 if text == "":
498 return False
499 if text[0] == "." or text[-1] == ".":
500 return False
501 return True
502
503def domain_match(A, B):
504 """Return True if domain A domain-matches domain B, according to RFC 2965.
505
506 A and B may be host domain names or IP addresses.
507
508 RFC 2965, section 1:
509
510 Host names can be specified either as an IP address or a HDN string.
511 Sometimes we compare one host name with another. (Such comparisons SHALL
512 be case-insensitive.) Host A's name domain-matches host B's if
513
514 * their host name strings string-compare equal; or
515
516 * A is a HDN string and has the form NB, where N is a non-empty
517 name string, B has the form .B', and B' is a HDN string. (So,
518 x.y.com domain-matches .Y.com but not Y.com.)
519
520 Note that domain-match is not a commutative operation: a.b.c.com
521 domain-matches .c.com, but not the reverse.
522
523 """
524 # Note that, if A or B are IP addresses, the only relevant part of the
525 # definition of the domain-match algorithm is the direct string-compare.
526 A = A.lower()
527 B = B.lower()
528 if A == B:
529 return True
530 if not is_HDN(A):
531 return False
532 i = A.rfind(B)
533 if i == -1 or i == 0:
534 # A does not have form NB, or N is the empty string
535 return False
536 if not B.startswith("."):
537 return False
538 if not is_HDN(B[1:]):
539 return False
540 return True
541
542def liberal_is_HDN(text):
543 """Return True if text is a sort-of-like a host domain name.
544
545 For accepting/blocking domains.
546
547 """
548 if IPV4_RE.search(text):
549 return False
550 return True
551
552def user_domain_match(A, B):
553 """For blocking/accepting domains.
554
555 A and B may be host domain names or IP addresses.
556
557 """
558 A = A.lower()
559 B = B.lower()
560 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
561 if A == B:
562 # equal IP addresses
563 return True
564 return False
565 initial_dot = B.startswith(".")
566 if initial_dot and A.endswith(B):
567 return True
568 if not initial_dot and A == B:
569 return True
570 return False
571
572cut_port_re = re.compile(r":\d+$")
573def request_host(request):
574 """Return request-host, as defined by RFC 2965.
575
576 Variation from RFC: returned value is lowercased, for convenient
577 comparison.
578
579 """
580 url = request.get_full_url()
581 host = urlparse.urlparse(url)[1]
582 if host == "":
583 host = request.get_header("Host", "")
584
585 # remove port, if present
586 host = cut_port_re.sub("", host, 1)
587 return host.lower()
588
589def eff_request_host(request):
590 """Return a tuple (request-host, effective request-host name).
591
592 As defined by RFC 2965, except both are lowercased.
593
594 """
595 erhn = req_host = request_host(request)
596 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
597 erhn = req_host + ".local"
598 return req_host, erhn
599
600def request_path(request):
601 """request-URI, as defined by RFC 2965."""
602 url = request.get_full_url()
603 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
604 #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
605 path, parameters, query, frag = urlparse.urlparse(url)[2:]
606 if parameters:
607 path = "%s;%s" % (path, parameters)
608 path = escape_path(path)
609 req_path = urlparse.urlunparse(("", "", path, "", query, frag))
610 if not req_path.startswith("/"):
611 # fix bad RFC 2396 absoluteURI
612 req_path = "/"+req_path
613 return req_path
614
615def request_port(request):
616 host = request.get_host()
617 i = host.find(':')
618 if i >= 0:
619 port = host[i+1:]
620 try:
621 int(port)
622 except ValueError:
623 debug("nonnumeric port: '%s'", port)
624 return None
625 else:
626 port = DEFAULT_HTTP_PORT
627 return port
628
629# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
630# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
631HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
632ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
633def uppercase_escaped_char(match):
634 return "%%%s" % match.group(1).upper()
635def escape_path(path):
636 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
637 # There's no knowing what character encoding was used to create URLs
638 # containing %-escapes, but since we have to pick one to escape invalid
639 # path characters, we pick UTF-8, as recommended in the HTML 4.0
640 # specification:
641 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
642 # And here, kind of: draft-fielding-uri-rfc2396bis-03
643 # (And in draft IRI specification: draft-duerst-iri-05)
644 # (And here, for new URI schemes: RFC 2718)
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +0000645 if isinstance(path, unicode):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000646 path = path.encode("utf-8")
647 path = urllib.quote(path, HTTP_PATH_SAFE)
648 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
649 return path
650
651def reach(h):
652 """Return reach of host h, as defined by RFC 2965, section 1.
653
654 The reach R of a host name H is defined as follows:
655
656 * If
657
658 - H is the host domain name of a host; and,
659
660 - H has the form A.B; and
661
662 - A has no embedded (that is, interior) dots; and
663
664 - B has at least one embedded dot, or B is the string "local".
665 then the reach of H is .B.
666
667 * Otherwise, the reach of H is H.
668
669 >>> reach("www.acme.com")
670 '.acme.com'
671 >>> reach("acme.com")
672 'acme.com'
673 >>> reach("acme.local")
674 '.local'
675
676 """
677 i = h.find(".")
678 if i >= 0:
679 #a = h[:i] # this line is only here to show what a is
680 b = h[i+1:]
681 i = b.find(".")
682 if is_HDN(h) and (i >= 0 or b == "local"):
683 return "."+b
684 return h
685
686def is_third_party(request):
687 """
688
689 RFC 2965, section 3.3.6:
690
691 An unverifiable transaction is to a third-party host if its request-
692 host U does not domain-match the reach R of the request-host O in the
693 origin transaction.
694
695 """
696 req_host = request_host(request)
697 if not domain_match(req_host, reach(request.get_origin_req_host())):
698 return True
699 else:
700 return False
701
702
703class Cookie:
704 """HTTP Cookie.
705
706 This class represents both Netscape and RFC 2965 cookies.
707
708 This is deliberately a very simple class. It just holds attributes. It's
709 possible to construct Cookie instances that don't comply with the cookie
710 standards. CookieJar.make_cookies is the factory function for Cookie
711 objects -- it deals with cookie parsing, supplying defaults, and
712 normalising to the representation used in this class. CookiePolicy is
713 responsible for checking them to see whether they should be accepted from
714 and returned to the server.
715
716 Note that the port may be present in the headers, but unspecified ("Port"
717 rather than"Port=80", for example); if this is the case, port is None.
718
719 """
720
721 def __init__(self, version, name, value,
722 port, port_specified,
723 domain, domain_specified, domain_initial_dot,
724 path, path_specified,
725 secure,
726 expires,
727 discard,
728 comment,
729 comment_url,
730 rest):
731
732 if version is not None: version = int(version)
733 if expires is not None: expires = int(expires)
734 if port is None and port_specified is True:
735 raise ValueError("if port is None, port_specified must be false")
736
737 self.version = version
738 self.name = name
739 self.value = value
740 self.port = port
741 self.port_specified = port_specified
742 # normalise case, as per RFC 2965 section 3.3.3
743 self.domain = domain.lower()
744 self.domain_specified = domain_specified
745 # Sigh. We need to know whether the domain given in the
746 # cookie-attribute had an initial dot, in order to follow RFC 2965
747 # (as clarified in draft errata). Needed for the returned $Domain
748 # value.
749 self.domain_initial_dot = domain_initial_dot
750 self.path = path
751 self.path_specified = path_specified
752 self.secure = secure
753 self.expires = expires
754 self.discard = discard
755 self.comment = comment
756 self.comment_url = comment_url
757
758 self._rest = copy.copy(rest)
759
760 def has_nonstandard_attr(self, name):
761 return name in self._rest
762 def get_nonstandard_attr(self, name, default=None):
763 return self._rest.get(name, default)
764 def set_nonstandard_attr(self, name, value):
765 self._rest[name] = value
766
767 def is_expired(self, now=None):
768 if now is None: now = time.time()
769 if (self.expires is not None) and (self.expires <= now):
770 return True
771 return False
772
773 def __str__(self):
774 if self.port is None: p = ""
775 else: p = ":"+self.port
776 limit = self.domain + p + self.path
777 if self.value is not None:
778 namevalue = "%s=%s" % (self.name, self.value)
779 else:
780 namevalue = self.name
781 return "<Cookie %s for %s>" % (namevalue, limit)
782
783 def __repr__(self):
784 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000785 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000786 "port", "port_specified",
787 "domain", "domain_specified", "domain_initial_dot",
788 "path", "path_specified",
789 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000790 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000791 attr = getattr(self, name)
792 args.append("%s=%s" % (name, repr(attr)))
793 args.append("rest=%s" % repr(self._rest))
794 return "Cookie(%s)" % ", ".join(args)
795
796
797class CookiePolicy:
798 """Defines which cookies get accepted from and returned to server.
799
800 May also modify cookies, though this is probably a bad idea.
801
802 The subclass DefaultCookiePolicy defines the standard rules for Netscape
803 and RFC 2965 cookies -- override that if you want a customised policy.
804
805 """
806 def set_ok(self, cookie, request):
807 """Return true if (and only if) cookie should be accepted from server.
808
809 Currently, pre-expired cookies never get this far -- the CookieJar
810 class deletes such cookies itself.
811
812 """
813 raise NotImplementedError()
814
815 def return_ok(self, cookie, request):
816 """Return true if (and only if) cookie should be returned to server."""
817 raise NotImplementedError()
818
819 def domain_return_ok(self, domain, request):
820 """Return false if cookies should not be returned, given cookie domain.
821 """
822 return True
823
824 def path_return_ok(self, path, request):
825 """Return false if cookies should not be returned, given cookie path.
826 """
827 return True
828
829
830class DefaultCookiePolicy(CookiePolicy):
831 """Implements the standard rules for accepting and returning cookies."""
832
833 DomainStrictNoDots = 1
834 DomainStrictNonDomain = 2
835 DomainRFC2965Match = 4
836
837 DomainLiberal = 0
838 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
839
840 def __init__(self,
841 blocked_domains=None, allowed_domains=None,
842 netscape=True, rfc2965=False,
843 hide_cookie2=False,
844 strict_domain=False,
845 strict_rfc2965_unverifiable=True,
846 strict_ns_unverifiable=False,
847 strict_ns_domain=DomainLiberal,
848 strict_ns_set_initial_dollar=False,
849 strict_ns_set_path=False,
850 ):
851 """Constructor arguments should be passed as keyword arguments only."""
852 self.netscape = netscape
853 self.rfc2965 = rfc2965
854 self.hide_cookie2 = hide_cookie2
855 self.strict_domain = strict_domain
856 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
857 self.strict_ns_unverifiable = strict_ns_unverifiable
858 self.strict_ns_domain = strict_ns_domain
859 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
860 self.strict_ns_set_path = strict_ns_set_path
861
862 if blocked_domains is not None:
863 self._blocked_domains = tuple(blocked_domains)
864 else:
865 self._blocked_domains = ()
866
867 if allowed_domains is not None:
868 allowed_domains = tuple(allowed_domains)
869 self._allowed_domains = allowed_domains
870
871 def blocked_domains(self):
872 """Return the sequence of blocked domains (as a tuple)."""
873 return self._blocked_domains
874 def set_blocked_domains(self, blocked_domains):
875 """Set the sequence of blocked domains."""
876 self._blocked_domains = tuple(blocked_domains)
877
878 def is_blocked(self, domain):
879 for blocked_domain in self._blocked_domains:
880 if user_domain_match(domain, blocked_domain):
881 return True
882 return False
883
884 def allowed_domains(self):
885 """Return None, or the sequence of allowed domains (as a tuple)."""
886 return self._allowed_domains
887 def set_allowed_domains(self, allowed_domains):
888 """Set the sequence of allowed domains, or None."""
889 if allowed_domains is not None:
890 allowed_domains = tuple(allowed_domains)
891 self._allowed_domains = allowed_domains
892
893 def is_not_allowed(self, domain):
894 if self._allowed_domains is None:
895 return False
896 for allowed_domain in self._allowed_domains:
897 if user_domain_match(domain, allowed_domain):
898 return False
899 return True
900
901 def set_ok(self, cookie, request):
902 """
903 If you override .set_ok(), be sure to call this method. If it returns
904 false, so should your subclass (assuming your subclass wants to be more
905 strict about which cookies to accept).
906
907 """
908 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
909
910 assert cookie.name is not None
911
912 for n in "version", "verifiability", "name", "path", "domain", "port":
913 fn_name = "set_ok_"+n
914 fn = getattr(self, fn_name)
915 if not fn(cookie, request):
916 return False
917
918 return True
919
920 def set_ok_version(self, cookie, request):
921 if cookie.version is None:
922 # Version is always set to 0 by parse_ns_headers if it's a Netscape
923 # cookie, so this must be an invalid RFC 2965 cookie.
924 debug(" Set-Cookie2 without version attribute (%s=%s)",
925 cookie.name, cookie.value)
926 return False
927 if cookie.version > 0 and not self.rfc2965:
928 debug(" RFC 2965 cookies are switched off")
929 return False
930 elif cookie.version == 0 and not self.netscape:
931 debug(" Netscape cookies are switched off")
932 return False
933 return True
934
935 def set_ok_verifiability(self, cookie, request):
936 if request.is_unverifiable() and is_third_party(request):
937 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
938 debug(" third-party RFC 2965 cookie during "
939 "unverifiable transaction")
940 return False
941 elif cookie.version == 0 and self.strict_ns_unverifiable:
942 debug(" third-party Netscape cookie during "
943 "unverifiable transaction")
944 return False
945 return True
946
947 def set_ok_name(self, cookie, request):
948 # Try and stop servers setting V0 cookies designed to hack other
949 # servers that know both V0 and V1 protocols.
950 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
951 cookie.name.startswith("$")):
952 debug(" illegal name (starts with '$'): '%s'", cookie.name)
953 return False
954 return True
955
956 def set_ok_path(self, cookie, request):
957 if cookie.path_specified:
958 req_path = request_path(request)
959 if ((cookie.version > 0 or
960 (cookie.version == 0 and self.strict_ns_set_path)) and
961 not req_path.startswith(cookie.path)):
962 debug(" path attribute %s is not a prefix of request "
963 "path %s", cookie.path, req_path)
964 return False
965 return True
966
967 def set_ok_domain(self, cookie, request):
968 if self.is_blocked(cookie.domain):
969 debug(" domain %s is in user block-list", cookie.domain)
970 return False
971 if self.is_not_allowed(cookie.domain):
972 debug(" domain %s is not in user allow-list", cookie.domain)
973 return False
974 if cookie.domain_specified:
975 req_host, erhn = eff_request_host(request)
976 domain = cookie.domain
977 if self.strict_domain and (domain.count(".") >= 2):
978 i = domain.rfind(".")
979 j = domain.rfind(".", 0, i)
980 if j == 0: # domain like .foo.bar
981 tld = domain[i+1:]
982 sld = domain[j+1:i]
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000983 if (sld.lower() in (
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000984 "co", "ac",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000985 "com", "edu", "org", "net", "gov", "mil", "int") and
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000986 len(tld) == 2):
987 # domain like .co.uk
988 debug(" country-code second level domain %s", domain)
989 return False
990 if domain.startswith("."):
991 undotted_domain = domain[1:]
992 else:
993 undotted_domain = domain
994 embedded_dots = (undotted_domain.find(".") >= 0)
995 if not embedded_dots and domain != ".local":
996 debug(" non-local domain %s contains no embedded dot",
997 domain)
998 return False
999 if cookie.version == 0:
1000 if (not erhn.endswith(domain) and
1001 (not erhn.startswith(".") and
1002 not ("."+erhn).endswith(domain))):
1003 debug(" effective request-host %s (even with added "
1004 "initial dot) does not end end with %s",
1005 erhn, domain)
1006 return False
1007 if (cookie.version > 0 or
1008 (self.strict_ns_domain & self.DomainRFC2965Match)):
1009 if not domain_match(erhn, domain):
1010 debug(" effective request-host %s does not domain-match "
1011 "%s", erhn, domain)
1012 return False
1013 if (cookie.version > 0 or
1014 (self.strict_ns_domain & self.DomainStrictNoDots)):
1015 host_prefix = req_host[:-len(domain)]
1016 if (host_prefix.find(".") >= 0 and
1017 not IPV4_RE.search(req_host)):
1018 debug(" host prefix %s for domain %s contains a dot",
1019 host_prefix, domain)
1020 return False
1021 return True
1022
1023 def set_ok_port(self, cookie, request):
1024 if cookie.port_specified:
1025 req_port = request_port(request)
1026 if req_port is None:
1027 req_port = "80"
1028 else:
1029 req_port = str(req_port)
1030 for p in cookie.port.split(","):
1031 try:
1032 int(p)
1033 except ValueError:
1034 debug(" bad port %s (not numeric)", p)
1035 return False
1036 if p == req_port:
1037 break
1038 else:
1039 debug(" request port (%s) not found in %s",
1040 req_port, cookie.port)
1041 return False
1042 return True
1043
1044 def return_ok(self, cookie, request):
1045 """
1046 If you override .return_ok(), be sure to call this method. If it
1047 returns false, so should your subclass (assuming your subclass wants to
1048 be more strict about which cookies to return).
1049
1050 """
1051 # Path has already been checked by .path_return_ok(), and domain
1052 # blocking done by .domain_return_ok().
1053 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1054
1055 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1056 fn_name = "return_ok_"+n
1057 fn = getattr(self, fn_name)
1058 if not fn(cookie, request):
1059 return False
1060 return True
1061
1062 def return_ok_version(self, cookie, request):
1063 if cookie.version > 0 and not self.rfc2965:
1064 debug(" RFC 2965 cookies are switched off")
1065 return False
1066 elif cookie.version == 0 and not self.netscape:
1067 debug(" Netscape cookies are switched off")
1068 return False
1069 return True
1070
1071 def return_ok_verifiability(self, cookie, request):
1072 if request.is_unverifiable() and is_third_party(request):
1073 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1074 debug(" third-party RFC 2965 cookie during unverifiable "
1075 "transaction")
1076 return False
1077 elif cookie.version == 0 and self.strict_ns_unverifiable:
1078 debug(" third-party Netscape cookie during unverifiable "
1079 "transaction")
1080 return False
1081 return True
1082
1083 def return_ok_secure(self, cookie, request):
1084 if cookie.secure and request.get_type() != "https":
1085 debug(" secure cookie with non-secure request")
1086 return False
1087 return True
1088
1089 def return_ok_expires(self, cookie, request):
1090 if cookie.is_expired(self._now):
1091 debug(" cookie expired")
1092 return False
1093 return True
1094
1095 def return_ok_port(self, cookie, request):
1096 if cookie.port:
1097 req_port = request_port(request)
1098 if req_port is None:
1099 req_port = "80"
1100 for p in cookie.port.split(","):
1101 if p == req_port:
1102 break
1103 else:
1104 debug(" request port %s does not match cookie port %s",
1105 req_port, cookie.port)
1106 return False
1107 return True
1108
1109 def return_ok_domain(self, cookie, request):
1110 req_host, erhn = eff_request_host(request)
1111 domain = cookie.domain
1112
1113 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1114 if (cookie.version == 0 and
1115 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1116 not cookie.domain_specified and domain != erhn):
1117 debug(" cookie with unspecified domain does not string-compare "
1118 "equal to request domain")
1119 return False
1120
1121 if cookie.version > 0 and not domain_match(erhn, domain):
1122 debug(" effective request-host name %s does not domain-match "
1123 "RFC 2965 cookie domain %s", erhn, domain)
1124 return False
1125 if cookie.version == 0 and not ("."+erhn).endswith(domain):
1126 debug(" request-host %s does not match Netscape cookie domain "
1127 "%s", req_host, domain)
1128 return False
1129 return True
1130
1131 def domain_return_ok(self, domain, request):
1132 # Liberal check of. This is here as an optimization to avoid
1133 # having to load lots of MSIE cookie files unless necessary.
1134 req_host, erhn = eff_request_host(request)
1135 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001136 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001137 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001138 erhn = "."+erhn
1139 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001140 #debug(" request domain %s does not match cookie domain %s",
1141 # req_host, domain)
1142 return False
1143
1144 if self.is_blocked(domain):
1145 debug(" domain %s is in user block-list", domain)
1146 return False
1147 if self.is_not_allowed(domain):
1148 debug(" domain %s is not in user allow-list", domain)
1149 return False
1150
1151 return True
1152
1153 def path_return_ok(self, path, request):
1154 debug("- checking cookie path=%s", path)
1155 req_path = request_path(request)
1156 if not req_path.startswith(path):
1157 debug(" %s does not path-match %s", req_path, path)
1158 return False
1159 return True
1160
1161
1162def vals_sorted_by_key(adict):
1163 keys = adict.keys()
1164 keys.sort()
1165 return map(adict.get, keys)
1166
1167def deepvalues(mapping):
1168 """Iterates over nested mapping, depth-first, in sorted order by key."""
1169 values = vals_sorted_by_key(mapping)
1170 for obj in values:
1171 mapping = False
1172 try:
1173 obj.items
1174 except AttributeError:
1175 pass
1176 else:
1177 mapping = True
1178 for subobj in deepvalues(obj):
1179 yield subobj
1180 if not mapping:
1181 yield obj
1182
1183
1184# Used as second parameter to dict.get() method, to distinguish absent
1185# dict key from one with a None value.
1186class Absent: pass
1187
1188class CookieJar:
1189 """Collection of HTTP cookies.
1190
1191 You may not need to know about this class: try
1192 urllib2.build_opener(HTTPCookieProcessor).open(url).
1193
1194 """
1195
1196 non_word_re = re.compile(r"\W")
1197 quote_re = re.compile(r"([\"\\])")
1198 strict_domain_re = re.compile(r"\.?[^.]*")
1199 domain_re = re.compile(r"[^.]*")
1200 dots_re = re.compile(r"^\.+")
1201
1202 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1203
1204 def __init__(self, policy=None):
1205 if policy is None:
1206 policy = DefaultCookiePolicy()
1207 self._policy = policy
1208
1209 self._cookies_lock = _threading.RLock()
1210 self._cookies = {}
1211
1212 def set_policy(self, policy):
1213 self._policy = policy
1214
1215 def _cookies_for_domain(self, domain, request):
1216 cookies = []
1217 if not self._policy.domain_return_ok(domain, request):
1218 return []
1219 debug("Checking %s for cookies to return", domain)
1220 cookies_by_path = self._cookies[domain]
1221 for path in cookies_by_path.keys():
1222 if not self._policy.path_return_ok(path, request):
1223 continue
1224 cookies_by_name = cookies_by_path[path]
1225 for cookie in cookies_by_name.values():
1226 if not self._policy.return_ok(cookie, request):
1227 debug(" not returning cookie")
1228 continue
1229 debug(" it's a match")
1230 cookies.append(cookie)
1231 return cookies
1232
1233 def _cookies_for_request(self, request):
1234 """Return a list of cookies to be returned to server."""
1235 cookies = []
1236 for domain in self._cookies.keys():
1237 cookies.extend(self._cookies_for_domain(domain, request))
1238 return cookies
1239
1240 def _cookie_attrs(self, cookies):
1241 """Return a list of cookie-attributes to be returned to server.
1242
1243 like ['foo="bar"; $Path="/"', ...]
1244
1245 The $Version attribute is also added when appropriate (currently only
1246 once per request).
1247
1248 """
1249 # add cookies in order of most specific (ie. longest) path first
1250 def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
1251 cookies.sort(decreasing_size)
1252
1253 version_set = False
1254
1255 attrs = []
1256 for cookie in cookies:
1257 # set version of Cookie header
1258 # XXX
1259 # What should it be if multiple matching Set-Cookie headers have
1260 # different versions themselves?
1261 # Answer: there is no answer; was supposed to be settled by
1262 # RFC 2965 errata, but that may never appear...
1263 version = cookie.version
1264 if not version_set:
1265 version_set = True
1266 if version > 0:
1267 attrs.append("$Version=%s" % version)
1268
1269 # quote cookie value if necessary
1270 # (not for Netscape protocol, which already has any quotes
1271 # intact, due to the poorly-specified Netscape Cookie: syntax)
1272 if ((cookie.value is not None) and
1273 self.non_word_re.search(cookie.value) and version > 0):
1274 value = self.quote_re.sub(r"\\\1", cookie.value)
1275 else:
1276 value = cookie.value
1277
1278 # add cookie-attributes to be returned in Cookie header
1279 if cookie.value is None:
1280 attrs.append(cookie.name)
1281 else:
1282 attrs.append("%s=%s" % (cookie.name, value))
1283 if version > 0:
1284 if cookie.path_specified:
1285 attrs.append('$Path="%s"' % cookie.path)
1286 if cookie.domain.startswith("."):
1287 domain = cookie.domain
1288 if (not cookie.domain_initial_dot and
1289 domain.startswith(".")):
1290 domain = domain[1:]
1291 attrs.append('$Domain="%s"' % domain)
1292 if cookie.port is not None:
1293 p = "$Port"
1294 if cookie.port_specified:
1295 p = p + ('="%s"' % cookie.port)
1296 attrs.append(p)
1297
1298 return attrs
1299
1300 def add_cookie_header(self, request):
1301 """Add correct Cookie: header to request (urllib2.Request object).
1302
1303 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1304
1305 """
1306 debug("add_cookie_header")
1307 self._cookies_lock.acquire()
1308
1309 self._policy._now = self._now = int(time.time())
1310
1311 req_host, erhn = eff_request_host(request)
1312 strict_non_domain = (
1313 self._policy.strict_ns_domain & self._policy.DomainStrictNonDomain)
1314
1315 cookies = self._cookies_for_request(request)
1316
1317 attrs = self._cookie_attrs(cookies)
1318 if attrs:
1319 if not request.has_header("Cookie"):
1320 request.add_unredirected_header(
1321 "Cookie", "; ".join(attrs))
1322
1323 # if necessary, advertise that we know RFC 2965
1324 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1325 not request.has_header("Cookie2")):
1326 for cookie in cookies:
1327 if cookie.version != 1:
1328 request.add_unredirected_header("Cookie2", '$Version="1"')
1329 break
1330
1331 self._cookies_lock.release()
1332
1333 self.clear_expired_cookies()
1334
1335 def _normalized_cookie_tuples(self, attrs_set):
1336 """Return list of tuples containing normalised cookie information.
1337
1338 attrs_set is the list of lists of key,value pairs extracted from
1339 the Set-Cookie or Set-Cookie2 headers.
1340
1341 Tuples are name, value, standard, rest, where name and value are the
1342 cookie name and value, standard is a dictionary containing the standard
1343 cookie-attributes (discard, secure, version, expires or max-age,
1344 domain, path and port) and rest is a dictionary containing the rest of
1345 the cookie-attributes.
1346
1347 """
1348 cookie_tuples = []
1349
1350 boolean_attrs = "discard", "secure"
1351 value_attrs = ("version",
1352 "expires", "max-age",
1353 "domain", "path", "port",
1354 "comment", "commenturl")
1355
1356 for cookie_attrs in attrs_set:
1357 name, value = cookie_attrs[0]
1358
1359 # Build dictionary of standard cookie-attributes (standard) and
1360 # dictionary of other cookie-attributes (rest).
1361
1362 # Note: expiry time is normalised to seconds since epoch. V0
1363 # cookies should have the Expires cookie-attribute, and V1 cookies
1364 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1365 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1366 # accept either (but prefer Max-Age).
1367 max_age_set = False
1368
1369 bad_cookie = False
1370
1371 standard = {}
1372 rest = {}
1373 for k, v in cookie_attrs[1:]:
1374 lc = k.lower()
1375 # don't lose case distinction for unknown fields
1376 if lc in value_attrs or lc in boolean_attrs:
1377 k = lc
1378 if k in boolean_attrs and v is None:
1379 # boolean cookie-attribute is present, but has no value
1380 # (like "discard", rather than "port=80")
1381 v = True
1382 if k in standard:
1383 # only first value is significant
1384 continue
1385 if k == "domain":
1386 if v is None:
1387 debug(" missing value for domain attribute")
1388 bad_cookie = True
1389 break
1390 # RFC 2965 section 3.3.3
1391 v = v.lower()
1392 if k == "expires":
1393 if max_age_set:
1394 # Prefer max-age to expires (like Mozilla)
1395 continue
1396 if v is None:
1397 debug(" missing or invalid value for expires "
1398 "attribute: treating as session cookie")
1399 continue
1400 if k == "max-age":
1401 max_age_set = True
1402 try:
1403 v = int(v)
1404 except ValueError:
1405 debug(" missing or invalid (non-numeric) value for "
1406 "max-age attribute")
1407 bad_cookie = True
1408 break
1409 # convert RFC 2965 Max-Age to seconds since epoch
1410 # XXX Strictly you're supposed to follow RFC 2616
1411 # age-calculation rules. Remember that zero Max-Age is a
1412 # is a request to discard (old and new) cookie, though.
1413 k = "expires"
1414 v = self._now + v
1415 if (k in value_attrs) or (k in boolean_attrs):
1416 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001417 k not in ("port", "comment", "commenturl")):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001418 debug(" missing value for %s attribute" % k)
1419 bad_cookie = True
1420 break
1421 standard[k] = v
1422 else:
1423 rest[k] = v
1424
1425 if bad_cookie:
1426 continue
1427
1428 cookie_tuples.append((name, value, standard, rest))
1429
1430 return cookie_tuples
1431
1432 def _cookie_from_cookie_tuple(self, tup, request):
1433 # standard is dict of standard cookie-attributes, rest is dict of the
1434 # rest of them
1435 name, value, standard, rest = tup
1436
1437 domain = standard.get("domain", Absent)
1438 path = standard.get("path", Absent)
1439 port = standard.get("port", Absent)
1440 expires = standard.get("expires", Absent)
1441
1442 # set the easy defaults
1443 version = standard.get("version", None)
1444 if version is not None: version = int(version)
1445 secure = standard.get("secure", False)
1446 # (discard is also set if expires is Absent)
1447 discard = standard.get("discard", False)
1448 comment = standard.get("comment", None)
1449 comment_url = standard.get("commenturl", None)
1450
1451 # set default path
1452 if path is not Absent and path != "":
1453 path_specified = True
1454 path = escape_path(path)
1455 else:
1456 path_specified = False
1457 path = request_path(request)
1458 i = path.rfind("/")
1459 if i != -1:
1460 if version == 0:
1461 # Netscape spec parts company from reality here
1462 path = path[:i]
1463 else:
1464 path = path[:i+1]
1465 if len(path) == 0: path = "/"
1466
1467 # set default domain
1468 domain_specified = domain is not Absent
1469 # but first we have to remember whether it starts with a dot
1470 domain_initial_dot = False
1471 if domain_specified:
1472 domain_initial_dot = bool(domain.startswith("."))
1473 if domain is Absent:
1474 req_host, erhn = eff_request_host(request)
1475 domain = erhn
1476 elif not domain.startswith("."):
1477 domain = "."+domain
1478
1479 # set default port
1480 port_specified = False
1481 if port is not Absent:
1482 if port is None:
1483 # Port attr present, but has no value: default to request port.
1484 # Cookie should then only be sent back on that port.
1485 port = request_port(request)
1486 else:
1487 port_specified = True
1488 port = re.sub(r"\s+", "", port)
1489 else:
1490 # No port attr present. Cookie can be sent back on any port.
1491 port = None
1492
1493 # set default expires and discard
1494 if expires is Absent:
1495 expires = None
1496 discard = True
1497 elif expires <= self._now:
1498 # Expiry date in past is request to delete cookie. This can't be
1499 # in DefaultCookiePolicy, because can't delete cookies there.
1500 try:
1501 self.clear(domain, path, name)
1502 except KeyError:
1503 pass
1504 debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1505 domain, path, name)
1506 return None
1507
1508 return Cookie(version,
1509 name, value,
1510 port, port_specified,
1511 domain, domain_specified, domain_initial_dot,
1512 path, path_specified,
1513 secure,
1514 expires,
1515 discard,
1516 comment,
1517 comment_url,
1518 rest)
1519
1520 def _cookies_from_attrs_set(self, attrs_set, request):
1521 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1522
1523 cookies = []
1524 for tup in cookie_tuples:
1525 cookie = self._cookie_from_cookie_tuple(tup, request)
1526 if cookie: cookies.append(cookie)
1527 return cookies
1528
1529 def make_cookies(self, response, request):
1530 """Return sequence of Cookie objects extracted from response object."""
1531 # get cookie-attributes for RFC 2965 and Netscape protocols
1532 headers = response.info()
1533 rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1534 ns_hdrs = headers.getheaders("Set-Cookie")
1535
1536 rfc2965 = self._policy.rfc2965
1537 netscape = self._policy.netscape
1538
1539 if ((not rfc2965_hdrs and not ns_hdrs) or
1540 (not ns_hdrs and not rfc2965) or
1541 (not rfc2965_hdrs and not netscape) or
1542 (not netscape and not rfc2965)):
1543 return [] # no relevant cookie headers: quick exit
1544
1545 try:
1546 cookies = self._cookies_from_attrs_set(
1547 split_header_words(rfc2965_hdrs), request)
1548 except:
1549 reraise_unmasked_exceptions()
1550 cookies = []
1551
1552 if ns_hdrs and netscape:
1553 try:
1554 ns_cookies = self._cookies_from_attrs_set(
1555 parse_ns_headers(ns_hdrs), request)
1556 except:
1557 reraise_unmasked_exceptions()
1558 ns_cookies = []
1559
1560 # Look for Netscape cookies (from Set-Cookie headers) that match
1561 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1562 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1563 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1564 # bundled in with the Netscape cookies for this purpose, which is
1565 # reasonable behaviour.
1566 if rfc2965:
1567 lookup = {}
1568 for cookie in cookies:
1569 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1570
1571 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1572 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1573 return key not in lookup
1574 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1575
1576 if ns_cookies:
1577 cookies.extend(ns_cookies)
1578
1579 return cookies
1580
1581 def set_cookie_if_ok(self, cookie, request):
1582 """Set a cookie if policy says it's OK to do so."""
1583 self._cookies_lock.acquire()
1584 self._policy._now = self._now = int(time.time())
1585
1586 if self._policy.set_ok(cookie, request):
1587 self.set_cookie(cookie)
1588
1589 self._cookies_lock.release()
1590
1591 def set_cookie(self, cookie):
1592 """Set a cookie, without checking whether or not it should be set."""
1593 c = self._cookies
1594 self._cookies_lock.acquire()
1595 try:
1596 if cookie.domain not in c: c[cookie.domain] = {}
1597 c2 = c[cookie.domain]
1598 if cookie.path not in c2: c2[cookie.path] = {}
1599 c3 = c2[cookie.path]
1600 c3[cookie.name] = cookie
1601 finally:
1602 self._cookies_lock.release()
1603
1604 def extract_cookies(self, response, request):
1605 """Extract cookies from response, where allowable given the request."""
1606 debug("extract_cookies: %s", response.info())
1607 self._cookies_lock.acquire()
1608 self._policy._now = self._now = int(time.time())
1609
1610 for cookie in self.make_cookies(response, request):
1611 if self._policy.set_ok(cookie, request):
1612 debug(" setting cookie: %s", cookie)
1613 self.set_cookie(cookie)
1614 self._cookies_lock.release()
1615
1616 def clear(self, domain=None, path=None, name=None):
1617 """Clear some cookies.
1618
1619 Invoking this method without arguments will clear all cookies. If
1620 given a single argument, only cookies belonging to that domain will be
1621 removed. If given two arguments, cookies belonging to the specified
1622 path within that domain are removed. If given three arguments, then
1623 the cookie with the specified name, path and domain is removed.
1624
1625 Raises KeyError if no matching cookie exists.
1626
1627 """
1628 if name is not None:
1629 if (domain is None) or (path is None):
1630 raise ValueError(
1631 "domain and path must be given to remove a cookie by name")
1632 del self._cookies[domain][path][name]
1633 elif path is not None:
1634 if domain is None:
1635 raise ValueError(
1636 "domain must be given to remove cookies by path")
1637 del self._cookies[domain][path]
1638 elif domain is not None:
1639 del self._cookies[domain]
1640 else:
1641 self._cookies = {}
1642
1643 def clear_session_cookies(self):
1644 """Discard all session cookies.
1645
1646 Note that the .save() method won't save session cookies anyway, unless
1647 you ask otherwise by passing a true ignore_discard argument.
1648
1649 """
1650 self._cookies_lock.acquire()
1651 for cookie in self:
1652 if cookie.discard:
1653 self.clear(cookie.domain, cookie.path, cookie.name)
1654 self._cookies_lock.release()
1655
1656 def clear_expired_cookies(self):
1657 """Discard all expired cookies.
1658
1659 You probably don't need to call this method: expired cookies are never
1660 sent back to the server (provided you're using DefaultCookiePolicy),
1661 this method is called by CookieJar itself every so often, and the
1662 .save() method won't save expired cookies anyway (unless you ask
1663 otherwise by passing a true ignore_expires argument).
1664
1665 """
1666 self._cookies_lock.acquire()
1667 now = time.time()
1668 for cookie in self:
1669 if cookie.is_expired(now):
1670 self.clear(cookie.domain, cookie.path, cookie.name)
1671 self._cookies_lock.release()
1672
1673 def __iter__(self):
1674 return deepvalues(self._cookies)
1675
1676 def __len__(self):
1677 """Return number of contained cookies."""
1678 i = 0
1679 for cookie in self: i = i + 1
1680 return i
1681
1682 def __repr__(self):
1683 r = []
1684 for cookie in self: r.append(repr(cookie))
1685 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1686
1687 def __str__(self):
1688 r = []
1689 for cookie in self: r.append(str(cookie))
1690 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1691
1692
1693class LoadError(Exception): pass
1694
1695class FileCookieJar(CookieJar):
1696 """CookieJar that can be loaded from and saved to a file."""
1697
1698 def __init__(self, filename=None, delayload=False, policy=None):
1699 """
1700 Cookies are NOT loaded from the named file until either the .load() or
1701 .revert() method is called.
1702
1703 """
1704 CookieJar.__init__(self, policy)
1705 if filename is not None:
1706 try:
1707 filename+""
1708 except:
1709 raise ValueError("filename must be string-like")
1710 self.filename = filename
1711 self.delayload = bool(delayload)
1712
1713 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1714 """Save cookies to a file."""
1715 raise NotImplementedError()
1716
1717 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1718 """Load cookies from a file."""
1719 if filename is None:
1720 if self.filename is not None: filename = self.filename
1721 else: raise ValueError(MISSING_FILENAME_TEXT)
1722
1723 f = open(filename)
1724 try:
1725 self._really_load(f, filename, ignore_discard, ignore_expires)
1726 finally:
1727 f.close()
1728
1729 def revert(self, filename=None,
1730 ignore_discard=False, ignore_expires=False):
1731 """Clear all cookies and reload cookies from a saved file.
1732
1733 Raises LoadError (or IOError) if reversion is not successful; the
1734 object's state will not be altered if this happens.
1735
1736 """
1737 if filename is None:
1738 if self.filename is not None: filename = self.filename
1739 else: raise ValueError(MISSING_FILENAME_TEXT)
1740
1741 self._cookies_lock.acquire()
1742
1743 old_state = copy.deepcopy(self._cookies)
1744 self._cookies = {}
1745 try:
1746 self.load(filename, ignore_discard, ignore_expires)
1747 except (LoadError, IOError):
1748 self._cookies = old_state
1749 raise
1750
1751 self._cookies_lock.release()
1752
1753from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1754from _MozillaCookieJar import MozillaCookieJar