blob: f0a89a555cd37f71b81500becc1482cd5bf663e8 [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that the classes which do not derive from
11FileCookieJar are not distributed with the Python standard library, but
12are available from http://wwwsearch.sf.net/):
13
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +000028import sys, re, urlparse, copy, time, urllib, logging
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000029try:
30 import threading as _threading
31except ImportError:
32 import dummy_threading as _threading
33import httplib # only for the default HTTP port
34from calendar import timegm
35
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000036debug = logging.getLogger("cookielib").debug
37
38DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
39MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
40 "instance initialised with one)")
41
42def reraise_unmasked_exceptions(unmasked=()):
43 # There are a few catch-all except: statements in this module, for
44 # catching input that's bad in unexpected ways.
45 # This function re-raises some exceptions we don't want to trap.
46 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
47 etype = sys.exc_info()[0]
48 if issubclass(etype, unmasked):
49 raise
50 # swallowed an exception
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000051 import warnings, traceback, StringIO
52 f = StringIO.StringIO()
53 traceback.print_exc(None, f)
54 msg = f.getvalue()
55 warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000056
57
58# Date/time conversion
59# -----------------------------------------------------------------------------
60
61EPOCH_YEAR = 1970
62def _timegm(tt):
63 year, month, mday, hour, min, sec = tt[:6]
64 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
65 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
66 return timegm(tt)
67 else:
68 return None
69
70DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
71MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
72 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
73MONTHS_LOWER = []
74for month in MONTHS: MONTHS_LOWER.append(month.lower())
75
76def time2isoz(t=None):
77 """Return a string representing time in seconds since epoch, t.
78
79 If the function is called without an argument, it will use the current
80 time.
81
82 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
83 representing Universal Time (UTC, aka GMT). An example of this format is:
84
85 1994-11-24 08:49:37Z
86
87 """
88 if t is None: t = time.time()
89 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
90 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
91 year, mon, mday, hour, min, sec)
92
93def time2netscape(t=None):
94 """Return a string representing time in seconds since epoch, t.
95
96 If the function is called without an argument, it will use the current
97 time.
98
99 The format of the returned string is like this:
100
101 Wed, DD-Mon-YYYY HH:MM:SS GMT
102
103 """
104 if t is None: t = time.time()
105 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
106 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
107 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
108
109
110UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
111
112TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
113def offset_from_tz_string(tz):
114 offset = None
115 if tz in UTC_ZONES:
116 offset = 0
117 else:
118 m = TIMEZONE_RE.search(tz)
119 if m:
120 offset = 3600 * int(m.group(2))
121 if m.group(3):
122 offset = offset + 60 * int(m.group(3))
123 if m.group(1) == '-':
124 offset = -offset
125 return offset
126
127def _str2time(day, mon, yr, hr, min, sec, tz):
128 # translate month name to number
129 # month numbers start with 1 (January)
130 try:
131 mon = MONTHS_LOWER.index(mon.lower())+1
132 except ValueError:
133 # maybe it's already a number
134 try:
135 imon = int(mon)
136 except ValueError:
137 return None
138 if 1 <= imon <= 12:
139 mon = imon
140 else:
141 return None
142
143 # make sure clock elements are defined
144 if hr is None: hr = 0
145 if min is None: min = 0
146 if sec is None: sec = 0
147
148 yr = int(yr)
149 day = int(day)
150 hr = int(hr)
151 min = int(min)
152 sec = int(sec)
153
154 if yr < 1000:
155 # find "obvious" year
156 cur_yr = time.localtime(time.time())[0]
157 m = cur_yr % 100
158 tmp = yr
159 yr = yr + cur_yr - m
160 m = m - tmp
161 if abs(m) > 50:
162 if m > 0: yr = yr + 100
163 else: yr = yr - 100
164
165 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
166 t = _timegm((yr, mon, day, hr, min, sec, tz))
167
168 if t is not None:
169 # adjust time using timezone string, to get absolute time since epoch
170 if tz is None:
171 tz = "UTC"
172 tz = tz.upper()
173 offset = offset_from_tz_string(tz)
174 if offset is None:
175 return None
176 t = t - offset
177
178 return t
179
180STRICT_DATE_RE = re.compile(
181 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
182 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
183WEEKDAY_RE = re.compile(
184 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
185LOOSE_HTTP_DATE_RE = re.compile(
186 r"""^
187 (\d\d?) # day
188 (?:\s+|[-\/])
189 (\w+) # month
190 (?:\s+|[-\/])
191 (\d+) # year
192 (?:
193 (?:\s+|:) # separator before clock
194 (\d\d?):(\d\d) # hour:min
195 (?::(\d\d))? # optional seconds
196 )? # optional clock
197 \s*
198 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
199 \s*
200 (?:\(\w+\))? # ASCII representation of timezone in parens.
201 \s*$""", re.X)
202def http2time(text):
203 """Returns time in seconds since epoch of time represented by a string.
204
205 Return value is an integer.
206
207 None is returned if the format of str is unrecognized, the time is outside
208 the representable range, or the timezone string is not recognized. If the
209 string contains no timezone, UTC is assumed.
210
211 The timezone in the string may be numerical (like "-0800" or "+0100") or a
212 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
213 timezone strings equivalent to UTC (zero offset) are known to the function.
214
215 The function loosely parses the following formats:
216
217 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
218 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
219 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
220 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
221 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
222 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
223
224 The parser ignores leading and trailing whitespace. The time may be
225 absent.
226
227 If the year is given with only 2 digits, the function will select the
228 century that makes the year closest to the current date.
229
230 """
231 # fast exit for strictly conforming string
232 m = STRICT_DATE_RE.search(text)
233 if m:
234 g = m.groups()
235 mon = MONTHS_LOWER.index(g[1].lower()) + 1
236 tt = (int(g[2]), mon, int(g[0]),
237 int(g[3]), int(g[4]), float(g[5]))
238 return _timegm(tt)
239
240 # No, we need some messy parsing...
241
242 # clean up
243 text = text.lstrip()
244 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
245
246 # tz is time zone specifier string
247 day, mon, yr, hr, min, sec, tz = [None]*7
248
249 # loose regexp parse
250 m = LOOSE_HTTP_DATE_RE.search(text)
251 if m is not None:
252 day, mon, yr, hr, min, sec, tz = m.groups()
253 else:
254 return None # bad format
255
256 return _str2time(day, mon, yr, hr, min, sec, tz)
257
258ISO_DATE_RE = re.compile(
259 """^
260 (\d{4}) # year
261 [-\/]?
262 (\d\d?) # numerical month
263 [-\/]?
264 (\d\d?) # day
265 (?:
266 (?:\s+|[-:Tt]) # separator before clock
267 (\d\d?):?(\d\d) # hour:min
268 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
269 )? # optional clock
270 \s*
271 ([-+]?\d\d?:?(:?\d\d)?
272 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
273 \s*$""", re.X)
274def iso2time(text):
275 """
276 As for http2time, but parses the ISO 8601 formats:
277
278 1994-02-03 14:15:29 -0100 -- ISO 8601 format
279 1994-02-03 14:15:29 -- zone is optional
280 1994-02-03 -- only date
281 1994-02-03T14:15:29 -- Use T as separator
282 19940203T141529Z -- ISO 8601 compact format
283 19940203 -- only date
284
285 """
286 # clean up
287 text = text.lstrip()
288
289 # tz is time zone specifier string
290 day, mon, yr, hr, min, sec, tz = [None]*7
291
292 # loose regexp parse
293 m = ISO_DATE_RE.search(text)
294 if m is not None:
295 # XXX there's an extra bit of the timezone I'm ignoring here: is
296 # this the right thing to do?
297 yr, mon, day, hr, min, sec, tz, _ = m.groups()
298 else:
299 return None # bad format
300
301 return _str2time(day, mon, yr, hr, min, sec, tz)
302
303
304# Header parsing
305# -----------------------------------------------------------------------------
306
307def unmatched(match):
308 """Return unmatched part of re.Match object."""
309 start, end = match.span(0)
310 return match.string[:start]+match.string[end:]
311
312HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
313HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
314HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
315HEADER_ESCAPE_RE = re.compile(r"\\(.)")
316def split_header_words(header_values):
317 r"""Parse header values into a list of lists containing key,value pairs.
318
319 The function knows how to deal with ",", ";" and "=" as well as quoted
320 values after "=". A list of space separated tokens are parsed as if they
321 were separated by ";".
322
323 If the header_values passed as argument contains multiple values, then they
324 are treated as if they were a single value separated by comma ",".
325
326 This means that this function is useful for parsing header fields that
327 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
328 the requirement for tokens).
329
330 headers = #header
331 header = (token | parameter) *( [";"] (token | parameter))
332
333 token = 1*<any CHAR except CTLs or separators>
334 separators = "(" | ")" | "<" | ">" | "@"
335 | "," | ";" | ":" | "\" | <">
336 | "/" | "[" | "]" | "?" | "="
337 | "{" | "}" | SP | HT
338
339 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
340 qdtext = <any TEXT except <">>
341 quoted-pair = "\" CHAR
342
343 parameter = attribute "=" value
344 attribute = token
345 value = token | quoted-string
346
347 Each header is represented by a list of key/value pairs. The value for a
348 simple token (not part of a parameter) is None. Syntactically incorrect
349 headers will not necessarily be parsed as you would want.
350
351 This is easier to describe with some examples:
352
353 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
354 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
355 >>> split_header_words(['text/html; charset="iso-8859-1"'])
356 [[('text/html', None), ('charset', 'iso-8859-1')]]
357 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
358 [[('Basic', None), ('realm', '"foobar"')]]
359
360 """
Raymond Hettingerf7153662005-02-07 14:16:21 +0000361 assert not isinstance(header_values, basestring)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000362 result = []
363 for text in header_values:
364 orig_text = text
365 pairs = []
366 while text:
367 m = HEADER_TOKEN_RE.search(text)
368 if m:
369 text = unmatched(m)
370 name = m.group(1)
371 m = HEADER_QUOTED_VALUE_RE.search(text)
372 if m: # quoted value
373 text = unmatched(m)
374 value = m.group(1)
375 value = HEADER_ESCAPE_RE.sub(r"\1", value)
376 else:
377 m = HEADER_VALUE_RE.search(text)
378 if m: # unquoted value
379 text = unmatched(m)
380 value = m.group(1)
381 value = value.rstrip()
382 else:
383 # no value, a lone token
384 value = None
385 pairs.append((name, value))
386 elif text.lstrip().startswith(","):
387 # concatenated headers, as per RFC 2616 section 4.2
388 text = text.lstrip()[1:]
389 if pairs: result.append(pairs)
390 pairs = []
391 else:
392 # skip junk
393 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
394 assert nr_junk_chars > 0, (
395 "split_header_words bug: '%s', '%s', %s" %
396 (orig_text, text, pairs))
397 text = non_junk
398 if pairs: result.append(pairs)
399 return result
400
401HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
402def join_header_words(lists):
403 """Do the inverse (almost) of the conversion done by split_header_words.
404
405 Takes a list of lists of (key, value) pairs and produces a single header
406 value. Attribute values are quoted if needed.
407
408 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
409 'text/plain; charset="iso-8859/1"'
410 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
411 'text/plain, charset="iso-8859/1"'
412
413 """
414 headers = []
415 for pairs in lists:
416 attr = []
417 for k, v in pairs:
418 if v is not None:
419 if not re.search(r"^\w+$", v):
420 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
421 v = '"%s"' % v
422 k = "%s=%s" % (k, v)
423 attr.append(k)
424 if attr: headers.append("; ".join(attr))
425 return ", ".join(headers)
426
427def parse_ns_headers(ns_headers):
428 """Ad-hoc parser for Netscape protocol cookie-attributes.
429
430 The old Netscape cookie format for Set-Cookie can for instance contain
431 an unquoted "," in the expires field, so we have to use this ad-hoc
432 parser instead of split_header_words.
433
434 XXX This may not make the best possible effort to parse all the crap
435 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
436 parser is probably better, so could do worse than following that if
437 this ever gives any trouble.
438
439 Currently, this is also used for parsing RFC 2109 cookies.
440
441 """
442 known_attrs = ("expires", "domain", "path", "secure",
443 # RFC 2109 attrs (may turn up in Netscape cookies, too)
444 "port", "max-age")
445
446 result = []
447 for ns_header in ns_headers:
448 pairs = []
449 version_set = False
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000450 for ii, param in enumerate(re.split(r";\s*", ns_header)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000451 param = param.rstrip()
452 if param == "": continue
453 if "=" not in param:
Martin v. Löwisc5574e82005-03-03 10:57:37 +0000454 k, v = param, None
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000455 else:
456 k, v = re.split(r"\s*=\s*", param, 1)
457 k = k.lstrip()
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000458 if ii != 0:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000459 lc = k.lower()
460 if lc in known_attrs:
461 k = lc
462 if k == "version":
Neal Norwitz71dad722005-12-23 21:43:48 +0000463 # This is an RFC 2109 cookie.
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000464 version_set = True
465 if k == "expires":
466 # convert expires date to seconds since epoch
467 if v.startswith('"'): v = v[1:]
468 if v.endswith('"'): v = v[:-1]
469 v = http2time(v) # None if invalid
470 pairs.append((k, v))
471
472 if pairs:
473 if not version_set:
474 pairs.append(("version", "0"))
475 result.append(pairs)
476
477 return result
478
479
480IPV4_RE = re.compile(r"\.\d+$")
481def is_HDN(text):
482 """Return True if text is a host domain name."""
483 # XXX
484 # This may well be wrong. Which RFC is HDN defined in, if any (for
485 # the purposes of RFC 2965)?
486 # For the current implementation, what about IPv6? Remember to look
487 # at other uses of IPV4_RE also, if change this.
488 if IPV4_RE.search(text):
489 return False
490 if text == "":
491 return False
492 if text[0] == "." or text[-1] == ".":
493 return False
494 return True
495
496def domain_match(A, B):
497 """Return True if domain A domain-matches domain B, according to RFC 2965.
498
499 A and B may be host domain names or IP addresses.
500
501 RFC 2965, section 1:
502
503 Host names can be specified either as an IP address or a HDN string.
504 Sometimes we compare one host name with another. (Such comparisons SHALL
505 be case-insensitive.) Host A's name domain-matches host B's if
506
507 * their host name strings string-compare equal; or
508
509 * A is a HDN string and has the form NB, where N is a non-empty
510 name string, B has the form .B', and B' is a HDN string. (So,
511 x.y.com domain-matches .Y.com but not Y.com.)
512
513 Note that domain-match is not a commutative operation: a.b.c.com
514 domain-matches .c.com, but not the reverse.
515
516 """
517 # Note that, if A or B are IP addresses, the only relevant part of the
518 # definition of the domain-match algorithm is the direct string-compare.
519 A = A.lower()
520 B = B.lower()
521 if A == B:
522 return True
523 if not is_HDN(A):
524 return False
525 i = A.rfind(B)
526 if i == -1 or i == 0:
527 # A does not have form NB, or N is the empty string
528 return False
529 if not B.startswith("."):
530 return False
531 if not is_HDN(B[1:]):
532 return False
533 return True
534
535def liberal_is_HDN(text):
536 """Return True if text is a sort-of-like a host domain name.
537
538 For accepting/blocking domains.
539
540 """
541 if IPV4_RE.search(text):
542 return False
543 return True
544
545def user_domain_match(A, B):
546 """For blocking/accepting domains.
547
548 A and B may be host domain names or IP addresses.
549
550 """
551 A = A.lower()
552 B = B.lower()
553 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
554 if A == B:
555 # equal IP addresses
556 return True
557 return False
558 initial_dot = B.startswith(".")
559 if initial_dot and A.endswith(B):
560 return True
561 if not initial_dot and A == B:
562 return True
563 return False
564
565cut_port_re = re.compile(r":\d+$")
566def request_host(request):
567 """Return request-host, as defined by RFC 2965.
568
569 Variation from RFC: returned value is lowercased, for convenient
570 comparison.
571
572 """
573 url = request.get_full_url()
574 host = urlparse.urlparse(url)[1]
575 if host == "":
576 host = request.get_header("Host", "")
577
578 # remove port, if present
579 host = cut_port_re.sub("", host, 1)
580 return host.lower()
581
582def eff_request_host(request):
583 """Return a tuple (request-host, effective request-host name).
584
585 As defined by RFC 2965, except both are lowercased.
586
587 """
588 erhn = req_host = request_host(request)
589 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
590 erhn = req_host + ".local"
591 return req_host, erhn
592
593def request_path(request):
594 """request-URI, as defined by RFC 2965."""
595 url = request.get_full_url()
596 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
597 #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
598 path, parameters, query, frag = urlparse.urlparse(url)[2:]
599 if parameters:
600 path = "%s;%s" % (path, parameters)
601 path = escape_path(path)
602 req_path = urlparse.urlunparse(("", "", path, "", query, frag))
603 if not req_path.startswith("/"):
604 # fix bad RFC 2396 absoluteURI
605 req_path = "/"+req_path
606 return req_path
607
608def request_port(request):
609 host = request.get_host()
610 i = host.find(':')
611 if i >= 0:
612 port = host[i+1:]
613 try:
614 int(port)
615 except ValueError:
616 debug("nonnumeric port: '%s'", port)
617 return None
618 else:
619 port = DEFAULT_HTTP_PORT
620 return port
621
622# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
623# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
624HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
625ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
626def uppercase_escaped_char(match):
627 return "%%%s" % match.group(1).upper()
628def escape_path(path):
629 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
630 # There's no knowing what character encoding was used to create URLs
631 # containing %-escapes, but since we have to pick one to escape invalid
632 # path characters, we pick UTF-8, as recommended in the HTML 4.0
633 # specification:
634 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
635 # And here, kind of: draft-fielding-uri-rfc2396bis-03
636 # (And in draft IRI specification: draft-duerst-iri-05)
637 # (And here, for new URI schemes: RFC 2718)
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +0000638 if isinstance(path, unicode):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000639 path = path.encode("utf-8")
640 path = urllib.quote(path, HTTP_PATH_SAFE)
641 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
642 return path
643
644def reach(h):
645 """Return reach of host h, as defined by RFC 2965, section 1.
646
647 The reach R of a host name H is defined as follows:
648
649 * If
650
651 - H is the host domain name of a host; and,
652
653 - H has the form A.B; and
654
655 - A has no embedded (that is, interior) dots; and
656
657 - B has at least one embedded dot, or B is the string "local".
658 then the reach of H is .B.
659
660 * Otherwise, the reach of H is H.
661
662 >>> reach("www.acme.com")
663 '.acme.com'
664 >>> reach("acme.com")
665 'acme.com'
666 >>> reach("acme.local")
667 '.local'
668
669 """
670 i = h.find(".")
671 if i >= 0:
672 #a = h[:i] # this line is only here to show what a is
673 b = h[i+1:]
674 i = b.find(".")
675 if is_HDN(h) and (i >= 0 or b == "local"):
676 return "."+b
677 return h
678
679def is_third_party(request):
680 """
681
682 RFC 2965, section 3.3.6:
683
684 An unverifiable transaction is to a third-party host if its request-
685 host U does not domain-match the reach R of the request-host O in the
686 origin transaction.
687
688 """
689 req_host = request_host(request)
690 if not domain_match(req_host, reach(request.get_origin_req_host())):
691 return True
692 else:
693 return False
694
695
696class Cookie:
697 """HTTP Cookie.
698
699 This class represents both Netscape and RFC 2965 cookies.
700
701 This is deliberately a very simple class. It just holds attributes. It's
702 possible to construct Cookie instances that don't comply with the cookie
703 standards. CookieJar.make_cookies is the factory function for Cookie
704 objects -- it deals with cookie parsing, supplying defaults, and
705 normalising to the representation used in this class. CookiePolicy is
706 responsible for checking them to see whether they should be accepted from
707 and returned to the server.
708
709 Note that the port may be present in the headers, but unspecified ("Port"
710 rather than"Port=80", for example); if this is the case, port is None.
711
712 """
713
714 def __init__(self, version, name, value,
715 port, port_specified,
716 domain, domain_specified, domain_initial_dot,
717 path, path_specified,
718 secure,
719 expires,
720 discard,
721 comment,
722 comment_url,
Neal Norwitz71dad722005-12-23 21:43:48 +0000723 rest,
724 rfc2109=False,
725 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000726
727 if version is not None: version = int(version)
728 if expires is not None: expires = int(expires)
729 if port is None and port_specified is True:
730 raise ValueError("if port is None, port_specified must be false")
731
732 self.version = version
733 self.name = name
734 self.value = value
735 self.port = port
736 self.port_specified = port_specified
737 # normalise case, as per RFC 2965 section 3.3.3
738 self.domain = domain.lower()
739 self.domain_specified = domain_specified
740 # Sigh. We need to know whether the domain given in the
741 # cookie-attribute had an initial dot, in order to follow RFC 2965
742 # (as clarified in draft errata). Needed for the returned $Domain
743 # value.
744 self.domain_initial_dot = domain_initial_dot
745 self.path = path
746 self.path_specified = path_specified
747 self.secure = secure
748 self.expires = expires
749 self.discard = discard
750 self.comment = comment
751 self.comment_url = comment_url
Neal Norwitz71dad722005-12-23 21:43:48 +0000752 self.rfc2109 = rfc2109
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000753
754 self._rest = copy.copy(rest)
755
756 def has_nonstandard_attr(self, name):
757 return name in self._rest
758 def get_nonstandard_attr(self, name, default=None):
759 return self._rest.get(name, default)
760 def set_nonstandard_attr(self, name, value):
761 self._rest[name] = value
762
763 def is_expired(self, now=None):
764 if now is None: now = time.time()
765 if (self.expires is not None) and (self.expires <= now):
766 return True
767 return False
768
769 def __str__(self):
770 if self.port is None: p = ""
771 else: p = ":"+self.port
772 limit = self.domain + p + self.path
773 if self.value is not None:
774 namevalue = "%s=%s" % (self.name, self.value)
775 else:
776 namevalue = self.name
777 return "<Cookie %s for %s>" % (namevalue, limit)
778
779 def __repr__(self):
780 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000781 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000782 "port", "port_specified",
783 "domain", "domain_specified", "domain_initial_dot",
784 "path", "path_specified",
785 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000786 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000787 attr = getattr(self, name)
788 args.append("%s=%s" % (name, repr(attr)))
789 args.append("rest=%s" % repr(self._rest))
Neal Norwitz71dad722005-12-23 21:43:48 +0000790 args.append("rfc2109=%s" % repr(self.rfc2109))
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000791 return "Cookie(%s)" % ", ".join(args)
792
793
794class CookiePolicy:
795 """Defines which cookies get accepted from and returned to server.
796
797 May also modify cookies, though this is probably a bad idea.
798
799 The subclass DefaultCookiePolicy defines the standard rules for Netscape
800 and RFC 2965 cookies -- override that if you want a customised policy.
801
802 """
803 def set_ok(self, cookie, request):
804 """Return true if (and only if) cookie should be accepted from server.
805
806 Currently, pre-expired cookies never get this far -- the CookieJar
807 class deletes such cookies itself.
808
809 """
810 raise NotImplementedError()
811
812 def return_ok(self, cookie, request):
813 """Return true if (and only if) cookie should be returned to server."""
814 raise NotImplementedError()
815
816 def domain_return_ok(self, domain, request):
817 """Return false if cookies should not be returned, given cookie domain.
818 """
819 return True
820
821 def path_return_ok(self, path, request):
822 """Return false if cookies should not be returned, given cookie path.
823 """
824 return True
825
826
827class DefaultCookiePolicy(CookiePolicy):
828 """Implements the standard rules for accepting and returning cookies."""
829
830 DomainStrictNoDots = 1
831 DomainStrictNonDomain = 2
832 DomainRFC2965Match = 4
833
834 DomainLiberal = 0
835 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
836
837 def __init__(self,
838 blocked_domains=None, allowed_domains=None,
839 netscape=True, rfc2965=False,
Neal Norwitz71dad722005-12-23 21:43:48 +0000840 rfc2109_as_netscape=None,
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000841 hide_cookie2=False,
842 strict_domain=False,
843 strict_rfc2965_unverifiable=True,
844 strict_ns_unverifiable=False,
845 strict_ns_domain=DomainLiberal,
846 strict_ns_set_initial_dollar=False,
847 strict_ns_set_path=False,
848 ):
849 """Constructor arguments should be passed as keyword arguments only."""
850 self.netscape = netscape
851 self.rfc2965 = rfc2965
Neal Norwitz71dad722005-12-23 21:43:48 +0000852 self.rfc2109_as_netscape = rfc2109_as_netscape
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000853 self.hide_cookie2 = hide_cookie2
854 self.strict_domain = strict_domain
855 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
856 self.strict_ns_unverifiable = strict_ns_unverifiable
857 self.strict_ns_domain = strict_ns_domain
858 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
859 self.strict_ns_set_path = strict_ns_set_path
860
861 if blocked_domains is not None:
862 self._blocked_domains = tuple(blocked_domains)
863 else:
864 self._blocked_domains = ()
865
866 if allowed_domains is not None:
867 allowed_domains = tuple(allowed_domains)
868 self._allowed_domains = allowed_domains
869
870 def blocked_domains(self):
871 """Return the sequence of blocked domains (as a tuple)."""
872 return self._blocked_domains
873 def set_blocked_domains(self, blocked_domains):
874 """Set the sequence of blocked domains."""
875 self._blocked_domains = tuple(blocked_domains)
876
877 def is_blocked(self, domain):
878 for blocked_domain in self._blocked_domains:
879 if user_domain_match(domain, blocked_domain):
880 return True
881 return False
882
883 def allowed_domains(self):
884 """Return None, or the sequence of allowed domains (as a tuple)."""
885 return self._allowed_domains
886 def set_allowed_domains(self, allowed_domains):
887 """Set the sequence of allowed domains, or None."""
888 if allowed_domains is not None:
889 allowed_domains = tuple(allowed_domains)
890 self._allowed_domains = allowed_domains
891
892 def is_not_allowed(self, domain):
893 if self._allowed_domains is None:
894 return False
895 for allowed_domain in self._allowed_domains:
896 if user_domain_match(domain, allowed_domain):
897 return False
898 return True
899
900 def set_ok(self, cookie, request):
901 """
902 If you override .set_ok(), be sure to call this method. If it returns
903 false, so should your subclass (assuming your subclass wants to be more
904 strict about which cookies to accept).
905
906 """
907 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
908
909 assert cookie.name is not None
910
911 for n in "version", "verifiability", "name", "path", "domain", "port":
912 fn_name = "set_ok_"+n
913 fn = getattr(self, fn_name)
914 if not fn(cookie, request):
915 return False
916
917 return True
918
919 def set_ok_version(self, cookie, request):
920 if cookie.version is None:
921 # Version is always set to 0 by parse_ns_headers if it's a Netscape
922 # cookie, so this must be an invalid RFC 2965 cookie.
923 debug(" Set-Cookie2 without version attribute (%s=%s)",
924 cookie.name, cookie.value)
925 return False
926 if cookie.version > 0 and not self.rfc2965:
927 debug(" RFC 2965 cookies are switched off")
928 return False
929 elif cookie.version == 0 and not self.netscape:
930 debug(" Netscape cookies are switched off")
931 return False
932 return True
933
934 def set_ok_verifiability(self, cookie, request):
935 if request.is_unverifiable() and is_third_party(request):
936 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
937 debug(" third-party RFC 2965 cookie during "
938 "unverifiable transaction")
939 return False
940 elif cookie.version == 0 and self.strict_ns_unverifiable:
941 debug(" third-party Netscape cookie during "
942 "unverifiable transaction")
943 return False
944 return True
945
946 def set_ok_name(self, cookie, request):
947 # Try and stop servers setting V0 cookies designed to hack other
948 # servers that know both V0 and V1 protocols.
949 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
950 cookie.name.startswith("$")):
951 debug(" illegal name (starts with '$'): '%s'", cookie.name)
952 return False
953 return True
954
955 def set_ok_path(self, cookie, request):
956 if cookie.path_specified:
957 req_path = request_path(request)
958 if ((cookie.version > 0 or
959 (cookie.version == 0 and self.strict_ns_set_path)) and
960 not req_path.startswith(cookie.path)):
961 debug(" path attribute %s is not a prefix of request "
962 "path %s", cookie.path, req_path)
963 return False
964 return True
965
966 def set_ok_domain(self, cookie, request):
967 if self.is_blocked(cookie.domain):
968 debug(" domain %s is in user block-list", cookie.domain)
969 return False
970 if self.is_not_allowed(cookie.domain):
971 debug(" domain %s is not in user allow-list", cookie.domain)
972 return False
973 if cookie.domain_specified:
974 req_host, erhn = eff_request_host(request)
975 domain = cookie.domain
976 if self.strict_domain and (domain.count(".") >= 2):
977 i = domain.rfind(".")
978 j = domain.rfind(".", 0, i)
979 if j == 0: # domain like .foo.bar
980 tld = domain[i+1:]
981 sld = domain[j+1:i]
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000982 if (sld.lower() in (
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000983 "co", "ac",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000984 "com", "edu", "org", "net", "gov", "mil", "int") and
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000985 len(tld) == 2):
986 # domain like .co.uk
987 debug(" country-code second level domain %s", domain)
988 return False
989 if domain.startswith("."):
990 undotted_domain = domain[1:]
991 else:
992 undotted_domain = domain
993 embedded_dots = (undotted_domain.find(".") >= 0)
994 if not embedded_dots and domain != ".local":
995 debug(" non-local domain %s contains no embedded dot",
996 domain)
997 return False
998 if cookie.version == 0:
999 if (not erhn.endswith(domain) and
1000 (not erhn.startswith(".") and
1001 not ("."+erhn).endswith(domain))):
1002 debug(" effective request-host %s (even with added "
1003 "initial dot) does not end end with %s",
1004 erhn, domain)
1005 return False
1006 if (cookie.version > 0 or
1007 (self.strict_ns_domain & self.DomainRFC2965Match)):
1008 if not domain_match(erhn, domain):
1009 debug(" effective request-host %s does not domain-match "
1010 "%s", erhn, domain)
1011 return False
1012 if (cookie.version > 0 or
1013 (self.strict_ns_domain & self.DomainStrictNoDots)):
1014 host_prefix = req_host[:-len(domain)]
1015 if (host_prefix.find(".") >= 0 and
1016 not IPV4_RE.search(req_host)):
1017 debug(" host prefix %s for domain %s contains a dot",
1018 host_prefix, domain)
1019 return False
1020 return True
1021
1022 def set_ok_port(self, cookie, request):
1023 if cookie.port_specified:
1024 req_port = request_port(request)
1025 if req_port is None:
1026 req_port = "80"
1027 else:
1028 req_port = str(req_port)
1029 for p in cookie.port.split(","):
1030 try:
1031 int(p)
1032 except ValueError:
1033 debug(" bad port %s (not numeric)", p)
1034 return False
1035 if p == req_port:
1036 break
1037 else:
1038 debug(" request port (%s) not found in %s",
1039 req_port, cookie.port)
1040 return False
1041 return True
1042
1043 def return_ok(self, cookie, request):
1044 """
1045 If you override .return_ok(), be sure to call this method. If it
1046 returns false, so should your subclass (assuming your subclass wants to
1047 be more strict about which cookies to return).
1048
1049 """
1050 # Path has already been checked by .path_return_ok(), and domain
1051 # blocking done by .domain_return_ok().
1052 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1053
1054 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1055 fn_name = "return_ok_"+n
1056 fn = getattr(self, fn_name)
1057 if not fn(cookie, request):
1058 return False
1059 return True
1060
1061 def return_ok_version(self, cookie, request):
1062 if cookie.version > 0 and not self.rfc2965:
1063 debug(" RFC 2965 cookies are switched off")
1064 return False
1065 elif cookie.version == 0 and not self.netscape:
1066 debug(" Netscape cookies are switched off")
1067 return False
1068 return True
1069
1070 def return_ok_verifiability(self, cookie, request):
1071 if request.is_unverifiable() and is_third_party(request):
1072 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1073 debug(" third-party RFC 2965 cookie during unverifiable "
1074 "transaction")
1075 return False
1076 elif cookie.version == 0 and self.strict_ns_unverifiable:
1077 debug(" third-party Netscape cookie during unverifiable "
1078 "transaction")
1079 return False
1080 return True
1081
1082 def return_ok_secure(self, cookie, request):
1083 if cookie.secure and request.get_type() != "https":
1084 debug(" secure cookie with non-secure request")
1085 return False
1086 return True
1087
1088 def return_ok_expires(self, cookie, request):
1089 if cookie.is_expired(self._now):
1090 debug(" cookie expired")
1091 return False
1092 return True
1093
1094 def return_ok_port(self, cookie, request):
1095 if cookie.port:
1096 req_port = request_port(request)
1097 if req_port is None:
1098 req_port = "80"
1099 for p in cookie.port.split(","):
1100 if p == req_port:
1101 break
1102 else:
1103 debug(" request port %s does not match cookie port %s",
1104 req_port, cookie.port)
1105 return False
1106 return True
1107
1108 def return_ok_domain(self, cookie, request):
1109 req_host, erhn = eff_request_host(request)
1110 domain = cookie.domain
1111
1112 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1113 if (cookie.version == 0 and
1114 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1115 not cookie.domain_specified and domain != erhn):
1116 debug(" cookie with unspecified domain does not string-compare "
1117 "equal to request domain")
1118 return False
1119
1120 if cookie.version > 0 and not domain_match(erhn, domain):
1121 debug(" effective request-host name %s does not domain-match "
1122 "RFC 2965 cookie domain %s", erhn, domain)
1123 return False
1124 if cookie.version == 0 and not ("."+erhn).endswith(domain):
1125 debug(" request-host %s does not match Netscape cookie domain "
1126 "%s", req_host, domain)
1127 return False
1128 return True
1129
1130 def domain_return_ok(self, domain, request):
1131 # Liberal check of. This is here as an optimization to avoid
1132 # having to load lots of MSIE cookie files unless necessary.
1133 req_host, erhn = eff_request_host(request)
1134 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001135 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001136 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001137 erhn = "."+erhn
1138 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001139 #debug(" request domain %s does not match cookie domain %s",
1140 # req_host, domain)
1141 return False
1142
1143 if self.is_blocked(domain):
1144 debug(" domain %s is in user block-list", domain)
1145 return False
1146 if self.is_not_allowed(domain):
1147 debug(" domain %s is not in user allow-list", domain)
1148 return False
1149
1150 return True
1151
1152 def path_return_ok(self, path, request):
1153 debug("- checking cookie path=%s", path)
1154 req_path = request_path(request)
1155 if not req_path.startswith(path):
1156 debug(" %s does not path-match %s", req_path, path)
1157 return False
1158 return True
1159
1160
1161def vals_sorted_by_key(adict):
1162 keys = adict.keys()
1163 keys.sort()
1164 return map(adict.get, keys)
1165
1166def deepvalues(mapping):
1167 """Iterates over nested mapping, depth-first, in sorted order by key."""
1168 values = vals_sorted_by_key(mapping)
1169 for obj in values:
1170 mapping = False
1171 try:
1172 obj.items
1173 except AttributeError:
1174 pass
1175 else:
1176 mapping = True
1177 for subobj in deepvalues(obj):
1178 yield subobj
1179 if not mapping:
1180 yield obj
1181
1182
1183# Used as second parameter to dict.get() method, to distinguish absent
1184# dict key from one with a None value.
1185class Absent: pass
1186
1187class CookieJar:
1188 """Collection of HTTP cookies.
1189
1190 You may not need to know about this class: try
1191 urllib2.build_opener(HTTPCookieProcessor).open(url).
1192
1193 """
1194
1195 non_word_re = re.compile(r"\W")
1196 quote_re = re.compile(r"([\"\\])")
1197 strict_domain_re = re.compile(r"\.?[^.]*")
1198 domain_re = re.compile(r"[^.]*")
1199 dots_re = re.compile(r"^\.+")
1200
1201 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1202
1203 def __init__(self, policy=None):
1204 if policy is None:
1205 policy = DefaultCookiePolicy()
1206 self._policy = policy
1207
1208 self._cookies_lock = _threading.RLock()
1209 self._cookies = {}
1210
1211 def set_policy(self, policy):
1212 self._policy = policy
1213
1214 def _cookies_for_domain(self, domain, request):
1215 cookies = []
1216 if not self._policy.domain_return_ok(domain, request):
1217 return []
1218 debug("Checking %s for cookies to return", domain)
1219 cookies_by_path = self._cookies[domain]
1220 for path in cookies_by_path.keys():
1221 if not self._policy.path_return_ok(path, request):
1222 continue
1223 cookies_by_name = cookies_by_path[path]
1224 for cookie in cookies_by_name.values():
1225 if not self._policy.return_ok(cookie, request):
1226 debug(" not returning cookie")
1227 continue
1228 debug(" it's a match")
1229 cookies.append(cookie)
1230 return cookies
1231
1232 def _cookies_for_request(self, request):
1233 """Return a list of cookies to be returned to server."""
1234 cookies = []
1235 for domain in self._cookies.keys():
1236 cookies.extend(self._cookies_for_domain(domain, request))
1237 return cookies
1238
1239 def _cookie_attrs(self, cookies):
1240 """Return a list of cookie-attributes to be returned to server.
1241
1242 like ['foo="bar"; $Path="/"', ...]
1243
1244 The $Version attribute is also added when appropriate (currently only
1245 once per request).
1246
1247 """
1248 # add cookies in order of most specific (ie. longest) path first
1249 def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
1250 cookies.sort(decreasing_size)
1251
1252 version_set = False
1253
1254 attrs = []
1255 for cookie in cookies:
1256 # set version of Cookie header
1257 # XXX
1258 # What should it be if multiple matching Set-Cookie headers have
1259 # different versions themselves?
1260 # Answer: there is no answer; was supposed to be settled by
1261 # RFC 2965 errata, but that may never appear...
1262 version = cookie.version
1263 if not version_set:
1264 version_set = True
1265 if version > 0:
1266 attrs.append("$Version=%s" % version)
1267
1268 # quote cookie value if necessary
1269 # (not for Netscape protocol, which already has any quotes
1270 # intact, due to the poorly-specified Netscape Cookie: syntax)
1271 if ((cookie.value is not None) and
1272 self.non_word_re.search(cookie.value) and version > 0):
1273 value = self.quote_re.sub(r"\\\1", cookie.value)
1274 else:
1275 value = cookie.value
1276
1277 # add cookie-attributes to be returned in Cookie header
1278 if cookie.value is None:
1279 attrs.append(cookie.name)
1280 else:
1281 attrs.append("%s=%s" % (cookie.name, value))
1282 if version > 0:
1283 if cookie.path_specified:
1284 attrs.append('$Path="%s"' % cookie.path)
1285 if cookie.domain.startswith("."):
1286 domain = cookie.domain
1287 if (not cookie.domain_initial_dot and
1288 domain.startswith(".")):
1289 domain = domain[1:]
1290 attrs.append('$Domain="%s"' % domain)
1291 if cookie.port is not None:
1292 p = "$Port"
1293 if cookie.port_specified:
1294 p = p + ('="%s"' % cookie.port)
1295 attrs.append(p)
1296
1297 return attrs
1298
1299 def add_cookie_header(self, request):
1300 """Add correct Cookie: header to request (urllib2.Request object).
1301
1302 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1303
1304 """
1305 debug("add_cookie_header")
1306 self._cookies_lock.acquire()
1307
1308 self._policy._now = self._now = int(time.time())
1309
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001310 cookies = self._cookies_for_request(request)
1311
1312 attrs = self._cookie_attrs(cookies)
1313 if attrs:
1314 if not request.has_header("Cookie"):
1315 request.add_unredirected_header(
1316 "Cookie", "; ".join(attrs))
1317
1318 # if necessary, advertise that we know RFC 2965
1319 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1320 not request.has_header("Cookie2")):
1321 for cookie in cookies:
1322 if cookie.version != 1:
1323 request.add_unredirected_header("Cookie2", '$Version="1"')
1324 break
1325
1326 self._cookies_lock.release()
1327
1328 self.clear_expired_cookies()
1329
1330 def _normalized_cookie_tuples(self, attrs_set):
1331 """Return list of tuples containing normalised cookie information.
1332
1333 attrs_set is the list of lists of key,value pairs extracted from
1334 the Set-Cookie or Set-Cookie2 headers.
1335
1336 Tuples are name, value, standard, rest, where name and value are the
1337 cookie name and value, standard is a dictionary containing the standard
1338 cookie-attributes (discard, secure, version, expires or max-age,
1339 domain, path and port) and rest is a dictionary containing the rest of
1340 the cookie-attributes.
1341
1342 """
1343 cookie_tuples = []
1344
1345 boolean_attrs = "discard", "secure"
1346 value_attrs = ("version",
1347 "expires", "max-age",
1348 "domain", "path", "port",
1349 "comment", "commenturl")
1350
1351 for cookie_attrs in attrs_set:
1352 name, value = cookie_attrs[0]
1353
1354 # Build dictionary of standard cookie-attributes (standard) and
1355 # dictionary of other cookie-attributes (rest).
1356
1357 # Note: expiry time is normalised to seconds since epoch. V0
1358 # cookies should have the Expires cookie-attribute, and V1 cookies
1359 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1360 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1361 # accept either (but prefer Max-Age).
1362 max_age_set = False
1363
1364 bad_cookie = False
1365
1366 standard = {}
1367 rest = {}
1368 for k, v in cookie_attrs[1:]:
1369 lc = k.lower()
1370 # don't lose case distinction for unknown fields
1371 if lc in value_attrs or lc in boolean_attrs:
1372 k = lc
1373 if k in boolean_attrs and v is None:
1374 # boolean cookie-attribute is present, but has no value
1375 # (like "discard", rather than "port=80")
1376 v = True
1377 if k in standard:
1378 # only first value is significant
1379 continue
1380 if k == "domain":
1381 if v is None:
1382 debug(" missing value for domain attribute")
1383 bad_cookie = True
1384 break
1385 # RFC 2965 section 3.3.3
1386 v = v.lower()
1387 if k == "expires":
1388 if max_age_set:
1389 # Prefer max-age to expires (like Mozilla)
1390 continue
1391 if v is None:
1392 debug(" missing or invalid value for expires "
1393 "attribute: treating as session cookie")
1394 continue
1395 if k == "max-age":
1396 max_age_set = True
1397 try:
1398 v = int(v)
1399 except ValueError:
1400 debug(" missing or invalid (non-numeric) value for "
1401 "max-age attribute")
1402 bad_cookie = True
1403 break
1404 # convert RFC 2965 Max-Age to seconds since epoch
1405 # XXX Strictly you're supposed to follow RFC 2616
1406 # age-calculation rules. Remember that zero Max-Age is a
1407 # is a request to discard (old and new) cookie, though.
1408 k = "expires"
1409 v = self._now + v
1410 if (k in value_attrs) or (k in boolean_attrs):
1411 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001412 k not in ("port", "comment", "commenturl")):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001413 debug(" missing value for %s attribute" % k)
1414 bad_cookie = True
1415 break
1416 standard[k] = v
1417 else:
1418 rest[k] = v
1419
1420 if bad_cookie:
1421 continue
1422
1423 cookie_tuples.append((name, value, standard, rest))
1424
1425 return cookie_tuples
1426
1427 def _cookie_from_cookie_tuple(self, tup, request):
1428 # standard is dict of standard cookie-attributes, rest is dict of the
1429 # rest of them
1430 name, value, standard, rest = tup
1431
1432 domain = standard.get("domain", Absent)
1433 path = standard.get("path", Absent)
1434 port = standard.get("port", Absent)
1435 expires = standard.get("expires", Absent)
1436
1437 # set the easy defaults
1438 version = standard.get("version", None)
1439 if version is not None: version = int(version)
1440 secure = standard.get("secure", False)
1441 # (discard is also set if expires is Absent)
1442 discard = standard.get("discard", False)
1443 comment = standard.get("comment", None)
1444 comment_url = standard.get("commenturl", None)
1445
1446 # set default path
1447 if path is not Absent and path != "":
1448 path_specified = True
1449 path = escape_path(path)
1450 else:
1451 path_specified = False
1452 path = request_path(request)
1453 i = path.rfind("/")
1454 if i != -1:
1455 if version == 0:
1456 # Netscape spec parts company from reality here
1457 path = path[:i]
1458 else:
1459 path = path[:i+1]
1460 if len(path) == 0: path = "/"
1461
1462 # set default domain
1463 domain_specified = domain is not Absent
1464 # but first we have to remember whether it starts with a dot
1465 domain_initial_dot = False
1466 if domain_specified:
1467 domain_initial_dot = bool(domain.startswith("."))
1468 if domain is Absent:
1469 req_host, erhn = eff_request_host(request)
1470 domain = erhn
1471 elif not domain.startswith("."):
1472 domain = "."+domain
1473
1474 # set default port
1475 port_specified = False
1476 if port is not Absent:
1477 if port is None:
1478 # Port attr present, but has no value: default to request port.
1479 # Cookie should then only be sent back on that port.
1480 port = request_port(request)
1481 else:
1482 port_specified = True
1483 port = re.sub(r"\s+", "", port)
1484 else:
1485 # No port attr present. Cookie can be sent back on any port.
1486 port = None
1487
1488 # set default expires and discard
1489 if expires is Absent:
1490 expires = None
1491 discard = True
1492 elif expires <= self._now:
1493 # Expiry date in past is request to delete cookie. This can't be
1494 # in DefaultCookiePolicy, because can't delete cookies there.
1495 try:
1496 self.clear(domain, path, name)
1497 except KeyError:
1498 pass
1499 debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1500 domain, path, name)
1501 return None
1502
1503 return Cookie(version,
1504 name, value,
1505 port, port_specified,
1506 domain, domain_specified, domain_initial_dot,
1507 path, path_specified,
1508 secure,
1509 expires,
1510 discard,
1511 comment,
1512 comment_url,
1513 rest)
1514
1515 def _cookies_from_attrs_set(self, attrs_set, request):
1516 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1517
1518 cookies = []
1519 for tup in cookie_tuples:
1520 cookie = self._cookie_from_cookie_tuple(tup, request)
1521 if cookie: cookies.append(cookie)
1522 return cookies
1523
Neal Norwitz71dad722005-12-23 21:43:48 +00001524 def _process_rfc2109_cookies(self, cookies):
1525 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1526 if rfc2109_as_ns is None:
1527 rfc2109_as_ns = not self._policy.rfc2965
1528 for cookie in cookies:
1529 if cookie.version == 1:
1530 cookie.rfc2109 = True
Tim Peters536cf992005-12-25 23:18:31 +00001531 if rfc2109_as_ns:
Neal Norwitz71dad722005-12-23 21:43:48 +00001532 # treat 2109 cookies as Netscape cookies rather than
1533 # as RFC2965 cookies
1534 cookie.version = 0
1535
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001536 def make_cookies(self, response, request):
1537 """Return sequence of Cookie objects extracted from response object."""
1538 # get cookie-attributes for RFC 2965 and Netscape protocols
1539 headers = response.info()
1540 rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1541 ns_hdrs = headers.getheaders("Set-Cookie")
1542
1543 rfc2965 = self._policy.rfc2965
1544 netscape = self._policy.netscape
1545
1546 if ((not rfc2965_hdrs and not ns_hdrs) or
1547 (not ns_hdrs and not rfc2965) or
1548 (not rfc2965_hdrs and not netscape) or
1549 (not netscape and not rfc2965)):
1550 return [] # no relevant cookie headers: quick exit
1551
1552 try:
1553 cookies = self._cookies_from_attrs_set(
1554 split_header_words(rfc2965_hdrs), request)
1555 except:
1556 reraise_unmasked_exceptions()
1557 cookies = []
1558
1559 if ns_hdrs and netscape:
1560 try:
Neal Norwitz71dad722005-12-23 21:43:48 +00001561 # RFC 2109 and Netscape cookies
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001562 ns_cookies = self._cookies_from_attrs_set(
1563 parse_ns_headers(ns_hdrs), request)
1564 except:
1565 reraise_unmasked_exceptions()
1566 ns_cookies = []
Neal Norwitz71dad722005-12-23 21:43:48 +00001567 self._process_rfc2109_cookies(ns_cookies)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001568
1569 # Look for Netscape cookies (from Set-Cookie headers) that match
1570 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1571 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1572 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1573 # bundled in with the Netscape cookies for this purpose, which is
1574 # reasonable behaviour.
1575 if rfc2965:
1576 lookup = {}
1577 for cookie in cookies:
1578 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1579
1580 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1581 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1582 return key not in lookup
1583 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1584
1585 if ns_cookies:
1586 cookies.extend(ns_cookies)
1587
1588 return cookies
1589
1590 def set_cookie_if_ok(self, cookie, request):
1591 """Set a cookie if policy says it's OK to do so."""
1592 self._cookies_lock.acquire()
1593 self._policy._now = self._now = int(time.time())
1594
1595 if self._policy.set_ok(cookie, request):
1596 self.set_cookie(cookie)
1597
1598 self._cookies_lock.release()
1599
1600 def set_cookie(self, cookie):
1601 """Set a cookie, without checking whether or not it should be set."""
1602 c = self._cookies
1603 self._cookies_lock.acquire()
1604 try:
1605 if cookie.domain not in c: c[cookie.domain] = {}
1606 c2 = c[cookie.domain]
1607 if cookie.path not in c2: c2[cookie.path] = {}
1608 c3 = c2[cookie.path]
1609 c3[cookie.name] = cookie
1610 finally:
1611 self._cookies_lock.release()
1612
1613 def extract_cookies(self, response, request):
1614 """Extract cookies from response, where allowable given the request."""
1615 debug("extract_cookies: %s", response.info())
1616 self._cookies_lock.acquire()
1617 self._policy._now = self._now = int(time.time())
1618
1619 for cookie in self.make_cookies(response, request):
1620 if self._policy.set_ok(cookie, request):
1621 debug(" setting cookie: %s", cookie)
1622 self.set_cookie(cookie)
1623 self._cookies_lock.release()
1624
1625 def clear(self, domain=None, path=None, name=None):
1626 """Clear some cookies.
1627
1628 Invoking this method without arguments will clear all cookies. If
1629 given a single argument, only cookies belonging to that domain will be
1630 removed. If given two arguments, cookies belonging to the specified
1631 path within that domain are removed. If given three arguments, then
1632 the cookie with the specified name, path and domain is removed.
1633
1634 Raises KeyError if no matching cookie exists.
1635
1636 """
1637 if name is not None:
1638 if (domain is None) or (path is None):
1639 raise ValueError(
1640 "domain and path must be given to remove a cookie by name")
1641 del self._cookies[domain][path][name]
1642 elif path is not None:
1643 if domain is None:
1644 raise ValueError(
1645 "domain must be given to remove cookies by path")
1646 del self._cookies[domain][path]
1647 elif domain is not None:
1648 del self._cookies[domain]
1649 else:
1650 self._cookies = {}
1651
1652 def clear_session_cookies(self):
1653 """Discard all session cookies.
1654
1655 Note that the .save() method won't save session cookies anyway, unless
1656 you ask otherwise by passing a true ignore_discard argument.
1657
1658 """
1659 self._cookies_lock.acquire()
1660 for cookie in self:
1661 if cookie.discard:
1662 self.clear(cookie.domain, cookie.path, cookie.name)
1663 self._cookies_lock.release()
1664
1665 def clear_expired_cookies(self):
1666 """Discard all expired cookies.
1667
1668 You probably don't need to call this method: expired cookies are never
1669 sent back to the server (provided you're using DefaultCookiePolicy),
1670 this method is called by CookieJar itself every so often, and the
1671 .save() method won't save expired cookies anyway (unless you ask
1672 otherwise by passing a true ignore_expires argument).
1673
1674 """
1675 self._cookies_lock.acquire()
1676 now = time.time()
1677 for cookie in self:
1678 if cookie.is_expired(now):
1679 self.clear(cookie.domain, cookie.path, cookie.name)
1680 self._cookies_lock.release()
1681
1682 def __iter__(self):
1683 return deepvalues(self._cookies)
1684
1685 def __len__(self):
1686 """Return number of contained cookies."""
1687 i = 0
1688 for cookie in self: i = i + 1
1689 return i
1690
1691 def __repr__(self):
1692 r = []
1693 for cookie in self: r.append(repr(cookie))
1694 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1695
1696 def __str__(self):
1697 r = []
1698 for cookie in self: r.append(str(cookie))
1699 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1700
1701
Neal Norwitz3e7de592005-12-23 21:24:35 +00001702# derives from IOError for backwards-compatibility with Python 2.4.0
1703class LoadError(IOError): pass
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001704
1705class FileCookieJar(CookieJar):
1706 """CookieJar that can be loaded from and saved to a file."""
1707
1708 def __init__(self, filename=None, delayload=False, policy=None):
1709 """
1710 Cookies are NOT loaded from the named file until either the .load() or
1711 .revert() method is called.
1712
1713 """
1714 CookieJar.__init__(self, policy)
1715 if filename is not None:
1716 try:
1717 filename+""
1718 except:
1719 raise ValueError("filename must be string-like")
1720 self.filename = filename
1721 self.delayload = bool(delayload)
1722
1723 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1724 """Save cookies to a file."""
1725 raise NotImplementedError()
1726
1727 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1728 """Load cookies from a file."""
1729 if filename is None:
1730 if self.filename is not None: filename = self.filename
1731 else: raise ValueError(MISSING_FILENAME_TEXT)
1732
1733 f = open(filename)
1734 try:
1735 self._really_load(f, filename, ignore_discard, ignore_expires)
1736 finally:
1737 f.close()
1738
1739 def revert(self, filename=None,
1740 ignore_discard=False, ignore_expires=False):
1741 """Clear all cookies and reload cookies from a saved file.
1742
1743 Raises LoadError (or IOError) if reversion is not successful; the
1744 object's state will not be altered if this happens.
1745
1746 """
1747 if filename is None:
1748 if self.filename is not None: filename = self.filename
1749 else: raise ValueError(MISSING_FILENAME_TEXT)
1750
1751 self._cookies_lock.acquire()
1752
1753 old_state = copy.deepcopy(self._cookies)
1754 self._cookies = {}
1755 try:
1756 self.load(filename, ignore_discard, ignore_expires)
1757 except (LoadError, IOError):
1758 self._cookies = old_state
1759 raise
1760
1761 self._cookies_lock.release()
1762
1763from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1764from _MozillaCookieJar import MozillaCookieJar