blob: f2cc13297600bc1a2afa8289891d187c3f72ee9b [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that the classes which do not derive from
11FileCookieJar are not distributed with the Python standard library, but
12are available from http://wwwsearch.sf.net/):
13
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
28import sys, re, urlparse, copy, time, struct, urllib, types, logging
29from types import StringTypes
30try:
31 import threading as _threading
32except ImportError:
33 import dummy_threading as _threading
34import httplib # only for the default HTTP port
35from calendar import timegm
36
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000037debug = logging.getLogger("cookielib").debug
38
39DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
40MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
41 "instance initialised with one)")
42
43def reraise_unmasked_exceptions(unmasked=()):
44 # There are a few catch-all except: statements in this module, for
45 # catching input that's bad in unexpected ways.
46 # This function re-raises some exceptions we don't want to trap.
47 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
48 etype = sys.exc_info()[0]
49 if issubclass(etype, unmasked):
50 raise
51 # swallowed an exception
52 import warnings
53 warnings.warn("cookielib bug!", stacklevel=2)
54 import traceback
55 traceback.print_exc()
56
57
58# Date/time conversion
59# -----------------------------------------------------------------------------
60
61EPOCH_YEAR = 1970
62def _timegm(tt):
63 year, month, mday, hour, min, sec = tt[:6]
64 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
65 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
66 return timegm(tt)
67 else:
68 return None
69
70DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
71MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
72 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
73MONTHS_LOWER = []
74for month in MONTHS: MONTHS_LOWER.append(month.lower())
75
76def time2isoz(t=None):
77 """Return a string representing time in seconds since epoch, t.
78
79 If the function is called without an argument, it will use the current
80 time.
81
82 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
83 representing Universal Time (UTC, aka GMT). An example of this format is:
84
85 1994-11-24 08:49:37Z
86
87 """
88 if t is None: t = time.time()
89 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
90 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
91 year, mon, mday, hour, min, sec)
92
93def time2netscape(t=None):
94 """Return a string representing time in seconds since epoch, t.
95
96 If the function is called without an argument, it will use the current
97 time.
98
99 The format of the returned string is like this:
100
101 Wed, DD-Mon-YYYY HH:MM:SS GMT
102
103 """
104 if t is None: t = time.time()
105 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
106 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
107 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
108
109
110UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
111
112TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
113def offset_from_tz_string(tz):
114 offset = None
115 if tz in UTC_ZONES:
116 offset = 0
117 else:
118 m = TIMEZONE_RE.search(tz)
119 if m:
120 offset = 3600 * int(m.group(2))
121 if m.group(3):
122 offset = offset + 60 * int(m.group(3))
123 if m.group(1) == '-':
124 offset = -offset
125 return offset
126
127def _str2time(day, mon, yr, hr, min, sec, tz):
128 # translate month name to number
129 # month numbers start with 1 (January)
130 try:
131 mon = MONTHS_LOWER.index(mon.lower())+1
132 except ValueError:
133 # maybe it's already a number
134 try:
135 imon = int(mon)
136 except ValueError:
137 return None
138 if 1 <= imon <= 12:
139 mon = imon
140 else:
141 return None
142
143 # make sure clock elements are defined
144 if hr is None: hr = 0
145 if min is None: min = 0
146 if sec is None: sec = 0
147
148 yr = int(yr)
149 day = int(day)
150 hr = int(hr)
151 min = int(min)
152 sec = int(sec)
153
154 if yr < 1000:
155 # find "obvious" year
156 cur_yr = time.localtime(time.time())[0]
157 m = cur_yr % 100
158 tmp = yr
159 yr = yr + cur_yr - m
160 m = m - tmp
161 if abs(m) > 50:
162 if m > 0: yr = yr + 100
163 else: yr = yr - 100
164
165 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
166 t = _timegm((yr, mon, day, hr, min, sec, tz))
167
168 if t is not None:
169 # adjust time using timezone string, to get absolute time since epoch
170 if tz is None:
171 tz = "UTC"
172 tz = tz.upper()
173 offset = offset_from_tz_string(tz)
174 if offset is None:
175 return None
176 t = t - offset
177
178 return t
179
180STRICT_DATE_RE = re.compile(
181 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
182 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
183WEEKDAY_RE = re.compile(
184 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
185LOOSE_HTTP_DATE_RE = re.compile(
186 r"""^
187 (\d\d?) # day
188 (?:\s+|[-\/])
189 (\w+) # month
190 (?:\s+|[-\/])
191 (\d+) # year
192 (?:
193 (?:\s+|:) # separator before clock
194 (\d\d?):(\d\d) # hour:min
195 (?::(\d\d))? # optional seconds
196 )? # optional clock
197 \s*
198 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
199 \s*
200 (?:\(\w+\))? # ASCII representation of timezone in parens.
201 \s*$""", re.X)
202def http2time(text):
203 """Returns time in seconds since epoch of time represented by a string.
204
205 Return value is an integer.
206
207 None is returned if the format of str is unrecognized, the time is outside
208 the representable range, or the timezone string is not recognized. If the
209 string contains no timezone, UTC is assumed.
210
211 The timezone in the string may be numerical (like "-0800" or "+0100") or a
212 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
213 timezone strings equivalent to UTC (zero offset) are known to the function.
214
215 The function loosely parses the following formats:
216
217 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
218 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
219 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
220 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
221 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
222 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
223
224 The parser ignores leading and trailing whitespace. The time may be
225 absent.
226
227 If the year is given with only 2 digits, the function will select the
228 century that makes the year closest to the current date.
229
230 """
231 # fast exit for strictly conforming string
232 m = STRICT_DATE_RE.search(text)
233 if m:
234 g = m.groups()
235 mon = MONTHS_LOWER.index(g[1].lower()) + 1
236 tt = (int(g[2]), mon, int(g[0]),
237 int(g[3]), int(g[4]), float(g[5]))
238 return _timegm(tt)
239
240 # No, we need some messy parsing...
241
242 # clean up
243 text = text.lstrip()
244 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
245
246 # tz is time zone specifier string
247 day, mon, yr, hr, min, sec, tz = [None]*7
248
249 # loose regexp parse
250 m = LOOSE_HTTP_DATE_RE.search(text)
251 if m is not None:
252 day, mon, yr, hr, min, sec, tz = m.groups()
253 else:
254 return None # bad format
255
256 return _str2time(day, mon, yr, hr, min, sec, tz)
257
258ISO_DATE_RE = re.compile(
259 """^
260 (\d{4}) # year
261 [-\/]?
262 (\d\d?) # numerical month
263 [-\/]?
264 (\d\d?) # day
265 (?:
266 (?:\s+|[-:Tt]) # separator before clock
267 (\d\d?):?(\d\d) # hour:min
268 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
269 )? # optional clock
270 \s*
271 ([-+]?\d\d?:?(:?\d\d)?
272 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
273 \s*$""", re.X)
274def iso2time(text):
275 """
276 As for http2time, but parses the ISO 8601 formats:
277
278 1994-02-03 14:15:29 -0100 -- ISO 8601 format
279 1994-02-03 14:15:29 -- zone is optional
280 1994-02-03 -- only date
281 1994-02-03T14:15:29 -- Use T as separator
282 19940203T141529Z -- ISO 8601 compact format
283 19940203 -- only date
284
285 """
286 # clean up
287 text = text.lstrip()
288
289 # tz is time zone specifier string
290 day, mon, yr, hr, min, sec, tz = [None]*7
291
292 # loose regexp parse
293 m = ISO_DATE_RE.search(text)
294 if m is not None:
295 # XXX there's an extra bit of the timezone I'm ignoring here: is
296 # this the right thing to do?
297 yr, mon, day, hr, min, sec, tz, _ = m.groups()
298 else:
299 return None # bad format
300
301 return _str2time(day, mon, yr, hr, min, sec, tz)
302
303
304# Header parsing
305# -----------------------------------------------------------------------------
306
307def unmatched(match):
308 """Return unmatched part of re.Match object."""
309 start, end = match.span(0)
310 return match.string[:start]+match.string[end:]
311
312HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
313HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
314HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
315HEADER_ESCAPE_RE = re.compile(r"\\(.)")
316def split_header_words(header_values):
317 r"""Parse header values into a list of lists containing key,value pairs.
318
319 The function knows how to deal with ",", ";" and "=" as well as quoted
320 values after "=". A list of space separated tokens are parsed as if they
321 were separated by ";".
322
323 If the header_values passed as argument contains multiple values, then they
324 are treated as if they were a single value separated by comma ",".
325
326 This means that this function is useful for parsing header fields that
327 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
328 the requirement for tokens).
329
330 headers = #header
331 header = (token | parameter) *( [";"] (token | parameter))
332
333 token = 1*<any CHAR except CTLs or separators>
334 separators = "(" | ")" | "<" | ">" | "@"
335 | "," | ";" | ":" | "\" | <">
336 | "/" | "[" | "]" | "?" | "="
337 | "{" | "}" | SP | HT
338
339 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
340 qdtext = <any TEXT except <">>
341 quoted-pair = "\" CHAR
342
343 parameter = attribute "=" value
344 attribute = token
345 value = token | quoted-string
346
347 Each header is represented by a list of key/value pairs. The value for a
348 simple token (not part of a parameter) is None. Syntactically incorrect
349 headers will not necessarily be parsed as you would want.
350
351 This is easier to describe with some examples:
352
353 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
354 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
355 >>> split_header_words(['text/html; charset="iso-8859-1"'])
356 [[('text/html', None), ('charset', 'iso-8859-1')]]
357 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
358 [[('Basic', None), ('realm', '"foobar"')]]
359
360 """
361 assert type(header_values) not in StringTypes
362 result = []
363 for text in header_values:
364 orig_text = text
365 pairs = []
366 while text:
367 m = HEADER_TOKEN_RE.search(text)
368 if m:
369 text = unmatched(m)
370 name = m.group(1)
371 m = HEADER_QUOTED_VALUE_RE.search(text)
372 if m: # quoted value
373 text = unmatched(m)
374 value = m.group(1)
375 value = HEADER_ESCAPE_RE.sub(r"\1", value)
376 else:
377 m = HEADER_VALUE_RE.search(text)
378 if m: # unquoted value
379 text = unmatched(m)
380 value = m.group(1)
381 value = value.rstrip()
382 else:
383 # no value, a lone token
384 value = None
385 pairs.append((name, value))
386 elif text.lstrip().startswith(","):
387 # concatenated headers, as per RFC 2616 section 4.2
388 text = text.lstrip()[1:]
389 if pairs: result.append(pairs)
390 pairs = []
391 else:
392 # skip junk
393 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
394 assert nr_junk_chars > 0, (
395 "split_header_words bug: '%s', '%s', %s" %
396 (orig_text, text, pairs))
397 text = non_junk
398 if pairs: result.append(pairs)
399 return result
400
401HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
402def join_header_words(lists):
403 """Do the inverse (almost) of the conversion done by split_header_words.
404
405 Takes a list of lists of (key, value) pairs and produces a single header
406 value. Attribute values are quoted if needed.
407
408 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
409 'text/plain; charset="iso-8859/1"'
410 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
411 'text/plain, charset="iso-8859/1"'
412
413 """
414 headers = []
415 for pairs in lists:
416 attr = []
417 for k, v in pairs:
418 if v is not None:
419 if not re.search(r"^\w+$", v):
420 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
421 v = '"%s"' % v
422 k = "%s=%s" % (k, v)
423 attr.append(k)
424 if attr: headers.append("; ".join(attr))
425 return ", ".join(headers)
426
427def parse_ns_headers(ns_headers):
428 """Ad-hoc parser for Netscape protocol cookie-attributes.
429
430 The old Netscape cookie format for Set-Cookie can for instance contain
431 an unquoted "," in the expires field, so we have to use this ad-hoc
432 parser instead of split_header_words.
433
434 XXX This may not make the best possible effort to parse all the crap
435 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
436 parser is probably better, so could do worse than following that if
437 this ever gives any trouble.
438
439 Currently, this is also used for parsing RFC 2109 cookies.
440
441 """
442 known_attrs = ("expires", "domain", "path", "secure",
443 # RFC 2109 attrs (may turn up in Netscape cookies, too)
444 "port", "max-age")
445
446 result = []
447 for ns_header in ns_headers:
448 pairs = []
449 version_set = False
450 for param in re.split(r";\s*", ns_header):
451 param = param.rstrip()
452 if param == "": continue
453 if "=" not in param:
454 if param.lower() in known_attrs:
455 k, v = param, None
456 else:
457 # cookie with missing value
458 k, v = param, None
459 else:
460 k, v = re.split(r"\s*=\s*", param, 1)
461 k = k.lstrip()
462 if k is not None:
463 lc = k.lower()
464 if lc in known_attrs:
465 k = lc
466 if k == "version":
467 # This is an RFC 2109 cookie. Will be treated as RFC 2965
468 # cookie in rest of code.
469 # Probably it should be parsed with split_header_words, but
470 # that's too much hassle.
471 version_set = True
472 if k == "expires":
473 # convert expires date to seconds since epoch
474 if v.startswith('"'): v = v[1:]
475 if v.endswith('"'): v = v[:-1]
476 v = http2time(v) # None if invalid
477 pairs.append((k, v))
478
479 if pairs:
480 if not version_set:
481 pairs.append(("version", "0"))
482 result.append(pairs)
483
484 return result
485
486
487IPV4_RE = re.compile(r"\.\d+$")
488def is_HDN(text):
489 """Return True if text is a host domain name."""
490 # XXX
491 # This may well be wrong. Which RFC is HDN defined in, if any (for
492 # the purposes of RFC 2965)?
493 # For the current implementation, what about IPv6? Remember to look
494 # at other uses of IPV4_RE also, if change this.
495 if IPV4_RE.search(text):
496 return False
497 if text == "":
498 return False
499 if text[0] == "." or text[-1] == ".":
500 return False
501 return True
502
503def domain_match(A, B):
504 """Return True if domain A domain-matches domain B, according to RFC 2965.
505
506 A and B may be host domain names or IP addresses.
507
508 RFC 2965, section 1:
509
510 Host names can be specified either as an IP address or a HDN string.
511 Sometimes we compare one host name with another. (Such comparisons SHALL
512 be case-insensitive.) Host A's name domain-matches host B's if
513
514 * their host name strings string-compare equal; or
515
516 * A is a HDN string and has the form NB, where N is a non-empty
517 name string, B has the form .B', and B' is a HDN string. (So,
518 x.y.com domain-matches .Y.com but not Y.com.)
519
520 Note that domain-match is not a commutative operation: a.b.c.com
521 domain-matches .c.com, but not the reverse.
522
523 """
524 # Note that, if A or B are IP addresses, the only relevant part of the
525 # definition of the domain-match algorithm is the direct string-compare.
526 A = A.lower()
527 B = B.lower()
528 if A == B:
529 return True
530 if not is_HDN(A):
531 return False
532 i = A.rfind(B)
533 if i == -1 or i == 0:
534 # A does not have form NB, or N is the empty string
535 return False
536 if not B.startswith("."):
537 return False
538 if not is_HDN(B[1:]):
539 return False
540 return True
541
542def liberal_is_HDN(text):
543 """Return True if text is a sort-of-like a host domain name.
544
545 For accepting/blocking domains.
546
547 """
548 if IPV4_RE.search(text):
549 return False
550 return True
551
552def user_domain_match(A, B):
553 """For blocking/accepting domains.
554
555 A and B may be host domain names or IP addresses.
556
557 """
558 A = A.lower()
559 B = B.lower()
560 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
561 if A == B:
562 # equal IP addresses
563 return True
564 return False
565 initial_dot = B.startswith(".")
566 if initial_dot and A.endswith(B):
567 return True
568 if not initial_dot and A == B:
569 return True
570 return False
571
572cut_port_re = re.compile(r":\d+$")
573def request_host(request):
574 """Return request-host, as defined by RFC 2965.
575
576 Variation from RFC: returned value is lowercased, for convenient
577 comparison.
578
579 """
580 url = request.get_full_url()
581 host = urlparse.urlparse(url)[1]
582 if host == "":
583 host = request.get_header("Host", "")
584
585 # remove port, if present
586 host = cut_port_re.sub("", host, 1)
587 return host.lower()
588
589def eff_request_host(request):
590 """Return a tuple (request-host, effective request-host name).
591
592 As defined by RFC 2965, except both are lowercased.
593
594 """
595 erhn = req_host = request_host(request)
596 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
597 erhn = req_host + ".local"
598 return req_host, erhn
599
600def request_path(request):
601 """request-URI, as defined by RFC 2965."""
602 url = request.get_full_url()
603 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
604 #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
605 path, parameters, query, frag = urlparse.urlparse(url)[2:]
606 if parameters:
607 path = "%s;%s" % (path, parameters)
608 path = escape_path(path)
609 req_path = urlparse.urlunparse(("", "", path, "", query, frag))
610 if not req_path.startswith("/"):
611 # fix bad RFC 2396 absoluteURI
612 req_path = "/"+req_path
613 return req_path
614
615def request_port(request):
616 host = request.get_host()
617 i = host.find(':')
618 if i >= 0:
619 port = host[i+1:]
620 try:
621 int(port)
622 except ValueError:
623 debug("nonnumeric port: '%s'", port)
624 return None
625 else:
626 port = DEFAULT_HTTP_PORT
627 return port
628
629# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
630# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
631HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
632ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
633def uppercase_escaped_char(match):
634 return "%%%s" % match.group(1).upper()
635def escape_path(path):
636 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
637 # There's no knowing what character encoding was used to create URLs
638 # containing %-escapes, but since we have to pick one to escape invalid
639 # path characters, we pick UTF-8, as recommended in the HTML 4.0
640 # specification:
641 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
642 # And here, kind of: draft-fielding-uri-rfc2396bis-03
643 # (And in draft IRI specification: draft-duerst-iri-05)
644 # (And here, for new URI schemes: RFC 2718)
645 if isinstance(path, types.UnicodeType):
646 path = path.encode("utf-8")
647 path = urllib.quote(path, HTTP_PATH_SAFE)
648 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
649 return path
650
651def reach(h):
652 """Return reach of host h, as defined by RFC 2965, section 1.
653
654 The reach R of a host name H is defined as follows:
655
656 * If
657
658 - H is the host domain name of a host; and,
659
660 - H has the form A.B; and
661
662 - A has no embedded (that is, interior) dots; and
663
664 - B has at least one embedded dot, or B is the string "local".
665 then the reach of H is .B.
666
667 * Otherwise, the reach of H is H.
668
669 >>> reach("www.acme.com")
670 '.acme.com'
671 >>> reach("acme.com")
672 'acme.com'
673 >>> reach("acme.local")
674 '.local'
675
676 """
677 i = h.find(".")
678 if i >= 0:
679 #a = h[:i] # this line is only here to show what a is
680 b = h[i+1:]
681 i = b.find(".")
682 if is_HDN(h) and (i >= 0 or b == "local"):
683 return "."+b
684 return h
685
686def is_third_party(request):
687 """
688
689 RFC 2965, section 3.3.6:
690
691 An unverifiable transaction is to a third-party host if its request-
692 host U does not domain-match the reach R of the request-host O in the
693 origin transaction.
694
695 """
696 req_host = request_host(request)
697 if not domain_match(req_host, reach(request.get_origin_req_host())):
698 return True
699 else:
700 return False
701
702
703class Cookie:
704 """HTTP Cookie.
705
706 This class represents both Netscape and RFC 2965 cookies.
707
708 This is deliberately a very simple class. It just holds attributes. It's
709 possible to construct Cookie instances that don't comply with the cookie
710 standards. CookieJar.make_cookies is the factory function for Cookie
711 objects -- it deals with cookie parsing, supplying defaults, and
712 normalising to the representation used in this class. CookiePolicy is
713 responsible for checking them to see whether they should be accepted from
714 and returned to the server.
715
716 Note that the port may be present in the headers, but unspecified ("Port"
717 rather than"Port=80", for example); if this is the case, port is None.
718
719 """
720
721 def __init__(self, version, name, value,
722 port, port_specified,
723 domain, domain_specified, domain_initial_dot,
724 path, path_specified,
725 secure,
726 expires,
727 discard,
728 comment,
729 comment_url,
730 rest):
731
732 if version is not None: version = int(version)
733 if expires is not None: expires = int(expires)
734 if port is None and port_specified is True:
735 raise ValueError("if port is None, port_specified must be false")
736
737 self.version = version
738 self.name = name
739 self.value = value
740 self.port = port
741 self.port_specified = port_specified
742 # normalise case, as per RFC 2965 section 3.3.3
743 self.domain = domain.lower()
744 self.domain_specified = domain_specified
745 # Sigh. We need to know whether the domain given in the
746 # cookie-attribute had an initial dot, in order to follow RFC 2965
747 # (as clarified in draft errata). Needed for the returned $Domain
748 # value.
749 self.domain_initial_dot = domain_initial_dot
750 self.path = path
751 self.path_specified = path_specified
752 self.secure = secure
753 self.expires = expires
754 self.discard = discard
755 self.comment = comment
756 self.comment_url = comment_url
757
758 self._rest = copy.copy(rest)
759
760 def has_nonstandard_attr(self, name):
761 return name in self._rest
762 def get_nonstandard_attr(self, name, default=None):
763 return self._rest.get(name, default)
764 def set_nonstandard_attr(self, name, value):
765 self._rest[name] = value
766
767 def is_expired(self, now=None):
768 if now is None: now = time.time()
769 if (self.expires is not None) and (self.expires <= now):
770 return True
771 return False
772
773 def __str__(self):
774 if self.port is None: p = ""
775 else: p = ":"+self.port
776 limit = self.domain + p + self.path
777 if self.value is not None:
778 namevalue = "%s=%s" % (self.name, self.value)
779 else:
780 namevalue = self.name
781 return "<Cookie %s for %s>" % (namevalue, limit)
782
783 def __repr__(self):
784 args = []
785 for name in ["version", "name", "value",
786 "port", "port_specified",
787 "domain", "domain_specified", "domain_initial_dot",
788 "path", "path_specified",
789 "secure", "expires", "discard", "comment", "comment_url",
790 ]:
791 attr = getattr(self, name)
792 args.append("%s=%s" % (name, repr(attr)))
793 args.append("rest=%s" % repr(self._rest))
794 return "Cookie(%s)" % ", ".join(args)
795
796
797class CookiePolicy:
798 """Defines which cookies get accepted from and returned to server.
799
800 May also modify cookies, though this is probably a bad idea.
801
802 The subclass DefaultCookiePolicy defines the standard rules for Netscape
803 and RFC 2965 cookies -- override that if you want a customised policy.
804
805 """
806 def set_ok(self, cookie, request):
807 """Return true if (and only if) cookie should be accepted from server.
808
809 Currently, pre-expired cookies never get this far -- the CookieJar
810 class deletes such cookies itself.
811
812 """
813 raise NotImplementedError()
814
815 def return_ok(self, cookie, request):
816 """Return true if (and only if) cookie should be returned to server."""
817 raise NotImplementedError()
818
819 def domain_return_ok(self, domain, request):
820 """Return false if cookies should not be returned, given cookie domain.
821 """
822 return True
823
824 def path_return_ok(self, path, request):
825 """Return false if cookies should not be returned, given cookie path.
826 """
827 return True
828
829
830class DefaultCookiePolicy(CookiePolicy):
831 """Implements the standard rules for accepting and returning cookies."""
832
833 DomainStrictNoDots = 1
834 DomainStrictNonDomain = 2
835 DomainRFC2965Match = 4
836
837 DomainLiberal = 0
838 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
839
840 def __init__(self,
841 blocked_domains=None, allowed_domains=None,
842 netscape=True, rfc2965=False,
843 hide_cookie2=False,
844 strict_domain=False,
845 strict_rfc2965_unverifiable=True,
846 strict_ns_unverifiable=False,
847 strict_ns_domain=DomainLiberal,
848 strict_ns_set_initial_dollar=False,
849 strict_ns_set_path=False,
850 ):
851 """Constructor arguments should be passed as keyword arguments only."""
852 self.netscape = netscape
853 self.rfc2965 = rfc2965
854 self.hide_cookie2 = hide_cookie2
855 self.strict_domain = strict_domain
856 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
857 self.strict_ns_unverifiable = strict_ns_unverifiable
858 self.strict_ns_domain = strict_ns_domain
859 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
860 self.strict_ns_set_path = strict_ns_set_path
861
862 if blocked_domains is not None:
863 self._blocked_domains = tuple(blocked_domains)
864 else:
865 self._blocked_domains = ()
866
867 if allowed_domains is not None:
868 allowed_domains = tuple(allowed_domains)
869 self._allowed_domains = allowed_domains
870
871 def blocked_domains(self):
872 """Return the sequence of blocked domains (as a tuple)."""
873 return self._blocked_domains
874 def set_blocked_domains(self, blocked_domains):
875 """Set the sequence of blocked domains."""
876 self._blocked_domains = tuple(blocked_domains)
877
878 def is_blocked(self, domain):
879 for blocked_domain in self._blocked_domains:
880 if user_domain_match(domain, blocked_domain):
881 return True
882 return False
883
884 def allowed_domains(self):
885 """Return None, or the sequence of allowed domains (as a tuple)."""
886 return self._allowed_domains
887 def set_allowed_domains(self, allowed_domains):
888 """Set the sequence of allowed domains, or None."""
889 if allowed_domains is not None:
890 allowed_domains = tuple(allowed_domains)
891 self._allowed_domains = allowed_domains
892
893 def is_not_allowed(self, domain):
894 if self._allowed_domains is None:
895 return False
896 for allowed_domain in self._allowed_domains:
897 if user_domain_match(domain, allowed_domain):
898 return False
899 return True
900
901 def set_ok(self, cookie, request):
902 """
903 If you override .set_ok(), be sure to call this method. If it returns
904 false, so should your subclass (assuming your subclass wants to be more
905 strict about which cookies to accept).
906
907 """
908 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
909
910 assert cookie.name is not None
911
912 for n in "version", "verifiability", "name", "path", "domain", "port":
913 fn_name = "set_ok_"+n
914 fn = getattr(self, fn_name)
915 if not fn(cookie, request):
916 return False
917
918 return True
919
920 def set_ok_version(self, cookie, request):
921 if cookie.version is None:
922 # Version is always set to 0 by parse_ns_headers if it's a Netscape
923 # cookie, so this must be an invalid RFC 2965 cookie.
924 debug(" Set-Cookie2 without version attribute (%s=%s)",
925 cookie.name, cookie.value)
926 return False
927 if cookie.version > 0 and not self.rfc2965:
928 debug(" RFC 2965 cookies are switched off")
929 return False
930 elif cookie.version == 0 and not self.netscape:
931 debug(" Netscape cookies are switched off")
932 return False
933 return True
934
935 def set_ok_verifiability(self, cookie, request):
936 if request.is_unverifiable() and is_third_party(request):
937 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
938 debug(" third-party RFC 2965 cookie during "
939 "unverifiable transaction")
940 return False
941 elif cookie.version == 0 and self.strict_ns_unverifiable:
942 debug(" third-party Netscape cookie during "
943 "unverifiable transaction")
944 return False
945 return True
946
947 def set_ok_name(self, cookie, request):
948 # Try and stop servers setting V0 cookies designed to hack other
949 # servers that know both V0 and V1 protocols.
950 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
951 cookie.name.startswith("$")):
952 debug(" illegal name (starts with '$'): '%s'", cookie.name)
953 return False
954 return True
955
956 def set_ok_path(self, cookie, request):
957 if cookie.path_specified:
958 req_path = request_path(request)
959 if ((cookie.version > 0 or
960 (cookie.version == 0 and self.strict_ns_set_path)) and
961 not req_path.startswith(cookie.path)):
962 debug(" path attribute %s is not a prefix of request "
963 "path %s", cookie.path, req_path)
964 return False
965 return True
966
967 def set_ok_domain(self, cookie, request):
968 if self.is_blocked(cookie.domain):
969 debug(" domain %s is in user block-list", cookie.domain)
970 return False
971 if self.is_not_allowed(cookie.domain):
972 debug(" domain %s is not in user allow-list", cookie.domain)
973 return False
974 if cookie.domain_specified:
975 req_host, erhn = eff_request_host(request)
976 domain = cookie.domain
977 if self.strict_domain and (domain.count(".") >= 2):
978 i = domain.rfind(".")
979 j = domain.rfind(".", 0, i)
980 if j == 0: # domain like .foo.bar
981 tld = domain[i+1:]
982 sld = domain[j+1:i]
983 if (sld.lower() in [
984 "co", "ac",
985 "com", "edu", "org", "net", "gov", "mil", "int"] and
986 len(tld) == 2):
987 # domain like .co.uk
988 debug(" country-code second level domain %s", domain)
989 return False
990 if domain.startswith("."):
991 undotted_domain = domain[1:]
992 else:
993 undotted_domain = domain
994 embedded_dots = (undotted_domain.find(".") >= 0)
995 if not embedded_dots and domain != ".local":
996 debug(" non-local domain %s contains no embedded dot",
997 domain)
998 return False
999 if cookie.version == 0:
1000 if (not erhn.endswith(domain) and
1001 (not erhn.startswith(".") and
1002 not ("."+erhn).endswith(domain))):
1003 debug(" effective request-host %s (even with added "
1004 "initial dot) does not end end with %s",
1005 erhn, domain)
1006 return False
1007 if (cookie.version > 0 or
1008 (self.strict_ns_domain & self.DomainRFC2965Match)):
1009 if not domain_match(erhn, domain):
1010 debug(" effective request-host %s does not domain-match "
1011 "%s", erhn, domain)
1012 return False
1013 if (cookie.version > 0 or
1014 (self.strict_ns_domain & self.DomainStrictNoDots)):
1015 host_prefix = req_host[:-len(domain)]
1016 if (host_prefix.find(".") >= 0 and
1017 not IPV4_RE.search(req_host)):
1018 debug(" host prefix %s for domain %s contains a dot",
1019 host_prefix, domain)
1020 return False
1021 return True
1022
1023 def set_ok_port(self, cookie, request):
1024 if cookie.port_specified:
1025 req_port = request_port(request)
1026 if req_port is None:
1027 req_port = "80"
1028 else:
1029 req_port = str(req_port)
1030 for p in cookie.port.split(","):
1031 try:
1032 int(p)
1033 except ValueError:
1034 debug(" bad port %s (not numeric)", p)
1035 return False
1036 if p == req_port:
1037 break
1038 else:
1039 debug(" request port (%s) not found in %s",
1040 req_port, cookie.port)
1041 return False
1042 return True
1043
1044 def return_ok(self, cookie, request):
1045 """
1046 If you override .return_ok(), be sure to call this method. If it
1047 returns false, so should your subclass (assuming your subclass wants to
1048 be more strict about which cookies to return).
1049
1050 """
1051 # Path has already been checked by .path_return_ok(), and domain
1052 # blocking done by .domain_return_ok().
1053 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1054
1055 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1056 fn_name = "return_ok_"+n
1057 fn = getattr(self, fn_name)
1058 if not fn(cookie, request):
1059 return False
1060 return True
1061
1062 def return_ok_version(self, cookie, request):
1063 if cookie.version > 0 and not self.rfc2965:
1064 debug(" RFC 2965 cookies are switched off")
1065 return False
1066 elif cookie.version == 0 and not self.netscape:
1067 debug(" Netscape cookies are switched off")
1068 return False
1069 return True
1070
1071 def return_ok_verifiability(self, cookie, request):
1072 if request.is_unverifiable() and is_third_party(request):
1073 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1074 debug(" third-party RFC 2965 cookie during unverifiable "
1075 "transaction")
1076 return False
1077 elif cookie.version == 0 and self.strict_ns_unverifiable:
1078 debug(" third-party Netscape cookie during unverifiable "
1079 "transaction")
1080 return False
1081 return True
1082
1083 def return_ok_secure(self, cookie, request):
1084 if cookie.secure and request.get_type() != "https":
1085 debug(" secure cookie with non-secure request")
1086 return False
1087 return True
1088
1089 def return_ok_expires(self, cookie, request):
1090 if cookie.is_expired(self._now):
1091 debug(" cookie expired")
1092 return False
1093 return True
1094
1095 def return_ok_port(self, cookie, request):
1096 if cookie.port:
1097 req_port = request_port(request)
1098 if req_port is None:
1099 req_port = "80"
1100 for p in cookie.port.split(","):
1101 if p == req_port:
1102 break
1103 else:
1104 debug(" request port %s does not match cookie port %s",
1105 req_port, cookie.port)
1106 return False
1107 return True
1108
1109 def return_ok_domain(self, cookie, request):
1110 req_host, erhn = eff_request_host(request)
1111 domain = cookie.domain
1112
1113 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1114 if (cookie.version == 0 and
1115 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1116 not cookie.domain_specified and domain != erhn):
1117 debug(" cookie with unspecified domain does not string-compare "
1118 "equal to request domain")
1119 return False
1120
1121 if cookie.version > 0 and not domain_match(erhn, domain):
1122 debug(" effective request-host name %s does not domain-match "
1123 "RFC 2965 cookie domain %s", erhn, domain)
1124 return False
1125 if cookie.version == 0 and not ("."+erhn).endswith(domain):
1126 debug(" request-host %s does not match Netscape cookie domain "
1127 "%s", req_host, domain)
1128 return False
1129 return True
1130
1131 def domain_return_ok(self, domain, request):
1132 # Liberal check of. This is here as an optimization to avoid
1133 # having to load lots of MSIE cookie files unless necessary.
1134 req_host, erhn = eff_request_host(request)
1135 if not req_host.startswith("."):
1136 dotted_req_host = "."+req_host
1137 if not erhn.startswith("."):
1138 dotted_erhn = "."+erhn
1139 if not (dotted_req_host.endswith(domain) or
1140 dotted_erhn.endswith(domain)):
1141 #debug(" request domain %s does not match cookie domain %s",
1142 # req_host, domain)
1143 return False
1144
1145 if self.is_blocked(domain):
1146 debug(" domain %s is in user block-list", domain)
1147 return False
1148 if self.is_not_allowed(domain):
1149 debug(" domain %s is not in user allow-list", domain)
1150 return False
1151
1152 return True
1153
1154 def path_return_ok(self, path, request):
1155 debug("- checking cookie path=%s", path)
1156 req_path = request_path(request)
1157 if not req_path.startswith(path):
1158 debug(" %s does not path-match %s", req_path, path)
1159 return False
1160 return True
1161
1162
1163def vals_sorted_by_key(adict):
1164 keys = adict.keys()
1165 keys.sort()
1166 return map(adict.get, keys)
1167
1168def deepvalues(mapping):
1169 """Iterates over nested mapping, depth-first, in sorted order by key."""
1170 values = vals_sorted_by_key(mapping)
1171 for obj in values:
1172 mapping = False
1173 try:
1174 obj.items
1175 except AttributeError:
1176 pass
1177 else:
1178 mapping = True
1179 for subobj in deepvalues(obj):
1180 yield subobj
1181 if not mapping:
1182 yield obj
1183
1184
1185# Used as second parameter to dict.get() method, to distinguish absent
1186# dict key from one with a None value.
1187class Absent: pass
1188
1189class CookieJar:
1190 """Collection of HTTP cookies.
1191
1192 You may not need to know about this class: try
1193 urllib2.build_opener(HTTPCookieProcessor).open(url).
1194
1195 """
1196
1197 non_word_re = re.compile(r"\W")
1198 quote_re = re.compile(r"([\"\\])")
1199 strict_domain_re = re.compile(r"\.?[^.]*")
1200 domain_re = re.compile(r"[^.]*")
1201 dots_re = re.compile(r"^\.+")
1202
1203 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1204
1205 def __init__(self, policy=None):
1206 if policy is None:
1207 policy = DefaultCookiePolicy()
1208 self._policy = policy
1209
1210 self._cookies_lock = _threading.RLock()
1211 self._cookies = {}
1212
1213 def set_policy(self, policy):
1214 self._policy = policy
1215
1216 def _cookies_for_domain(self, domain, request):
1217 cookies = []
1218 if not self._policy.domain_return_ok(domain, request):
1219 return []
1220 debug("Checking %s for cookies to return", domain)
1221 cookies_by_path = self._cookies[domain]
1222 for path in cookies_by_path.keys():
1223 if not self._policy.path_return_ok(path, request):
1224 continue
1225 cookies_by_name = cookies_by_path[path]
1226 for cookie in cookies_by_name.values():
1227 if not self._policy.return_ok(cookie, request):
1228 debug(" not returning cookie")
1229 continue
1230 debug(" it's a match")
1231 cookies.append(cookie)
1232 return cookies
1233
1234 def _cookies_for_request(self, request):
1235 """Return a list of cookies to be returned to server."""
1236 cookies = []
1237 for domain in self._cookies.keys():
1238 cookies.extend(self._cookies_for_domain(domain, request))
1239 return cookies
1240
1241 def _cookie_attrs(self, cookies):
1242 """Return a list of cookie-attributes to be returned to server.
1243
1244 like ['foo="bar"; $Path="/"', ...]
1245
1246 The $Version attribute is also added when appropriate (currently only
1247 once per request).
1248
1249 """
1250 # add cookies in order of most specific (ie. longest) path first
1251 def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
1252 cookies.sort(decreasing_size)
1253
1254 version_set = False
1255
1256 attrs = []
1257 for cookie in cookies:
1258 # set version of Cookie header
1259 # XXX
1260 # What should it be if multiple matching Set-Cookie headers have
1261 # different versions themselves?
1262 # Answer: there is no answer; was supposed to be settled by
1263 # RFC 2965 errata, but that may never appear...
1264 version = cookie.version
1265 if not version_set:
1266 version_set = True
1267 if version > 0:
1268 attrs.append("$Version=%s" % version)
1269
1270 # quote cookie value if necessary
1271 # (not for Netscape protocol, which already has any quotes
1272 # intact, due to the poorly-specified Netscape Cookie: syntax)
1273 if ((cookie.value is not None) and
1274 self.non_word_re.search(cookie.value) and version > 0):
1275 value = self.quote_re.sub(r"\\\1", cookie.value)
1276 else:
1277 value = cookie.value
1278
1279 # add cookie-attributes to be returned in Cookie header
1280 if cookie.value is None:
1281 attrs.append(cookie.name)
1282 else:
1283 attrs.append("%s=%s" % (cookie.name, value))
1284 if version > 0:
1285 if cookie.path_specified:
1286 attrs.append('$Path="%s"' % cookie.path)
1287 if cookie.domain.startswith("."):
1288 domain = cookie.domain
1289 if (not cookie.domain_initial_dot and
1290 domain.startswith(".")):
1291 domain = domain[1:]
1292 attrs.append('$Domain="%s"' % domain)
1293 if cookie.port is not None:
1294 p = "$Port"
1295 if cookie.port_specified:
1296 p = p + ('="%s"' % cookie.port)
1297 attrs.append(p)
1298
1299 return attrs
1300
1301 def add_cookie_header(self, request):
1302 """Add correct Cookie: header to request (urllib2.Request object).
1303
1304 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1305
1306 """
1307 debug("add_cookie_header")
1308 self._cookies_lock.acquire()
1309
1310 self._policy._now = self._now = int(time.time())
1311
1312 req_host, erhn = eff_request_host(request)
1313 strict_non_domain = (
1314 self._policy.strict_ns_domain & self._policy.DomainStrictNonDomain)
1315
1316 cookies = self._cookies_for_request(request)
1317
1318 attrs = self._cookie_attrs(cookies)
1319 if attrs:
1320 if not request.has_header("Cookie"):
1321 request.add_unredirected_header(
1322 "Cookie", "; ".join(attrs))
1323
1324 # if necessary, advertise that we know RFC 2965
1325 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1326 not request.has_header("Cookie2")):
1327 for cookie in cookies:
1328 if cookie.version != 1:
1329 request.add_unredirected_header("Cookie2", '$Version="1"')
1330 break
1331
1332 self._cookies_lock.release()
1333
1334 self.clear_expired_cookies()
1335
1336 def _normalized_cookie_tuples(self, attrs_set):
1337 """Return list of tuples containing normalised cookie information.
1338
1339 attrs_set is the list of lists of key,value pairs extracted from
1340 the Set-Cookie or Set-Cookie2 headers.
1341
1342 Tuples are name, value, standard, rest, where name and value are the
1343 cookie name and value, standard is a dictionary containing the standard
1344 cookie-attributes (discard, secure, version, expires or max-age,
1345 domain, path and port) and rest is a dictionary containing the rest of
1346 the cookie-attributes.
1347
1348 """
1349 cookie_tuples = []
1350
1351 boolean_attrs = "discard", "secure"
1352 value_attrs = ("version",
1353 "expires", "max-age",
1354 "domain", "path", "port",
1355 "comment", "commenturl")
1356
1357 for cookie_attrs in attrs_set:
1358 name, value = cookie_attrs[0]
1359
1360 # Build dictionary of standard cookie-attributes (standard) and
1361 # dictionary of other cookie-attributes (rest).
1362
1363 # Note: expiry time is normalised to seconds since epoch. V0
1364 # cookies should have the Expires cookie-attribute, and V1 cookies
1365 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1366 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1367 # accept either (but prefer Max-Age).
1368 max_age_set = False
1369
1370 bad_cookie = False
1371
1372 standard = {}
1373 rest = {}
1374 for k, v in cookie_attrs[1:]:
1375 lc = k.lower()
1376 # don't lose case distinction for unknown fields
1377 if lc in value_attrs or lc in boolean_attrs:
1378 k = lc
1379 if k in boolean_attrs and v is None:
1380 # boolean cookie-attribute is present, but has no value
1381 # (like "discard", rather than "port=80")
1382 v = True
1383 if k in standard:
1384 # only first value is significant
1385 continue
1386 if k == "domain":
1387 if v is None:
1388 debug(" missing value for domain attribute")
1389 bad_cookie = True
1390 break
1391 # RFC 2965 section 3.3.3
1392 v = v.lower()
1393 if k == "expires":
1394 if max_age_set:
1395 # Prefer max-age to expires (like Mozilla)
1396 continue
1397 if v is None:
1398 debug(" missing or invalid value for expires "
1399 "attribute: treating as session cookie")
1400 continue
1401 if k == "max-age":
1402 max_age_set = True
1403 try:
1404 v = int(v)
1405 except ValueError:
1406 debug(" missing or invalid (non-numeric) value for "
1407 "max-age attribute")
1408 bad_cookie = True
1409 break
1410 # convert RFC 2965 Max-Age to seconds since epoch
1411 # XXX Strictly you're supposed to follow RFC 2616
1412 # age-calculation rules. Remember that zero Max-Age is a
1413 # is a request to discard (old and new) cookie, though.
1414 k = "expires"
1415 v = self._now + v
1416 if (k in value_attrs) or (k in boolean_attrs):
1417 if (v is None and
1418 k not in ["port", "comment", "commenturl"]):
1419 debug(" missing value for %s attribute" % k)
1420 bad_cookie = True
1421 break
1422 standard[k] = v
1423 else:
1424 rest[k] = v
1425
1426 if bad_cookie:
1427 continue
1428
1429 cookie_tuples.append((name, value, standard, rest))
1430
1431 return cookie_tuples
1432
1433 def _cookie_from_cookie_tuple(self, tup, request):
1434 # standard is dict of standard cookie-attributes, rest is dict of the
1435 # rest of them
1436 name, value, standard, rest = tup
1437
1438 domain = standard.get("domain", Absent)
1439 path = standard.get("path", Absent)
1440 port = standard.get("port", Absent)
1441 expires = standard.get("expires", Absent)
1442
1443 # set the easy defaults
1444 version = standard.get("version", None)
1445 if version is not None: version = int(version)
1446 secure = standard.get("secure", False)
1447 # (discard is also set if expires is Absent)
1448 discard = standard.get("discard", False)
1449 comment = standard.get("comment", None)
1450 comment_url = standard.get("commenturl", None)
1451
1452 # set default path
1453 if path is not Absent and path != "":
1454 path_specified = True
1455 path = escape_path(path)
1456 else:
1457 path_specified = False
1458 path = request_path(request)
1459 i = path.rfind("/")
1460 if i != -1:
1461 if version == 0:
1462 # Netscape spec parts company from reality here
1463 path = path[:i]
1464 else:
1465 path = path[:i+1]
1466 if len(path) == 0: path = "/"
1467
1468 # set default domain
1469 domain_specified = domain is not Absent
1470 # but first we have to remember whether it starts with a dot
1471 domain_initial_dot = False
1472 if domain_specified:
1473 domain_initial_dot = bool(domain.startswith("."))
1474 if domain is Absent:
1475 req_host, erhn = eff_request_host(request)
1476 domain = erhn
1477 elif not domain.startswith("."):
1478 domain = "."+domain
1479
1480 # set default port
1481 port_specified = False
1482 if port is not Absent:
1483 if port is None:
1484 # Port attr present, but has no value: default to request port.
1485 # Cookie should then only be sent back on that port.
1486 port = request_port(request)
1487 else:
1488 port_specified = True
1489 port = re.sub(r"\s+", "", port)
1490 else:
1491 # No port attr present. Cookie can be sent back on any port.
1492 port = None
1493
1494 # set default expires and discard
1495 if expires is Absent:
1496 expires = None
1497 discard = True
1498 elif expires <= self._now:
1499 # Expiry date in past is request to delete cookie. This can't be
1500 # in DefaultCookiePolicy, because can't delete cookies there.
1501 try:
1502 self.clear(domain, path, name)
1503 except KeyError:
1504 pass
1505 debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1506 domain, path, name)
1507 return None
1508
1509 return Cookie(version,
1510 name, value,
1511 port, port_specified,
1512 domain, domain_specified, domain_initial_dot,
1513 path, path_specified,
1514 secure,
1515 expires,
1516 discard,
1517 comment,
1518 comment_url,
1519 rest)
1520
1521 def _cookies_from_attrs_set(self, attrs_set, request):
1522 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1523
1524 cookies = []
1525 for tup in cookie_tuples:
1526 cookie = self._cookie_from_cookie_tuple(tup, request)
1527 if cookie: cookies.append(cookie)
1528 return cookies
1529
1530 def make_cookies(self, response, request):
1531 """Return sequence of Cookie objects extracted from response object."""
1532 # get cookie-attributes for RFC 2965 and Netscape protocols
1533 headers = response.info()
1534 rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1535 ns_hdrs = headers.getheaders("Set-Cookie")
1536
1537 rfc2965 = self._policy.rfc2965
1538 netscape = self._policy.netscape
1539
1540 if ((not rfc2965_hdrs and not ns_hdrs) or
1541 (not ns_hdrs and not rfc2965) or
1542 (not rfc2965_hdrs and not netscape) or
1543 (not netscape and not rfc2965)):
1544 return [] # no relevant cookie headers: quick exit
1545
1546 try:
1547 cookies = self._cookies_from_attrs_set(
1548 split_header_words(rfc2965_hdrs), request)
1549 except:
1550 reraise_unmasked_exceptions()
1551 cookies = []
1552
1553 if ns_hdrs and netscape:
1554 try:
1555 ns_cookies = self._cookies_from_attrs_set(
1556 parse_ns_headers(ns_hdrs), request)
1557 except:
1558 reraise_unmasked_exceptions()
1559 ns_cookies = []
1560
1561 # Look for Netscape cookies (from Set-Cookie headers) that match
1562 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1563 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1564 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1565 # bundled in with the Netscape cookies for this purpose, which is
1566 # reasonable behaviour.
1567 if rfc2965:
1568 lookup = {}
1569 for cookie in cookies:
1570 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1571
1572 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1573 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1574 return key not in lookup
1575 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1576
1577 if ns_cookies:
1578 cookies.extend(ns_cookies)
1579
1580 return cookies
1581
1582 def set_cookie_if_ok(self, cookie, request):
1583 """Set a cookie if policy says it's OK to do so."""
1584 self._cookies_lock.acquire()
1585 self._policy._now = self._now = int(time.time())
1586
1587 if self._policy.set_ok(cookie, request):
1588 self.set_cookie(cookie)
1589
1590 self._cookies_lock.release()
1591
1592 def set_cookie(self, cookie):
1593 """Set a cookie, without checking whether or not it should be set."""
1594 c = self._cookies
1595 self._cookies_lock.acquire()
1596 try:
1597 if cookie.domain not in c: c[cookie.domain] = {}
1598 c2 = c[cookie.domain]
1599 if cookie.path not in c2: c2[cookie.path] = {}
1600 c3 = c2[cookie.path]
1601 c3[cookie.name] = cookie
1602 finally:
1603 self._cookies_lock.release()
1604
1605 def extract_cookies(self, response, request):
1606 """Extract cookies from response, where allowable given the request."""
1607 debug("extract_cookies: %s", response.info())
1608 self._cookies_lock.acquire()
1609 self._policy._now = self._now = int(time.time())
1610
1611 for cookie in self.make_cookies(response, request):
1612 if self._policy.set_ok(cookie, request):
1613 debug(" setting cookie: %s", cookie)
1614 self.set_cookie(cookie)
1615 self._cookies_lock.release()
1616
1617 def clear(self, domain=None, path=None, name=None):
1618 """Clear some cookies.
1619
1620 Invoking this method without arguments will clear all cookies. If
1621 given a single argument, only cookies belonging to that domain will be
1622 removed. If given two arguments, cookies belonging to the specified
1623 path within that domain are removed. If given three arguments, then
1624 the cookie with the specified name, path and domain is removed.
1625
1626 Raises KeyError if no matching cookie exists.
1627
1628 """
1629 if name is not None:
1630 if (domain is None) or (path is None):
1631 raise ValueError(
1632 "domain and path must be given to remove a cookie by name")
1633 del self._cookies[domain][path][name]
1634 elif path is not None:
1635 if domain is None:
1636 raise ValueError(
1637 "domain must be given to remove cookies by path")
1638 del self._cookies[domain][path]
1639 elif domain is not None:
1640 del self._cookies[domain]
1641 else:
1642 self._cookies = {}
1643
1644 def clear_session_cookies(self):
1645 """Discard all session cookies.
1646
1647 Note that the .save() method won't save session cookies anyway, unless
1648 you ask otherwise by passing a true ignore_discard argument.
1649
1650 """
1651 self._cookies_lock.acquire()
1652 for cookie in self:
1653 if cookie.discard:
1654 self.clear(cookie.domain, cookie.path, cookie.name)
1655 self._cookies_lock.release()
1656
1657 def clear_expired_cookies(self):
1658 """Discard all expired cookies.
1659
1660 You probably don't need to call this method: expired cookies are never
1661 sent back to the server (provided you're using DefaultCookiePolicy),
1662 this method is called by CookieJar itself every so often, and the
1663 .save() method won't save expired cookies anyway (unless you ask
1664 otherwise by passing a true ignore_expires argument).
1665
1666 """
1667 self._cookies_lock.acquire()
1668 now = time.time()
1669 for cookie in self:
1670 if cookie.is_expired(now):
1671 self.clear(cookie.domain, cookie.path, cookie.name)
1672 self._cookies_lock.release()
1673
1674 def __iter__(self):
1675 return deepvalues(self._cookies)
1676
1677 def __len__(self):
1678 """Return number of contained cookies."""
1679 i = 0
1680 for cookie in self: i = i + 1
1681 return i
1682
1683 def __repr__(self):
1684 r = []
1685 for cookie in self: r.append(repr(cookie))
1686 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1687
1688 def __str__(self):
1689 r = []
1690 for cookie in self: r.append(str(cookie))
1691 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1692
1693
1694class LoadError(Exception): pass
1695
1696class FileCookieJar(CookieJar):
1697 """CookieJar that can be loaded from and saved to a file."""
1698
1699 def __init__(self, filename=None, delayload=False, policy=None):
1700 """
1701 Cookies are NOT loaded from the named file until either the .load() or
1702 .revert() method is called.
1703
1704 """
1705 CookieJar.__init__(self, policy)
1706 if filename is not None:
1707 try:
1708 filename+""
1709 except:
1710 raise ValueError("filename must be string-like")
1711 self.filename = filename
1712 self.delayload = bool(delayload)
1713
1714 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1715 """Save cookies to a file."""
1716 raise NotImplementedError()
1717
1718 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1719 """Load cookies from a file."""
1720 if filename is None:
1721 if self.filename is not None: filename = self.filename
1722 else: raise ValueError(MISSING_FILENAME_TEXT)
1723
1724 f = open(filename)
1725 try:
1726 self._really_load(f, filename, ignore_discard, ignore_expires)
1727 finally:
1728 f.close()
1729
1730 def revert(self, filename=None,
1731 ignore_discard=False, ignore_expires=False):
1732 """Clear all cookies and reload cookies from a saved file.
1733
1734 Raises LoadError (or IOError) if reversion is not successful; the
1735 object's state will not be altered if this happens.
1736
1737 """
1738 if filename is None:
1739 if self.filename is not None: filename = self.filename
1740 else: raise ValueError(MISSING_FILENAME_TEXT)
1741
1742 self._cookies_lock.acquire()
1743
1744 old_state = copy.deepcopy(self._cookies)
1745 self._cookies = {}
1746 try:
1747 self.load(filename, ignore_discard, ignore_expires)
1748 except (LoadError, IOError):
1749 self._cookies = old_state
1750 raise
1751
1752 self._cookies_lock.release()
1753
1754from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1755from _MozillaCookieJar import MozillaCookieJar