blob: 42a2513fec89362f105bca810d726d61aefbb3ae [file] [log] [blame]
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that the classes which do not derive from
11FileCookieJar are not distributed with the Python standard library, but
12are available from http://wwwsearch.sf.net/):
13
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
23 |/
24 MSIECookieJar
25
26"""
27
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +000028import sys, re, urlparse, copy, time, urllib, logging
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000029try:
30 import threading as _threading
31except ImportError:
32 import dummy_threading as _threading
33import httplib # only for the default HTTP port
34from calendar import timegm
35
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000036debug = logging.getLogger("cookielib").debug
37
38DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
39MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
40 "instance initialised with one)")
41
42def reraise_unmasked_exceptions(unmasked=()):
43 # There are a few catch-all except: statements in this module, for
44 # catching input that's bad in unexpected ways.
45 # This function re-raises some exceptions we don't want to trap.
46 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
47 etype = sys.exc_info()[0]
48 if issubclass(etype, unmasked):
49 raise
50 # swallowed an exception
Andrew M. Kuchlingae40c2f2004-07-10 18:32:12 +000051 import warnings, traceback, StringIO
52 f = StringIO.StringIO()
53 traceback.print_exc(None, f)
54 msg = f.getvalue()
55 warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +000056
57
58# Date/time conversion
59# -----------------------------------------------------------------------------
60
61EPOCH_YEAR = 1970
62def _timegm(tt):
63 year, month, mday, hour, min, sec = tt[:6]
64 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
65 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
66 return timegm(tt)
67 else:
68 return None
69
70DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
71MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
72 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
73MONTHS_LOWER = []
74for month in MONTHS: MONTHS_LOWER.append(month.lower())
75
76def time2isoz(t=None):
77 """Return a string representing time in seconds since epoch, t.
78
79 If the function is called without an argument, it will use the current
80 time.
81
82 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
83 representing Universal Time (UTC, aka GMT). An example of this format is:
84
85 1994-11-24 08:49:37Z
86
87 """
88 if t is None: t = time.time()
89 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
90 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
91 year, mon, mday, hour, min, sec)
92
93def time2netscape(t=None):
94 """Return a string representing time in seconds since epoch, t.
95
96 If the function is called without an argument, it will use the current
97 time.
98
99 The format of the returned string is like this:
100
101 Wed, DD-Mon-YYYY HH:MM:SS GMT
102
103 """
104 if t is None: t = time.time()
105 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
106 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
107 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
108
109
110UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
111
112TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
113def offset_from_tz_string(tz):
114 offset = None
115 if tz in UTC_ZONES:
116 offset = 0
117 else:
118 m = TIMEZONE_RE.search(tz)
119 if m:
120 offset = 3600 * int(m.group(2))
121 if m.group(3):
122 offset = offset + 60 * int(m.group(3))
123 if m.group(1) == '-':
124 offset = -offset
125 return offset
126
127def _str2time(day, mon, yr, hr, min, sec, tz):
128 # translate month name to number
129 # month numbers start with 1 (January)
130 try:
131 mon = MONTHS_LOWER.index(mon.lower())+1
132 except ValueError:
133 # maybe it's already a number
134 try:
135 imon = int(mon)
136 except ValueError:
137 return None
138 if 1 <= imon <= 12:
139 mon = imon
140 else:
141 return None
142
143 # make sure clock elements are defined
144 if hr is None: hr = 0
145 if min is None: min = 0
146 if sec is None: sec = 0
147
148 yr = int(yr)
149 day = int(day)
150 hr = int(hr)
151 min = int(min)
152 sec = int(sec)
153
154 if yr < 1000:
155 # find "obvious" year
156 cur_yr = time.localtime(time.time())[0]
157 m = cur_yr % 100
158 tmp = yr
159 yr = yr + cur_yr - m
160 m = m - tmp
161 if abs(m) > 50:
162 if m > 0: yr = yr + 100
163 else: yr = yr - 100
164
165 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
166 t = _timegm((yr, mon, day, hr, min, sec, tz))
167
168 if t is not None:
169 # adjust time using timezone string, to get absolute time since epoch
170 if tz is None:
171 tz = "UTC"
172 tz = tz.upper()
173 offset = offset_from_tz_string(tz)
174 if offset is None:
175 return None
176 t = t - offset
177
178 return t
179
180STRICT_DATE_RE = re.compile(
181 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
182 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
183WEEKDAY_RE = re.compile(
184 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
185LOOSE_HTTP_DATE_RE = re.compile(
186 r"""^
187 (\d\d?) # day
188 (?:\s+|[-\/])
189 (\w+) # month
190 (?:\s+|[-\/])
191 (\d+) # year
192 (?:
193 (?:\s+|:) # separator before clock
194 (\d\d?):(\d\d) # hour:min
195 (?::(\d\d))? # optional seconds
196 )? # optional clock
197 \s*
198 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
199 \s*
200 (?:\(\w+\))? # ASCII representation of timezone in parens.
201 \s*$""", re.X)
202def http2time(text):
203 """Returns time in seconds since epoch of time represented by a string.
204
205 Return value is an integer.
206
207 None is returned if the format of str is unrecognized, the time is outside
208 the representable range, or the timezone string is not recognized. If the
209 string contains no timezone, UTC is assumed.
210
211 The timezone in the string may be numerical (like "-0800" or "+0100") or a
212 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
213 timezone strings equivalent to UTC (zero offset) are known to the function.
214
215 The function loosely parses the following formats:
216
217 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
218 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
219 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
220 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
221 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
222 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
223
224 The parser ignores leading and trailing whitespace. The time may be
225 absent.
226
227 If the year is given with only 2 digits, the function will select the
228 century that makes the year closest to the current date.
229
230 """
231 # fast exit for strictly conforming string
232 m = STRICT_DATE_RE.search(text)
233 if m:
234 g = m.groups()
235 mon = MONTHS_LOWER.index(g[1].lower()) + 1
236 tt = (int(g[2]), mon, int(g[0]),
237 int(g[3]), int(g[4]), float(g[5]))
238 return _timegm(tt)
239
240 # No, we need some messy parsing...
241
242 # clean up
243 text = text.lstrip()
244 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
245
246 # tz is time zone specifier string
247 day, mon, yr, hr, min, sec, tz = [None]*7
248
249 # loose regexp parse
250 m = LOOSE_HTTP_DATE_RE.search(text)
251 if m is not None:
252 day, mon, yr, hr, min, sec, tz = m.groups()
253 else:
254 return None # bad format
255
256 return _str2time(day, mon, yr, hr, min, sec, tz)
257
258ISO_DATE_RE = re.compile(
259 """^
260 (\d{4}) # year
261 [-\/]?
262 (\d\d?) # numerical month
263 [-\/]?
264 (\d\d?) # day
265 (?:
266 (?:\s+|[-:Tt]) # separator before clock
267 (\d\d?):?(\d\d) # hour:min
268 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
269 )? # optional clock
270 \s*
271 ([-+]?\d\d?:?(:?\d\d)?
272 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
273 \s*$""", re.X)
274def iso2time(text):
275 """
276 As for http2time, but parses the ISO 8601 formats:
277
278 1994-02-03 14:15:29 -0100 -- ISO 8601 format
279 1994-02-03 14:15:29 -- zone is optional
280 1994-02-03 -- only date
281 1994-02-03T14:15:29 -- Use T as separator
282 19940203T141529Z -- ISO 8601 compact format
283 19940203 -- only date
284
285 """
286 # clean up
287 text = text.lstrip()
288
289 # tz is time zone specifier string
290 day, mon, yr, hr, min, sec, tz = [None]*7
291
292 # loose regexp parse
293 m = ISO_DATE_RE.search(text)
294 if m is not None:
295 # XXX there's an extra bit of the timezone I'm ignoring here: is
296 # this the right thing to do?
297 yr, mon, day, hr, min, sec, tz, _ = m.groups()
298 else:
299 return None # bad format
300
301 return _str2time(day, mon, yr, hr, min, sec, tz)
302
303
304# Header parsing
305# -----------------------------------------------------------------------------
306
307def unmatched(match):
308 """Return unmatched part of re.Match object."""
309 start, end = match.span(0)
310 return match.string[:start]+match.string[end:]
311
312HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
313HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
314HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
315HEADER_ESCAPE_RE = re.compile(r"\\(.)")
316def split_header_words(header_values):
317 r"""Parse header values into a list of lists containing key,value pairs.
318
319 The function knows how to deal with ",", ";" and "=" as well as quoted
320 values after "=". A list of space separated tokens are parsed as if they
321 were separated by ";".
322
323 If the header_values passed as argument contains multiple values, then they
324 are treated as if they were a single value separated by comma ",".
325
326 This means that this function is useful for parsing header fields that
327 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
328 the requirement for tokens).
329
330 headers = #header
331 header = (token | parameter) *( [";"] (token | parameter))
332
333 token = 1*<any CHAR except CTLs or separators>
334 separators = "(" | ")" | "<" | ">" | "@"
335 | "," | ";" | ":" | "\" | <">
336 | "/" | "[" | "]" | "?" | "="
337 | "{" | "}" | SP | HT
338
339 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
340 qdtext = <any TEXT except <">>
341 quoted-pair = "\" CHAR
342
343 parameter = attribute "=" value
344 attribute = token
345 value = token | quoted-string
346
347 Each header is represented by a list of key/value pairs. The value for a
348 simple token (not part of a parameter) is None. Syntactically incorrect
349 headers will not necessarily be parsed as you would want.
350
351 This is easier to describe with some examples:
352
353 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
354 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
355 >>> split_header_words(['text/html; charset="iso-8859-1"'])
356 [[('text/html', None), ('charset', 'iso-8859-1')]]
357 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
358 [[('Basic', None), ('realm', '"foobar"')]]
359
360 """
Raymond Hettingerf7153662005-02-07 14:16:21 +0000361 assert not isinstance(header_values, basestring)
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000362 result = []
363 for text in header_values:
364 orig_text = text
365 pairs = []
366 while text:
367 m = HEADER_TOKEN_RE.search(text)
368 if m:
369 text = unmatched(m)
370 name = m.group(1)
371 m = HEADER_QUOTED_VALUE_RE.search(text)
372 if m: # quoted value
373 text = unmatched(m)
374 value = m.group(1)
375 value = HEADER_ESCAPE_RE.sub(r"\1", value)
376 else:
377 m = HEADER_VALUE_RE.search(text)
378 if m: # unquoted value
379 text = unmatched(m)
380 value = m.group(1)
381 value = value.rstrip()
382 else:
383 # no value, a lone token
384 value = None
385 pairs.append((name, value))
386 elif text.lstrip().startswith(","):
387 # concatenated headers, as per RFC 2616 section 4.2
388 text = text.lstrip()[1:]
389 if pairs: result.append(pairs)
390 pairs = []
391 else:
392 # skip junk
393 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
394 assert nr_junk_chars > 0, (
395 "split_header_words bug: '%s', '%s', %s" %
396 (orig_text, text, pairs))
397 text = non_junk
398 if pairs: result.append(pairs)
399 return result
400
401HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
402def join_header_words(lists):
403 """Do the inverse (almost) of the conversion done by split_header_words.
404
405 Takes a list of lists of (key, value) pairs and produces a single header
406 value. Attribute values are quoted if needed.
407
408 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
409 'text/plain; charset="iso-8859/1"'
410 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
411 'text/plain, charset="iso-8859/1"'
412
413 """
414 headers = []
415 for pairs in lists:
416 attr = []
417 for k, v in pairs:
418 if v is not None:
419 if not re.search(r"^\w+$", v):
420 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
421 v = '"%s"' % v
422 k = "%s=%s" % (k, v)
423 attr.append(k)
424 if attr: headers.append("; ".join(attr))
425 return ", ".join(headers)
426
427def parse_ns_headers(ns_headers):
428 """Ad-hoc parser for Netscape protocol cookie-attributes.
429
430 The old Netscape cookie format for Set-Cookie can for instance contain
431 an unquoted "," in the expires field, so we have to use this ad-hoc
432 parser instead of split_header_words.
433
434 XXX This may not make the best possible effort to parse all the crap
435 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
436 parser is probably better, so could do worse than following that if
437 this ever gives any trouble.
438
439 Currently, this is also used for parsing RFC 2109 cookies.
440
441 """
442 known_attrs = ("expires", "domain", "path", "secure",
443 # RFC 2109 attrs (may turn up in Netscape cookies, too)
444 "port", "max-age")
445
446 result = []
447 for ns_header in ns_headers:
448 pairs = []
449 version_set = False
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000450 for ii, param in enumerate(re.split(r";\s*", ns_header)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000451 param = param.rstrip()
452 if param == "": continue
453 if "=" not in param:
Martin v. Löwisc5574e82005-03-03 10:57:37 +0000454 k, v = param, None
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000455 else:
456 k, v = re.split(r"\s*=\s*", param, 1)
457 k = k.lstrip()
Martin v. Löwis4ea3ead2005-03-03 10:48:12 +0000458 if ii != 0:
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000459 lc = k.lower()
460 if lc in known_attrs:
461 k = lc
462 if k == "version":
463 # This is an RFC 2109 cookie. Will be treated as RFC 2965
464 # cookie in rest of code.
465 # Probably it should be parsed with split_header_words, but
466 # that's too much hassle.
467 version_set = True
468 if k == "expires":
469 # convert expires date to seconds since epoch
470 if v.startswith('"'): v = v[1:]
471 if v.endswith('"'): v = v[:-1]
472 v = http2time(v) # None if invalid
473 pairs.append((k, v))
474
475 if pairs:
476 if not version_set:
477 pairs.append(("version", "0"))
478 result.append(pairs)
479
480 return result
481
482
483IPV4_RE = re.compile(r"\.\d+$")
484def is_HDN(text):
485 """Return True if text is a host domain name."""
486 # XXX
487 # This may well be wrong. Which RFC is HDN defined in, if any (for
488 # the purposes of RFC 2965)?
489 # For the current implementation, what about IPv6? Remember to look
490 # at other uses of IPV4_RE also, if change this.
491 if IPV4_RE.search(text):
492 return False
493 if text == "":
494 return False
495 if text[0] == "." or text[-1] == ".":
496 return False
497 return True
498
499def domain_match(A, B):
500 """Return True if domain A domain-matches domain B, according to RFC 2965.
501
502 A and B may be host domain names or IP addresses.
503
504 RFC 2965, section 1:
505
506 Host names can be specified either as an IP address or a HDN string.
507 Sometimes we compare one host name with another. (Such comparisons SHALL
508 be case-insensitive.) Host A's name domain-matches host B's if
509
510 * their host name strings string-compare equal; or
511
512 * A is a HDN string and has the form NB, where N is a non-empty
513 name string, B has the form .B', and B' is a HDN string. (So,
514 x.y.com domain-matches .Y.com but not Y.com.)
515
516 Note that domain-match is not a commutative operation: a.b.c.com
517 domain-matches .c.com, but not the reverse.
518
519 """
520 # Note that, if A or B are IP addresses, the only relevant part of the
521 # definition of the domain-match algorithm is the direct string-compare.
522 A = A.lower()
523 B = B.lower()
524 if A == B:
525 return True
526 if not is_HDN(A):
527 return False
528 i = A.rfind(B)
529 if i == -1 or i == 0:
530 # A does not have form NB, or N is the empty string
531 return False
532 if not B.startswith("."):
533 return False
534 if not is_HDN(B[1:]):
535 return False
536 return True
537
538def liberal_is_HDN(text):
539 """Return True if text is a sort-of-like a host domain name.
540
541 For accepting/blocking domains.
542
543 """
544 if IPV4_RE.search(text):
545 return False
546 return True
547
548def user_domain_match(A, B):
549 """For blocking/accepting domains.
550
551 A and B may be host domain names or IP addresses.
552
553 """
554 A = A.lower()
555 B = B.lower()
556 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
557 if A == B:
558 # equal IP addresses
559 return True
560 return False
561 initial_dot = B.startswith(".")
562 if initial_dot and A.endswith(B):
563 return True
564 if not initial_dot and A == B:
565 return True
566 return False
567
568cut_port_re = re.compile(r":\d+$")
569def request_host(request):
570 """Return request-host, as defined by RFC 2965.
571
572 Variation from RFC: returned value is lowercased, for convenient
573 comparison.
574
575 """
576 url = request.get_full_url()
577 host = urlparse.urlparse(url)[1]
578 if host == "":
579 host = request.get_header("Host", "")
580
581 # remove port, if present
582 host = cut_port_re.sub("", host, 1)
583 return host.lower()
584
585def eff_request_host(request):
586 """Return a tuple (request-host, effective request-host name).
587
588 As defined by RFC 2965, except both are lowercased.
589
590 """
591 erhn = req_host = request_host(request)
592 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
593 erhn = req_host + ".local"
594 return req_host, erhn
595
596def request_path(request):
597 """request-URI, as defined by RFC 2965."""
598 url = request.get_full_url()
599 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
600 #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
601 path, parameters, query, frag = urlparse.urlparse(url)[2:]
602 if parameters:
603 path = "%s;%s" % (path, parameters)
604 path = escape_path(path)
605 req_path = urlparse.urlunparse(("", "", path, "", query, frag))
606 if not req_path.startswith("/"):
607 # fix bad RFC 2396 absoluteURI
608 req_path = "/"+req_path
609 return req_path
610
611def request_port(request):
612 host = request.get_host()
613 i = host.find(':')
614 if i >= 0:
615 port = host[i+1:]
616 try:
617 int(port)
618 except ValueError:
619 debug("nonnumeric port: '%s'", port)
620 return None
621 else:
622 port = DEFAULT_HTTP_PORT
623 return port
624
625# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
626# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
627HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
628ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
629def uppercase_escaped_char(match):
630 return "%%%s" % match.group(1).upper()
631def escape_path(path):
632 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
633 # There's no knowing what character encoding was used to create URLs
634 # containing %-escapes, but since we have to pick one to escape invalid
635 # path characters, we pick UTF-8, as recommended in the HTML 4.0
636 # specification:
637 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
638 # And here, kind of: draft-fielding-uri-rfc2396bis-03
639 # (And in draft IRI specification: draft-duerst-iri-05)
640 # (And here, for new URI schemes: RFC 2718)
Neal Norwitz2fa0b9d2004-10-17 16:23:52 +0000641 if isinstance(path, unicode):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000642 path = path.encode("utf-8")
643 path = urllib.quote(path, HTTP_PATH_SAFE)
644 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
645 return path
646
647def reach(h):
648 """Return reach of host h, as defined by RFC 2965, section 1.
649
650 The reach R of a host name H is defined as follows:
651
652 * If
653
654 - H is the host domain name of a host; and,
655
656 - H has the form A.B; and
657
658 - A has no embedded (that is, interior) dots; and
659
660 - B has at least one embedded dot, or B is the string "local".
661 then the reach of H is .B.
662
663 * Otherwise, the reach of H is H.
664
665 >>> reach("www.acme.com")
666 '.acme.com'
667 >>> reach("acme.com")
668 'acme.com'
669 >>> reach("acme.local")
670 '.local'
671
672 """
673 i = h.find(".")
674 if i >= 0:
675 #a = h[:i] # this line is only here to show what a is
676 b = h[i+1:]
677 i = b.find(".")
678 if is_HDN(h) and (i >= 0 or b == "local"):
679 return "."+b
680 return h
681
682def is_third_party(request):
683 """
684
685 RFC 2965, section 3.3.6:
686
687 An unverifiable transaction is to a third-party host if its request-
688 host U does not domain-match the reach R of the request-host O in the
689 origin transaction.
690
691 """
692 req_host = request_host(request)
693 if not domain_match(req_host, reach(request.get_origin_req_host())):
694 return True
695 else:
696 return False
697
698
699class Cookie:
700 """HTTP Cookie.
701
702 This class represents both Netscape and RFC 2965 cookies.
703
704 This is deliberately a very simple class. It just holds attributes. It's
705 possible to construct Cookie instances that don't comply with the cookie
706 standards. CookieJar.make_cookies is the factory function for Cookie
707 objects -- it deals with cookie parsing, supplying defaults, and
708 normalising to the representation used in this class. CookiePolicy is
709 responsible for checking them to see whether they should be accepted from
710 and returned to the server.
711
712 Note that the port may be present in the headers, but unspecified ("Port"
713 rather than"Port=80", for example); if this is the case, port is None.
714
715 """
716
717 def __init__(self, version, name, value,
718 port, port_specified,
719 domain, domain_specified, domain_initial_dot,
720 path, path_specified,
721 secure,
722 expires,
723 discard,
724 comment,
725 comment_url,
726 rest):
727
728 if version is not None: version = int(version)
729 if expires is not None: expires = int(expires)
730 if port is None and port_specified is True:
731 raise ValueError("if port is None, port_specified must be false")
732
733 self.version = version
734 self.name = name
735 self.value = value
736 self.port = port
737 self.port_specified = port_specified
738 # normalise case, as per RFC 2965 section 3.3.3
739 self.domain = domain.lower()
740 self.domain_specified = domain_specified
741 # Sigh. We need to know whether the domain given in the
742 # cookie-attribute had an initial dot, in order to follow RFC 2965
743 # (as clarified in draft errata). Needed for the returned $Domain
744 # value.
745 self.domain_initial_dot = domain_initial_dot
746 self.path = path
747 self.path_specified = path_specified
748 self.secure = secure
749 self.expires = expires
750 self.discard = discard
751 self.comment = comment
752 self.comment_url = comment_url
753
754 self._rest = copy.copy(rest)
755
756 def has_nonstandard_attr(self, name):
757 return name in self._rest
758 def get_nonstandard_attr(self, name, default=None):
759 return self._rest.get(name, default)
760 def set_nonstandard_attr(self, name, value):
761 self._rest[name] = value
762
763 def is_expired(self, now=None):
764 if now is None: now = time.time()
765 if (self.expires is not None) and (self.expires <= now):
766 return True
767 return False
768
769 def __str__(self):
770 if self.port is None: p = ""
771 else: p = ":"+self.port
772 limit = self.domain + p + self.path
773 if self.value is not None:
774 namevalue = "%s=%s" % (self.name, self.value)
775 else:
776 namevalue = self.name
777 return "<Cookie %s for %s>" % (namevalue, limit)
778
779 def __repr__(self):
780 args = []
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000781 for name in ("version", "name", "value",
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000782 "port", "port_specified",
783 "domain", "domain_specified", "domain_initial_dot",
784 "path", "path_specified",
785 "secure", "expires", "discard", "comment", "comment_url",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000786 ):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000787 attr = getattr(self, name)
788 args.append("%s=%s" % (name, repr(attr)))
789 args.append("rest=%s" % repr(self._rest))
790 return "Cookie(%s)" % ", ".join(args)
791
792
793class CookiePolicy:
794 """Defines which cookies get accepted from and returned to server.
795
796 May also modify cookies, though this is probably a bad idea.
797
798 The subclass DefaultCookiePolicy defines the standard rules for Netscape
799 and RFC 2965 cookies -- override that if you want a customised policy.
800
801 """
802 def set_ok(self, cookie, request):
803 """Return true if (and only if) cookie should be accepted from server.
804
805 Currently, pre-expired cookies never get this far -- the CookieJar
806 class deletes such cookies itself.
807
808 """
809 raise NotImplementedError()
810
811 def return_ok(self, cookie, request):
812 """Return true if (and only if) cookie should be returned to server."""
813 raise NotImplementedError()
814
815 def domain_return_ok(self, domain, request):
816 """Return false if cookies should not be returned, given cookie domain.
817 """
818 return True
819
820 def path_return_ok(self, path, request):
821 """Return false if cookies should not be returned, given cookie path.
822 """
823 return True
824
825
826class DefaultCookiePolicy(CookiePolicy):
827 """Implements the standard rules for accepting and returning cookies."""
828
829 DomainStrictNoDots = 1
830 DomainStrictNonDomain = 2
831 DomainRFC2965Match = 4
832
833 DomainLiberal = 0
834 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
835
836 def __init__(self,
837 blocked_domains=None, allowed_domains=None,
838 netscape=True, rfc2965=False,
839 hide_cookie2=False,
840 strict_domain=False,
841 strict_rfc2965_unverifiable=True,
842 strict_ns_unverifiable=False,
843 strict_ns_domain=DomainLiberal,
844 strict_ns_set_initial_dollar=False,
845 strict_ns_set_path=False,
846 ):
847 """Constructor arguments should be passed as keyword arguments only."""
848 self.netscape = netscape
849 self.rfc2965 = rfc2965
850 self.hide_cookie2 = hide_cookie2
851 self.strict_domain = strict_domain
852 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
853 self.strict_ns_unverifiable = strict_ns_unverifiable
854 self.strict_ns_domain = strict_ns_domain
855 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
856 self.strict_ns_set_path = strict_ns_set_path
857
858 if blocked_domains is not None:
859 self._blocked_domains = tuple(blocked_domains)
860 else:
861 self._blocked_domains = ()
862
863 if allowed_domains is not None:
864 allowed_domains = tuple(allowed_domains)
865 self._allowed_domains = allowed_domains
866
867 def blocked_domains(self):
868 """Return the sequence of blocked domains (as a tuple)."""
869 return self._blocked_domains
870 def set_blocked_domains(self, blocked_domains):
871 """Set the sequence of blocked domains."""
872 self._blocked_domains = tuple(blocked_domains)
873
874 def is_blocked(self, domain):
875 for blocked_domain in self._blocked_domains:
876 if user_domain_match(domain, blocked_domain):
877 return True
878 return False
879
880 def allowed_domains(self):
881 """Return None, or the sequence of allowed domains (as a tuple)."""
882 return self._allowed_domains
883 def set_allowed_domains(self, allowed_domains):
884 """Set the sequence of allowed domains, or None."""
885 if allowed_domains is not None:
886 allowed_domains = tuple(allowed_domains)
887 self._allowed_domains = allowed_domains
888
889 def is_not_allowed(self, domain):
890 if self._allowed_domains is None:
891 return False
892 for allowed_domain in self._allowed_domains:
893 if user_domain_match(domain, allowed_domain):
894 return False
895 return True
896
897 def set_ok(self, cookie, request):
898 """
899 If you override .set_ok(), be sure to call this method. If it returns
900 false, so should your subclass (assuming your subclass wants to be more
901 strict about which cookies to accept).
902
903 """
904 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
905
906 assert cookie.name is not None
907
908 for n in "version", "verifiability", "name", "path", "domain", "port":
909 fn_name = "set_ok_"+n
910 fn = getattr(self, fn_name)
911 if not fn(cookie, request):
912 return False
913
914 return True
915
916 def set_ok_version(self, cookie, request):
917 if cookie.version is None:
918 # Version is always set to 0 by parse_ns_headers if it's a Netscape
919 # cookie, so this must be an invalid RFC 2965 cookie.
920 debug(" Set-Cookie2 without version attribute (%s=%s)",
921 cookie.name, cookie.value)
922 return False
923 if cookie.version > 0 and not self.rfc2965:
924 debug(" RFC 2965 cookies are switched off")
925 return False
926 elif cookie.version == 0 and not self.netscape:
927 debug(" Netscape cookies are switched off")
928 return False
929 return True
930
931 def set_ok_verifiability(self, cookie, request):
932 if request.is_unverifiable() and is_third_party(request):
933 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
934 debug(" third-party RFC 2965 cookie during "
935 "unverifiable transaction")
936 return False
937 elif cookie.version == 0 and self.strict_ns_unverifiable:
938 debug(" third-party Netscape cookie during "
939 "unverifiable transaction")
940 return False
941 return True
942
943 def set_ok_name(self, cookie, request):
944 # Try and stop servers setting V0 cookies designed to hack other
945 # servers that know both V0 and V1 protocols.
946 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
947 cookie.name.startswith("$")):
948 debug(" illegal name (starts with '$'): '%s'", cookie.name)
949 return False
950 return True
951
952 def set_ok_path(self, cookie, request):
953 if cookie.path_specified:
954 req_path = request_path(request)
955 if ((cookie.version > 0 or
956 (cookie.version == 0 and self.strict_ns_set_path)) and
957 not req_path.startswith(cookie.path)):
958 debug(" path attribute %s is not a prefix of request "
959 "path %s", cookie.path, req_path)
960 return False
961 return True
962
963 def set_ok_domain(self, cookie, request):
964 if self.is_blocked(cookie.domain):
965 debug(" domain %s is in user block-list", cookie.domain)
966 return False
967 if self.is_not_allowed(cookie.domain):
968 debug(" domain %s is not in user allow-list", cookie.domain)
969 return False
970 if cookie.domain_specified:
971 req_host, erhn = eff_request_host(request)
972 domain = cookie.domain
973 if self.strict_domain and (domain.count(".") >= 2):
974 i = domain.rfind(".")
975 j = domain.rfind(".", 0, i)
976 if j == 0: # domain like .foo.bar
977 tld = domain[i+1:]
978 sld = domain[j+1:i]
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000979 if (sld.lower() in (
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000980 "co", "ac",
Raymond Hettingerdbecd932005-02-06 06:57:08 +0000981 "com", "edu", "org", "net", "gov", "mil", "int") and
Martin v. Löwis2a6ba902004-05-31 18:22:40 +0000982 len(tld) == 2):
983 # domain like .co.uk
984 debug(" country-code second level domain %s", domain)
985 return False
986 if domain.startswith("."):
987 undotted_domain = domain[1:]
988 else:
989 undotted_domain = domain
990 embedded_dots = (undotted_domain.find(".") >= 0)
991 if not embedded_dots and domain != ".local":
992 debug(" non-local domain %s contains no embedded dot",
993 domain)
994 return False
995 if cookie.version == 0:
996 if (not erhn.endswith(domain) and
997 (not erhn.startswith(".") and
998 not ("."+erhn).endswith(domain))):
999 debug(" effective request-host %s (even with added "
1000 "initial dot) does not end end with %s",
1001 erhn, domain)
1002 return False
1003 if (cookie.version > 0 or
1004 (self.strict_ns_domain & self.DomainRFC2965Match)):
1005 if not domain_match(erhn, domain):
1006 debug(" effective request-host %s does not domain-match "
1007 "%s", erhn, domain)
1008 return False
1009 if (cookie.version > 0 or
1010 (self.strict_ns_domain & self.DomainStrictNoDots)):
1011 host_prefix = req_host[:-len(domain)]
1012 if (host_prefix.find(".") >= 0 and
1013 not IPV4_RE.search(req_host)):
1014 debug(" host prefix %s for domain %s contains a dot",
1015 host_prefix, domain)
1016 return False
1017 return True
1018
1019 def set_ok_port(self, cookie, request):
1020 if cookie.port_specified:
1021 req_port = request_port(request)
1022 if req_port is None:
1023 req_port = "80"
1024 else:
1025 req_port = str(req_port)
1026 for p in cookie.port.split(","):
1027 try:
1028 int(p)
1029 except ValueError:
1030 debug(" bad port %s (not numeric)", p)
1031 return False
1032 if p == req_port:
1033 break
1034 else:
1035 debug(" request port (%s) not found in %s",
1036 req_port, cookie.port)
1037 return False
1038 return True
1039
1040 def return_ok(self, cookie, request):
1041 """
1042 If you override .return_ok(), be sure to call this method. If it
1043 returns false, so should your subclass (assuming your subclass wants to
1044 be more strict about which cookies to return).
1045
1046 """
1047 # Path has already been checked by .path_return_ok(), and domain
1048 # blocking done by .domain_return_ok().
1049 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1050
1051 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1052 fn_name = "return_ok_"+n
1053 fn = getattr(self, fn_name)
1054 if not fn(cookie, request):
1055 return False
1056 return True
1057
1058 def return_ok_version(self, cookie, request):
1059 if cookie.version > 0 and not self.rfc2965:
1060 debug(" RFC 2965 cookies are switched off")
1061 return False
1062 elif cookie.version == 0 and not self.netscape:
1063 debug(" Netscape cookies are switched off")
1064 return False
1065 return True
1066
1067 def return_ok_verifiability(self, cookie, request):
1068 if request.is_unverifiable() and is_third_party(request):
1069 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1070 debug(" third-party RFC 2965 cookie during unverifiable "
1071 "transaction")
1072 return False
1073 elif cookie.version == 0 and self.strict_ns_unverifiable:
1074 debug(" third-party Netscape cookie during unverifiable "
1075 "transaction")
1076 return False
1077 return True
1078
1079 def return_ok_secure(self, cookie, request):
1080 if cookie.secure and request.get_type() != "https":
1081 debug(" secure cookie with non-secure request")
1082 return False
1083 return True
1084
1085 def return_ok_expires(self, cookie, request):
1086 if cookie.is_expired(self._now):
1087 debug(" cookie expired")
1088 return False
1089 return True
1090
1091 def return_ok_port(self, cookie, request):
1092 if cookie.port:
1093 req_port = request_port(request)
1094 if req_port is None:
1095 req_port = "80"
1096 for p in cookie.port.split(","):
1097 if p == req_port:
1098 break
1099 else:
1100 debug(" request port %s does not match cookie port %s",
1101 req_port, cookie.port)
1102 return False
1103 return True
1104
1105 def return_ok_domain(self, cookie, request):
1106 req_host, erhn = eff_request_host(request)
1107 domain = cookie.domain
1108
1109 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1110 if (cookie.version == 0 and
1111 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1112 not cookie.domain_specified and domain != erhn):
1113 debug(" cookie with unspecified domain does not string-compare "
1114 "equal to request domain")
1115 return False
1116
1117 if cookie.version > 0 and not domain_match(erhn, domain):
1118 debug(" effective request-host name %s does not domain-match "
1119 "RFC 2965 cookie domain %s", erhn, domain)
1120 return False
1121 if cookie.version == 0 and not ("."+erhn).endswith(domain):
1122 debug(" request-host %s does not match Netscape cookie domain "
1123 "%s", req_host, domain)
1124 return False
1125 return True
1126
1127 def domain_return_ok(self, domain, request):
1128 # Liberal check of. This is here as an optimization to avoid
1129 # having to load lots of MSIE cookie files unless necessary.
1130 req_host, erhn = eff_request_host(request)
1131 if not req_host.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001132 req_host = "."+req_host
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001133 if not erhn.startswith("."):
Raymond Hettingerbab41432005-02-05 01:31:19 +00001134 erhn = "."+erhn
1135 if not (req_host.endswith(domain) or erhn.endswith(domain)):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001136 #debug(" request domain %s does not match cookie domain %s",
1137 # req_host, domain)
1138 return False
1139
1140 if self.is_blocked(domain):
1141 debug(" domain %s is in user block-list", domain)
1142 return False
1143 if self.is_not_allowed(domain):
1144 debug(" domain %s is not in user allow-list", domain)
1145 return False
1146
1147 return True
1148
1149 def path_return_ok(self, path, request):
1150 debug("- checking cookie path=%s", path)
1151 req_path = request_path(request)
1152 if not req_path.startswith(path):
1153 debug(" %s does not path-match %s", req_path, path)
1154 return False
1155 return True
1156
1157
1158def vals_sorted_by_key(adict):
1159 keys = adict.keys()
1160 keys.sort()
1161 return map(adict.get, keys)
1162
1163def deepvalues(mapping):
1164 """Iterates over nested mapping, depth-first, in sorted order by key."""
1165 values = vals_sorted_by_key(mapping)
1166 for obj in values:
1167 mapping = False
1168 try:
1169 obj.items
1170 except AttributeError:
1171 pass
1172 else:
1173 mapping = True
1174 for subobj in deepvalues(obj):
1175 yield subobj
1176 if not mapping:
1177 yield obj
1178
1179
1180# Used as second parameter to dict.get() method, to distinguish absent
1181# dict key from one with a None value.
1182class Absent: pass
1183
1184class CookieJar:
1185 """Collection of HTTP cookies.
1186
1187 You may not need to know about this class: try
1188 urllib2.build_opener(HTTPCookieProcessor).open(url).
1189
1190 """
1191
1192 non_word_re = re.compile(r"\W")
1193 quote_re = re.compile(r"([\"\\])")
1194 strict_domain_re = re.compile(r"\.?[^.]*")
1195 domain_re = re.compile(r"[^.]*")
1196 dots_re = re.compile(r"^\.+")
1197
1198 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1199
1200 def __init__(self, policy=None):
1201 if policy is None:
1202 policy = DefaultCookiePolicy()
1203 self._policy = policy
1204
1205 self._cookies_lock = _threading.RLock()
1206 self._cookies = {}
1207
1208 def set_policy(self, policy):
1209 self._policy = policy
1210
1211 def _cookies_for_domain(self, domain, request):
1212 cookies = []
1213 if not self._policy.domain_return_ok(domain, request):
1214 return []
1215 debug("Checking %s for cookies to return", domain)
1216 cookies_by_path = self._cookies[domain]
1217 for path in cookies_by_path.keys():
1218 if not self._policy.path_return_ok(path, request):
1219 continue
1220 cookies_by_name = cookies_by_path[path]
1221 for cookie in cookies_by_name.values():
1222 if not self._policy.return_ok(cookie, request):
1223 debug(" not returning cookie")
1224 continue
1225 debug(" it's a match")
1226 cookies.append(cookie)
1227 return cookies
1228
1229 def _cookies_for_request(self, request):
1230 """Return a list of cookies to be returned to server."""
1231 cookies = []
1232 for domain in self._cookies.keys():
1233 cookies.extend(self._cookies_for_domain(domain, request))
1234 return cookies
1235
1236 def _cookie_attrs(self, cookies):
1237 """Return a list of cookie-attributes to be returned to server.
1238
1239 like ['foo="bar"; $Path="/"', ...]
1240
1241 The $Version attribute is also added when appropriate (currently only
1242 once per request).
1243
1244 """
1245 # add cookies in order of most specific (ie. longest) path first
1246 def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
1247 cookies.sort(decreasing_size)
1248
1249 version_set = False
1250
1251 attrs = []
1252 for cookie in cookies:
1253 # set version of Cookie header
1254 # XXX
1255 # What should it be if multiple matching Set-Cookie headers have
1256 # different versions themselves?
1257 # Answer: there is no answer; was supposed to be settled by
1258 # RFC 2965 errata, but that may never appear...
1259 version = cookie.version
1260 if not version_set:
1261 version_set = True
1262 if version > 0:
1263 attrs.append("$Version=%s" % version)
1264
1265 # quote cookie value if necessary
1266 # (not for Netscape protocol, which already has any quotes
1267 # intact, due to the poorly-specified Netscape Cookie: syntax)
1268 if ((cookie.value is not None) and
1269 self.non_word_re.search(cookie.value) and version > 0):
1270 value = self.quote_re.sub(r"\\\1", cookie.value)
1271 else:
1272 value = cookie.value
1273
1274 # add cookie-attributes to be returned in Cookie header
1275 if cookie.value is None:
1276 attrs.append(cookie.name)
1277 else:
1278 attrs.append("%s=%s" % (cookie.name, value))
1279 if version > 0:
1280 if cookie.path_specified:
1281 attrs.append('$Path="%s"' % cookie.path)
1282 if cookie.domain.startswith("."):
1283 domain = cookie.domain
1284 if (not cookie.domain_initial_dot and
1285 domain.startswith(".")):
1286 domain = domain[1:]
1287 attrs.append('$Domain="%s"' % domain)
1288 if cookie.port is not None:
1289 p = "$Port"
1290 if cookie.port_specified:
1291 p = p + ('="%s"' % cookie.port)
1292 attrs.append(p)
1293
1294 return attrs
1295
1296 def add_cookie_header(self, request):
1297 """Add correct Cookie: header to request (urllib2.Request object).
1298
1299 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1300
1301 """
1302 debug("add_cookie_header")
1303 self._cookies_lock.acquire()
1304
1305 self._policy._now = self._now = int(time.time())
1306
1307 req_host, erhn = eff_request_host(request)
1308 strict_non_domain = (
1309 self._policy.strict_ns_domain & self._policy.DomainStrictNonDomain)
1310
1311 cookies = self._cookies_for_request(request)
1312
1313 attrs = self._cookie_attrs(cookies)
1314 if attrs:
1315 if not request.has_header("Cookie"):
1316 request.add_unredirected_header(
1317 "Cookie", "; ".join(attrs))
1318
1319 # if necessary, advertise that we know RFC 2965
1320 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1321 not request.has_header("Cookie2")):
1322 for cookie in cookies:
1323 if cookie.version != 1:
1324 request.add_unredirected_header("Cookie2", '$Version="1"')
1325 break
1326
1327 self._cookies_lock.release()
1328
1329 self.clear_expired_cookies()
1330
1331 def _normalized_cookie_tuples(self, attrs_set):
1332 """Return list of tuples containing normalised cookie information.
1333
1334 attrs_set is the list of lists of key,value pairs extracted from
1335 the Set-Cookie or Set-Cookie2 headers.
1336
1337 Tuples are name, value, standard, rest, where name and value are the
1338 cookie name and value, standard is a dictionary containing the standard
1339 cookie-attributes (discard, secure, version, expires or max-age,
1340 domain, path and port) and rest is a dictionary containing the rest of
1341 the cookie-attributes.
1342
1343 """
1344 cookie_tuples = []
1345
1346 boolean_attrs = "discard", "secure"
1347 value_attrs = ("version",
1348 "expires", "max-age",
1349 "domain", "path", "port",
1350 "comment", "commenturl")
1351
1352 for cookie_attrs in attrs_set:
1353 name, value = cookie_attrs[0]
1354
1355 # Build dictionary of standard cookie-attributes (standard) and
1356 # dictionary of other cookie-attributes (rest).
1357
1358 # Note: expiry time is normalised to seconds since epoch. V0
1359 # cookies should have the Expires cookie-attribute, and V1 cookies
1360 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1361 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1362 # accept either (but prefer Max-Age).
1363 max_age_set = False
1364
1365 bad_cookie = False
1366
1367 standard = {}
1368 rest = {}
1369 for k, v in cookie_attrs[1:]:
1370 lc = k.lower()
1371 # don't lose case distinction for unknown fields
1372 if lc in value_attrs or lc in boolean_attrs:
1373 k = lc
1374 if k in boolean_attrs and v is None:
1375 # boolean cookie-attribute is present, but has no value
1376 # (like "discard", rather than "port=80")
1377 v = True
1378 if k in standard:
1379 # only first value is significant
1380 continue
1381 if k == "domain":
1382 if v is None:
1383 debug(" missing value for domain attribute")
1384 bad_cookie = True
1385 break
1386 # RFC 2965 section 3.3.3
1387 v = v.lower()
1388 if k == "expires":
1389 if max_age_set:
1390 # Prefer max-age to expires (like Mozilla)
1391 continue
1392 if v is None:
1393 debug(" missing or invalid value for expires "
1394 "attribute: treating as session cookie")
1395 continue
1396 if k == "max-age":
1397 max_age_set = True
1398 try:
1399 v = int(v)
1400 except ValueError:
1401 debug(" missing or invalid (non-numeric) value for "
1402 "max-age attribute")
1403 bad_cookie = True
1404 break
1405 # convert RFC 2965 Max-Age to seconds since epoch
1406 # XXX Strictly you're supposed to follow RFC 2616
1407 # age-calculation rules. Remember that zero Max-Age is a
1408 # is a request to discard (old and new) cookie, though.
1409 k = "expires"
1410 v = self._now + v
1411 if (k in value_attrs) or (k in boolean_attrs):
1412 if (v is None and
Raymond Hettingerdbecd932005-02-06 06:57:08 +00001413 k not in ("port", "comment", "commenturl")):
Martin v. Löwis2a6ba902004-05-31 18:22:40 +00001414 debug(" missing value for %s attribute" % k)
1415 bad_cookie = True
1416 break
1417 standard[k] = v
1418 else:
1419 rest[k] = v
1420
1421 if bad_cookie:
1422 continue
1423
1424 cookie_tuples.append((name, value, standard, rest))
1425
1426 return cookie_tuples
1427
1428 def _cookie_from_cookie_tuple(self, tup, request):
1429 # standard is dict of standard cookie-attributes, rest is dict of the
1430 # rest of them
1431 name, value, standard, rest = tup
1432
1433 domain = standard.get("domain", Absent)
1434 path = standard.get("path", Absent)
1435 port = standard.get("port", Absent)
1436 expires = standard.get("expires", Absent)
1437
1438 # set the easy defaults
1439 version = standard.get("version", None)
1440 if version is not None: version = int(version)
1441 secure = standard.get("secure", False)
1442 # (discard is also set if expires is Absent)
1443 discard = standard.get("discard", False)
1444 comment = standard.get("comment", None)
1445 comment_url = standard.get("commenturl", None)
1446
1447 # set default path
1448 if path is not Absent and path != "":
1449 path_specified = True
1450 path = escape_path(path)
1451 else:
1452 path_specified = False
1453 path = request_path(request)
1454 i = path.rfind("/")
1455 if i != -1:
1456 if version == 0:
1457 # Netscape spec parts company from reality here
1458 path = path[:i]
1459 else:
1460 path = path[:i+1]
1461 if len(path) == 0: path = "/"
1462
1463 # set default domain
1464 domain_specified = domain is not Absent
1465 # but first we have to remember whether it starts with a dot
1466 domain_initial_dot = False
1467 if domain_specified:
1468 domain_initial_dot = bool(domain.startswith("."))
1469 if domain is Absent:
1470 req_host, erhn = eff_request_host(request)
1471 domain = erhn
1472 elif not domain.startswith("."):
1473 domain = "."+domain
1474
1475 # set default port
1476 port_specified = False
1477 if port is not Absent:
1478 if port is None:
1479 # Port attr present, but has no value: default to request port.
1480 # Cookie should then only be sent back on that port.
1481 port = request_port(request)
1482 else:
1483 port_specified = True
1484 port = re.sub(r"\s+", "", port)
1485 else:
1486 # No port attr present. Cookie can be sent back on any port.
1487 port = None
1488
1489 # set default expires and discard
1490 if expires is Absent:
1491 expires = None
1492 discard = True
1493 elif expires <= self._now:
1494 # Expiry date in past is request to delete cookie. This can't be
1495 # in DefaultCookiePolicy, because can't delete cookies there.
1496 try:
1497 self.clear(domain, path, name)
1498 except KeyError:
1499 pass
1500 debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1501 domain, path, name)
1502 return None
1503
1504 return Cookie(version,
1505 name, value,
1506 port, port_specified,
1507 domain, domain_specified, domain_initial_dot,
1508 path, path_specified,
1509 secure,
1510 expires,
1511 discard,
1512 comment,
1513 comment_url,
1514 rest)
1515
1516 def _cookies_from_attrs_set(self, attrs_set, request):
1517 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1518
1519 cookies = []
1520 for tup in cookie_tuples:
1521 cookie = self._cookie_from_cookie_tuple(tup, request)
1522 if cookie: cookies.append(cookie)
1523 return cookies
1524
1525 def make_cookies(self, response, request):
1526 """Return sequence of Cookie objects extracted from response object."""
1527 # get cookie-attributes for RFC 2965 and Netscape protocols
1528 headers = response.info()
1529 rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1530 ns_hdrs = headers.getheaders("Set-Cookie")
1531
1532 rfc2965 = self._policy.rfc2965
1533 netscape = self._policy.netscape
1534
1535 if ((not rfc2965_hdrs and not ns_hdrs) or
1536 (not ns_hdrs and not rfc2965) or
1537 (not rfc2965_hdrs and not netscape) or
1538 (not netscape and not rfc2965)):
1539 return [] # no relevant cookie headers: quick exit
1540
1541 try:
1542 cookies = self._cookies_from_attrs_set(
1543 split_header_words(rfc2965_hdrs), request)
1544 except:
1545 reraise_unmasked_exceptions()
1546 cookies = []
1547
1548 if ns_hdrs and netscape:
1549 try:
1550 ns_cookies = self._cookies_from_attrs_set(
1551 parse_ns_headers(ns_hdrs), request)
1552 except:
1553 reraise_unmasked_exceptions()
1554 ns_cookies = []
1555
1556 # Look for Netscape cookies (from Set-Cookie headers) that match
1557 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1558 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1559 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1560 # bundled in with the Netscape cookies for this purpose, which is
1561 # reasonable behaviour.
1562 if rfc2965:
1563 lookup = {}
1564 for cookie in cookies:
1565 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1566
1567 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1568 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1569 return key not in lookup
1570 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1571
1572 if ns_cookies:
1573 cookies.extend(ns_cookies)
1574
1575 return cookies
1576
1577 def set_cookie_if_ok(self, cookie, request):
1578 """Set a cookie if policy says it's OK to do so."""
1579 self._cookies_lock.acquire()
1580 self._policy._now = self._now = int(time.time())
1581
1582 if self._policy.set_ok(cookie, request):
1583 self.set_cookie(cookie)
1584
1585 self._cookies_lock.release()
1586
1587 def set_cookie(self, cookie):
1588 """Set a cookie, without checking whether or not it should be set."""
1589 c = self._cookies
1590 self._cookies_lock.acquire()
1591 try:
1592 if cookie.domain not in c: c[cookie.domain] = {}
1593 c2 = c[cookie.domain]
1594 if cookie.path not in c2: c2[cookie.path] = {}
1595 c3 = c2[cookie.path]
1596 c3[cookie.name] = cookie
1597 finally:
1598 self._cookies_lock.release()
1599
1600 def extract_cookies(self, response, request):
1601 """Extract cookies from response, where allowable given the request."""
1602 debug("extract_cookies: %s", response.info())
1603 self._cookies_lock.acquire()
1604 self._policy._now = self._now = int(time.time())
1605
1606 for cookie in self.make_cookies(response, request):
1607 if self._policy.set_ok(cookie, request):
1608 debug(" setting cookie: %s", cookie)
1609 self.set_cookie(cookie)
1610 self._cookies_lock.release()
1611
1612 def clear(self, domain=None, path=None, name=None):
1613 """Clear some cookies.
1614
1615 Invoking this method without arguments will clear all cookies. If
1616 given a single argument, only cookies belonging to that domain will be
1617 removed. If given two arguments, cookies belonging to the specified
1618 path within that domain are removed. If given three arguments, then
1619 the cookie with the specified name, path and domain is removed.
1620
1621 Raises KeyError if no matching cookie exists.
1622
1623 """
1624 if name is not None:
1625 if (domain is None) or (path is None):
1626 raise ValueError(
1627 "domain and path must be given to remove a cookie by name")
1628 del self._cookies[domain][path][name]
1629 elif path is not None:
1630 if domain is None:
1631 raise ValueError(
1632 "domain must be given to remove cookies by path")
1633 del self._cookies[domain][path]
1634 elif domain is not None:
1635 del self._cookies[domain]
1636 else:
1637 self._cookies = {}
1638
1639 def clear_session_cookies(self):
1640 """Discard all session cookies.
1641
1642 Note that the .save() method won't save session cookies anyway, unless
1643 you ask otherwise by passing a true ignore_discard argument.
1644
1645 """
1646 self._cookies_lock.acquire()
1647 for cookie in self:
1648 if cookie.discard:
1649 self.clear(cookie.domain, cookie.path, cookie.name)
1650 self._cookies_lock.release()
1651
1652 def clear_expired_cookies(self):
1653 """Discard all expired cookies.
1654
1655 You probably don't need to call this method: expired cookies are never
1656 sent back to the server (provided you're using DefaultCookiePolicy),
1657 this method is called by CookieJar itself every so often, and the
1658 .save() method won't save expired cookies anyway (unless you ask
1659 otherwise by passing a true ignore_expires argument).
1660
1661 """
1662 self._cookies_lock.acquire()
1663 now = time.time()
1664 for cookie in self:
1665 if cookie.is_expired(now):
1666 self.clear(cookie.domain, cookie.path, cookie.name)
1667 self._cookies_lock.release()
1668
1669 def __iter__(self):
1670 return deepvalues(self._cookies)
1671
1672 def __len__(self):
1673 """Return number of contained cookies."""
1674 i = 0
1675 for cookie in self: i = i + 1
1676 return i
1677
1678 def __repr__(self):
1679 r = []
1680 for cookie in self: r.append(repr(cookie))
1681 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1682
1683 def __str__(self):
1684 r = []
1685 for cookie in self: r.append(str(cookie))
1686 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1687
1688
1689class LoadError(Exception): pass
1690
1691class FileCookieJar(CookieJar):
1692 """CookieJar that can be loaded from and saved to a file."""
1693
1694 def __init__(self, filename=None, delayload=False, policy=None):
1695 """
1696 Cookies are NOT loaded from the named file until either the .load() or
1697 .revert() method is called.
1698
1699 """
1700 CookieJar.__init__(self, policy)
1701 if filename is not None:
1702 try:
1703 filename+""
1704 except:
1705 raise ValueError("filename must be string-like")
1706 self.filename = filename
1707 self.delayload = bool(delayload)
1708
1709 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1710 """Save cookies to a file."""
1711 raise NotImplementedError()
1712
1713 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1714 """Load cookies from a file."""
1715 if filename is None:
1716 if self.filename is not None: filename = self.filename
1717 else: raise ValueError(MISSING_FILENAME_TEXT)
1718
1719 f = open(filename)
1720 try:
1721 self._really_load(f, filename, ignore_discard, ignore_expires)
1722 finally:
1723 f.close()
1724
1725 def revert(self, filename=None,
1726 ignore_discard=False, ignore_expires=False):
1727 """Clear all cookies and reload cookies from a saved file.
1728
1729 Raises LoadError (or IOError) if reversion is not successful; the
1730 object's state will not be altered if this happens.
1731
1732 """
1733 if filename is None:
1734 if self.filename is not None: filename = self.filename
1735 else: raise ValueError(MISSING_FILENAME_TEXT)
1736
1737 self._cookies_lock.acquire()
1738
1739 old_state = copy.deepcopy(self._cookies)
1740 self._cookies = {}
1741 try:
1742 self.load(filename, ignore_discard, ignore_expires)
1743 except (LoadError, IOError):
1744 self._cookies = old_state
1745 raise
1746
1747 self._cookies_lock.release()
1748
1749from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1750from _MozillaCookieJar import MozillaCookieJar