blob: 200452455851a9d6b9a22361f17a27d1acc16cbe [file] [log] [blame]
Guido van Rossum9ab94c11997-12-10 16:17:39 +00001"""RFC-822 message manipulation class.
Guido van Rossum01ca3361992-07-13 14:28:59 +00002
Guido van Rossum9ab94c11997-12-10 16:17:39 +00003XXX This is only a very rough sketch of a full RFC-822 parser;
4in particular the tokenizing of addresses does not adhere to all the
5quoting rules.
6
7Directions for use:
8
9To create a Message object: first open a file, e.g.:
10 fp = open(file, 'r')
Guido van Rossumc7bb8571998-06-10 21:31:01 +000011You can use any other legal way of getting an open file object, e.g. use
12sys.stdin or call os.popen().
Guido van Rossum9ab94c11997-12-10 16:17:39 +000013Then pass the open file object to the Message() constructor:
14 m = Message(fp)
15
Guido van Rossume894fc01998-06-11 13:58:40 +000016This class can work with any input object that supports a readline
17method. If the input object has seek and tell capability, the
18rewindbody method will work; also illegal lines will be pushed back
19onto the input stream. If the input object lacks seek but has an
20`unread' method that can push back a line of input, Message will use
21that to push back illegal lines. Thus this class can be used to parse
22messages coming from a buffered stream.
Guido van Rossumc7bb8571998-06-10 21:31:01 +000023
24The optional `seekable' argument is provided as a workaround for
25certain stdio libraries in which tell() discards buffered data before
26discovering that the lseek() system call doesn't work. For maximum
27portability, you should set the seekable argument to zero to prevent
28that initial \code{tell} when passing in an unseekable object such as
29a a file object created from a socket object. If it is 1 on entry --
30which it is by default -- the tell() method of the open file object is
31called once; if this raises an exception, seekable is reset to 0. For
32other nonzero values of seekable, this test is not made.
33
Guido van Rossum9ab94c11997-12-10 16:17:39 +000034To get the text of a particular header there are several methods:
35 str = m.getheader(name)
36 str = m.getrawheader(name)
37where name is the name of the header, e.g. 'Subject'.
38The difference is that getheader() strips the leading and trailing
39whitespace, while getrawheader() doesn't. Both functions retain
40embedded whitespace (including newlines) exactly as they are
41specified in the header, and leave the case of the text unchanged.
42
43For addresses and address lists there are functions
44 realname, mailaddress = m.getaddr(name) and
45 list = m.getaddrlist(name)
46where the latter returns a list of (realname, mailaddr) tuples.
47
48There is also a method
49 time = m.getdate(name)
50which parses a Date-like field and returns a time-compatible tuple,
51i.e. a tuple such as returned by time.localtime() or accepted by
52time.mktime().
53
54See the class definition for lower level access methods.
55
56There are also some utility functions here.
57"""
Guido van Rossum4d4ab921998-06-16 22:27:09 +000058# Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
Guido van Rossum01ca3361992-07-13 14:28:59 +000059
Guido van Rossum01ca3361992-07-13 14:28:59 +000060import string
Guido van Rossumb6775db1994-08-01 11:34:53 +000061import time
Guido van Rossum01ca3361992-07-13 14:28:59 +000062
63
Guido van Rossum9ab94c11997-12-10 16:17:39 +000064_blanklines = ('\r\n', '\n') # Optimization for islast()
Guido van Rossum92457b91995-06-22 19:06:57 +000065
66
Guido van Rossum01ca3361992-07-13 14:28:59 +000067class Message:
Guido van Rossum9ab94c11997-12-10 16:17:39 +000068 """Represents a single RFC-822-compliant message."""
69
70 def __init__(self, fp, seekable = 1):
71 """Initialize the class instance and read the headers."""
Guido van Rossumc7bb8571998-06-10 21:31:01 +000072 if seekable == 1:
73 # Exercise tell() to make sure it works
74 # (and then assume seek() works, too)
75 try:
76 fp.tell()
77 except:
78 seekable = 0
79 else:
80 seekable = 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +000081 self.fp = fp
82 self.seekable = seekable
83 self.startofheaders = None
84 self.startofbody = None
85 #
86 if self.seekable:
87 try:
88 self.startofheaders = self.fp.tell()
89 except IOError:
90 self.seekable = 0
91 #
92 self.readheaders()
93 #
94 if self.seekable:
95 try:
96 self.startofbody = self.fp.tell()
97 except IOError:
98 self.seekable = 0
99
100 def rewindbody(self):
101 """Rewind the file to the start of the body (if seekable)."""
102 if not self.seekable:
103 raise IOError, "unseekable file"
104 self.fp.seek(self.startofbody)
105
106 def readheaders(self):
107 """Read header lines.
108
109 Read header lines up to the entirely blank line that
110 terminates them. The (normally blank) line that ends the
111 headers is skipped, but not included in the returned list.
112 If a non-header line ends the headers, (which is an error),
113 an attempt is made to backspace over it; it is never
114 included in the returned list.
115
116 The variable self.status is set to the empty string if all
117 went well, otherwise it is an error message.
118 The variable self.headers is a completely uninterpreted list
119 of lines contained in the header (so printing them will
120 reproduce the header exactly as it appears in the file).
121 """
122 self.dict = {}
123 self.unixfrom = ''
124 self.headers = list = []
125 self.status = ''
126 headerseen = ""
127 firstline = 1
Guido van Rossum052969a1998-07-21 14:24:04 +0000128 startofline = unread = tell = None
129 if hasattr(self.fp, 'unread'):
130 unread = self.fp.unread
131 elif self.seekable:
132 tell = self.fp.tell
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000133 while 1:
Guido van Rossum052969a1998-07-21 14:24:04 +0000134 if tell:
135 startofline = tell()
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000136 line = self.fp.readline()
137 if not line:
138 self.status = 'EOF in headers'
139 break
140 # Skip unix From name time lines
141 if firstline and line[:5] == 'From ':
142 self.unixfrom = self.unixfrom + line
143 continue
144 firstline = 0
Guido van Rossume894fc01998-06-11 13:58:40 +0000145 if headerseen and line[0] in ' \t':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000146 # It's a continuation line.
147 list.append(line)
Guido van Rossume894fc01998-06-11 13:58:40 +0000148 x = (self.dict[headerseen] + "\n " + string.strip(line))
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000149 self.dict[headerseen] = string.strip(x)
Guido van Rossume894fc01998-06-11 13:58:40 +0000150 continue
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000151 elif self.iscomment(line):
Guido van Rossume894fc01998-06-11 13:58:40 +0000152 # It's a comment. Ignore it.
153 continue
154 elif self.islast(line):
155 # Note! No pushback here! The delimiter line gets eaten.
156 break
157 headerseen = self.isheader(line)
158 if headerseen:
159 # It's a legal header line, save it.
160 list.append(line)
Guido van Rossumd8957d61999-10-06 15:19:19 +0000161 self.dict[headerseen] = string.strip(line[len(headerseen)+1:])
Guido van Rossume894fc01998-06-11 13:58:40 +0000162 continue
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000163 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000164 # It's not a header line; throw it back and stop here.
165 if not self.dict:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000166 self.status = 'No headers'
167 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000168 self.status = 'Non-header line where header expected'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000169 # Try to undo the read.
Guido van Rossum052969a1998-07-21 14:24:04 +0000170 if unread:
171 unread(line)
172 elif tell:
173 self.fp.seek(startofline)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000174 else:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000175 self.status = self.status + '; bad seek'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000176 break
Guido van Rossume894fc01998-06-11 13:58:40 +0000177
178 def isheader(self, line):
179 """Determine whether a given line is a legal header.
180
181 This method should return the header name, suitably canonicalized.
182 You may override this method in order to use Message parsing
183 on tagged data in RFC822-like formats with special header formats.
184 """
185 i = string.find(line, ':')
186 if i > 0:
187 return string.lower(line[:i])
188 else:
189 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000190
191 def islast(self, line):
192 """Determine whether a line is a legal end of RFC-822 headers.
193
194 You may override this method if your application wants
195 to bend the rules, e.g. to strip trailing whitespace,
Thomas Wouters7e474022000-07-16 12:04:32 +0000196 or to recognize MH template separators ('--------').
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000197 For convenience (e.g. for code reading from sockets) a
198 line consisting of \r\n also matches.
199 """
200 return line in _blanklines
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000201
202 def iscomment(self, line):
203 """Determine whether a line should be skipped entirely.
204
205 You may override this method in order to use Message parsing
206 on tagged data in RFC822-like formats that support embedded
207 comments or free-text data.
208 """
209 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000210
211 def getallmatchingheaders(self, name):
212 """Find all header lines matching a given header name.
213
214 Look through the list of headers and find all lines
215 matching a given header name (and their continuation
216 lines). A list of the lines is returned, without
217 interpretation. If the header does not occur, an
218 empty list is returned. If the header occurs multiple
219 times, all occurrences are returned. Case is not
220 important in the header name.
221 """
222 name = string.lower(name) + ':'
223 n = len(name)
224 list = []
225 hit = 0
226 for line in self.headers:
227 if string.lower(line[:n]) == name:
228 hit = 1
229 elif line[:1] not in string.whitespace:
230 hit = 0
231 if hit:
232 list.append(line)
233 return list
234
235 def getfirstmatchingheader(self, name):
236 """Get the first header line matching name.
237
238 This is similar to getallmatchingheaders, but it returns
239 only the first matching header (and its continuation
240 lines).
241 """
242 name = string.lower(name) + ':'
243 n = len(name)
244 list = []
245 hit = 0
246 for line in self.headers:
247 if hit:
248 if line[:1] not in string.whitespace:
249 break
250 elif string.lower(line[:n]) == name:
251 hit = 1
252 if hit:
253 list.append(line)
254 return list
255
256 def getrawheader(self, name):
257 """A higher-level interface to getfirstmatchingheader().
258
259 Return a string containing the literal text of the
260 header but with the keyword stripped. All leading,
261 trailing and embedded whitespace is kept in the
262 string, however.
263 Return None if the header does not occur.
264 """
265
266 list = self.getfirstmatchingheader(name)
267 if not list:
268 return None
269 list[0] = list[0][len(name) + 1:]
270 return string.joinfields(list, '')
271
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000272 def getheader(self, name, default=None):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000273 """Get the header value for a name.
274
Fred Drakeddf22c41999-04-28 21:17:38 +0000275 This is the normal interface: it returns a stripped
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000276 version of the header value for a given header name,
277 or None if it doesn't exist. This uses the dictionary
278 version which finds the *last* such header.
279 """
280 try:
281 return self.dict[string.lower(name)]
282 except KeyError:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000283 return default
284 get = getheader
Fred Drakeddf22c41999-04-28 21:17:38 +0000285
286 def getheaders(self, name):
287 """Get all values for a header.
288
289 This returns a list of values for headers given more than once;
290 each value in the result list is stripped in the same way as the
Fred Drakecbfa5cb1999-06-14 15:40:23 +0000291 result of getheader(). If the header is not given, return an
292 empty list.
Fred Drakeddf22c41999-04-28 21:17:38 +0000293 """
294 result = []
295 current = ''
296 have_header = 0
297 for s in self.getallmatchingheaders(name):
298 if s[0] in string.whitespace:
299 if current:
300 current = "%s\n %s" % (current, string.strip(s))
301 else:
302 current = string.strip(s)
303 else:
304 if have_header:
305 result.append(current)
306 current = string.strip(s[string.find(s, ":") + 1:])
307 have_header = 1
308 if have_header:
309 result.append(current)
Fred Drakecbfa5cb1999-06-14 15:40:23 +0000310 return result
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000311
312 def getaddr(self, name):
313 """Get a single address from a header, as a tuple.
314
315 An example return value:
316 ('Guido van Rossum', 'guido@cwi.nl')
317 """
318 # New, by Ben Escoto
319 alist = self.getaddrlist(name)
320 if alist:
321 return alist[0]
322 else:
323 return (None, None)
324
325 def getaddrlist(self, name):
326 """Get a list of addresses from a header.
Barry Warsaw8a578431999-01-14 19:59:58 +0000327
328 Retrieves a list of addresses from a header, where each address is a
329 tuple as returned by getaddr(). Scans all named headers, so it works
330 properly with multiple To: or Cc: headers for example.
331
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000332 """
Barry Warsaw8a578431999-01-14 19:59:58 +0000333 raw = []
334 for h in self.getallmatchingheaders(name):
Fred Drake13a2c272000-02-10 17:17:14 +0000335 if h[0] in ' \t':
336 raw.append(h)
337 else:
338 if raw:
339 raw.append(', ')
Barry Warsaw8a578431999-01-14 19:59:58 +0000340 i = string.find(h, ':')
341 if i > 0:
342 addr = h[i+1:]
343 raw.append(addr)
344 alladdrs = string.join(raw, '')
345 a = AddrlistClass(alladdrs)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000346 return a.getaddrlist()
347
348 def getdate(self, name):
349 """Retrieve a date field from a header.
350
351 Retrieves a date field from the named header, returning
352 a tuple compatible with time.mktime().
353 """
354 try:
355 data = self[name]
356 except KeyError:
357 return None
358 return parsedate(data)
359
360 def getdate_tz(self, name):
361 """Retrieve a date field from a header as a 10-tuple.
362
363 The first 9 elements make up a tuple compatible with
364 time.mktime(), and the 10th is the offset of the poster's
365 time zone from GMT/UTC.
366 """
367 try:
368 data = self[name]
369 except KeyError:
370 return None
371 return parsedate_tz(data)
372
373
374 # Access as a dictionary (only finds *last* header of each type):
375
376 def __len__(self):
377 """Get the number of headers in a message."""
378 return len(self.dict)
379
380 def __getitem__(self, name):
381 """Get a specific header, as from a dictionary."""
382 return self.dict[string.lower(name)]
Guido van Rossume894fc01998-06-11 13:58:40 +0000383
384 def __setitem__(self, name, value):
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000385 """Set the value of a header.
386
387 Note: This is not a perfect inversion of __getitem__, because
388 any changed headers get stuck at the end of the raw-headers list
389 rather than where the altered header was.
390 """
Guido van Rossume894fc01998-06-11 13:58:40 +0000391 del self[name] # Won't fail if it doesn't exist
392 self.dict[string.lower(name)] = value
393 text = name + ": " + value
394 lines = string.split(text, "\n")
395 for line in lines:
396 self.headers.append(line + "\n")
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000397
Guido van Rossum75d92c11998-04-02 21:33:20 +0000398 def __delitem__(self, name):
399 """Delete all occurrences of a specific header, if it is present."""
Guido van Rossumf3c5f5c1999-09-15 22:15:23 +0000400 name = string.lower(name)
401 if not self.dict.has_key(name):
402 return
403 del self.dict[name]
404 name = name + ':'
Guido van Rossum75d92c11998-04-02 21:33:20 +0000405 n = len(name)
406 list = []
407 hit = 0
408 for i in range(len(self.headers)):
409 line = self.headers[i]
410 if string.lower(line[:n]) == name:
411 hit = 1
412 elif line[:1] not in string.whitespace:
413 hit = 0
414 if hit:
415 list.append(i)
416 list.reverse()
417 for i in list:
418 del self.headers[i]
419
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000420 def has_key(self, name):
421 """Determine whether a message contains the named header."""
422 return self.dict.has_key(string.lower(name))
423
424 def keys(self):
425 """Get all of a message's header field names."""
426 return self.dict.keys()
427
428 def values(self):
429 """Get all of a message's header field values."""
430 return self.dict.values()
431
432 def items(self):
433 """Get all of a message's headers.
434
435 Returns a list of name, value tuples.
436 """
437 return self.dict.items()
Guido van Rossum01ca3361992-07-13 14:28:59 +0000438
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000439 def __str__(self):
440 str = ''
441 for hdr in self.headers:
442 str = str + hdr
443 return str
Guido van Rossum01ca3361992-07-13 14:28:59 +0000444
445
446# Utility functions
447# -----------------
448
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000449# XXX Should fix unquote() and quote() to be really conformant.
Guido van Rossumb6775db1994-08-01 11:34:53 +0000450# XXX The inverses of the parse functions may also be useful.
451
Guido van Rossum01ca3361992-07-13 14:28:59 +0000452
Guido van Rossum01ca3361992-07-13 14:28:59 +0000453def unquote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000454 """Remove quotes from a string."""
455 if len(str) > 1:
456 if str[0] == '"' and str[-1:] == '"':
457 return str[1:-1]
458 if str[0] == '<' and str[-1:] == '>':
459 return str[1:-1]
460 return str
Guido van Rossumb6775db1994-08-01 11:34:53 +0000461
462
Guido van Rossum7883e1d1997-09-15 14:12:54 +0000463def quote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000464 """Add quotes around a string."""
465 return '"%s"' % string.join(
466 string.split(
467 string.join(
468 string.split(str, '\\'),
469 '\\\\'),
470 '"'),
471 '\\"')
Guido van Rossumb6775db1994-08-01 11:34:53 +0000472
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000473
Guido van Rossumb6775db1994-08-01 11:34:53 +0000474def parseaddr(address):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000475 """Parse an address into a (realname, mailaddr) tuple."""
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000476 a = AddrlistClass(address)
477 list = a.getaddrlist()
478 if not list:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000479 return (None, None)
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000480 else:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000481 return list[0]
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000482
483
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000484class AddrlistClass:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000485 """Address parser class by Ben Escoto.
486
487 To understand what this class does, it helps to have a copy of
488 RFC-822 in front of you.
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000489
490 Note: this class interface is deprecated and may be removed in the future.
491 Use rfc822.AddressList instead.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000492 """
493
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000494 def __init__(self, field):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000495 """Initialize a new instance.
496
497 `field' is an unparsed address header field, containing
498 one or more addresses.
499 """
500 self.specials = '()<>@,:;.\"[]'
501 self.pos = 0
502 self.LWS = ' \t'
Barry Warsaw8a578431999-01-14 19:59:58 +0000503 self.CR = '\r\n'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000504 self.atomends = self.specials + self.LWS + self.CR
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000505 self.field = field
506 self.commentlist = []
507
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000508 def gotonext(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000509 """Parse up to the start of the next address."""
510 while self.pos < len(self.field):
511 if self.field[self.pos] in self.LWS + '\n\r':
512 self.pos = self.pos + 1
513 elif self.field[self.pos] == '(':
514 self.commentlist.append(self.getcomment())
515 else: break
516
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000517 def getaddrlist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000518 """Parse all addresses.
519
520 Returns a list containing all of the addresses.
521 """
522 ad = self.getaddress()
523 if ad:
524 return ad + self.getaddrlist()
525 else: return []
526
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000527 def getaddress(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000528 """Parse the next address."""
529 self.commentlist = []
530 self.gotonext()
531
532 oldpos = self.pos
533 oldcl = self.commentlist
534 plist = self.getphraselist()
535
536 self.gotonext()
537 returnlist = []
538
539 if self.pos >= len(self.field):
540 # Bad email address technically, no domain.
541 if plist:
542 returnlist = [(string.join(self.commentlist), plist[0])]
543
544 elif self.field[self.pos] in '.@':
545 # email address is just an addrspec
546 # this isn't very efficient since we start over
547 self.pos = oldpos
548 self.commentlist = oldcl
549 addrspec = self.getaddrspec()
550 returnlist = [(string.join(self.commentlist), addrspec)]
551
552 elif self.field[self.pos] == ':':
553 # address is a group
554 returnlist = []
555
Barry Warsaw96e9bf41999-07-12 18:37:02 +0000556 fieldlen = len(self.field)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000557 self.pos = self.pos + 1
558 while self.pos < len(self.field):
559 self.gotonext()
Barry Warsaw96e9bf41999-07-12 18:37:02 +0000560 if self.pos < fieldlen and self.field[self.pos] == ';':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000561 self.pos = self.pos + 1
562 break
563 returnlist = returnlist + self.getaddress()
564
565 elif self.field[self.pos] == '<':
566 # Address is a phrase then a route addr
567 routeaddr = self.getrouteaddr()
568
569 if self.commentlist:
570 returnlist = [(string.join(plist) + ' (' + \
571 string.join(self.commentlist) + ')', routeaddr)]
572 else: returnlist = [(string.join(plist), routeaddr)]
573
574 else:
575 if plist:
576 returnlist = [(string.join(self.commentlist), plist[0])]
Barry Warsaw8a578431999-01-14 19:59:58 +0000577 elif self.field[self.pos] in self.specials:
578 self.pos = self.pos + 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000579
580 self.gotonext()
581 if self.pos < len(self.field) and self.field[self.pos] == ',':
582 self.pos = self.pos + 1
583 return returnlist
584
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000585 def getrouteaddr(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000586 """Parse a route address (Return-path value).
587
588 This method just skips all the route stuff and returns the addrspec.
589 """
590 if self.field[self.pos] != '<':
591 return
592
593 expectroute = 0
594 self.pos = self.pos + 1
595 self.gotonext()
Guido van Rossum9e43adb1998-03-03 16:17:52 +0000596 adlist = None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000597 while self.pos < len(self.field):
598 if expectroute:
599 self.getdomain()
600 expectroute = 0
601 elif self.field[self.pos] == '>':
602 self.pos = self.pos + 1
603 break
604 elif self.field[self.pos] == '@':
605 self.pos = self.pos + 1
606 expectroute = 1
607 elif self.field[self.pos] == ':':
608 self.pos = self.pos + 1
609 expectaddrspec = 1
610 else:
611 adlist = self.getaddrspec()
612 self.pos = self.pos + 1
613 break
614 self.gotonext()
615
616 return adlist
617
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000618 def getaddrspec(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000619 """Parse an RFC-822 addr-spec."""
620 aslist = []
621
622 self.gotonext()
623 while self.pos < len(self.field):
624 if self.field[self.pos] == '.':
625 aslist.append('.')
626 self.pos = self.pos + 1
627 elif self.field[self.pos] == '"':
Guido van Rossumb1844871999-06-15 18:06:20 +0000628 aslist.append('"%s"' % self.getquote())
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000629 elif self.field[self.pos] in self.atomends:
630 break
631 else: aslist.append(self.getatom())
632 self.gotonext()
633
634 if self.pos >= len(self.field) or self.field[self.pos] != '@':
635 return string.join(aslist, '')
636
637 aslist.append('@')
638 self.pos = self.pos + 1
639 self.gotonext()
640 return string.join(aslist, '') + self.getdomain()
641
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000642 def getdomain(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000643 """Get the complete domain name from an address."""
644 sdlist = []
645 while self.pos < len(self.field):
646 if self.field[self.pos] in self.LWS:
647 self.pos = self.pos + 1
648 elif self.field[self.pos] == '(':
649 self.commentlist.append(self.getcomment())
650 elif self.field[self.pos] == '[':
651 sdlist.append(self.getdomainliteral())
652 elif self.field[self.pos] == '.':
653 self.pos = self.pos + 1
654 sdlist.append('.')
655 elif self.field[self.pos] in self.atomends:
656 break
657 else: sdlist.append(self.getatom())
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000658 return string.join(sdlist, '')
659
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000660 def getdelimited(self, beginchar, endchars, allowcomments = 1):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000661 """Parse a header fragment delimited by special characters.
662
663 `beginchar' is the start character for the fragment.
664 If self is not looking at an instance of `beginchar' then
665 getdelimited returns the empty string.
666
667 `endchars' is a sequence of allowable end-delimiting characters.
668 Parsing stops when one of these is encountered.
669
670 If `allowcomments' is non-zero, embedded RFC-822 comments
671 are allowed within the parsed fragment.
672 """
673 if self.field[self.pos] != beginchar:
674 return ''
675
676 slist = ['']
677 quote = 0
678 self.pos = self.pos + 1
679 while self.pos < len(self.field):
680 if quote == 1:
681 slist.append(self.field[self.pos])
682 quote = 0
683 elif self.field[self.pos] in endchars:
684 self.pos = self.pos + 1
685 break
686 elif allowcomments and self.field[self.pos] == '(':
687 slist.append(self.getcomment())
688 elif self.field[self.pos] == '\\':
689 quote = 1
690 else:
691 slist.append(self.field[self.pos])
692 self.pos = self.pos + 1
693
694 return string.join(slist, '')
695
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000696 def getquote(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000697 """Get a quote-delimited fragment from self's field."""
698 return self.getdelimited('"', '"\r', 0)
699
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000700 def getcomment(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000701 """Get a parenthesis-delimited fragment from self's field."""
702 return self.getdelimited('(', ')\r', 1)
703
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000704 def getdomainliteral(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000705 """Parse an RFC-822 domain-literal."""
Barry Warsaw2ea2b112000-09-25 15:08:27 +0000706 return '[%s]' % self.getdelimited('[', ']\r', 0)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000707
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000708 def getatom(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000709 """Parse an RFC-822 atom."""
710 atomlist = ['']
711
712 while self.pos < len(self.field):
713 if self.field[self.pos] in self.atomends:
714 break
715 else: atomlist.append(self.field[self.pos])
716 self.pos = self.pos + 1
717
718 return string.join(atomlist, '')
719
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000720 def getphraselist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000721 """Parse a sequence of RFC-822 phrases.
722
723 A phrase is a sequence of words, which are in turn either
Guido van Rossume894fc01998-06-11 13:58:40 +0000724 RFC-822 atoms or quoted-strings. Phrases are canonicalized
725 by squeezing all runs of continuous whitespace into one space.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000726 """
727 plist = []
728
729 while self.pos < len(self.field):
730 if self.field[self.pos] in self.LWS:
731 self.pos = self.pos + 1
732 elif self.field[self.pos] == '"':
733 plist.append(self.getquote())
734 elif self.field[self.pos] == '(':
735 self.commentlist.append(self.getcomment())
736 elif self.field[self.pos] in self.atomends:
737 break
738 else: plist.append(self.getatom())
739
740 return plist
Guido van Rossumb6775db1994-08-01 11:34:53 +0000741
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000742class AddressList(AddrlistClass):
743 """An AddressList encapsulates a list of parsed RFC822 addresses."""
744 def __init__(self, field):
745 AddrlistClass.__init__(self, field)
746 if field:
747 self.addresslist = self.getaddrlist()
748 else:
749 self.addresslist = []
750
751 def __len__(self):
752 return len(self.addresslist)
753
754 def __str__(self):
755 return string.joinfields(map(dump_address_pair, self.addresslist),", ")
756
757 def __add__(self, other):
758 # Set union
759 newaddr = AddressList(None)
760 newaddr.addresslist = self.addresslist[:]
761 for x in other.addresslist:
762 if not x in self.addresslist:
763 newaddr.addresslist.append(x)
764 return newaddr
765
Thomas Wouters104a7bc2000-08-24 20:14:10 +0000766 def __iadd__(self, other):
767 # Set union, in-place
768 for x in other.addresslist:
769 if not x in self.addresslist:
770 self.addresslist.append(x)
771 return self
772
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000773 def __sub__(self, other):
774 # Set difference
775 newaddr = AddressList(None)
776 for x in self.addresslist:
777 if not x in other.addresslist:
778 newaddr.addresslist.append(x)
779 return newaddr
780
Thomas Wouters104a7bc2000-08-24 20:14:10 +0000781 def __isub__(self, other):
782 # Set difference, in-place
783 for x in other.addresslist:
784 if x in self.addresslist:
785 self.addresslist.remove(x)
786 return self
787
Guido van Rossum81d10b41998-06-16 22:29:03 +0000788 def __getitem__(self, index):
789 # Make indexing, slices, and 'in' work
Guido van Rossuma07934e1999-09-03 13:23:49 +0000790 return self.addresslist[index]
Guido van Rossum81d10b41998-06-16 22:29:03 +0000791
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000792def dump_address_pair(pair):
793 """Dump a (name, address) pair in a canonicalized form."""
794 if pair[0]:
795 return '"' + pair[0] + '" <' + pair[1] + '>'
796 else:
797 return pair[1]
Guido van Rossumb6775db1994-08-01 11:34:53 +0000798
799# Parse a date field
800
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000801_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
802 'aug', 'sep', 'oct', 'nov', 'dec',
Fred Drake13a2c272000-02-10 17:17:14 +0000803 'january', 'february', 'march', 'april', 'may', 'june', 'july',
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000804 'august', 'september', 'october', 'november', 'december']
805_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
Guido van Rossumb6775db1994-08-01 11:34:53 +0000806
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000807# The timezone table does not include the military time zones defined
808# in RFC822, other than Z. According to RFC1123, the description in
809# RFC822 gets the signs wrong, so we can't rely on any such time
810# zones. RFC1123 recommends that numeric timezone indicators be used
811# instead of timezone names.
812
813_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
Guido van Rossum67133e21998-05-18 16:09:10 +0000814 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000815 'EST': -500, 'EDT': -400, # Eastern
Guido van Rossum67133e21998-05-18 16:09:10 +0000816 'CST': -600, 'CDT': -500, # Central
817 'MST': -700, 'MDT': -600, # Mountain
818 'PST': -800, 'PDT': -700 # Pacific
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000819 }
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000820
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000821
822def parsedate_tz(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000823 """Convert a date string to a time tuple.
824
825 Accounts for military timezones.
826 """
827 data = string.split(data)
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000828 if data[0][-1] in (',', '.') or string.lower(data[0]) in _daynames:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000829 # There's a dayname here. Skip it
830 del data[0]
831 if len(data) == 3: # RFC 850 date, deprecated
832 stuff = string.split(data[0], '-')
833 if len(stuff) == 3:
834 data = stuff + data[1:]
835 if len(data) == 4:
836 s = data[3]
837 i = string.find(s, '+')
838 if i > 0:
839 data[3:] = [s[:i], s[i+1:]]
840 else:
841 data.append('') # Dummy tz
842 if len(data) < 5:
843 return None
844 data = data[:5]
845 [dd, mm, yy, tm, tz] = data
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000846 mm = string.lower(mm)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000847 if not mm in _monthnames:
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000848 dd, mm = mm, string.lower(dd)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000849 if not mm in _monthnames:
850 return None
851 mm = _monthnames.index(mm)+1
Guido van Rossumb08f51b1999-04-29 12:50:36 +0000852 if mm > 12: mm = mm - 12
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000853 if dd[-1] == ',':
Fred Drake13a2c272000-02-10 17:17:14 +0000854 dd = dd[:-1]
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000855 i = string.find(yy, ':')
856 if i > 0:
Fred Drake13a2c272000-02-10 17:17:14 +0000857 yy, tm = tm, yy
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000858 if yy[-1] == ',':
Fred Drake13a2c272000-02-10 17:17:14 +0000859 yy = yy[:-1]
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000860 if yy[0] not in string.digits:
Fred Drake13a2c272000-02-10 17:17:14 +0000861 yy, tz = tz, yy
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000862 if tm[-1] == ',':
Fred Drake13a2c272000-02-10 17:17:14 +0000863 tm = tm[:-1]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000864 tm = string.splitfields(tm, ':')
865 if len(tm) == 2:
866 [thh, tmm] = tm
867 tss = '0'
Guido van Rossum99e11311998-12-23 21:58:38 +0000868 elif len(tm) == 3:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000869 [thh, tmm, tss] = tm
Guido van Rossum99e11311998-12-23 21:58:38 +0000870 else:
871 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000872 try:
873 yy = string.atoi(yy)
874 dd = string.atoi(dd)
875 thh = string.atoi(thh)
876 tmm = string.atoi(tmm)
877 tss = string.atoi(tss)
878 except string.atoi_error:
879 return None
Guido van Rossuma73033f1998-02-19 00:28:58 +0000880 tzoffset=None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000881 tz=string.upper(tz)
882 if _timezones.has_key(tz):
883 tzoffset=_timezones[tz]
884 else:
885 try:
886 tzoffset=string.atoi(tz)
887 except string.atoi_error:
888 pass
889 # Convert a timezone offset into seconds ; -0500 -> -18000
Guido van Rossuma73033f1998-02-19 00:28:58 +0000890 if tzoffset:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000891 if tzoffset < 0:
892 tzsign = -1
893 tzoffset = -tzoffset
894 else:
895 tzsign = 1
896 tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000897 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
898 return tuple
899
Guido van Rossumb6775db1994-08-01 11:34:53 +0000900
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000901def parsedate(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000902 """Convert a time string to a time tuple."""
903 t=parsedate_tz(data)
904 if type(t)==type( () ):
905 return t[:9]
906 else: return t
907
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000908
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000909def mktime_tz(data):
Guido van Rossum67133e21998-05-18 16:09:10 +0000910 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
Guido van Rossuma73033f1998-02-19 00:28:58 +0000911 if data[9] is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000912 # No zone info, so localtime is better assumption than GMT
913 return time.mktime(data[:8] + (-1,))
Guido van Rossuma73033f1998-02-19 00:28:58 +0000914 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000915 t = time.mktime(data[:8] + (0,))
916 return t - data[9] - time.timezone
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000917
Guido van Rossum247a78a1999-04-19 18:04:38 +0000918def formatdate(timeval=None):
919 """Returns time format preferred for Internet standards.
920
921 Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
922 """
923 if timeval is None:
924 timeval = time.time()
925 return "%s" % time.strftime('%a, %d %b %Y %H:%M:%S GMT',
926 time.gmtime(timeval))
927
Guido van Rossumb6775db1994-08-01 11:34:53 +0000928
929# When used as script, run a small test program.
930# The first command line argument must be a filename containing one
931# message in RFC-822 format.
932
933if __name__ == '__main__':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000934 import sys, os
935 file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
936 if sys.argv[1:]: file = sys.argv[1]
937 f = open(file, 'r')
938 m = Message(f)
939 print 'From:', m.getaddr('from')
940 print 'To:', m.getaddrlist('to')
941 print 'Subject:', m.getheader('subject')
942 print 'Date:', m.getheader('date')
943 date = m.getdate_tz('date')
Guido van Rossum1d2b23e2000-01-17 14:11:04 +0000944 tz = date[-1]
945 date = time.localtime(mktime_tz(date))
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000946 if date:
Guido van Rossum1d2b23e2000-01-17 14:11:04 +0000947 print 'ParsedDate:', time.asctime(date),
948 hhmmss = tz
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000949 hhmm, ss = divmod(hhmmss, 60)
950 hh, mm = divmod(hhmm, 60)
951 print "%+03d%02d" % (hh, mm),
952 if ss: print ".%02d" % ss,
953 print
954 else:
955 print 'ParsedDate:', None
956 m.rewindbody()
957 n = 0
958 while f.readline():
959 n = n + 1
960 print 'Lines:', n
961 print '-'*70
962 print 'len =', len(m)
963 if m.has_key('Date'): print 'Date =', m['Date']
964 if m.has_key('X-Nonsense'): pass
965 print 'keys =', m.keys()
966 print 'values =', m.values()
967 print 'items =', m.items()