blob: 2f1a268a42ddfdd598f737bae6b0e9129e6551c5 [file] [log] [blame]
Guido van Rossum9ab94c11997-12-10 16:17:39 +00001"""RFC-822 message manipulation class.
Guido van Rossum01ca3361992-07-13 14:28:59 +00002
Guido van Rossum9ab94c11997-12-10 16:17:39 +00003XXX This is only a very rough sketch of a full RFC-822 parser;
4in particular the tokenizing of addresses does not adhere to all the
5quoting rules.
6
7Directions for use:
8
9To create a Message object: first open a file, e.g.:
10 fp = open(file, 'r')
Guido van Rossumc7bb8571998-06-10 21:31:01 +000011You can use any other legal way of getting an open file object, e.g. use
12sys.stdin or call os.popen().
Guido van Rossum9ab94c11997-12-10 16:17:39 +000013Then pass the open file object to the Message() constructor:
14 m = Message(fp)
15
Guido van Rossume894fc01998-06-11 13:58:40 +000016This class can work with any input object that supports a readline
17method. If the input object has seek and tell capability, the
18rewindbody method will work; also illegal lines will be pushed back
19onto the input stream. If the input object lacks seek but has an
20`unread' method that can push back a line of input, Message will use
21that to push back illegal lines. Thus this class can be used to parse
22messages coming from a buffered stream.
Guido van Rossumc7bb8571998-06-10 21:31:01 +000023
24The optional `seekable' argument is provided as a workaround for
25certain stdio libraries in which tell() discards buffered data before
26discovering that the lseek() system call doesn't work. For maximum
27portability, you should set the seekable argument to zero to prevent
28that initial \code{tell} when passing in an unseekable object such as
29a a file object created from a socket object. If it is 1 on entry --
30which it is by default -- the tell() method of the open file object is
31called once; if this raises an exception, seekable is reset to 0. For
32other nonzero values of seekable, this test is not made.
33
Guido van Rossum9ab94c11997-12-10 16:17:39 +000034To get the text of a particular header there are several methods:
35 str = m.getheader(name)
36 str = m.getrawheader(name)
37where name is the name of the header, e.g. 'Subject'.
38The difference is that getheader() strips the leading and trailing
39whitespace, while getrawheader() doesn't. Both functions retain
40embedded whitespace (including newlines) exactly as they are
41specified in the header, and leave the case of the text unchanged.
42
43For addresses and address lists there are functions
44 realname, mailaddress = m.getaddr(name) and
45 list = m.getaddrlist(name)
46where the latter returns a list of (realname, mailaddr) tuples.
47
48There is also a method
49 time = m.getdate(name)
50which parses a Date-like field and returns a time-compatible tuple,
51i.e. a tuple such as returned by time.localtime() or accepted by
52time.mktime().
53
54See the class definition for lower level access methods.
55
56There are also some utility functions here.
57"""
Guido van Rossum4d4ab921998-06-16 22:27:09 +000058# Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
Guido van Rossum01ca3361992-07-13 14:28:59 +000059
Guido van Rossum01ca3361992-07-13 14:28:59 +000060import string
Guido van Rossumb6775db1994-08-01 11:34:53 +000061import time
Guido van Rossum01ca3361992-07-13 14:28:59 +000062
63
Guido van Rossum9ab94c11997-12-10 16:17:39 +000064_blanklines = ('\r\n', '\n') # Optimization for islast()
Guido van Rossum92457b91995-06-22 19:06:57 +000065
66
Guido van Rossum01ca3361992-07-13 14:28:59 +000067class Message:
Guido van Rossum9ab94c11997-12-10 16:17:39 +000068 """Represents a single RFC-822-compliant message."""
69
70 def __init__(self, fp, seekable = 1):
71 """Initialize the class instance and read the headers."""
Guido van Rossumc7bb8571998-06-10 21:31:01 +000072 if seekable == 1:
73 # Exercise tell() to make sure it works
74 # (and then assume seek() works, too)
75 try:
76 fp.tell()
77 except:
78 seekable = 0
79 else:
80 seekable = 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +000081 self.fp = fp
82 self.seekable = seekable
83 self.startofheaders = None
84 self.startofbody = None
85 #
86 if self.seekable:
87 try:
88 self.startofheaders = self.fp.tell()
89 except IOError:
90 self.seekable = 0
91 #
92 self.readheaders()
93 #
94 if self.seekable:
95 try:
96 self.startofbody = self.fp.tell()
97 except IOError:
98 self.seekable = 0
99
100 def rewindbody(self):
101 """Rewind the file to the start of the body (if seekable)."""
102 if not self.seekable:
103 raise IOError, "unseekable file"
104 self.fp.seek(self.startofbody)
105
106 def readheaders(self):
107 """Read header lines.
108
109 Read header lines up to the entirely blank line that
110 terminates them. The (normally blank) line that ends the
111 headers is skipped, but not included in the returned list.
112 If a non-header line ends the headers, (which is an error),
113 an attempt is made to backspace over it; it is never
114 included in the returned list.
115
116 The variable self.status is set to the empty string if all
117 went well, otherwise it is an error message.
118 The variable self.headers is a completely uninterpreted list
119 of lines contained in the header (so printing them will
120 reproduce the header exactly as it appears in the file).
121 """
122 self.dict = {}
123 self.unixfrom = ''
124 self.headers = list = []
125 self.status = ''
126 headerseen = ""
127 firstline = 1
Guido van Rossum052969a1998-07-21 14:24:04 +0000128 startofline = unread = tell = None
129 if hasattr(self.fp, 'unread'):
130 unread = self.fp.unread
131 elif self.seekable:
132 tell = self.fp.tell
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000133 while 1:
Guido van Rossum052969a1998-07-21 14:24:04 +0000134 if tell:
Guido van Rossuma66eed62000-11-09 18:05:24 +0000135 try:
136 startofline = tell()
137 except IOError:
138 startofline = tell = None
139 self.seekable = 0
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000140 line = self.fp.readline()
141 if not line:
142 self.status = 'EOF in headers'
143 break
144 # Skip unix From name time lines
145 if firstline and line[:5] == 'From ':
146 self.unixfrom = self.unixfrom + line
147 continue
148 firstline = 0
Guido van Rossume894fc01998-06-11 13:58:40 +0000149 if headerseen and line[0] in ' \t':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000150 # It's a continuation line.
151 list.append(line)
Guido van Rossume894fc01998-06-11 13:58:40 +0000152 x = (self.dict[headerseen] + "\n " + string.strip(line))
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000153 self.dict[headerseen] = string.strip(x)
Guido van Rossume894fc01998-06-11 13:58:40 +0000154 continue
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000155 elif self.iscomment(line):
Guido van Rossume894fc01998-06-11 13:58:40 +0000156 # It's a comment. Ignore it.
157 continue
158 elif self.islast(line):
159 # Note! No pushback here! The delimiter line gets eaten.
160 break
161 headerseen = self.isheader(line)
162 if headerseen:
163 # It's a legal header line, save it.
164 list.append(line)
Guido van Rossumd8957d61999-10-06 15:19:19 +0000165 self.dict[headerseen] = string.strip(line[len(headerseen)+1:])
Guido van Rossume894fc01998-06-11 13:58:40 +0000166 continue
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000167 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000168 # It's not a header line; throw it back and stop here.
169 if not self.dict:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000170 self.status = 'No headers'
171 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000172 self.status = 'Non-header line where header expected'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000173 # Try to undo the read.
Guido van Rossum052969a1998-07-21 14:24:04 +0000174 if unread:
175 unread(line)
176 elif tell:
177 self.fp.seek(startofline)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000178 else:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000179 self.status = self.status + '; bad seek'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000180 break
Guido van Rossume894fc01998-06-11 13:58:40 +0000181
182 def isheader(self, line):
183 """Determine whether a given line is a legal header.
184
185 This method should return the header name, suitably canonicalized.
186 You may override this method in order to use Message parsing
187 on tagged data in RFC822-like formats with special header formats.
188 """
189 i = string.find(line, ':')
190 if i > 0:
191 return string.lower(line[:i])
192 else:
193 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000194
195 def islast(self, line):
196 """Determine whether a line is a legal end of RFC-822 headers.
197
198 You may override this method if your application wants
199 to bend the rules, e.g. to strip trailing whitespace,
Thomas Wouters7e474022000-07-16 12:04:32 +0000200 or to recognize MH template separators ('--------').
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000201 For convenience (e.g. for code reading from sockets) a
202 line consisting of \r\n also matches.
203 """
204 return line in _blanklines
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000205
206 def iscomment(self, line):
207 """Determine whether a line should be skipped entirely.
208
209 You may override this method in order to use Message parsing
210 on tagged data in RFC822-like formats that support embedded
211 comments or free-text data.
212 """
213 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000214
215 def getallmatchingheaders(self, name):
216 """Find all header lines matching a given header name.
217
218 Look through the list of headers and find all lines
219 matching a given header name (and their continuation
220 lines). A list of the lines is returned, without
221 interpretation. If the header does not occur, an
222 empty list is returned. If the header occurs multiple
223 times, all occurrences are returned. Case is not
224 important in the header name.
225 """
226 name = string.lower(name) + ':'
227 n = len(name)
228 list = []
229 hit = 0
230 for line in self.headers:
231 if string.lower(line[:n]) == name:
232 hit = 1
233 elif line[:1] not in string.whitespace:
234 hit = 0
235 if hit:
236 list.append(line)
237 return list
238
239 def getfirstmatchingheader(self, name):
240 """Get the first header line matching name.
241
242 This is similar to getallmatchingheaders, but it returns
243 only the first matching header (and its continuation
244 lines).
245 """
246 name = string.lower(name) + ':'
247 n = len(name)
248 list = []
249 hit = 0
250 for line in self.headers:
251 if hit:
252 if line[:1] not in string.whitespace:
253 break
254 elif string.lower(line[:n]) == name:
255 hit = 1
256 if hit:
257 list.append(line)
258 return list
259
260 def getrawheader(self, name):
261 """A higher-level interface to getfirstmatchingheader().
262
263 Return a string containing the literal text of the
264 header but with the keyword stripped. All leading,
265 trailing and embedded whitespace is kept in the
266 string, however.
267 Return None if the header does not occur.
268 """
269
270 list = self.getfirstmatchingheader(name)
271 if not list:
272 return None
273 list[0] = list[0][len(name) + 1:]
274 return string.joinfields(list, '')
275
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000276 def getheader(self, name, default=None):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000277 """Get the header value for a name.
278
Fred Drakeddf22c41999-04-28 21:17:38 +0000279 This is the normal interface: it returns a stripped
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000280 version of the header value for a given header name,
281 or None if it doesn't exist. This uses the dictionary
282 version which finds the *last* such header.
283 """
284 try:
285 return self.dict[string.lower(name)]
286 except KeyError:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000287 return default
288 get = getheader
Fred Drakeddf22c41999-04-28 21:17:38 +0000289
290 def getheaders(self, name):
291 """Get all values for a header.
292
293 This returns a list of values for headers given more than once;
294 each value in the result list is stripped in the same way as the
Fred Drakecbfa5cb1999-06-14 15:40:23 +0000295 result of getheader(). If the header is not given, return an
296 empty list.
Fred Drakeddf22c41999-04-28 21:17:38 +0000297 """
298 result = []
299 current = ''
300 have_header = 0
301 for s in self.getallmatchingheaders(name):
302 if s[0] in string.whitespace:
303 if current:
304 current = "%s\n %s" % (current, string.strip(s))
305 else:
306 current = string.strip(s)
307 else:
308 if have_header:
309 result.append(current)
310 current = string.strip(s[string.find(s, ":") + 1:])
311 have_header = 1
312 if have_header:
313 result.append(current)
Fred Drakecbfa5cb1999-06-14 15:40:23 +0000314 return result
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000315
316 def getaddr(self, name):
317 """Get a single address from a header, as a tuple.
318
319 An example return value:
320 ('Guido van Rossum', 'guido@cwi.nl')
321 """
322 # New, by Ben Escoto
323 alist = self.getaddrlist(name)
324 if alist:
325 return alist[0]
326 else:
327 return (None, None)
328
329 def getaddrlist(self, name):
330 """Get a list of addresses from a header.
Barry Warsaw8a578431999-01-14 19:59:58 +0000331
332 Retrieves a list of addresses from a header, where each address is a
333 tuple as returned by getaddr(). Scans all named headers, so it works
334 properly with multiple To: or Cc: headers for example.
335
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000336 """
Barry Warsaw8a578431999-01-14 19:59:58 +0000337 raw = []
338 for h in self.getallmatchingheaders(name):
Fred Drake13a2c272000-02-10 17:17:14 +0000339 if h[0] in ' \t':
340 raw.append(h)
341 else:
342 if raw:
343 raw.append(', ')
Barry Warsaw8a578431999-01-14 19:59:58 +0000344 i = string.find(h, ':')
345 if i > 0:
346 addr = h[i+1:]
347 raw.append(addr)
348 alladdrs = string.join(raw, '')
349 a = AddrlistClass(alladdrs)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000350 return a.getaddrlist()
351
352 def getdate(self, name):
353 """Retrieve a date field from a header.
354
355 Retrieves a date field from the named header, returning
356 a tuple compatible with time.mktime().
357 """
358 try:
359 data = self[name]
360 except KeyError:
361 return None
362 return parsedate(data)
363
364 def getdate_tz(self, name):
365 """Retrieve a date field from a header as a 10-tuple.
366
367 The first 9 elements make up a tuple compatible with
368 time.mktime(), and the 10th is the offset of the poster's
369 time zone from GMT/UTC.
370 """
371 try:
372 data = self[name]
373 except KeyError:
374 return None
375 return parsedate_tz(data)
376
377
378 # Access as a dictionary (only finds *last* header of each type):
379
380 def __len__(self):
381 """Get the number of headers in a message."""
382 return len(self.dict)
383
384 def __getitem__(self, name):
385 """Get a specific header, as from a dictionary."""
386 return self.dict[string.lower(name)]
Guido van Rossume894fc01998-06-11 13:58:40 +0000387
388 def __setitem__(self, name, value):
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000389 """Set the value of a header.
390
391 Note: This is not a perfect inversion of __getitem__, because
392 any changed headers get stuck at the end of the raw-headers list
393 rather than where the altered header was.
394 """
Guido van Rossume894fc01998-06-11 13:58:40 +0000395 del self[name] # Won't fail if it doesn't exist
396 self.dict[string.lower(name)] = value
397 text = name + ": " + value
398 lines = string.split(text, "\n")
399 for line in lines:
400 self.headers.append(line + "\n")
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000401
Guido van Rossum75d92c11998-04-02 21:33:20 +0000402 def __delitem__(self, name):
403 """Delete all occurrences of a specific header, if it is present."""
Guido van Rossumf3c5f5c1999-09-15 22:15:23 +0000404 name = string.lower(name)
405 if not self.dict.has_key(name):
406 return
407 del self.dict[name]
408 name = name + ':'
Guido van Rossum75d92c11998-04-02 21:33:20 +0000409 n = len(name)
410 list = []
411 hit = 0
412 for i in range(len(self.headers)):
413 line = self.headers[i]
414 if string.lower(line[:n]) == name:
415 hit = 1
416 elif line[:1] not in string.whitespace:
417 hit = 0
418 if hit:
419 list.append(i)
420 list.reverse()
421 for i in list:
422 del self.headers[i]
423
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000424 def has_key(self, name):
425 """Determine whether a message contains the named header."""
426 return self.dict.has_key(string.lower(name))
427
428 def keys(self):
429 """Get all of a message's header field names."""
430 return self.dict.keys()
431
432 def values(self):
433 """Get all of a message's header field values."""
434 return self.dict.values()
435
436 def items(self):
437 """Get all of a message's headers.
438
439 Returns a list of name, value tuples.
440 """
441 return self.dict.items()
Guido van Rossum01ca3361992-07-13 14:28:59 +0000442
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000443 def __str__(self):
444 str = ''
445 for hdr in self.headers:
446 str = str + hdr
447 return str
Guido van Rossum01ca3361992-07-13 14:28:59 +0000448
449
450# Utility functions
451# -----------------
452
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000453# XXX Should fix unquote() and quote() to be really conformant.
Guido van Rossumb6775db1994-08-01 11:34:53 +0000454# XXX The inverses of the parse functions may also be useful.
455
Guido van Rossum01ca3361992-07-13 14:28:59 +0000456
Guido van Rossum01ca3361992-07-13 14:28:59 +0000457def unquote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000458 """Remove quotes from a string."""
459 if len(str) > 1:
460 if str[0] == '"' and str[-1:] == '"':
461 return str[1:-1]
462 if str[0] == '<' and str[-1:] == '>':
463 return str[1:-1]
464 return str
Guido van Rossumb6775db1994-08-01 11:34:53 +0000465
466
Guido van Rossum7883e1d1997-09-15 14:12:54 +0000467def quote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000468 """Add quotes around a string."""
469 return '"%s"' % string.join(
470 string.split(
471 string.join(
472 string.split(str, '\\'),
473 '\\\\'),
474 '"'),
475 '\\"')
Guido van Rossumb6775db1994-08-01 11:34:53 +0000476
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000477
Guido van Rossumb6775db1994-08-01 11:34:53 +0000478def parseaddr(address):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000479 """Parse an address into a (realname, mailaddr) tuple."""
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000480 a = AddrlistClass(address)
481 list = a.getaddrlist()
482 if not list:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000483 return (None, None)
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000484 else:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000485 return list[0]
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000486
487
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000488class AddrlistClass:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000489 """Address parser class by Ben Escoto.
490
491 To understand what this class does, it helps to have a copy of
492 RFC-822 in front of you.
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000493
494 Note: this class interface is deprecated and may be removed in the future.
495 Use rfc822.AddressList instead.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000496 """
497
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000498 def __init__(self, field):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000499 """Initialize a new instance.
500
501 `field' is an unparsed address header field, containing
502 one or more addresses.
503 """
504 self.specials = '()<>@,:;.\"[]'
505 self.pos = 0
506 self.LWS = ' \t'
Barry Warsaw8a578431999-01-14 19:59:58 +0000507 self.CR = '\r\n'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000508 self.atomends = self.specials + self.LWS + self.CR
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000509 self.field = field
510 self.commentlist = []
511
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000512 def gotonext(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000513 """Parse up to the start of the next address."""
514 while self.pos < len(self.field):
515 if self.field[self.pos] in self.LWS + '\n\r':
516 self.pos = self.pos + 1
517 elif self.field[self.pos] == '(':
518 self.commentlist.append(self.getcomment())
519 else: break
520
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000521 def getaddrlist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000522 """Parse all addresses.
523
524 Returns a list containing all of the addresses.
525 """
526 ad = self.getaddress()
527 if ad:
528 return ad + self.getaddrlist()
529 else: return []
530
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000531 def getaddress(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000532 """Parse the next address."""
533 self.commentlist = []
534 self.gotonext()
535
536 oldpos = self.pos
537 oldcl = self.commentlist
538 plist = self.getphraselist()
539
540 self.gotonext()
541 returnlist = []
542
543 if self.pos >= len(self.field):
544 # Bad email address technically, no domain.
545 if plist:
546 returnlist = [(string.join(self.commentlist), plist[0])]
547
548 elif self.field[self.pos] in '.@':
549 # email address is just an addrspec
550 # this isn't very efficient since we start over
551 self.pos = oldpos
552 self.commentlist = oldcl
553 addrspec = self.getaddrspec()
554 returnlist = [(string.join(self.commentlist), addrspec)]
555
556 elif self.field[self.pos] == ':':
557 # address is a group
558 returnlist = []
559
Barry Warsaw96e9bf41999-07-12 18:37:02 +0000560 fieldlen = len(self.field)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000561 self.pos = self.pos + 1
562 while self.pos < len(self.field):
563 self.gotonext()
Barry Warsaw96e9bf41999-07-12 18:37:02 +0000564 if self.pos < fieldlen and self.field[self.pos] == ';':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000565 self.pos = self.pos + 1
566 break
567 returnlist = returnlist + self.getaddress()
568
569 elif self.field[self.pos] == '<':
570 # Address is a phrase then a route addr
571 routeaddr = self.getrouteaddr()
572
573 if self.commentlist:
574 returnlist = [(string.join(plist) + ' (' + \
575 string.join(self.commentlist) + ')', routeaddr)]
576 else: returnlist = [(string.join(plist), routeaddr)]
577
578 else:
579 if plist:
580 returnlist = [(string.join(self.commentlist), plist[0])]
Barry Warsaw8a578431999-01-14 19:59:58 +0000581 elif self.field[self.pos] in self.specials:
582 self.pos = self.pos + 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000583
584 self.gotonext()
585 if self.pos < len(self.field) and self.field[self.pos] == ',':
586 self.pos = self.pos + 1
587 return returnlist
588
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000589 def getrouteaddr(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000590 """Parse a route address (Return-path value).
591
592 This method just skips all the route stuff and returns the addrspec.
593 """
594 if self.field[self.pos] != '<':
595 return
596
597 expectroute = 0
598 self.pos = self.pos + 1
599 self.gotonext()
Guido van Rossum9e43adb1998-03-03 16:17:52 +0000600 adlist = None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000601 while self.pos < len(self.field):
602 if expectroute:
603 self.getdomain()
604 expectroute = 0
605 elif self.field[self.pos] == '>':
606 self.pos = self.pos + 1
607 break
608 elif self.field[self.pos] == '@':
609 self.pos = self.pos + 1
610 expectroute = 1
611 elif self.field[self.pos] == ':':
612 self.pos = self.pos + 1
613 expectaddrspec = 1
614 else:
615 adlist = self.getaddrspec()
616 self.pos = self.pos + 1
617 break
618 self.gotonext()
619
620 return adlist
621
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000622 def getaddrspec(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000623 """Parse an RFC-822 addr-spec."""
624 aslist = []
625
626 self.gotonext()
627 while self.pos < len(self.field):
628 if self.field[self.pos] == '.':
629 aslist.append('.')
630 self.pos = self.pos + 1
631 elif self.field[self.pos] == '"':
Guido van Rossumb1844871999-06-15 18:06:20 +0000632 aslist.append('"%s"' % self.getquote())
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000633 elif self.field[self.pos] in self.atomends:
634 break
635 else: aslist.append(self.getatom())
636 self.gotonext()
637
638 if self.pos >= len(self.field) or self.field[self.pos] != '@':
639 return string.join(aslist, '')
640
641 aslist.append('@')
642 self.pos = self.pos + 1
643 self.gotonext()
644 return string.join(aslist, '') + self.getdomain()
645
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000646 def getdomain(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000647 """Get the complete domain name from an address."""
648 sdlist = []
649 while self.pos < len(self.field):
650 if self.field[self.pos] in self.LWS:
651 self.pos = self.pos + 1
652 elif self.field[self.pos] == '(':
653 self.commentlist.append(self.getcomment())
654 elif self.field[self.pos] == '[':
655 sdlist.append(self.getdomainliteral())
656 elif self.field[self.pos] == '.':
657 self.pos = self.pos + 1
658 sdlist.append('.')
659 elif self.field[self.pos] in self.atomends:
660 break
661 else: sdlist.append(self.getatom())
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000662 return string.join(sdlist, '')
663
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000664 def getdelimited(self, beginchar, endchars, allowcomments = 1):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000665 """Parse a header fragment delimited by special characters.
666
667 `beginchar' is the start character for the fragment.
668 If self is not looking at an instance of `beginchar' then
669 getdelimited returns the empty string.
670
671 `endchars' is a sequence of allowable end-delimiting characters.
672 Parsing stops when one of these is encountered.
673
674 If `allowcomments' is non-zero, embedded RFC-822 comments
675 are allowed within the parsed fragment.
676 """
677 if self.field[self.pos] != beginchar:
678 return ''
679
680 slist = ['']
681 quote = 0
682 self.pos = self.pos + 1
683 while self.pos < len(self.field):
684 if quote == 1:
685 slist.append(self.field[self.pos])
686 quote = 0
687 elif self.field[self.pos] in endchars:
688 self.pos = self.pos + 1
689 break
690 elif allowcomments and self.field[self.pos] == '(':
691 slist.append(self.getcomment())
692 elif self.field[self.pos] == '\\':
693 quote = 1
694 else:
695 slist.append(self.field[self.pos])
696 self.pos = self.pos + 1
697
698 return string.join(slist, '')
699
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000700 def getquote(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000701 """Get a quote-delimited fragment from self's field."""
702 return self.getdelimited('"', '"\r', 0)
703
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000704 def getcomment(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000705 """Get a parenthesis-delimited fragment from self's field."""
706 return self.getdelimited('(', ')\r', 1)
707
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000708 def getdomainliteral(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000709 """Parse an RFC-822 domain-literal."""
Barry Warsaw2ea2b112000-09-25 15:08:27 +0000710 return '[%s]' % self.getdelimited('[', ']\r', 0)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000711
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000712 def getatom(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000713 """Parse an RFC-822 atom."""
714 atomlist = ['']
715
716 while self.pos < len(self.field):
717 if self.field[self.pos] in self.atomends:
718 break
719 else: atomlist.append(self.field[self.pos])
720 self.pos = self.pos + 1
721
722 return string.join(atomlist, '')
723
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000724 def getphraselist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000725 """Parse a sequence of RFC-822 phrases.
726
727 A phrase is a sequence of words, which are in turn either
Guido van Rossume894fc01998-06-11 13:58:40 +0000728 RFC-822 atoms or quoted-strings. Phrases are canonicalized
729 by squeezing all runs of continuous whitespace into one space.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000730 """
731 plist = []
732
733 while self.pos < len(self.field):
734 if self.field[self.pos] in self.LWS:
735 self.pos = self.pos + 1
736 elif self.field[self.pos] == '"':
737 plist.append(self.getquote())
738 elif self.field[self.pos] == '(':
739 self.commentlist.append(self.getcomment())
740 elif self.field[self.pos] in self.atomends:
741 break
742 else: plist.append(self.getatom())
743
744 return plist
Guido van Rossumb6775db1994-08-01 11:34:53 +0000745
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000746class AddressList(AddrlistClass):
747 """An AddressList encapsulates a list of parsed RFC822 addresses."""
748 def __init__(self, field):
749 AddrlistClass.__init__(self, field)
750 if field:
751 self.addresslist = self.getaddrlist()
752 else:
753 self.addresslist = []
754
755 def __len__(self):
756 return len(self.addresslist)
757
758 def __str__(self):
759 return string.joinfields(map(dump_address_pair, self.addresslist),", ")
760
761 def __add__(self, other):
762 # Set union
763 newaddr = AddressList(None)
764 newaddr.addresslist = self.addresslist[:]
765 for x in other.addresslist:
766 if not x in self.addresslist:
767 newaddr.addresslist.append(x)
768 return newaddr
769
Thomas Wouters104a7bc2000-08-24 20:14:10 +0000770 def __iadd__(self, other):
771 # Set union, in-place
772 for x in other.addresslist:
773 if not x in self.addresslist:
774 self.addresslist.append(x)
775 return self
776
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000777 def __sub__(self, other):
778 # Set difference
779 newaddr = AddressList(None)
780 for x in self.addresslist:
781 if not x in other.addresslist:
782 newaddr.addresslist.append(x)
783 return newaddr
784
Thomas Wouters104a7bc2000-08-24 20:14:10 +0000785 def __isub__(self, other):
786 # Set difference, in-place
787 for x in other.addresslist:
788 if x in self.addresslist:
789 self.addresslist.remove(x)
790 return self
791
Guido van Rossum81d10b41998-06-16 22:29:03 +0000792 def __getitem__(self, index):
793 # Make indexing, slices, and 'in' work
Guido van Rossuma07934e1999-09-03 13:23:49 +0000794 return self.addresslist[index]
Guido van Rossum81d10b41998-06-16 22:29:03 +0000795
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000796def dump_address_pair(pair):
797 """Dump a (name, address) pair in a canonicalized form."""
798 if pair[0]:
799 return '"' + pair[0] + '" <' + pair[1] + '>'
800 else:
801 return pair[1]
Guido van Rossumb6775db1994-08-01 11:34:53 +0000802
803# Parse a date field
804
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000805_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
806 'aug', 'sep', 'oct', 'nov', 'dec',
Fred Drake13a2c272000-02-10 17:17:14 +0000807 'january', 'february', 'march', 'april', 'may', 'june', 'july',
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000808 'august', 'september', 'october', 'november', 'december']
809_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
Guido van Rossumb6775db1994-08-01 11:34:53 +0000810
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000811# The timezone table does not include the military time zones defined
812# in RFC822, other than Z. According to RFC1123, the description in
813# RFC822 gets the signs wrong, so we can't rely on any such time
814# zones. RFC1123 recommends that numeric timezone indicators be used
815# instead of timezone names.
816
817_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
Guido van Rossum67133e21998-05-18 16:09:10 +0000818 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000819 'EST': -500, 'EDT': -400, # Eastern
Guido van Rossum67133e21998-05-18 16:09:10 +0000820 'CST': -600, 'CDT': -500, # Central
821 'MST': -700, 'MDT': -600, # Mountain
822 'PST': -800, 'PDT': -700 # Pacific
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000823 }
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000824
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000825
826def parsedate_tz(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000827 """Convert a date string to a time tuple.
828
829 Accounts for military timezones.
830 """
831 data = string.split(data)
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000832 if data[0][-1] in (',', '.') or string.lower(data[0]) in _daynames:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000833 # There's a dayname here. Skip it
834 del data[0]
835 if len(data) == 3: # RFC 850 date, deprecated
836 stuff = string.split(data[0], '-')
837 if len(stuff) == 3:
838 data = stuff + data[1:]
839 if len(data) == 4:
840 s = data[3]
841 i = string.find(s, '+')
842 if i > 0:
843 data[3:] = [s[:i], s[i+1:]]
844 else:
845 data.append('') # Dummy tz
846 if len(data) < 5:
847 return None
848 data = data[:5]
849 [dd, mm, yy, tm, tz] = data
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000850 mm = string.lower(mm)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000851 if not mm in _monthnames:
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000852 dd, mm = mm, string.lower(dd)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000853 if not mm in _monthnames:
854 return None
855 mm = _monthnames.index(mm)+1
Guido van Rossumb08f51b1999-04-29 12:50:36 +0000856 if mm > 12: mm = mm - 12
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000857 if dd[-1] == ',':
Fred Drake13a2c272000-02-10 17:17:14 +0000858 dd = dd[:-1]
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000859 i = string.find(yy, ':')
860 if i > 0:
Fred Drake13a2c272000-02-10 17:17:14 +0000861 yy, tm = tm, yy
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000862 if yy[-1] == ',':
Fred Drake13a2c272000-02-10 17:17:14 +0000863 yy = yy[:-1]
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000864 if yy[0] not in string.digits:
Fred Drake13a2c272000-02-10 17:17:14 +0000865 yy, tz = tz, yy
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000866 if tm[-1] == ',':
Fred Drake13a2c272000-02-10 17:17:14 +0000867 tm = tm[:-1]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000868 tm = string.splitfields(tm, ':')
869 if len(tm) == 2:
870 [thh, tmm] = tm
871 tss = '0'
Guido van Rossum99e11311998-12-23 21:58:38 +0000872 elif len(tm) == 3:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000873 [thh, tmm, tss] = tm
Guido van Rossum99e11311998-12-23 21:58:38 +0000874 else:
875 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000876 try:
877 yy = string.atoi(yy)
878 dd = string.atoi(dd)
879 thh = string.atoi(thh)
880 tmm = string.atoi(tmm)
881 tss = string.atoi(tss)
882 except string.atoi_error:
883 return None
Guido van Rossuma73033f1998-02-19 00:28:58 +0000884 tzoffset=None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000885 tz=string.upper(tz)
886 if _timezones.has_key(tz):
887 tzoffset=_timezones[tz]
888 else:
889 try:
890 tzoffset=string.atoi(tz)
891 except string.atoi_error:
892 pass
893 # Convert a timezone offset into seconds ; -0500 -> -18000
Guido van Rossuma73033f1998-02-19 00:28:58 +0000894 if tzoffset:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000895 if tzoffset < 0:
896 tzsign = -1
897 tzoffset = -tzoffset
898 else:
899 tzsign = 1
900 tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000901 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
902 return tuple
903
Guido van Rossumb6775db1994-08-01 11:34:53 +0000904
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000905def parsedate(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000906 """Convert a time string to a time tuple."""
907 t=parsedate_tz(data)
908 if type(t)==type( () ):
909 return t[:9]
910 else: return t
911
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000912
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000913def mktime_tz(data):
Guido van Rossum67133e21998-05-18 16:09:10 +0000914 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
Guido van Rossuma73033f1998-02-19 00:28:58 +0000915 if data[9] is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000916 # No zone info, so localtime is better assumption than GMT
917 return time.mktime(data[:8] + (-1,))
Guido van Rossuma73033f1998-02-19 00:28:58 +0000918 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000919 t = time.mktime(data[:8] + (0,))
920 return t - data[9] - time.timezone
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000921
Guido van Rossum247a78a1999-04-19 18:04:38 +0000922def formatdate(timeval=None):
923 """Returns time format preferred for Internet standards.
924
925 Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
926 """
927 if timeval is None:
928 timeval = time.time()
929 return "%s" % time.strftime('%a, %d %b %Y %H:%M:%S GMT',
930 time.gmtime(timeval))
931
Guido van Rossumb6775db1994-08-01 11:34:53 +0000932
933# When used as script, run a small test program.
934# The first command line argument must be a filename containing one
935# message in RFC-822 format.
936
937if __name__ == '__main__':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000938 import sys, os
939 file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
940 if sys.argv[1:]: file = sys.argv[1]
941 f = open(file, 'r')
942 m = Message(f)
943 print 'From:', m.getaddr('from')
944 print 'To:', m.getaddrlist('to')
945 print 'Subject:', m.getheader('subject')
946 print 'Date:', m.getheader('date')
947 date = m.getdate_tz('date')
Guido van Rossum1d2b23e2000-01-17 14:11:04 +0000948 tz = date[-1]
949 date = time.localtime(mktime_tz(date))
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000950 if date:
Guido van Rossum1d2b23e2000-01-17 14:11:04 +0000951 print 'ParsedDate:', time.asctime(date),
952 hhmmss = tz
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000953 hhmm, ss = divmod(hhmmss, 60)
954 hh, mm = divmod(hhmm, 60)
955 print "%+03d%02d" % (hh, mm),
956 if ss: print ".%02d" % ss,
957 print
958 else:
959 print 'ParsedDate:', None
960 m.rewindbody()
961 n = 0
962 while f.readline():
963 n = n + 1
964 print 'Lines:', n
965 print '-'*70
966 print 'len =', len(m)
967 if m.has_key('Date'): print 'Date =', m['Date']
968 if m.has_key('X-Nonsense'): pass
969 print 'keys =', m.keys()
970 print 'values =', m.values()
971 print 'items =', m.items()