blob: 8af8ad2487176cba61c0dd2cfeb5a0fe6da2bd50 [file] [log] [blame]
Guido van Rossum9ab94c11997-12-10 16:17:39 +00001"""RFC-822 message manipulation class.
Guido van Rossum01ca3361992-07-13 14:28:59 +00002
Guido van Rossum9ab94c11997-12-10 16:17:39 +00003XXX This is only a very rough sketch of a full RFC-822 parser;
4in particular the tokenizing of addresses does not adhere to all the
5quoting rules.
6
7Directions for use:
8
9To create a Message object: first open a file, e.g.:
10 fp = open(file, 'r')
Guido van Rossumc7bb8571998-06-10 21:31:01 +000011You can use any other legal way of getting an open file object, e.g. use
12sys.stdin or call os.popen().
Guido van Rossum9ab94c11997-12-10 16:17:39 +000013Then pass the open file object to the Message() constructor:
14 m = Message(fp)
15
Guido van Rossume894fc01998-06-11 13:58:40 +000016This class can work with any input object that supports a readline
17method. If the input object has seek and tell capability, the
18rewindbody method will work; also illegal lines will be pushed back
19onto the input stream. If the input object lacks seek but has an
20`unread' method that can push back a line of input, Message will use
21that to push back illegal lines. Thus this class can be used to parse
22messages coming from a buffered stream.
Guido van Rossumc7bb8571998-06-10 21:31:01 +000023
24The optional `seekable' argument is provided as a workaround for
25certain stdio libraries in which tell() discards buffered data before
26discovering that the lseek() system call doesn't work. For maximum
27portability, you should set the seekable argument to zero to prevent
28that initial \code{tell} when passing in an unseekable object such as
29a a file object created from a socket object. If it is 1 on entry --
30which it is by default -- the tell() method of the open file object is
31called once; if this raises an exception, seekable is reset to 0. For
32other nonzero values of seekable, this test is not made.
33
Guido van Rossum9ab94c11997-12-10 16:17:39 +000034To get the text of a particular header there are several methods:
35 str = m.getheader(name)
36 str = m.getrawheader(name)
37where name is the name of the header, e.g. 'Subject'.
38The difference is that getheader() strips the leading and trailing
39whitespace, while getrawheader() doesn't. Both functions retain
40embedded whitespace (including newlines) exactly as they are
41specified in the header, and leave the case of the text unchanged.
42
43For addresses and address lists there are functions
44 realname, mailaddress = m.getaddr(name) and
45 list = m.getaddrlist(name)
46where the latter returns a list of (realname, mailaddr) tuples.
47
48There is also a method
49 time = m.getdate(name)
50which parses a Date-like field and returns a time-compatible tuple,
51i.e. a tuple such as returned by time.localtime() or accepted by
52time.mktime().
53
54See the class definition for lower level access methods.
55
56There are also some utility functions here.
57"""
Guido van Rossum4d4ab921998-06-16 22:27:09 +000058# Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
Guido van Rossum01ca3361992-07-13 14:28:59 +000059
Guido van Rossum01ca3361992-07-13 14:28:59 +000060import string
Guido van Rossumb6775db1994-08-01 11:34:53 +000061import time
Guido van Rossum01ca3361992-07-13 14:28:59 +000062
63
Guido van Rossum9ab94c11997-12-10 16:17:39 +000064_blanklines = ('\r\n', '\n') # Optimization for islast()
Guido van Rossum92457b91995-06-22 19:06:57 +000065
66
Guido van Rossum01ca3361992-07-13 14:28:59 +000067class Message:
Guido van Rossum9ab94c11997-12-10 16:17:39 +000068 """Represents a single RFC-822-compliant message."""
69
70 def __init__(self, fp, seekable = 1):
71 """Initialize the class instance and read the headers."""
Guido van Rossumc7bb8571998-06-10 21:31:01 +000072 if seekable == 1:
73 # Exercise tell() to make sure it works
74 # (and then assume seek() works, too)
75 try:
76 fp.tell()
77 except:
78 seekable = 0
79 else:
80 seekable = 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +000081 self.fp = fp
82 self.seekable = seekable
83 self.startofheaders = None
84 self.startofbody = None
85 #
86 if self.seekable:
87 try:
88 self.startofheaders = self.fp.tell()
89 except IOError:
90 self.seekable = 0
91 #
92 self.readheaders()
93 #
94 if self.seekable:
95 try:
96 self.startofbody = self.fp.tell()
97 except IOError:
98 self.seekable = 0
99
100 def rewindbody(self):
101 """Rewind the file to the start of the body (if seekable)."""
102 if not self.seekable:
103 raise IOError, "unseekable file"
104 self.fp.seek(self.startofbody)
105
106 def readheaders(self):
107 """Read header lines.
108
109 Read header lines up to the entirely blank line that
110 terminates them. The (normally blank) line that ends the
111 headers is skipped, but not included in the returned list.
112 If a non-header line ends the headers, (which is an error),
113 an attempt is made to backspace over it; it is never
114 included in the returned list.
115
116 The variable self.status is set to the empty string if all
117 went well, otherwise it is an error message.
118 The variable self.headers is a completely uninterpreted list
119 of lines contained in the header (so printing them will
120 reproduce the header exactly as it appears in the file).
121 """
122 self.dict = {}
123 self.unixfrom = ''
124 self.headers = list = []
125 self.status = ''
126 headerseen = ""
127 firstline = 1
Guido van Rossum052969a1998-07-21 14:24:04 +0000128 startofline = unread = tell = None
129 if hasattr(self.fp, 'unread'):
130 unread = self.fp.unread
131 elif self.seekable:
132 tell = self.fp.tell
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000133 while 1:
Guido van Rossum052969a1998-07-21 14:24:04 +0000134 if tell:
Guido van Rossuma66eed62000-11-09 18:05:24 +0000135 try:
136 startofline = tell()
137 except IOError:
138 startofline = tell = None
139 self.seekable = 0
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000140 line = self.fp.readline()
141 if not line:
142 self.status = 'EOF in headers'
143 break
144 # Skip unix From name time lines
Guido van Rossumc80f1822000-12-15 15:37:48 +0000145 if firstline and line.startswith('From '):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000146 self.unixfrom = self.unixfrom + line
147 continue
148 firstline = 0
Guido van Rossume894fc01998-06-11 13:58:40 +0000149 if headerseen and line[0] in ' \t':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000150 # It's a continuation line.
151 list.append(line)
Guido van Rossumc80f1822000-12-15 15:37:48 +0000152 x = (self.dict[headerseen] + "\n " + line.strip())
153 self.dict[headerseen] = x.strip()
Guido van Rossume894fc01998-06-11 13:58:40 +0000154 continue
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000155 elif self.iscomment(line):
Guido van Rossume894fc01998-06-11 13:58:40 +0000156 # It's a comment. Ignore it.
157 continue
158 elif self.islast(line):
159 # Note! No pushback here! The delimiter line gets eaten.
160 break
161 headerseen = self.isheader(line)
162 if headerseen:
163 # It's a legal header line, save it.
164 list.append(line)
Guido van Rossumc80f1822000-12-15 15:37:48 +0000165 self.dict[headerseen] = line[len(headerseen)+1:].strip()
Guido van Rossume894fc01998-06-11 13:58:40 +0000166 continue
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000167 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000168 # It's not a header line; throw it back and stop here.
169 if not self.dict:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000170 self.status = 'No headers'
171 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000172 self.status = 'Non-header line where header expected'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000173 # Try to undo the read.
Guido van Rossum052969a1998-07-21 14:24:04 +0000174 if unread:
175 unread(line)
176 elif tell:
177 self.fp.seek(startofline)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000178 else:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000179 self.status = self.status + '; bad seek'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000180 break
Guido van Rossume894fc01998-06-11 13:58:40 +0000181
182 def isheader(self, line):
183 """Determine whether a given line is a legal header.
184
185 This method should return the header name, suitably canonicalized.
186 You may override this method in order to use Message parsing
187 on tagged data in RFC822-like formats with special header formats.
188 """
Guido van Rossumc80f1822000-12-15 15:37:48 +0000189 i = line.find(':')
Guido van Rossume894fc01998-06-11 13:58:40 +0000190 if i > 0:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000191 return line[:i].lower()
Guido van Rossume894fc01998-06-11 13:58:40 +0000192 else:
193 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000194
195 def islast(self, line):
196 """Determine whether a line is a legal end of RFC-822 headers.
197
198 You may override this method if your application wants
199 to bend the rules, e.g. to strip trailing whitespace,
Thomas Wouters7e474022000-07-16 12:04:32 +0000200 or to recognize MH template separators ('--------').
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000201 For convenience (e.g. for code reading from sockets) a
202 line consisting of \r\n also matches.
203 """
204 return line in _blanklines
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000205
206 def iscomment(self, line):
207 """Determine whether a line should be skipped entirely.
208
209 You may override this method in order to use Message parsing
210 on tagged data in RFC822-like formats that support embedded
211 comments or free-text data.
212 """
213 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000214
215 def getallmatchingheaders(self, name):
216 """Find all header lines matching a given header name.
217
218 Look through the list of headers and find all lines
219 matching a given header name (and their continuation
220 lines). A list of the lines is returned, without
221 interpretation. If the header does not occur, an
222 empty list is returned. If the header occurs multiple
223 times, all occurrences are returned. Case is not
224 important in the header name.
225 """
Guido van Rossumc80f1822000-12-15 15:37:48 +0000226 name = name.lower() + ':'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000227 n = len(name)
228 list = []
229 hit = 0
230 for line in self.headers:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000231 if line[:n].lower() == name:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000232 hit = 1
233 elif line[:1] not in string.whitespace:
234 hit = 0
235 if hit:
236 list.append(line)
237 return list
238
239 def getfirstmatchingheader(self, name):
240 """Get the first header line matching name.
241
242 This is similar to getallmatchingheaders, but it returns
243 only the first matching header (and its continuation
244 lines).
245 """
Guido van Rossumc80f1822000-12-15 15:37:48 +0000246 name = name.lower() + ':'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000247 n = len(name)
248 list = []
249 hit = 0
250 for line in self.headers:
251 if hit:
252 if line[:1] not in string.whitespace:
253 break
Guido van Rossumc80f1822000-12-15 15:37:48 +0000254 elif line[:n].lower() == name:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000255 hit = 1
256 if hit:
257 list.append(line)
258 return list
259
260 def getrawheader(self, name):
261 """A higher-level interface to getfirstmatchingheader().
262
263 Return a string containing the literal text of the
264 header but with the keyword stripped. All leading,
265 trailing and embedded whitespace is kept in the
266 string, however.
267 Return None if the header does not occur.
268 """
269
270 list = self.getfirstmatchingheader(name)
271 if not list:
272 return None
273 list[0] = list[0][len(name) + 1:]
Guido van Rossumc80f1822000-12-15 15:37:48 +0000274 return ''.join(list)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000275
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000276 def getheader(self, name, default=None):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000277 """Get the header value for a name.
278
Fred Drakeddf22c41999-04-28 21:17:38 +0000279 This is the normal interface: it returns a stripped
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000280 version of the header value for a given header name,
281 or None if it doesn't exist. This uses the dictionary
282 version which finds the *last* such header.
283 """
284 try:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000285 return self.dict[name.lower()]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000286 except KeyError:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000287 return default
288 get = getheader
Fred Drakeddf22c41999-04-28 21:17:38 +0000289
290 def getheaders(self, name):
291 """Get all values for a header.
292
293 This returns a list of values for headers given more than once;
294 each value in the result list is stripped in the same way as the
Fred Drakecbfa5cb1999-06-14 15:40:23 +0000295 result of getheader(). If the header is not given, return an
296 empty list.
Fred Drakeddf22c41999-04-28 21:17:38 +0000297 """
298 result = []
299 current = ''
300 have_header = 0
301 for s in self.getallmatchingheaders(name):
302 if s[0] in string.whitespace:
303 if current:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000304 current = "%s\n %s" % (current, s.strip())
Fred Drakeddf22c41999-04-28 21:17:38 +0000305 else:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000306 current = s.strip()
Fred Drakeddf22c41999-04-28 21:17:38 +0000307 else:
308 if have_header:
309 result.append(current)
Guido van Rossumc80f1822000-12-15 15:37:48 +0000310 current = s[s.find(":") + 1:].strip()
Fred Drakeddf22c41999-04-28 21:17:38 +0000311 have_header = 1
312 if have_header:
313 result.append(current)
Fred Drakecbfa5cb1999-06-14 15:40:23 +0000314 return result
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000315
316 def getaddr(self, name):
317 """Get a single address from a header, as a tuple.
318
319 An example return value:
320 ('Guido van Rossum', 'guido@cwi.nl')
321 """
322 # New, by Ben Escoto
323 alist = self.getaddrlist(name)
324 if alist:
325 return alist[0]
326 else:
327 return (None, None)
328
329 def getaddrlist(self, name):
330 """Get a list of addresses from a header.
Barry Warsaw8a578431999-01-14 19:59:58 +0000331
332 Retrieves a list of addresses from a header, where each address is a
333 tuple as returned by getaddr(). Scans all named headers, so it works
334 properly with multiple To: or Cc: headers for example.
335
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000336 """
Barry Warsaw8a578431999-01-14 19:59:58 +0000337 raw = []
338 for h in self.getallmatchingheaders(name):
Fred Drake13a2c272000-02-10 17:17:14 +0000339 if h[0] in ' \t':
340 raw.append(h)
341 else:
342 if raw:
343 raw.append(', ')
Guido van Rossumc80f1822000-12-15 15:37:48 +0000344 i = h.find(':')
Barry Warsaw8a578431999-01-14 19:59:58 +0000345 if i > 0:
346 addr = h[i+1:]
347 raw.append(addr)
Guido van Rossumc80f1822000-12-15 15:37:48 +0000348 alladdrs = ''.join(raw)
Barry Warsaw8a578431999-01-14 19:59:58 +0000349 a = AddrlistClass(alladdrs)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000350 return a.getaddrlist()
351
352 def getdate(self, name):
353 """Retrieve a date field from a header.
354
355 Retrieves a date field from the named header, returning
356 a tuple compatible with time.mktime().
357 """
358 try:
359 data = self[name]
360 except KeyError:
361 return None
362 return parsedate(data)
363
364 def getdate_tz(self, name):
365 """Retrieve a date field from a header as a 10-tuple.
366
367 The first 9 elements make up a tuple compatible with
368 time.mktime(), and the 10th is the offset of the poster's
369 time zone from GMT/UTC.
370 """
371 try:
372 data = self[name]
373 except KeyError:
374 return None
375 return parsedate_tz(data)
376
377
378 # Access as a dictionary (only finds *last* header of each type):
379
380 def __len__(self):
381 """Get the number of headers in a message."""
382 return len(self.dict)
383
384 def __getitem__(self, name):
385 """Get a specific header, as from a dictionary."""
Guido van Rossumc80f1822000-12-15 15:37:48 +0000386 return self.dict[name.lower()]
Guido van Rossume894fc01998-06-11 13:58:40 +0000387
388 def __setitem__(self, name, value):
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000389 """Set the value of a header.
390
391 Note: This is not a perfect inversion of __getitem__, because
392 any changed headers get stuck at the end of the raw-headers list
393 rather than where the altered header was.
394 """
Guido van Rossume894fc01998-06-11 13:58:40 +0000395 del self[name] # Won't fail if it doesn't exist
Guido van Rossumc80f1822000-12-15 15:37:48 +0000396 self.dict[name.lower()] = value
Guido van Rossume894fc01998-06-11 13:58:40 +0000397 text = name + ": " + value
Guido van Rossumc80f1822000-12-15 15:37:48 +0000398 lines = text.split("\n")
Guido van Rossume894fc01998-06-11 13:58:40 +0000399 for line in lines:
400 self.headers.append(line + "\n")
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000401
Guido van Rossum75d92c11998-04-02 21:33:20 +0000402 def __delitem__(self, name):
403 """Delete all occurrences of a specific header, if it is present."""
Guido van Rossumc80f1822000-12-15 15:37:48 +0000404 name = name.lower()
Guido van Rossumf3c5f5c1999-09-15 22:15:23 +0000405 if not self.dict.has_key(name):
406 return
407 del self.dict[name]
408 name = name + ':'
Guido van Rossum75d92c11998-04-02 21:33:20 +0000409 n = len(name)
410 list = []
411 hit = 0
412 for i in range(len(self.headers)):
413 line = self.headers[i]
Guido van Rossumc80f1822000-12-15 15:37:48 +0000414 if line[:n].lower() == name:
Guido van Rossum75d92c11998-04-02 21:33:20 +0000415 hit = 1
416 elif line[:1] not in string.whitespace:
417 hit = 0
418 if hit:
419 list.append(i)
420 list.reverse()
421 for i in list:
422 del self.headers[i]
423
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000424 def has_key(self, name):
425 """Determine whether a message contains the named header."""
Guido van Rossumc80f1822000-12-15 15:37:48 +0000426 return self.dict.has_key(name.lower())
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000427
428 def keys(self):
429 """Get all of a message's header field names."""
430 return self.dict.keys()
431
432 def values(self):
433 """Get all of a message's header field values."""
434 return self.dict.values()
435
436 def items(self):
437 """Get all of a message's headers.
438
439 Returns a list of name, value tuples.
440 """
441 return self.dict.items()
Guido van Rossum01ca3361992-07-13 14:28:59 +0000442
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000443 def __str__(self):
444 str = ''
445 for hdr in self.headers:
446 str = str + hdr
447 return str
Guido van Rossum01ca3361992-07-13 14:28:59 +0000448
449
450# Utility functions
451# -----------------
452
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000453# XXX Should fix unquote() and quote() to be really conformant.
Guido van Rossumb6775db1994-08-01 11:34:53 +0000454# XXX The inverses of the parse functions may also be useful.
455
Guido van Rossum01ca3361992-07-13 14:28:59 +0000456
Guido van Rossum01ca3361992-07-13 14:28:59 +0000457def unquote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000458 """Remove quotes from a string."""
459 if len(str) > 1:
460 if str[0] == '"' and str[-1:] == '"':
461 return str[1:-1]
462 if str[0] == '<' and str[-1:] == '>':
463 return str[1:-1]
464 return str
Guido van Rossumb6775db1994-08-01 11:34:53 +0000465
466
Guido van Rossum7883e1d1997-09-15 14:12:54 +0000467def quote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000468 """Add quotes around a string."""
Guido van Rossumc80f1822000-12-15 15:37:48 +0000469 return str.replace('\\', '\\\\').replace('"', '\\"')
Guido van Rossumb6775db1994-08-01 11:34:53 +0000470
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000471
Guido van Rossumb6775db1994-08-01 11:34:53 +0000472def parseaddr(address):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000473 """Parse an address into a (realname, mailaddr) tuple."""
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000474 a = AddrlistClass(address)
475 list = a.getaddrlist()
476 if not list:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000477 return (None, None)
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000478 else:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000479 return list[0]
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000480
481
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000482class AddrlistClass:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000483 """Address parser class by Ben Escoto.
484
485 To understand what this class does, it helps to have a copy of
486 RFC-822 in front of you.
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000487
488 Note: this class interface is deprecated and may be removed in the future.
489 Use rfc822.AddressList instead.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000490 """
491
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000492 def __init__(self, field):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000493 """Initialize a new instance.
494
495 `field' is an unparsed address header field, containing
496 one or more addresses.
497 """
498 self.specials = '()<>@,:;.\"[]'
499 self.pos = 0
500 self.LWS = ' \t'
Barry Warsaw8a578431999-01-14 19:59:58 +0000501 self.CR = '\r\n'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000502 self.atomends = self.specials + self.LWS + self.CR
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000503 self.field = field
504 self.commentlist = []
505
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000506 def gotonext(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000507 """Parse up to the start of the next address."""
508 while self.pos < len(self.field):
509 if self.field[self.pos] in self.LWS + '\n\r':
510 self.pos = self.pos + 1
511 elif self.field[self.pos] == '(':
512 self.commentlist.append(self.getcomment())
513 else: break
514
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000515 def getaddrlist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000516 """Parse all addresses.
517
518 Returns a list containing all of the addresses.
519 """
520 ad = self.getaddress()
521 if ad:
522 return ad + self.getaddrlist()
523 else: return []
524
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000525 def getaddress(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000526 """Parse the next address."""
527 self.commentlist = []
528 self.gotonext()
529
530 oldpos = self.pos
531 oldcl = self.commentlist
532 plist = self.getphraselist()
533
534 self.gotonext()
535 returnlist = []
536
537 if self.pos >= len(self.field):
538 # Bad email address technically, no domain.
539 if plist:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000540 returnlist = [(' '.join(self.commentlist), plist[0])]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000541
542 elif self.field[self.pos] in '.@':
543 # email address is just an addrspec
544 # this isn't very efficient since we start over
545 self.pos = oldpos
546 self.commentlist = oldcl
547 addrspec = self.getaddrspec()
Guido van Rossumc80f1822000-12-15 15:37:48 +0000548 returnlist = [(' '.join(self.commentlist), addrspec)]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000549
550 elif self.field[self.pos] == ':':
551 # address is a group
552 returnlist = []
553
Barry Warsaw96e9bf41999-07-12 18:37:02 +0000554 fieldlen = len(self.field)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000555 self.pos = self.pos + 1
556 while self.pos < len(self.field):
557 self.gotonext()
Barry Warsaw96e9bf41999-07-12 18:37:02 +0000558 if self.pos < fieldlen and self.field[self.pos] == ';':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000559 self.pos = self.pos + 1
560 break
561 returnlist = returnlist + self.getaddress()
562
563 elif self.field[self.pos] == '<':
564 # Address is a phrase then a route addr
565 routeaddr = self.getrouteaddr()
566
567 if self.commentlist:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000568 returnlist = [(' '.join(plist) + ' (' + \
569 ' '.join(self.commentlist) + ')', routeaddr)]
570 else: returnlist = [(' '.join(plist), routeaddr)]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000571
572 else:
573 if plist:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000574 returnlist = [(' '.join(self.commentlist), plist[0])]
Barry Warsaw8a578431999-01-14 19:59:58 +0000575 elif self.field[self.pos] in self.specials:
576 self.pos = self.pos + 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000577
578 self.gotonext()
579 if self.pos < len(self.field) and self.field[self.pos] == ',':
580 self.pos = self.pos + 1
581 return returnlist
582
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000583 def getrouteaddr(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000584 """Parse a route address (Return-path value).
585
586 This method just skips all the route stuff and returns the addrspec.
587 """
588 if self.field[self.pos] != '<':
589 return
590
591 expectroute = 0
592 self.pos = self.pos + 1
593 self.gotonext()
Guido van Rossum9e43adb1998-03-03 16:17:52 +0000594 adlist = None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000595 while self.pos < len(self.field):
596 if expectroute:
597 self.getdomain()
598 expectroute = 0
599 elif self.field[self.pos] == '>':
600 self.pos = self.pos + 1
601 break
602 elif self.field[self.pos] == '@':
603 self.pos = self.pos + 1
604 expectroute = 1
605 elif self.field[self.pos] == ':':
606 self.pos = self.pos + 1
607 expectaddrspec = 1
608 else:
609 adlist = self.getaddrspec()
610 self.pos = self.pos + 1
611 break
612 self.gotonext()
613
614 return adlist
615
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000616 def getaddrspec(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000617 """Parse an RFC-822 addr-spec."""
618 aslist = []
619
620 self.gotonext()
621 while self.pos < len(self.field):
622 if self.field[self.pos] == '.':
623 aslist.append('.')
624 self.pos = self.pos + 1
625 elif self.field[self.pos] == '"':
Guido van Rossumb1844871999-06-15 18:06:20 +0000626 aslist.append('"%s"' % self.getquote())
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000627 elif self.field[self.pos] in self.atomends:
628 break
629 else: aslist.append(self.getatom())
630 self.gotonext()
631
632 if self.pos >= len(self.field) or self.field[self.pos] != '@':
Guido van Rossumc80f1822000-12-15 15:37:48 +0000633 return ''.join(aslist)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000634
635 aslist.append('@')
636 self.pos = self.pos + 1
637 self.gotonext()
Guido van Rossumc80f1822000-12-15 15:37:48 +0000638 return ''.join(aslist) + self.getdomain()
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000639
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000640 def getdomain(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000641 """Get the complete domain name from an address."""
642 sdlist = []
643 while self.pos < len(self.field):
644 if self.field[self.pos] in self.LWS:
645 self.pos = self.pos + 1
646 elif self.field[self.pos] == '(':
647 self.commentlist.append(self.getcomment())
648 elif self.field[self.pos] == '[':
649 sdlist.append(self.getdomainliteral())
650 elif self.field[self.pos] == '.':
651 self.pos = self.pos + 1
652 sdlist.append('.')
653 elif self.field[self.pos] in self.atomends:
654 break
655 else: sdlist.append(self.getatom())
Guido van Rossumc80f1822000-12-15 15:37:48 +0000656 return ''.join(sdlist)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000657
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000658 def getdelimited(self, beginchar, endchars, allowcomments = 1):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000659 """Parse a header fragment delimited by special characters.
660
661 `beginchar' is the start character for the fragment.
662 If self is not looking at an instance of `beginchar' then
663 getdelimited returns the empty string.
664
665 `endchars' is a sequence of allowable end-delimiting characters.
666 Parsing stops when one of these is encountered.
667
668 If `allowcomments' is non-zero, embedded RFC-822 comments
669 are allowed within the parsed fragment.
670 """
671 if self.field[self.pos] != beginchar:
672 return ''
673
674 slist = ['']
675 quote = 0
676 self.pos = self.pos + 1
677 while self.pos < len(self.field):
678 if quote == 1:
679 slist.append(self.field[self.pos])
680 quote = 0
681 elif self.field[self.pos] in endchars:
682 self.pos = self.pos + 1
683 break
684 elif allowcomments and self.field[self.pos] == '(':
685 slist.append(self.getcomment())
686 elif self.field[self.pos] == '\\':
687 quote = 1
688 else:
689 slist.append(self.field[self.pos])
690 self.pos = self.pos + 1
691
Guido van Rossumc80f1822000-12-15 15:37:48 +0000692 return ''.join(slist)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000693
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000694 def getquote(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000695 """Get a quote-delimited fragment from self's field."""
696 return self.getdelimited('"', '"\r', 0)
697
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000698 def getcomment(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000699 """Get a parenthesis-delimited fragment from self's field."""
700 return self.getdelimited('(', ')\r', 1)
701
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000702 def getdomainliteral(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000703 """Parse an RFC-822 domain-literal."""
Barry Warsaw2ea2b112000-09-25 15:08:27 +0000704 return '[%s]' % self.getdelimited('[', ']\r', 0)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000705
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000706 def getatom(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000707 """Parse an RFC-822 atom."""
708 atomlist = ['']
709
710 while self.pos < len(self.field):
711 if self.field[self.pos] in self.atomends:
712 break
713 else: atomlist.append(self.field[self.pos])
714 self.pos = self.pos + 1
715
Guido van Rossumc80f1822000-12-15 15:37:48 +0000716 return ''.join(atomlist)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000717
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000718 def getphraselist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000719 """Parse a sequence of RFC-822 phrases.
720
721 A phrase is a sequence of words, which are in turn either
Guido van Rossume894fc01998-06-11 13:58:40 +0000722 RFC-822 atoms or quoted-strings. Phrases are canonicalized
723 by squeezing all runs of continuous whitespace into one space.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000724 """
725 plist = []
726
727 while self.pos < len(self.field):
728 if self.field[self.pos] in self.LWS:
729 self.pos = self.pos + 1
730 elif self.field[self.pos] == '"':
731 plist.append(self.getquote())
732 elif self.field[self.pos] == '(':
733 self.commentlist.append(self.getcomment())
734 elif self.field[self.pos] in self.atomends:
735 break
736 else: plist.append(self.getatom())
737
738 return plist
Guido van Rossumb6775db1994-08-01 11:34:53 +0000739
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000740class AddressList(AddrlistClass):
741 """An AddressList encapsulates a list of parsed RFC822 addresses."""
742 def __init__(self, field):
743 AddrlistClass.__init__(self, field)
744 if field:
745 self.addresslist = self.getaddrlist()
746 else:
747 self.addresslist = []
748
749 def __len__(self):
750 return len(self.addresslist)
751
752 def __str__(self):
Guido van Rossumc80f1822000-12-15 15:37:48 +0000753 return ", ".join(map(dump_address_pair, self.addresslist))
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000754
755 def __add__(self, other):
756 # Set union
757 newaddr = AddressList(None)
758 newaddr.addresslist = self.addresslist[:]
759 for x in other.addresslist:
760 if not x in self.addresslist:
761 newaddr.addresslist.append(x)
762 return newaddr
763
Thomas Wouters104a7bc2000-08-24 20:14:10 +0000764 def __iadd__(self, other):
765 # Set union, in-place
766 for x in other.addresslist:
767 if not x in self.addresslist:
768 self.addresslist.append(x)
769 return self
770
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000771 def __sub__(self, other):
772 # Set difference
773 newaddr = AddressList(None)
774 for x in self.addresslist:
775 if not x in other.addresslist:
776 newaddr.addresslist.append(x)
777 return newaddr
778
Thomas Wouters104a7bc2000-08-24 20:14:10 +0000779 def __isub__(self, other):
780 # Set difference, in-place
781 for x in other.addresslist:
782 if x in self.addresslist:
783 self.addresslist.remove(x)
784 return self
785
Guido van Rossum81d10b41998-06-16 22:29:03 +0000786 def __getitem__(self, index):
787 # Make indexing, slices, and 'in' work
Guido van Rossuma07934e1999-09-03 13:23:49 +0000788 return self.addresslist[index]
Guido van Rossum81d10b41998-06-16 22:29:03 +0000789
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000790def dump_address_pair(pair):
791 """Dump a (name, address) pair in a canonicalized form."""
792 if pair[0]:
793 return '"' + pair[0] + '" <' + pair[1] + '>'
794 else:
795 return pair[1]
Guido van Rossumb6775db1994-08-01 11:34:53 +0000796
797# Parse a date field
798
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000799_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
800 'aug', 'sep', 'oct', 'nov', 'dec',
Fred Drake13a2c272000-02-10 17:17:14 +0000801 'january', 'february', 'march', 'april', 'may', 'june', 'july',
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000802 'august', 'september', 'october', 'november', 'december']
803_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
Guido van Rossumb6775db1994-08-01 11:34:53 +0000804
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000805# The timezone table does not include the military time zones defined
806# in RFC822, other than Z. According to RFC1123, the description in
807# RFC822 gets the signs wrong, so we can't rely on any such time
808# zones. RFC1123 recommends that numeric timezone indicators be used
809# instead of timezone names.
810
811_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
Guido van Rossum67133e21998-05-18 16:09:10 +0000812 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000813 'EST': -500, 'EDT': -400, # Eastern
Guido van Rossum67133e21998-05-18 16:09:10 +0000814 'CST': -600, 'CDT': -500, # Central
815 'MST': -700, 'MDT': -600, # Mountain
816 'PST': -800, 'PDT': -700 # Pacific
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000817 }
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000818
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000819
820def parsedate_tz(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000821 """Convert a date string to a time tuple.
822
823 Accounts for military timezones.
824 """
Guido van Rossumc80f1822000-12-15 15:37:48 +0000825 data = data.split()
826 if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000827 # There's a dayname here. Skip it
828 del data[0]
829 if len(data) == 3: # RFC 850 date, deprecated
Guido van Rossumc80f1822000-12-15 15:37:48 +0000830 stuff = data[0].split('-')
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000831 if len(stuff) == 3:
832 data = stuff + data[1:]
833 if len(data) == 4:
834 s = data[3]
Guido van Rossumc80f1822000-12-15 15:37:48 +0000835 i = s.find('+')
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000836 if i > 0:
837 data[3:] = [s[:i], s[i+1:]]
838 else:
839 data.append('') # Dummy tz
840 if len(data) < 5:
841 return None
842 data = data[:5]
843 [dd, mm, yy, tm, tz] = data
Guido van Rossumc80f1822000-12-15 15:37:48 +0000844 mm = mm.lower()
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000845 if not mm in _monthnames:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000846 dd, mm = mm, dd.lower()
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000847 if not mm in _monthnames:
848 return None
849 mm = _monthnames.index(mm)+1
Guido van Rossumb08f51b1999-04-29 12:50:36 +0000850 if mm > 12: mm = mm - 12
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000851 if dd[-1] == ',':
Fred Drake13a2c272000-02-10 17:17:14 +0000852 dd = dd[:-1]
Guido van Rossumc80f1822000-12-15 15:37:48 +0000853 i = yy.find(':')
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000854 if i > 0:
Fred Drake13a2c272000-02-10 17:17:14 +0000855 yy, tm = tm, yy
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000856 if yy[-1] == ',':
Fred Drake13a2c272000-02-10 17:17:14 +0000857 yy = yy[:-1]
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000858 if yy[0] not in string.digits:
Fred Drake13a2c272000-02-10 17:17:14 +0000859 yy, tz = tz, yy
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000860 if tm[-1] == ',':
Fred Drake13a2c272000-02-10 17:17:14 +0000861 tm = tm[:-1]
Guido van Rossumc80f1822000-12-15 15:37:48 +0000862 tm = tm.split(':')
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000863 if len(tm) == 2:
864 [thh, tmm] = tm
865 tss = '0'
Guido van Rossum99e11311998-12-23 21:58:38 +0000866 elif len(tm) == 3:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000867 [thh, tmm, tss] = tm
Guido van Rossum99e11311998-12-23 21:58:38 +0000868 else:
869 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000870 try:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000871 yy = int(yy)
872 dd = int(dd)
873 thh = int(thh)
874 tmm = int(tmm)
875 tss = int(tss)
876 except ValueError:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000877 return None
Guido van Rossumc80f1822000-12-15 15:37:48 +0000878 tzoffset = None
879 tz = tz.upper()
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000880 if _timezones.has_key(tz):
Guido van Rossumc80f1822000-12-15 15:37:48 +0000881 tzoffset = _timezones[tz]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000882 else:
883 try:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000884 tzoffset = int(tz)
885 except ValueError:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000886 pass
887 # Convert a timezone offset into seconds ; -0500 -> -18000
Guido van Rossuma73033f1998-02-19 00:28:58 +0000888 if tzoffset:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000889 if tzoffset < 0:
890 tzsign = -1
891 tzoffset = -tzoffset
892 else:
893 tzsign = 1
894 tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000895 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
896 return tuple
897
Guido van Rossumb6775db1994-08-01 11:34:53 +0000898
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000899def parsedate(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000900 """Convert a time string to a time tuple."""
Guido van Rossumc80f1822000-12-15 15:37:48 +0000901 t = parsedate_tz(data)
902 if type(t) == type( () ):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000903 return t[:9]
904 else: return t
905
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000906
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000907def mktime_tz(data):
Guido van Rossum67133e21998-05-18 16:09:10 +0000908 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
Guido van Rossuma73033f1998-02-19 00:28:58 +0000909 if data[9] is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000910 # No zone info, so localtime is better assumption than GMT
911 return time.mktime(data[:8] + (-1,))
Guido van Rossuma73033f1998-02-19 00:28:58 +0000912 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000913 t = time.mktime(data[:8] + (0,))
914 return t - data[9] - time.timezone
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000915
Guido van Rossum247a78a1999-04-19 18:04:38 +0000916def formatdate(timeval=None):
917 """Returns time format preferred for Internet standards.
918
919 Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
920 """
921 if timeval is None:
922 timeval = time.time()
923 return "%s" % time.strftime('%a, %d %b %Y %H:%M:%S GMT',
924 time.gmtime(timeval))
925
Guido van Rossumb6775db1994-08-01 11:34:53 +0000926
927# When used as script, run a small test program.
928# The first command line argument must be a filename containing one
929# message in RFC-822 format.
930
931if __name__ == '__main__':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000932 import sys, os
933 file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
934 if sys.argv[1:]: file = sys.argv[1]
935 f = open(file, 'r')
936 m = Message(f)
937 print 'From:', m.getaddr('from')
938 print 'To:', m.getaddrlist('to')
939 print 'Subject:', m.getheader('subject')
940 print 'Date:', m.getheader('date')
941 date = m.getdate_tz('date')
Guido van Rossum1d2b23e2000-01-17 14:11:04 +0000942 tz = date[-1]
943 date = time.localtime(mktime_tz(date))
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000944 if date:
Guido van Rossum1d2b23e2000-01-17 14:11:04 +0000945 print 'ParsedDate:', time.asctime(date),
946 hhmmss = tz
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000947 hhmm, ss = divmod(hhmmss, 60)
948 hh, mm = divmod(hhmm, 60)
949 print "%+03d%02d" % (hh, mm),
950 if ss: print ".%02d" % ss,
951 print
952 else:
953 print 'ParsedDate:', None
954 m.rewindbody()
955 n = 0
956 while f.readline():
957 n = n + 1
958 print 'Lines:', n
959 print '-'*70
960 print 'len =', len(m)
961 if m.has_key('Date'): print 'Date =', m['Date']
962 if m.has_key('X-Nonsense'): pass
963 print 'keys =', m.keys()
964 print 'values =', m.values()
965 print 'items =', m.items()