blob: 782054bcd8424855d0b2f5d675db8449b48cf222 [file] [log] [blame]
Guido van Rossum9ab94c11997-12-10 16:17:39 +00001"""RFC-822 message manipulation class.
Guido van Rossum01ca3361992-07-13 14:28:59 +00002
Guido van Rossum9ab94c11997-12-10 16:17:39 +00003XXX This is only a very rough sketch of a full RFC-822 parser;
4in particular the tokenizing of addresses does not adhere to all the
5quoting rules.
6
7Directions for use:
8
9To create a Message object: first open a file, e.g.:
10 fp = open(file, 'r')
Guido van Rossumc7bb8571998-06-10 21:31:01 +000011You can use any other legal way of getting an open file object, e.g. use
12sys.stdin or call os.popen().
Guido van Rossum9ab94c11997-12-10 16:17:39 +000013Then pass the open file object to the Message() constructor:
14 m = Message(fp)
15
Guido van Rossume894fc01998-06-11 13:58:40 +000016This class can work with any input object that supports a readline
17method. If the input object has seek and tell capability, the
18rewindbody method will work; also illegal lines will be pushed back
19onto the input stream. If the input object lacks seek but has an
20`unread' method that can push back a line of input, Message will use
21that to push back illegal lines. Thus this class can be used to parse
22messages coming from a buffered stream.
Guido van Rossumc7bb8571998-06-10 21:31:01 +000023
24The optional `seekable' argument is provided as a workaround for
25certain stdio libraries in which tell() discards buffered data before
26discovering that the lseek() system call doesn't work. For maximum
27portability, you should set the seekable argument to zero to prevent
28that initial \code{tell} when passing in an unseekable object such as
29a a file object created from a socket object. If it is 1 on entry --
30which it is by default -- the tell() method of the open file object is
31called once; if this raises an exception, seekable is reset to 0. For
32other nonzero values of seekable, this test is not made.
33
Guido van Rossum9ab94c11997-12-10 16:17:39 +000034To get the text of a particular header there are several methods:
35 str = m.getheader(name)
36 str = m.getrawheader(name)
37where name is the name of the header, e.g. 'Subject'.
38The difference is that getheader() strips the leading and trailing
39whitespace, while getrawheader() doesn't. Both functions retain
40embedded whitespace (including newlines) exactly as they are
41specified in the header, and leave the case of the text unchanged.
42
43For addresses and address lists there are functions
44 realname, mailaddress = m.getaddr(name) and
45 list = m.getaddrlist(name)
46where the latter returns a list of (realname, mailaddr) tuples.
47
48There is also a method
49 time = m.getdate(name)
50which parses a Date-like field and returns a time-compatible tuple,
51i.e. a tuple such as returned by time.localtime() or accepted by
52time.mktime().
53
54See the class definition for lower level access methods.
55
56There are also some utility functions here.
57"""
Guido van Rossum4d4ab921998-06-16 22:27:09 +000058# Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
Guido van Rossum01ca3361992-07-13 14:28:59 +000059
Guido van Rossumb6775db1994-08-01 11:34:53 +000060import time
Guido van Rossum01ca3361992-07-13 14:28:59 +000061
62
Guido van Rossum9ab94c11997-12-10 16:17:39 +000063_blanklines = ('\r\n', '\n') # Optimization for islast()
Guido van Rossum92457b91995-06-22 19:06:57 +000064
65
Guido van Rossum01ca3361992-07-13 14:28:59 +000066class Message:
Guido van Rossum9ab94c11997-12-10 16:17:39 +000067 """Represents a single RFC-822-compliant message."""
68
69 def __init__(self, fp, seekable = 1):
70 """Initialize the class instance and read the headers."""
Guido van Rossumc7bb8571998-06-10 21:31:01 +000071 if seekable == 1:
72 # Exercise tell() to make sure it works
73 # (and then assume seek() works, too)
74 try:
75 fp.tell()
76 except:
77 seekable = 0
78 else:
79 seekable = 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +000080 self.fp = fp
81 self.seekable = seekable
82 self.startofheaders = None
83 self.startofbody = None
84 #
85 if self.seekable:
86 try:
87 self.startofheaders = self.fp.tell()
88 except IOError:
89 self.seekable = 0
90 #
91 self.readheaders()
92 #
93 if self.seekable:
94 try:
95 self.startofbody = self.fp.tell()
96 except IOError:
97 self.seekable = 0
98
99 def rewindbody(self):
100 """Rewind the file to the start of the body (if seekable)."""
101 if not self.seekable:
102 raise IOError, "unseekable file"
103 self.fp.seek(self.startofbody)
104
105 def readheaders(self):
106 """Read header lines.
107
108 Read header lines up to the entirely blank line that
109 terminates them. The (normally blank) line that ends the
110 headers is skipped, but not included in the returned list.
111 If a non-header line ends the headers, (which is an error),
112 an attempt is made to backspace over it; it is never
113 included in the returned list.
114
115 The variable self.status is set to the empty string if all
116 went well, otherwise it is an error message.
117 The variable self.headers is a completely uninterpreted list
118 of lines contained in the header (so printing them will
119 reproduce the header exactly as it appears in the file).
120 """
121 self.dict = {}
122 self.unixfrom = ''
123 self.headers = list = []
124 self.status = ''
125 headerseen = ""
126 firstline = 1
Guido van Rossum052969a1998-07-21 14:24:04 +0000127 startofline = unread = tell = None
128 if hasattr(self.fp, 'unread'):
129 unread = self.fp.unread
130 elif self.seekable:
131 tell = self.fp.tell
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000132 while 1:
Guido van Rossum052969a1998-07-21 14:24:04 +0000133 if tell:
Guido van Rossuma66eed62000-11-09 18:05:24 +0000134 try:
135 startofline = tell()
136 except IOError:
137 startofline = tell = None
138 self.seekable = 0
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000139 line = self.fp.readline()
140 if not line:
141 self.status = 'EOF in headers'
142 break
143 # Skip unix From name time lines
Guido van Rossumc80f1822000-12-15 15:37:48 +0000144 if firstline and line.startswith('From '):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000145 self.unixfrom = self.unixfrom + line
146 continue
147 firstline = 0
Guido van Rossume894fc01998-06-11 13:58:40 +0000148 if headerseen and line[0] in ' \t':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000149 # It's a continuation line.
150 list.append(line)
Guido van Rossumc80f1822000-12-15 15:37:48 +0000151 x = (self.dict[headerseen] + "\n " + line.strip())
152 self.dict[headerseen] = x.strip()
Guido van Rossume894fc01998-06-11 13:58:40 +0000153 continue
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000154 elif self.iscomment(line):
Guido van Rossume894fc01998-06-11 13:58:40 +0000155 # It's a comment. Ignore it.
156 continue
157 elif self.islast(line):
158 # Note! No pushback here! The delimiter line gets eaten.
159 break
160 headerseen = self.isheader(line)
161 if headerseen:
162 # It's a legal header line, save it.
163 list.append(line)
Guido van Rossumc80f1822000-12-15 15:37:48 +0000164 self.dict[headerseen] = line[len(headerseen)+1:].strip()
Guido van Rossume894fc01998-06-11 13:58:40 +0000165 continue
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000166 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000167 # It's not a header line; throw it back and stop here.
168 if not self.dict:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000169 self.status = 'No headers'
170 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000171 self.status = 'Non-header line where header expected'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000172 # Try to undo the read.
Guido van Rossum052969a1998-07-21 14:24:04 +0000173 if unread:
174 unread(line)
175 elif tell:
176 self.fp.seek(startofline)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000177 else:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000178 self.status = self.status + '; bad seek'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000179 break
Guido van Rossume894fc01998-06-11 13:58:40 +0000180
181 def isheader(self, line):
182 """Determine whether a given line is a legal header.
183
184 This method should return the header name, suitably canonicalized.
185 You may override this method in order to use Message parsing
186 on tagged data in RFC822-like formats with special header formats.
187 """
Guido van Rossumc80f1822000-12-15 15:37:48 +0000188 i = line.find(':')
Guido van Rossume894fc01998-06-11 13:58:40 +0000189 if i > 0:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000190 return line[:i].lower()
Guido van Rossume894fc01998-06-11 13:58:40 +0000191 else:
192 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000193
194 def islast(self, line):
195 """Determine whether a line is a legal end of RFC-822 headers.
196
197 You may override this method if your application wants
198 to bend the rules, e.g. to strip trailing whitespace,
Thomas Wouters7e474022000-07-16 12:04:32 +0000199 or to recognize MH template separators ('--------').
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000200 For convenience (e.g. for code reading from sockets) a
201 line consisting of \r\n also matches.
202 """
203 return line in _blanklines
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000204
205 def iscomment(self, line):
206 """Determine whether a line should be skipped entirely.
207
208 You may override this method in order to use Message parsing
209 on tagged data in RFC822-like formats that support embedded
210 comments or free-text data.
211 """
212 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000213
214 def getallmatchingheaders(self, name):
215 """Find all header lines matching a given header name.
216
217 Look through the list of headers and find all lines
218 matching a given header name (and their continuation
219 lines). A list of the lines is returned, without
220 interpretation. If the header does not occur, an
221 empty list is returned. If the header occurs multiple
222 times, all occurrences are returned. Case is not
223 important in the header name.
224 """
Guido van Rossumc80f1822000-12-15 15:37:48 +0000225 name = name.lower() + ':'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000226 n = len(name)
227 list = []
228 hit = 0
229 for line in self.headers:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000230 if line[:n].lower() == name:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000231 hit = 1
Guido van Rossum352ca8c2001-01-02 20:36:32 +0000232 elif not line[:1].isspace():
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000233 hit = 0
234 if hit:
235 list.append(line)
236 return list
237
238 def getfirstmatchingheader(self, name):
239 """Get the first header line matching name.
240
241 This is similar to getallmatchingheaders, but it returns
242 only the first matching header (and its continuation
243 lines).
244 """
Guido van Rossumc80f1822000-12-15 15:37:48 +0000245 name = name.lower() + ':'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000246 n = len(name)
247 list = []
248 hit = 0
249 for line in self.headers:
250 if hit:
Guido van Rossum352ca8c2001-01-02 20:36:32 +0000251 if not line[:1].isspace():
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000252 break
Guido van Rossumc80f1822000-12-15 15:37:48 +0000253 elif line[:n].lower() == name:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000254 hit = 1
255 if hit:
256 list.append(line)
257 return list
258
259 def getrawheader(self, name):
260 """A higher-level interface to getfirstmatchingheader().
261
262 Return a string containing the literal text of the
263 header but with the keyword stripped. All leading,
264 trailing and embedded whitespace is kept in the
265 string, however.
266 Return None if the header does not occur.
267 """
268
269 list = self.getfirstmatchingheader(name)
270 if not list:
271 return None
272 list[0] = list[0][len(name) + 1:]
Guido van Rossumc80f1822000-12-15 15:37:48 +0000273 return ''.join(list)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000274
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000275 def getheader(self, name, default=None):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000276 """Get the header value for a name.
277
Fred Drakeddf22c41999-04-28 21:17:38 +0000278 This is the normal interface: it returns a stripped
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000279 version of the header value for a given header name,
280 or None if it doesn't exist. This uses the dictionary
281 version which finds the *last* such header.
282 """
283 try:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000284 return self.dict[name.lower()]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000285 except KeyError:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000286 return default
287 get = getheader
Fred Drakeddf22c41999-04-28 21:17:38 +0000288
289 def getheaders(self, name):
290 """Get all values for a header.
291
292 This returns a list of values for headers given more than once;
293 each value in the result list is stripped in the same way as the
Fred Drakecbfa5cb1999-06-14 15:40:23 +0000294 result of getheader(). If the header is not given, return an
295 empty list.
Fred Drakeddf22c41999-04-28 21:17:38 +0000296 """
297 result = []
298 current = ''
299 have_header = 0
300 for s in self.getallmatchingheaders(name):
Guido van Rossum352ca8c2001-01-02 20:36:32 +0000301 if s[0].isspace():
Fred Drakeddf22c41999-04-28 21:17:38 +0000302 if current:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000303 current = "%s\n %s" % (current, s.strip())
Fred Drakeddf22c41999-04-28 21:17:38 +0000304 else:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000305 current = s.strip()
Fred Drakeddf22c41999-04-28 21:17:38 +0000306 else:
307 if have_header:
308 result.append(current)
Guido van Rossumc80f1822000-12-15 15:37:48 +0000309 current = s[s.find(":") + 1:].strip()
Fred Drakeddf22c41999-04-28 21:17:38 +0000310 have_header = 1
311 if have_header:
312 result.append(current)
Fred Drakecbfa5cb1999-06-14 15:40:23 +0000313 return result
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000314
315 def getaddr(self, name):
316 """Get a single address from a header, as a tuple.
317
318 An example return value:
319 ('Guido van Rossum', 'guido@cwi.nl')
320 """
321 # New, by Ben Escoto
322 alist = self.getaddrlist(name)
323 if alist:
324 return alist[0]
325 else:
326 return (None, None)
327
328 def getaddrlist(self, name):
329 """Get a list of addresses from a header.
Barry Warsaw8a578431999-01-14 19:59:58 +0000330
331 Retrieves a list of addresses from a header, where each address is a
332 tuple as returned by getaddr(). Scans all named headers, so it works
333 properly with multiple To: or Cc: headers for example.
334
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000335 """
Barry Warsaw8a578431999-01-14 19:59:58 +0000336 raw = []
337 for h in self.getallmatchingheaders(name):
Fred Drake13a2c272000-02-10 17:17:14 +0000338 if h[0] in ' \t':
339 raw.append(h)
340 else:
341 if raw:
342 raw.append(', ')
Guido van Rossumc80f1822000-12-15 15:37:48 +0000343 i = h.find(':')
Barry Warsaw8a578431999-01-14 19:59:58 +0000344 if i > 0:
345 addr = h[i+1:]
346 raw.append(addr)
Guido van Rossumc80f1822000-12-15 15:37:48 +0000347 alladdrs = ''.join(raw)
Barry Warsaw8a578431999-01-14 19:59:58 +0000348 a = AddrlistClass(alladdrs)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000349 return a.getaddrlist()
350
351 def getdate(self, name):
352 """Retrieve a date field from a header.
353
354 Retrieves a date field from the named header, returning
355 a tuple compatible with time.mktime().
356 """
357 try:
358 data = self[name]
359 except KeyError:
360 return None
361 return parsedate(data)
362
363 def getdate_tz(self, name):
364 """Retrieve a date field from a header as a 10-tuple.
365
366 The first 9 elements make up a tuple compatible with
367 time.mktime(), and the 10th is the offset of the poster's
368 time zone from GMT/UTC.
369 """
370 try:
371 data = self[name]
372 except KeyError:
373 return None
374 return parsedate_tz(data)
375
376
377 # Access as a dictionary (only finds *last* header of each type):
378
379 def __len__(self):
380 """Get the number of headers in a message."""
381 return len(self.dict)
382
383 def __getitem__(self, name):
384 """Get a specific header, as from a dictionary."""
Guido van Rossumc80f1822000-12-15 15:37:48 +0000385 return self.dict[name.lower()]
Guido van Rossume894fc01998-06-11 13:58:40 +0000386
387 def __setitem__(self, name, value):
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000388 """Set the value of a header.
389
390 Note: This is not a perfect inversion of __getitem__, because
391 any changed headers get stuck at the end of the raw-headers list
392 rather than where the altered header was.
393 """
Guido van Rossume894fc01998-06-11 13:58:40 +0000394 del self[name] # Won't fail if it doesn't exist
Guido van Rossumc80f1822000-12-15 15:37:48 +0000395 self.dict[name.lower()] = value
Guido van Rossume894fc01998-06-11 13:58:40 +0000396 text = name + ": " + value
Guido van Rossumc80f1822000-12-15 15:37:48 +0000397 lines = text.split("\n")
Guido van Rossume894fc01998-06-11 13:58:40 +0000398 for line in lines:
399 self.headers.append(line + "\n")
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000400
Guido van Rossum75d92c11998-04-02 21:33:20 +0000401 def __delitem__(self, name):
402 """Delete all occurrences of a specific header, if it is present."""
Guido van Rossumc80f1822000-12-15 15:37:48 +0000403 name = name.lower()
Guido van Rossumf3c5f5c1999-09-15 22:15:23 +0000404 if not self.dict.has_key(name):
405 return
406 del self.dict[name]
407 name = name + ':'
Guido van Rossum75d92c11998-04-02 21:33:20 +0000408 n = len(name)
409 list = []
410 hit = 0
411 for i in range(len(self.headers)):
412 line = self.headers[i]
Guido van Rossumc80f1822000-12-15 15:37:48 +0000413 if line[:n].lower() == name:
Guido van Rossum75d92c11998-04-02 21:33:20 +0000414 hit = 1
Guido van Rossum352ca8c2001-01-02 20:36:32 +0000415 elif not line[:1].isspace():
Guido van Rossum75d92c11998-04-02 21:33:20 +0000416 hit = 0
417 if hit:
418 list.append(i)
419 list.reverse()
420 for i in list:
421 del self.headers[i]
422
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000423 def has_key(self, name):
424 """Determine whether a message contains the named header."""
Guido van Rossumc80f1822000-12-15 15:37:48 +0000425 return self.dict.has_key(name.lower())
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000426
427 def keys(self):
428 """Get all of a message's header field names."""
429 return self.dict.keys()
430
431 def values(self):
432 """Get all of a message's header field values."""
433 return self.dict.values()
434
435 def items(self):
436 """Get all of a message's headers.
437
438 Returns a list of name, value tuples.
439 """
440 return self.dict.items()
Guido van Rossum01ca3361992-07-13 14:28:59 +0000441
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000442 def __str__(self):
443 str = ''
444 for hdr in self.headers:
445 str = str + hdr
446 return str
Guido van Rossum01ca3361992-07-13 14:28:59 +0000447
448
449# Utility functions
450# -----------------
451
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000452# XXX Should fix unquote() and quote() to be really conformant.
Guido van Rossumb6775db1994-08-01 11:34:53 +0000453# XXX The inverses of the parse functions may also be useful.
454
Guido van Rossum01ca3361992-07-13 14:28:59 +0000455
Guido van Rossum01ca3361992-07-13 14:28:59 +0000456def unquote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000457 """Remove quotes from a string."""
458 if len(str) > 1:
459 if str[0] == '"' and str[-1:] == '"':
460 return str[1:-1]
461 if str[0] == '<' and str[-1:] == '>':
462 return str[1:-1]
463 return str
Guido van Rossumb6775db1994-08-01 11:34:53 +0000464
465
Guido van Rossum7883e1d1997-09-15 14:12:54 +0000466def quote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000467 """Add quotes around a string."""
Guido van Rossumc80f1822000-12-15 15:37:48 +0000468 return str.replace('\\', '\\\\').replace('"', '\\"')
Guido van Rossumb6775db1994-08-01 11:34:53 +0000469
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000470
Guido van Rossumb6775db1994-08-01 11:34:53 +0000471def parseaddr(address):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000472 """Parse an address into a (realname, mailaddr) tuple."""
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000473 a = AddrlistClass(address)
474 list = a.getaddrlist()
475 if not list:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000476 return (None, None)
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000477 else:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000478 return list[0]
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000479
480
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000481class AddrlistClass:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000482 """Address parser class by Ben Escoto.
483
484 To understand what this class does, it helps to have a copy of
485 RFC-822 in front of you.
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000486
487 Note: this class interface is deprecated and may be removed in the future.
488 Use rfc822.AddressList instead.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000489 """
490
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000491 def __init__(self, field):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000492 """Initialize a new instance.
493
494 `field' is an unparsed address header field, containing
495 one or more addresses.
496 """
497 self.specials = '()<>@,:;.\"[]'
498 self.pos = 0
499 self.LWS = ' \t'
Barry Warsaw8a578431999-01-14 19:59:58 +0000500 self.CR = '\r\n'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000501 self.atomends = self.specials + self.LWS + self.CR
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000502 self.field = field
503 self.commentlist = []
504
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000505 def gotonext(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000506 """Parse up to the start of the next address."""
507 while self.pos < len(self.field):
508 if self.field[self.pos] in self.LWS + '\n\r':
509 self.pos = self.pos + 1
510 elif self.field[self.pos] == '(':
511 self.commentlist.append(self.getcomment())
512 else: break
513
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000514 def getaddrlist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000515 """Parse all addresses.
516
517 Returns a list containing all of the addresses.
518 """
519 ad = self.getaddress()
520 if ad:
521 return ad + self.getaddrlist()
522 else: return []
523
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000524 def getaddress(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000525 """Parse the next address."""
526 self.commentlist = []
527 self.gotonext()
528
529 oldpos = self.pos
530 oldcl = self.commentlist
531 plist = self.getphraselist()
532
533 self.gotonext()
534 returnlist = []
535
536 if self.pos >= len(self.field):
537 # Bad email address technically, no domain.
538 if plist:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000539 returnlist = [(' '.join(self.commentlist), plist[0])]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000540
541 elif self.field[self.pos] in '.@':
542 # email address is just an addrspec
543 # this isn't very efficient since we start over
544 self.pos = oldpos
545 self.commentlist = oldcl
546 addrspec = self.getaddrspec()
Guido van Rossumc80f1822000-12-15 15:37:48 +0000547 returnlist = [(' '.join(self.commentlist), addrspec)]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000548
549 elif self.field[self.pos] == ':':
550 # address is a group
551 returnlist = []
552
Barry Warsaw96e9bf41999-07-12 18:37:02 +0000553 fieldlen = len(self.field)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000554 self.pos = self.pos + 1
555 while self.pos < len(self.field):
556 self.gotonext()
Barry Warsaw96e9bf41999-07-12 18:37:02 +0000557 if self.pos < fieldlen and self.field[self.pos] == ';':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000558 self.pos = self.pos + 1
559 break
560 returnlist = returnlist + self.getaddress()
561
562 elif self.field[self.pos] == '<':
563 # Address is a phrase then a route addr
564 routeaddr = self.getrouteaddr()
565
566 if self.commentlist:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000567 returnlist = [(' '.join(plist) + ' (' + \
568 ' '.join(self.commentlist) + ')', routeaddr)]
569 else: returnlist = [(' '.join(plist), routeaddr)]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000570
571 else:
572 if plist:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000573 returnlist = [(' '.join(self.commentlist), plist[0])]
Barry Warsaw8a578431999-01-14 19:59:58 +0000574 elif self.field[self.pos] in self.specials:
575 self.pos = self.pos + 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000576
577 self.gotonext()
578 if self.pos < len(self.field) and self.field[self.pos] == ',':
579 self.pos = self.pos + 1
580 return returnlist
581
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000582 def getrouteaddr(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000583 """Parse a route address (Return-path value).
584
585 This method just skips all the route stuff and returns the addrspec.
586 """
587 if self.field[self.pos] != '<':
588 return
589
590 expectroute = 0
591 self.pos = self.pos + 1
592 self.gotonext()
Guido van Rossum9e43adb1998-03-03 16:17:52 +0000593 adlist = None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000594 while self.pos < len(self.field):
595 if expectroute:
596 self.getdomain()
597 expectroute = 0
598 elif self.field[self.pos] == '>':
599 self.pos = self.pos + 1
600 break
601 elif self.field[self.pos] == '@':
602 self.pos = self.pos + 1
603 expectroute = 1
604 elif self.field[self.pos] == ':':
605 self.pos = self.pos + 1
606 expectaddrspec = 1
607 else:
608 adlist = self.getaddrspec()
609 self.pos = self.pos + 1
610 break
611 self.gotonext()
612
613 return adlist
614
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000615 def getaddrspec(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000616 """Parse an RFC-822 addr-spec."""
617 aslist = []
618
619 self.gotonext()
620 while self.pos < len(self.field):
621 if self.field[self.pos] == '.':
622 aslist.append('.')
623 self.pos = self.pos + 1
624 elif self.field[self.pos] == '"':
Guido van Rossumb1844871999-06-15 18:06:20 +0000625 aslist.append('"%s"' % self.getquote())
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000626 elif self.field[self.pos] in self.atomends:
627 break
628 else: aslist.append(self.getatom())
629 self.gotonext()
630
631 if self.pos >= len(self.field) or self.field[self.pos] != '@':
Guido van Rossumc80f1822000-12-15 15:37:48 +0000632 return ''.join(aslist)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000633
634 aslist.append('@')
635 self.pos = self.pos + 1
636 self.gotonext()
Guido van Rossumc80f1822000-12-15 15:37:48 +0000637 return ''.join(aslist) + self.getdomain()
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000638
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000639 def getdomain(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000640 """Get the complete domain name from an address."""
641 sdlist = []
642 while self.pos < len(self.field):
643 if self.field[self.pos] in self.LWS:
644 self.pos = self.pos + 1
645 elif self.field[self.pos] == '(':
646 self.commentlist.append(self.getcomment())
647 elif self.field[self.pos] == '[':
648 sdlist.append(self.getdomainliteral())
649 elif self.field[self.pos] == '.':
650 self.pos = self.pos + 1
651 sdlist.append('.')
652 elif self.field[self.pos] in self.atomends:
653 break
654 else: sdlist.append(self.getatom())
Guido van Rossumc80f1822000-12-15 15:37:48 +0000655 return ''.join(sdlist)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000656
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000657 def getdelimited(self, beginchar, endchars, allowcomments = 1):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000658 """Parse a header fragment delimited by special characters.
659
660 `beginchar' is the start character for the fragment.
661 If self is not looking at an instance of `beginchar' then
662 getdelimited returns the empty string.
663
664 `endchars' is a sequence of allowable end-delimiting characters.
665 Parsing stops when one of these is encountered.
666
667 If `allowcomments' is non-zero, embedded RFC-822 comments
668 are allowed within the parsed fragment.
669 """
670 if self.field[self.pos] != beginchar:
671 return ''
672
673 slist = ['']
674 quote = 0
675 self.pos = self.pos + 1
676 while self.pos < len(self.field):
677 if quote == 1:
678 slist.append(self.field[self.pos])
679 quote = 0
680 elif self.field[self.pos] in endchars:
681 self.pos = self.pos + 1
682 break
683 elif allowcomments and self.field[self.pos] == '(':
684 slist.append(self.getcomment())
685 elif self.field[self.pos] == '\\':
686 quote = 1
687 else:
688 slist.append(self.field[self.pos])
689 self.pos = self.pos + 1
690
Guido van Rossumc80f1822000-12-15 15:37:48 +0000691 return ''.join(slist)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000692
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000693 def getquote(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000694 """Get a quote-delimited fragment from self's field."""
695 return self.getdelimited('"', '"\r', 0)
696
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000697 def getcomment(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000698 """Get a parenthesis-delimited fragment from self's field."""
699 return self.getdelimited('(', ')\r', 1)
700
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000701 def getdomainliteral(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000702 """Parse an RFC-822 domain-literal."""
Barry Warsaw2ea2b112000-09-25 15:08:27 +0000703 return '[%s]' % self.getdelimited('[', ']\r', 0)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000704
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000705 def getatom(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000706 """Parse an RFC-822 atom."""
707 atomlist = ['']
708
709 while self.pos < len(self.field):
710 if self.field[self.pos] in self.atomends:
711 break
712 else: atomlist.append(self.field[self.pos])
713 self.pos = self.pos + 1
714
Guido van Rossumc80f1822000-12-15 15:37:48 +0000715 return ''.join(atomlist)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000716
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000717 def getphraselist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000718 """Parse a sequence of RFC-822 phrases.
719
720 A phrase is a sequence of words, which are in turn either
Guido van Rossume894fc01998-06-11 13:58:40 +0000721 RFC-822 atoms or quoted-strings. Phrases are canonicalized
722 by squeezing all runs of continuous whitespace into one space.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000723 """
724 plist = []
725
726 while self.pos < len(self.field):
727 if self.field[self.pos] in self.LWS:
728 self.pos = self.pos + 1
729 elif self.field[self.pos] == '"':
730 plist.append(self.getquote())
731 elif self.field[self.pos] == '(':
732 self.commentlist.append(self.getcomment())
733 elif self.field[self.pos] in self.atomends:
734 break
735 else: plist.append(self.getatom())
736
737 return plist
Guido van Rossumb6775db1994-08-01 11:34:53 +0000738
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000739class AddressList(AddrlistClass):
740 """An AddressList encapsulates a list of parsed RFC822 addresses."""
741 def __init__(self, field):
742 AddrlistClass.__init__(self, field)
743 if field:
744 self.addresslist = self.getaddrlist()
745 else:
746 self.addresslist = []
747
748 def __len__(self):
749 return len(self.addresslist)
750
751 def __str__(self):
Guido van Rossumc80f1822000-12-15 15:37:48 +0000752 return ", ".join(map(dump_address_pair, self.addresslist))
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000753
754 def __add__(self, other):
755 # Set union
756 newaddr = AddressList(None)
757 newaddr.addresslist = self.addresslist[:]
758 for x in other.addresslist:
759 if not x in self.addresslist:
760 newaddr.addresslist.append(x)
761 return newaddr
762
Thomas Wouters104a7bc2000-08-24 20:14:10 +0000763 def __iadd__(self, other):
764 # Set union, in-place
765 for x in other.addresslist:
766 if not x in self.addresslist:
767 self.addresslist.append(x)
768 return self
769
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000770 def __sub__(self, other):
771 # Set difference
772 newaddr = AddressList(None)
773 for x in self.addresslist:
774 if not x in other.addresslist:
775 newaddr.addresslist.append(x)
776 return newaddr
777
Thomas Wouters104a7bc2000-08-24 20:14:10 +0000778 def __isub__(self, other):
779 # Set difference, in-place
780 for x in other.addresslist:
781 if x in self.addresslist:
782 self.addresslist.remove(x)
783 return self
784
Guido van Rossum81d10b41998-06-16 22:29:03 +0000785 def __getitem__(self, index):
786 # Make indexing, slices, and 'in' work
Guido van Rossuma07934e1999-09-03 13:23:49 +0000787 return self.addresslist[index]
Guido van Rossum81d10b41998-06-16 22:29:03 +0000788
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000789def dump_address_pair(pair):
790 """Dump a (name, address) pair in a canonicalized form."""
791 if pair[0]:
792 return '"' + pair[0] + '" <' + pair[1] + '>'
793 else:
794 return pair[1]
Guido van Rossumb6775db1994-08-01 11:34:53 +0000795
796# Parse a date field
797
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000798_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
799 'aug', 'sep', 'oct', 'nov', 'dec',
Fred Drake13a2c272000-02-10 17:17:14 +0000800 'january', 'february', 'march', 'april', 'may', 'june', 'july',
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000801 'august', 'september', 'october', 'november', 'december']
802_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
Guido van Rossumb6775db1994-08-01 11:34:53 +0000803
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000804# The timezone table does not include the military time zones defined
805# in RFC822, other than Z. According to RFC1123, the description in
806# RFC822 gets the signs wrong, so we can't rely on any such time
807# zones. RFC1123 recommends that numeric timezone indicators be used
808# instead of timezone names.
809
810_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
Guido van Rossum67133e21998-05-18 16:09:10 +0000811 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000812 'EST': -500, 'EDT': -400, # Eastern
Guido van Rossum67133e21998-05-18 16:09:10 +0000813 'CST': -600, 'CDT': -500, # Central
814 'MST': -700, 'MDT': -600, # Mountain
815 'PST': -800, 'PDT': -700 # Pacific
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000816 }
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000817
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000818
819def parsedate_tz(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000820 """Convert a date string to a time tuple.
821
822 Accounts for military timezones.
823 """
Guido van Rossumc80f1822000-12-15 15:37:48 +0000824 data = data.split()
825 if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000826 # There's a dayname here. Skip it
827 del data[0]
828 if len(data) == 3: # RFC 850 date, deprecated
Guido van Rossumc80f1822000-12-15 15:37:48 +0000829 stuff = data[0].split('-')
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000830 if len(stuff) == 3:
831 data = stuff + data[1:]
832 if len(data) == 4:
833 s = data[3]
Guido van Rossumc80f1822000-12-15 15:37:48 +0000834 i = s.find('+')
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000835 if i > 0:
836 data[3:] = [s[:i], s[i+1:]]
837 else:
838 data.append('') # Dummy tz
839 if len(data) < 5:
840 return None
841 data = data[:5]
842 [dd, mm, yy, tm, tz] = data
Guido van Rossumc80f1822000-12-15 15:37:48 +0000843 mm = mm.lower()
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000844 if not mm in _monthnames:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000845 dd, mm = mm, dd.lower()
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000846 if not mm in _monthnames:
847 return None
848 mm = _monthnames.index(mm)+1
Guido van Rossumb08f51b1999-04-29 12:50:36 +0000849 if mm > 12: mm = mm - 12
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000850 if dd[-1] == ',':
Fred Drake13a2c272000-02-10 17:17:14 +0000851 dd = dd[:-1]
Guido van Rossumc80f1822000-12-15 15:37:48 +0000852 i = yy.find(':')
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000853 if i > 0:
Fred Drake13a2c272000-02-10 17:17:14 +0000854 yy, tm = tm, yy
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000855 if yy[-1] == ',':
Fred Drake13a2c272000-02-10 17:17:14 +0000856 yy = yy[:-1]
Guido van Rossum352ca8c2001-01-02 20:36:32 +0000857 if not yy[0].isdigit():
Fred Drake13a2c272000-02-10 17:17:14 +0000858 yy, tz = tz, yy
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000859 if tm[-1] == ',':
Fred Drake13a2c272000-02-10 17:17:14 +0000860 tm = tm[:-1]
Guido van Rossumc80f1822000-12-15 15:37:48 +0000861 tm = tm.split(':')
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000862 if len(tm) == 2:
863 [thh, tmm] = tm
864 tss = '0'
Guido van Rossum99e11311998-12-23 21:58:38 +0000865 elif len(tm) == 3:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000866 [thh, tmm, tss] = tm
Guido van Rossum99e11311998-12-23 21:58:38 +0000867 else:
868 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000869 try:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000870 yy = int(yy)
871 dd = int(dd)
872 thh = int(thh)
873 tmm = int(tmm)
874 tss = int(tss)
875 except ValueError:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000876 return None
Guido van Rossumc80f1822000-12-15 15:37:48 +0000877 tzoffset = None
878 tz = tz.upper()
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000879 if _timezones.has_key(tz):
Guido van Rossumc80f1822000-12-15 15:37:48 +0000880 tzoffset = _timezones[tz]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000881 else:
882 try:
Guido van Rossumc80f1822000-12-15 15:37:48 +0000883 tzoffset = int(tz)
884 except ValueError:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000885 pass
886 # Convert a timezone offset into seconds ; -0500 -> -18000
Guido van Rossuma73033f1998-02-19 00:28:58 +0000887 if tzoffset:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000888 if tzoffset < 0:
889 tzsign = -1
890 tzoffset = -tzoffset
891 else:
892 tzsign = 1
893 tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000894 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
895 return tuple
896
Guido van Rossumb6775db1994-08-01 11:34:53 +0000897
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000898def parsedate(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000899 """Convert a time string to a time tuple."""
Guido van Rossumc80f1822000-12-15 15:37:48 +0000900 t = parsedate_tz(data)
901 if type(t) == type( () ):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000902 return t[:9]
903 else: return t
904
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000905
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000906def mktime_tz(data):
Guido van Rossum67133e21998-05-18 16:09:10 +0000907 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
Guido van Rossuma73033f1998-02-19 00:28:58 +0000908 if data[9] is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000909 # No zone info, so localtime is better assumption than GMT
910 return time.mktime(data[:8] + (-1,))
Guido van Rossuma73033f1998-02-19 00:28:58 +0000911 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000912 t = time.mktime(data[:8] + (0,))
913 return t - data[9] - time.timezone
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000914
Guido van Rossum247a78a1999-04-19 18:04:38 +0000915def formatdate(timeval=None):
916 """Returns time format preferred for Internet standards.
917
918 Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
919 """
920 if timeval is None:
921 timeval = time.time()
922 return "%s" % time.strftime('%a, %d %b %Y %H:%M:%S GMT',
923 time.gmtime(timeval))
924
Guido van Rossumb6775db1994-08-01 11:34:53 +0000925
926# When used as script, run a small test program.
927# The first command line argument must be a filename containing one
928# message in RFC-822 format.
929
930if __name__ == '__main__':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000931 import sys, os
932 file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
933 if sys.argv[1:]: file = sys.argv[1]
934 f = open(file, 'r')
935 m = Message(f)
936 print 'From:', m.getaddr('from')
937 print 'To:', m.getaddrlist('to')
938 print 'Subject:', m.getheader('subject')
939 print 'Date:', m.getheader('date')
940 date = m.getdate_tz('date')
Guido van Rossum1d2b23e2000-01-17 14:11:04 +0000941 tz = date[-1]
942 date = time.localtime(mktime_tz(date))
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000943 if date:
Guido van Rossum1d2b23e2000-01-17 14:11:04 +0000944 print 'ParsedDate:', time.asctime(date),
945 hhmmss = tz
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000946 hhmm, ss = divmod(hhmmss, 60)
947 hh, mm = divmod(hhmm, 60)
948 print "%+03d%02d" % (hh, mm),
949 if ss: print ".%02d" % ss,
950 print
951 else:
952 print 'ParsedDate:', None
953 m.rewindbody()
954 n = 0
955 while f.readline():
956 n = n + 1
957 print 'Lines:', n
958 print '-'*70
959 print 'len =', len(m)
960 if m.has_key('Date'): print 'Date =', m['Date']
961 if m.has_key('X-Nonsense'): pass
962 print 'keys =', m.keys()
963 print 'values =', m.values()
964 print 'items =', m.items()