blob: 662703beca3b74860ed99a4fac7b3d02097d8745 [file] [log] [blame]
Guido van Rossum9ab94c11997-12-10 16:17:39 +00001"""RFC-822 message manipulation class.
Guido van Rossum01ca3361992-07-13 14:28:59 +00002
Guido van Rossum9ab94c11997-12-10 16:17:39 +00003XXX This is only a very rough sketch of a full RFC-822 parser;
4in particular the tokenizing of addresses does not adhere to all the
5quoting rules.
6
7Directions for use:
8
9To create a Message object: first open a file, e.g.:
10 fp = open(file, 'r')
Guido van Rossumc7bb8571998-06-10 21:31:01 +000011You can use any other legal way of getting an open file object, e.g. use
12sys.stdin or call os.popen().
Guido van Rossum9ab94c11997-12-10 16:17:39 +000013Then pass the open file object to the Message() constructor:
14 m = Message(fp)
15
Guido van Rossume894fc01998-06-11 13:58:40 +000016This class can work with any input object that supports a readline
17method. If the input object has seek and tell capability, the
18rewindbody method will work; also illegal lines will be pushed back
19onto the input stream. If the input object lacks seek but has an
20`unread' method that can push back a line of input, Message will use
21that to push back illegal lines. Thus this class can be used to parse
22messages coming from a buffered stream.
Guido van Rossumc7bb8571998-06-10 21:31:01 +000023
24The optional `seekable' argument is provided as a workaround for
25certain stdio libraries in which tell() discards buffered data before
26discovering that the lseek() system call doesn't work. For maximum
27portability, you should set the seekable argument to zero to prevent
28that initial \code{tell} when passing in an unseekable object such as
29a a file object created from a socket object. If it is 1 on entry --
30which it is by default -- the tell() method of the open file object is
31called once; if this raises an exception, seekable is reset to 0. For
32other nonzero values of seekable, this test is not made.
33
Guido van Rossum9ab94c11997-12-10 16:17:39 +000034To get the text of a particular header there are several methods:
35 str = m.getheader(name)
36 str = m.getrawheader(name)
37where name is the name of the header, e.g. 'Subject'.
38The difference is that getheader() strips the leading and trailing
39whitespace, while getrawheader() doesn't. Both functions retain
40embedded whitespace (including newlines) exactly as they are
41specified in the header, and leave the case of the text unchanged.
42
43For addresses and address lists there are functions
44 realname, mailaddress = m.getaddr(name) and
45 list = m.getaddrlist(name)
46where the latter returns a list of (realname, mailaddr) tuples.
47
48There is also a method
49 time = m.getdate(name)
50which parses a Date-like field and returns a time-compatible tuple,
51i.e. a tuple such as returned by time.localtime() or accepted by
52time.mktime().
53
54See the class definition for lower level access methods.
55
56There are also some utility functions here.
57"""
Guido van Rossum4d4ab921998-06-16 22:27:09 +000058# Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
Guido van Rossum01ca3361992-07-13 14:28:59 +000059
Guido van Rossum01ca3361992-07-13 14:28:59 +000060import string
Guido van Rossumb6775db1994-08-01 11:34:53 +000061import time
Guido van Rossum01ca3361992-07-13 14:28:59 +000062
63
Guido van Rossum9ab94c11997-12-10 16:17:39 +000064_blanklines = ('\r\n', '\n') # Optimization for islast()
Guido van Rossum92457b91995-06-22 19:06:57 +000065
66
Guido van Rossum01ca3361992-07-13 14:28:59 +000067class Message:
Guido van Rossum9ab94c11997-12-10 16:17:39 +000068 """Represents a single RFC-822-compliant message."""
69
70 def __init__(self, fp, seekable = 1):
71 """Initialize the class instance and read the headers."""
Guido van Rossumc7bb8571998-06-10 21:31:01 +000072 if seekable == 1:
73 # Exercise tell() to make sure it works
74 # (and then assume seek() works, too)
75 try:
76 fp.tell()
77 except:
78 seekable = 0
79 else:
80 seekable = 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +000081 self.fp = fp
82 self.seekable = seekable
83 self.startofheaders = None
84 self.startofbody = None
85 #
86 if self.seekable:
87 try:
88 self.startofheaders = self.fp.tell()
89 except IOError:
90 self.seekable = 0
91 #
92 self.readheaders()
93 #
94 if self.seekable:
95 try:
96 self.startofbody = self.fp.tell()
97 except IOError:
98 self.seekable = 0
99
100 def rewindbody(self):
101 """Rewind the file to the start of the body (if seekable)."""
102 if not self.seekable:
103 raise IOError, "unseekable file"
104 self.fp.seek(self.startofbody)
105
106 def readheaders(self):
107 """Read header lines.
108
109 Read header lines up to the entirely blank line that
110 terminates them. The (normally blank) line that ends the
111 headers is skipped, but not included in the returned list.
112 If a non-header line ends the headers, (which is an error),
113 an attempt is made to backspace over it; it is never
114 included in the returned list.
115
116 The variable self.status is set to the empty string if all
117 went well, otherwise it is an error message.
118 The variable self.headers is a completely uninterpreted list
119 of lines contained in the header (so printing them will
120 reproduce the header exactly as it appears in the file).
121 """
122 self.dict = {}
123 self.unixfrom = ''
124 self.headers = list = []
125 self.status = ''
126 headerseen = ""
127 firstline = 1
Guido van Rossum052969a1998-07-21 14:24:04 +0000128 startofline = unread = tell = None
129 if hasattr(self.fp, 'unread'):
130 unread = self.fp.unread
131 elif self.seekable:
132 tell = self.fp.tell
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000133 while 1:
Guido van Rossum052969a1998-07-21 14:24:04 +0000134 if tell:
135 startofline = tell()
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000136 line = self.fp.readline()
137 if not line:
138 self.status = 'EOF in headers'
139 break
140 # Skip unix From name time lines
141 if firstline and line[:5] == 'From ':
142 self.unixfrom = self.unixfrom + line
143 continue
144 firstline = 0
Guido van Rossume894fc01998-06-11 13:58:40 +0000145 if headerseen and line[0] in ' \t':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000146 # It's a continuation line.
147 list.append(line)
Guido van Rossume894fc01998-06-11 13:58:40 +0000148 x = (self.dict[headerseen] + "\n " + string.strip(line))
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000149 self.dict[headerseen] = string.strip(x)
Guido van Rossume894fc01998-06-11 13:58:40 +0000150 continue
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000151 elif self.iscomment(line):
Guido van Rossume894fc01998-06-11 13:58:40 +0000152 # It's a comment. Ignore it.
153 continue
154 elif self.islast(line):
155 # Note! No pushback here! The delimiter line gets eaten.
156 break
157 headerseen = self.isheader(line)
158 if headerseen:
159 # It's a legal header line, save it.
160 list.append(line)
161 self.dict[headerseen] = string.strip(line[len(headerseen)+2:])
162 continue
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000163 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000164 # It's not a header line; throw it back and stop here.
165 if not self.dict:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000166 self.status = 'No headers'
167 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000168 self.status = 'Non-header line where header expected'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000169 # Try to undo the read.
Guido van Rossum052969a1998-07-21 14:24:04 +0000170 if unread:
171 unread(line)
172 elif tell:
173 self.fp.seek(startofline)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000174 else:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000175 self.status = self.status + '; bad seek'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000176 break
Guido van Rossume894fc01998-06-11 13:58:40 +0000177
178 def isheader(self, line):
179 """Determine whether a given line is a legal header.
180
181 This method should return the header name, suitably canonicalized.
182 You may override this method in order to use Message parsing
183 on tagged data in RFC822-like formats with special header formats.
184 """
185 i = string.find(line, ':')
186 if i > 0:
187 return string.lower(line[:i])
188 else:
189 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000190
191 def islast(self, line):
192 """Determine whether a line is a legal end of RFC-822 headers.
193
194 You may override this method if your application wants
195 to bend the rules, e.g. to strip trailing whitespace,
196 or to recognise MH template separators ('--------').
197 For convenience (e.g. for code reading from sockets) a
198 line consisting of \r\n also matches.
199 """
200 return line in _blanklines
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000201
202 def iscomment(self, line):
203 """Determine whether a line should be skipped entirely.
204
205 You may override this method in order to use Message parsing
206 on tagged data in RFC822-like formats that support embedded
207 comments or free-text data.
208 """
209 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000210
211 def getallmatchingheaders(self, name):
212 """Find all header lines matching a given header name.
213
214 Look through the list of headers and find all lines
215 matching a given header name (and their continuation
216 lines). A list of the lines is returned, without
217 interpretation. If the header does not occur, an
218 empty list is returned. If the header occurs multiple
219 times, all occurrences are returned. Case is not
220 important in the header name.
221 """
222 name = string.lower(name) + ':'
223 n = len(name)
224 list = []
225 hit = 0
226 for line in self.headers:
227 if string.lower(line[:n]) == name:
228 hit = 1
229 elif line[:1] not in string.whitespace:
230 hit = 0
231 if hit:
232 list.append(line)
233 return list
234
235 def getfirstmatchingheader(self, name):
236 """Get the first header line matching name.
237
238 This is similar to getallmatchingheaders, but it returns
239 only the first matching header (and its continuation
240 lines).
241 """
242 name = string.lower(name) + ':'
243 n = len(name)
244 list = []
245 hit = 0
246 for line in self.headers:
247 if hit:
248 if line[:1] not in string.whitespace:
249 break
250 elif string.lower(line[:n]) == name:
251 hit = 1
252 if hit:
253 list.append(line)
254 return list
255
256 def getrawheader(self, name):
257 """A higher-level interface to getfirstmatchingheader().
258
259 Return a string containing the literal text of the
260 header but with the keyword stripped. All leading,
261 trailing and embedded whitespace is kept in the
262 string, however.
263 Return None if the header does not occur.
264 """
265
266 list = self.getfirstmatchingheader(name)
267 if not list:
268 return None
269 list[0] = list[0][len(name) + 1:]
270 return string.joinfields(list, '')
271
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000272 def getheader(self, name, default=None):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000273 """Get the header value for a name.
274
Fred Drakeddf22c41999-04-28 21:17:38 +0000275 This is the normal interface: it returns a stripped
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000276 version of the header value for a given header name,
277 or None if it doesn't exist. This uses the dictionary
278 version which finds the *last* such header.
279 """
280 try:
281 return self.dict[string.lower(name)]
282 except KeyError:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000283 return default
284 get = getheader
Fred Drakeddf22c41999-04-28 21:17:38 +0000285
286 def getheaders(self, name):
287 """Get all values for a header.
288
289 This returns a list of values for headers given more than once;
290 each value in the result list is stripped in the same way as the
Fred Drakecbfa5cb1999-06-14 15:40:23 +0000291 result of getheader(). If the header is not given, return an
292 empty list.
Fred Drakeddf22c41999-04-28 21:17:38 +0000293 """
294 result = []
295 current = ''
296 have_header = 0
297 for s in self.getallmatchingheaders(name):
298 if s[0] in string.whitespace:
299 if current:
300 current = "%s\n %s" % (current, string.strip(s))
301 else:
302 current = string.strip(s)
303 else:
304 if have_header:
305 result.append(current)
306 current = string.strip(s[string.find(s, ":") + 1:])
307 have_header = 1
308 if have_header:
309 result.append(current)
Fred Drakecbfa5cb1999-06-14 15:40:23 +0000310 return result
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000311
312 def getaddr(self, name):
313 """Get a single address from a header, as a tuple.
314
315 An example return value:
316 ('Guido van Rossum', 'guido@cwi.nl')
317 """
318 # New, by Ben Escoto
319 alist = self.getaddrlist(name)
320 if alist:
321 return alist[0]
322 else:
323 return (None, None)
324
325 def getaddrlist(self, name):
326 """Get a list of addresses from a header.
Barry Warsaw8a578431999-01-14 19:59:58 +0000327
328 Retrieves a list of addresses from a header, where each address is a
329 tuple as returned by getaddr(). Scans all named headers, so it works
330 properly with multiple To: or Cc: headers for example.
331
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000332 """
Barry Warsaw8a578431999-01-14 19:59:58 +0000333 raw = []
334 for h in self.getallmatchingheaders(name):
335 if h[0] in ' \t':
336 raw.append(h)
337 else:
338 if raw:
339 raw.append(', ')
340 i = string.find(h, ':')
341 if i > 0:
342 addr = h[i+1:]
343 raw.append(addr)
344 alladdrs = string.join(raw, '')
345 a = AddrlistClass(alladdrs)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000346 return a.getaddrlist()
347
348 def getdate(self, name):
349 """Retrieve a date field from a header.
350
351 Retrieves a date field from the named header, returning
352 a tuple compatible with time.mktime().
353 """
354 try:
355 data = self[name]
356 except KeyError:
357 return None
358 return parsedate(data)
359
360 def getdate_tz(self, name):
361 """Retrieve a date field from a header as a 10-tuple.
362
363 The first 9 elements make up a tuple compatible with
364 time.mktime(), and the 10th is the offset of the poster's
365 time zone from GMT/UTC.
366 """
367 try:
368 data = self[name]
369 except KeyError:
370 return None
371 return parsedate_tz(data)
372
373
374 # Access as a dictionary (only finds *last* header of each type):
375
376 def __len__(self):
377 """Get the number of headers in a message."""
378 return len(self.dict)
379
380 def __getitem__(self, name):
381 """Get a specific header, as from a dictionary."""
382 return self.dict[string.lower(name)]
Guido van Rossume894fc01998-06-11 13:58:40 +0000383
384 def __setitem__(self, name, value):
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000385 """Set the value of a header.
386
387 Note: This is not a perfect inversion of __getitem__, because
388 any changed headers get stuck at the end of the raw-headers list
389 rather than where the altered header was.
390 """
Guido van Rossume894fc01998-06-11 13:58:40 +0000391 del self[name] # Won't fail if it doesn't exist
392 self.dict[string.lower(name)] = value
393 text = name + ": " + value
394 lines = string.split(text, "\n")
395 for line in lines:
396 self.headers.append(line + "\n")
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000397
Guido van Rossum75d92c11998-04-02 21:33:20 +0000398 def __delitem__(self, name):
399 """Delete all occurrences of a specific header, if it is present."""
400 name = string.lower(name)
401 if not self.dict.has_key(name):
402 return
403 del self.dict[name]
404 name = name + ':'
405 n = len(name)
406 list = []
407 hit = 0
408 for i in range(len(self.headers)):
409 line = self.headers[i]
410 if string.lower(line[:n]) == name:
411 hit = 1
412 elif line[:1] not in string.whitespace:
413 hit = 0
414 if hit:
415 list.append(i)
416 list.reverse()
417 for i in list:
418 del self.headers[i]
419
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000420 def has_key(self, name):
421 """Determine whether a message contains the named header."""
422 return self.dict.has_key(string.lower(name))
423
424 def keys(self):
425 """Get all of a message's header field names."""
426 return self.dict.keys()
427
428 def values(self):
429 """Get all of a message's header field values."""
430 return self.dict.values()
431
432 def items(self):
433 """Get all of a message's headers.
434
435 Returns a list of name, value tuples.
436 """
437 return self.dict.items()
Guido van Rossum01ca3361992-07-13 14:28:59 +0000438
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000439 def __str__(self):
440 str = ''
441 for hdr in self.headers:
442 str = str + hdr
443 return str
Guido van Rossum01ca3361992-07-13 14:28:59 +0000444
445
446# Utility functions
447# -----------------
448
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000449# XXX Should fix unquote() and quote() to be really conformant.
Guido van Rossumb6775db1994-08-01 11:34:53 +0000450# XXX The inverses of the parse functions may also be useful.
451
Guido van Rossum01ca3361992-07-13 14:28:59 +0000452
Guido van Rossum01ca3361992-07-13 14:28:59 +0000453def unquote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000454 """Remove quotes from a string."""
455 if len(str) > 1:
456 if str[0] == '"' and str[-1:] == '"':
457 return str[1:-1]
458 if str[0] == '<' and str[-1:] == '>':
459 return str[1:-1]
460 return str
Guido van Rossumb6775db1994-08-01 11:34:53 +0000461
462
Guido van Rossum7883e1d1997-09-15 14:12:54 +0000463def quote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000464 """Add quotes around a string."""
465 return '"%s"' % string.join(
466 string.split(
467 string.join(
468 string.split(str, '\\'),
469 '\\\\'),
470 '"'),
471 '\\"')
Guido van Rossumb6775db1994-08-01 11:34:53 +0000472
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000473
Guido van Rossumb6775db1994-08-01 11:34:53 +0000474def parseaddr(address):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000475 """Parse an address into a (realname, mailaddr) tuple."""
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000476 a = AddrlistClass(address)
477 list = a.getaddrlist()
478 if not list:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000479 return (None, None)
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000480 else:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000481 return list[0]
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000482
483
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000484class AddrlistClass:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000485 """Address parser class by Ben Escoto.
486
487 To understand what this class does, it helps to have a copy of
488 RFC-822 in front of you.
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000489
490 Note: this class interface is deprecated and may be removed in the future.
491 Use rfc822.AddressList instead.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000492 """
493
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000494 def __init__(self, field):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000495 """Initialize a new instance.
496
497 `field' is an unparsed address header field, containing
498 one or more addresses.
499 """
500 self.specials = '()<>@,:;.\"[]'
501 self.pos = 0
502 self.LWS = ' \t'
Barry Warsaw8a578431999-01-14 19:59:58 +0000503 self.CR = '\r\n'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000504 self.atomends = self.specials + self.LWS + self.CR
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000505 self.field = field
506 self.commentlist = []
507
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000508 def gotonext(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000509 """Parse up to the start of the next address."""
510 while self.pos < len(self.field):
511 if self.field[self.pos] in self.LWS + '\n\r':
512 self.pos = self.pos + 1
513 elif self.field[self.pos] == '(':
514 self.commentlist.append(self.getcomment())
515 else: break
516
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000517 def getaddrlist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000518 """Parse all addresses.
519
520 Returns a list containing all of the addresses.
521 """
522 ad = self.getaddress()
523 if ad:
524 return ad + self.getaddrlist()
525 else: return []
526
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000527 def getaddress(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000528 """Parse the next address."""
529 self.commentlist = []
530 self.gotonext()
531
532 oldpos = self.pos
533 oldcl = self.commentlist
534 plist = self.getphraselist()
535
536 self.gotonext()
537 returnlist = []
538
539 if self.pos >= len(self.field):
540 # Bad email address technically, no domain.
541 if plist:
542 returnlist = [(string.join(self.commentlist), plist[0])]
543
544 elif self.field[self.pos] in '.@':
545 # email address is just an addrspec
546 # this isn't very efficient since we start over
547 self.pos = oldpos
548 self.commentlist = oldcl
549 addrspec = self.getaddrspec()
550 returnlist = [(string.join(self.commentlist), addrspec)]
551
552 elif self.field[self.pos] == ':':
553 # address is a group
554 returnlist = []
555
556 self.pos = self.pos + 1
557 while self.pos < len(self.field):
558 self.gotonext()
559 if self.field[self.pos] == ';':
560 self.pos = self.pos + 1
561 break
562 returnlist = returnlist + self.getaddress()
563
564 elif self.field[self.pos] == '<':
565 # Address is a phrase then a route addr
566 routeaddr = self.getrouteaddr()
567
568 if self.commentlist:
569 returnlist = [(string.join(plist) + ' (' + \
570 string.join(self.commentlist) + ')', routeaddr)]
571 else: returnlist = [(string.join(plist), routeaddr)]
572
573 else:
574 if plist:
575 returnlist = [(string.join(self.commentlist), plist[0])]
Barry Warsaw8a578431999-01-14 19:59:58 +0000576 elif self.field[self.pos] in self.specials:
577 self.pos = self.pos + 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000578
579 self.gotonext()
580 if self.pos < len(self.field) and self.field[self.pos] == ',':
581 self.pos = self.pos + 1
582 return returnlist
583
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000584 def getrouteaddr(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000585 """Parse a route address (Return-path value).
586
587 This method just skips all the route stuff and returns the addrspec.
588 """
589 if self.field[self.pos] != '<':
590 return
591
592 expectroute = 0
593 self.pos = self.pos + 1
594 self.gotonext()
Guido van Rossum9e43adb1998-03-03 16:17:52 +0000595 adlist = None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000596 while self.pos < len(self.field):
597 if expectroute:
598 self.getdomain()
599 expectroute = 0
600 elif self.field[self.pos] == '>':
601 self.pos = self.pos + 1
602 break
603 elif self.field[self.pos] == '@':
604 self.pos = self.pos + 1
605 expectroute = 1
606 elif self.field[self.pos] == ':':
607 self.pos = self.pos + 1
608 expectaddrspec = 1
609 else:
610 adlist = self.getaddrspec()
611 self.pos = self.pos + 1
612 break
613 self.gotonext()
614
615 return adlist
616
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000617 def getaddrspec(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000618 """Parse an RFC-822 addr-spec."""
619 aslist = []
620
621 self.gotonext()
622 while self.pos < len(self.field):
623 if self.field[self.pos] == '.':
624 aslist.append('.')
625 self.pos = self.pos + 1
626 elif self.field[self.pos] == '"':
Guido van Rossumb1844871999-06-15 18:06:20 +0000627 aslist.append('"%s"' % self.getquote())
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000628 elif self.field[self.pos] in self.atomends:
629 break
630 else: aslist.append(self.getatom())
631 self.gotonext()
632
633 if self.pos >= len(self.field) or self.field[self.pos] != '@':
634 return string.join(aslist, '')
635
636 aslist.append('@')
637 self.pos = self.pos + 1
638 self.gotonext()
639 return string.join(aslist, '') + self.getdomain()
640
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000641 def getdomain(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000642 """Get the complete domain name from an address."""
643 sdlist = []
644 while self.pos < len(self.field):
645 if self.field[self.pos] in self.LWS:
646 self.pos = self.pos + 1
647 elif self.field[self.pos] == '(':
648 self.commentlist.append(self.getcomment())
649 elif self.field[self.pos] == '[':
650 sdlist.append(self.getdomainliteral())
651 elif self.field[self.pos] == '.':
652 self.pos = self.pos + 1
653 sdlist.append('.')
654 elif self.field[self.pos] in self.atomends:
655 break
656 else: sdlist.append(self.getatom())
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000657 return string.join(sdlist, '')
658
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000659 def getdelimited(self, beginchar, endchars, allowcomments = 1):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000660 """Parse a header fragment delimited by special characters.
661
662 `beginchar' is the start character for the fragment.
663 If self is not looking at an instance of `beginchar' then
664 getdelimited returns the empty string.
665
666 `endchars' is a sequence of allowable end-delimiting characters.
667 Parsing stops when one of these is encountered.
668
669 If `allowcomments' is non-zero, embedded RFC-822 comments
670 are allowed within the parsed fragment.
671 """
672 if self.field[self.pos] != beginchar:
673 return ''
674
675 slist = ['']
676 quote = 0
677 self.pos = self.pos + 1
678 while self.pos < len(self.field):
679 if quote == 1:
680 slist.append(self.field[self.pos])
681 quote = 0
682 elif self.field[self.pos] in endchars:
683 self.pos = self.pos + 1
684 break
685 elif allowcomments and self.field[self.pos] == '(':
686 slist.append(self.getcomment())
687 elif self.field[self.pos] == '\\':
688 quote = 1
689 else:
690 slist.append(self.field[self.pos])
691 self.pos = self.pos + 1
692
693 return string.join(slist, '')
694
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000695 def getquote(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000696 """Get a quote-delimited fragment from self's field."""
697 return self.getdelimited('"', '"\r', 0)
698
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000699 def getcomment(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000700 """Get a parenthesis-delimited fragment from self's field."""
701 return self.getdelimited('(', ')\r', 1)
702
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000703 def getdomainliteral(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000704 """Parse an RFC-822 domain-literal."""
705 return self.getdelimited('[', ']\r', 0)
706
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000707 def getatom(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000708 """Parse an RFC-822 atom."""
709 atomlist = ['']
710
711 while self.pos < len(self.field):
712 if self.field[self.pos] in self.atomends:
713 break
714 else: atomlist.append(self.field[self.pos])
715 self.pos = self.pos + 1
716
717 return string.join(atomlist, '')
718
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000719 def getphraselist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000720 """Parse a sequence of RFC-822 phrases.
721
722 A phrase is a sequence of words, which are in turn either
Guido van Rossume894fc01998-06-11 13:58:40 +0000723 RFC-822 atoms or quoted-strings. Phrases are canonicalized
724 by squeezing all runs of continuous whitespace into one space.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000725 """
726 plist = []
727
728 while self.pos < len(self.field):
729 if self.field[self.pos] in self.LWS:
730 self.pos = self.pos + 1
731 elif self.field[self.pos] == '"':
732 plist.append(self.getquote())
733 elif self.field[self.pos] == '(':
734 self.commentlist.append(self.getcomment())
735 elif self.field[self.pos] in self.atomends:
736 break
737 else: plist.append(self.getatom())
738
739 return plist
Guido van Rossumb6775db1994-08-01 11:34:53 +0000740
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000741class AddressList(AddrlistClass):
742 """An AddressList encapsulates a list of parsed RFC822 addresses."""
743 def __init__(self, field):
744 AddrlistClass.__init__(self, field)
745 if field:
746 self.addresslist = self.getaddrlist()
747 else:
748 self.addresslist = []
749
750 def __len__(self):
751 return len(self.addresslist)
752
753 def __str__(self):
754 return string.joinfields(map(dump_address_pair, self.addresslist),", ")
755
756 def __add__(self, other):
757 # Set union
758 newaddr = AddressList(None)
759 newaddr.addresslist = self.addresslist[:]
760 for x in other.addresslist:
761 if not x in self.addresslist:
762 newaddr.addresslist.append(x)
763 return newaddr
764
765 def __sub__(self, other):
766 # Set difference
767 newaddr = AddressList(None)
768 for x in self.addresslist:
769 if not x in other.addresslist:
770 newaddr.addresslist.append(x)
771 return newaddr
772
Guido van Rossum81d10b41998-06-16 22:29:03 +0000773 def __getitem__(self, index):
774 # Make indexing, slices, and 'in' work
775 return self.addrlist[index]
776
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000777def dump_address_pair(pair):
778 """Dump a (name, address) pair in a canonicalized form."""
779 if pair[0]:
780 return '"' + pair[0] + '" <' + pair[1] + '>'
781 else:
782 return pair[1]
Guido van Rossumb6775db1994-08-01 11:34:53 +0000783
784# Parse a date field
785
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000786_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
787 'aug', 'sep', 'oct', 'nov', 'dec',
788 'january', 'february', 'march', 'april', 'may', 'june', 'july',
789 'august', 'september', 'october', 'november', 'december']
790_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
Guido van Rossumb6775db1994-08-01 11:34:53 +0000791
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000792# The timezone table does not include the military time zones defined
793# in RFC822, other than Z. According to RFC1123, the description in
794# RFC822 gets the signs wrong, so we can't rely on any such time
795# zones. RFC1123 recommends that numeric timezone indicators be used
796# instead of timezone names.
797
798_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
Guido van Rossum67133e21998-05-18 16:09:10 +0000799 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000800 'EST': -500, 'EDT': -400, # Eastern
Guido van Rossum67133e21998-05-18 16:09:10 +0000801 'CST': -600, 'CDT': -500, # Central
802 'MST': -700, 'MDT': -600, # Mountain
803 'PST': -800, 'PDT': -700 # Pacific
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000804 }
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000805
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000806
807def parsedate_tz(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000808 """Convert a date string to a time tuple.
809
810 Accounts for military timezones.
811 """
812 data = string.split(data)
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000813 if data[0][-1] in (',', '.') or string.lower(data[0]) in _daynames:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000814 # There's a dayname here. Skip it
815 del data[0]
816 if len(data) == 3: # RFC 850 date, deprecated
817 stuff = string.split(data[0], '-')
818 if len(stuff) == 3:
819 data = stuff + data[1:]
820 if len(data) == 4:
821 s = data[3]
822 i = string.find(s, '+')
823 if i > 0:
824 data[3:] = [s[:i], s[i+1:]]
825 else:
826 data.append('') # Dummy tz
827 if len(data) < 5:
828 return None
829 data = data[:5]
830 [dd, mm, yy, tm, tz] = data
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000831 mm = string.lower(mm)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000832 if not mm in _monthnames:
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000833 dd, mm = mm, string.lower(dd)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000834 if not mm in _monthnames:
835 return None
836 mm = _monthnames.index(mm)+1
Guido van Rossumb08f51b1999-04-29 12:50:36 +0000837 if mm > 12: mm = mm - 12
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000838 if dd[-1] == ',':
839 dd = dd[:-1]
840 i = string.find(yy, ':')
841 if i > 0:
842 yy, tm = tm, yy
843 if yy[-1] == ',':
844 yy = yy[:-1]
845 if yy[0] not in string.digits:
846 yy, tz = tz, yy
847 if tm[-1] == ',':
848 tm = tm[:-1]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000849 tm = string.splitfields(tm, ':')
850 if len(tm) == 2:
851 [thh, tmm] = tm
852 tss = '0'
Guido van Rossum99e11311998-12-23 21:58:38 +0000853 elif len(tm) == 3:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000854 [thh, tmm, tss] = tm
Guido van Rossum99e11311998-12-23 21:58:38 +0000855 else:
856 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000857 try:
858 yy = string.atoi(yy)
859 dd = string.atoi(dd)
860 thh = string.atoi(thh)
861 tmm = string.atoi(tmm)
862 tss = string.atoi(tss)
863 except string.atoi_error:
864 return None
Guido van Rossuma73033f1998-02-19 00:28:58 +0000865 tzoffset=None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000866 tz=string.upper(tz)
867 if _timezones.has_key(tz):
868 tzoffset=_timezones[tz]
869 else:
870 try:
871 tzoffset=string.atoi(tz)
872 except string.atoi_error:
873 pass
874 # Convert a timezone offset into seconds ; -0500 -> -18000
Guido van Rossuma73033f1998-02-19 00:28:58 +0000875 if tzoffset:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000876 if tzoffset < 0:
877 tzsign = -1
878 tzoffset = -tzoffset
879 else:
880 tzsign = 1
881 tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000882 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
883 return tuple
884
Guido van Rossumb6775db1994-08-01 11:34:53 +0000885
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000886def parsedate(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000887 """Convert a time string to a time tuple."""
888 t=parsedate_tz(data)
889 if type(t)==type( () ):
890 return t[:9]
891 else: return t
892
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000893
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000894def mktime_tz(data):
Guido van Rossum67133e21998-05-18 16:09:10 +0000895 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
Guido van Rossuma73033f1998-02-19 00:28:58 +0000896 if data[9] is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000897 # No zone info, so localtime is better assumption than GMT
898 return time.mktime(data[:8] + (-1,))
Guido van Rossuma73033f1998-02-19 00:28:58 +0000899 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000900 t = time.mktime(data[:8] + (0,))
901 return t - data[9] - time.timezone
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000902
Guido van Rossum247a78a1999-04-19 18:04:38 +0000903def formatdate(timeval=None):
904 """Returns time format preferred for Internet standards.
905
906 Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
907 """
908 if timeval is None:
909 timeval = time.time()
910 return "%s" % time.strftime('%a, %d %b %Y %H:%M:%S GMT',
911 time.gmtime(timeval))
912
Guido van Rossumb6775db1994-08-01 11:34:53 +0000913
914# When used as script, run a small test program.
915# The first command line argument must be a filename containing one
916# message in RFC-822 format.
917
918if __name__ == '__main__':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000919 import sys, os
920 file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
921 if sys.argv[1:]: file = sys.argv[1]
922 f = open(file, 'r')
923 m = Message(f)
924 print 'From:', m.getaddr('from')
925 print 'To:', m.getaddrlist('to')
926 print 'Subject:', m.getheader('subject')
927 print 'Date:', m.getheader('date')
928 date = m.getdate_tz('date')
929 if date:
930 print 'ParsedDate:', time.asctime(date[:-1]),
931 hhmmss = date[-1]
932 hhmm, ss = divmod(hhmmss, 60)
933 hh, mm = divmod(hhmm, 60)
934 print "%+03d%02d" % (hh, mm),
935 if ss: print ".%02d" % ss,
936 print
937 else:
938 print 'ParsedDate:', None
939 m.rewindbody()
940 n = 0
941 while f.readline():
942 n = n + 1
943 print 'Lines:', n
944 print '-'*70
945 print 'len =', len(m)
946 if m.has_key('Date'): print 'Date =', m['Date']
947 if m.has_key('X-Nonsense'): pass
948 print 'keys =', m.keys()
949 print 'values =', m.values()
950 print 'items =', m.items()