blob: 86727d0b233106dcd176c5c801c18219e0b99aff [file] [log] [blame]
Guido van Rossum9ab94c11997-12-10 16:17:39 +00001"""RFC-822 message manipulation class.
Guido van Rossum01ca3361992-07-13 14:28:59 +00002
Guido van Rossum9ab94c11997-12-10 16:17:39 +00003XXX This is only a very rough sketch of a full RFC-822 parser;
4in particular the tokenizing of addresses does not adhere to all the
5quoting rules.
6
7Directions for use:
8
9To create a Message object: first open a file, e.g.:
10 fp = open(file, 'r')
Guido van Rossumc7bb8571998-06-10 21:31:01 +000011You can use any other legal way of getting an open file object, e.g. use
12sys.stdin or call os.popen().
Guido van Rossum9ab94c11997-12-10 16:17:39 +000013Then pass the open file object to the Message() constructor:
14 m = Message(fp)
15
Guido van Rossume894fc01998-06-11 13:58:40 +000016This class can work with any input object that supports a readline
17method. If the input object has seek and tell capability, the
18rewindbody method will work; also illegal lines will be pushed back
19onto the input stream. If the input object lacks seek but has an
20`unread' method that can push back a line of input, Message will use
21that to push back illegal lines. Thus this class can be used to parse
22messages coming from a buffered stream.
Guido van Rossumc7bb8571998-06-10 21:31:01 +000023
24The optional `seekable' argument is provided as a workaround for
25certain stdio libraries in which tell() discards buffered data before
26discovering that the lseek() system call doesn't work. For maximum
27portability, you should set the seekable argument to zero to prevent
28that initial \code{tell} when passing in an unseekable object such as
29a a file object created from a socket object. If it is 1 on entry --
30which it is by default -- the tell() method of the open file object is
31called once; if this raises an exception, seekable is reset to 0. For
32other nonzero values of seekable, this test is not made.
33
Guido van Rossum9ab94c11997-12-10 16:17:39 +000034To get the text of a particular header there are several methods:
35 str = m.getheader(name)
36 str = m.getrawheader(name)
37where name is the name of the header, e.g. 'Subject'.
38The difference is that getheader() strips the leading and trailing
39whitespace, while getrawheader() doesn't. Both functions retain
40embedded whitespace (including newlines) exactly as they are
41specified in the header, and leave the case of the text unchanged.
42
43For addresses and address lists there are functions
44 realname, mailaddress = m.getaddr(name) and
45 list = m.getaddrlist(name)
46where the latter returns a list of (realname, mailaddr) tuples.
47
48There is also a method
49 time = m.getdate(name)
50which parses a Date-like field and returns a time-compatible tuple,
51i.e. a tuple such as returned by time.localtime() or accepted by
52time.mktime().
53
54See the class definition for lower level access methods.
55
56There are also some utility functions here.
57"""
Guido van Rossum4d4ab921998-06-16 22:27:09 +000058# Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
Guido van Rossum01ca3361992-07-13 14:28:59 +000059
Guido van Rossum01ca3361992-07-13 14:28:59 +000060import string
Guido van Rossumb6775db1994-08-01 11:34:53 +000061import time
Guido van Rossum01ca3361992-07-13 14:28:59 +000062
63
Guido van Rossum9ab94c11997-12-10 16:17:39 +000064_blanklines = ('\r\n', '\n') # Optimization for islast()
Guido van Rossum92457b91995-06-22 19:06:57 +000065
66
Guido van Rossum01ca3361992-07-13 14:28:59 +000067class Message:
Guido van Rossum9ab94c11997-12-10 16:17:39 +000068 """Represents a single RFC-822-compliant message."""
69
70 def __init__(self, fp, seekable = 1):
71 """Initialize the class instance and read the headers."""
Guido van Rossumc7bb8571998-06-10 21:31:01 +000072 if seekable == 1:
73 # Exercise tell() to make sure it works
74 # (and then assume seek() works, too)
75 try:
76 fp.tell()
77 except:
78 seekable = 0
79 else:
80 seekable = 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +000081 self.fp = fp
82 self.seekable = seekable
83 self.startofheaders = None
84 self.startofbody = None
85 #
86 if self.seekable:
87 try:
88 self.startofheaders = self.fp.tell()
89 except IOError:
90 self.seekable = 0
91 #
92 self.readheaders()
93 #
94 if self.seekable:
95 try:
96 self.startofbody = self.fp.tell()
97 except IOError:
98 self.seekable = 0
99
100 def rewindbody(self):
101 """Rewind the file to the start of the body (if seekable)."""
102 if not self.seekable:
103 raise IOError, "unseekable file"
104 self.fp.seek(self.startofbody)
105
106 def readheaders(self):
107 """Read header lines.
108
109 Read header lines up to the entirely blank line that
110 terminates them. The (normally blank) line that ends the
111 headers is skipped, but not included in the returned list.
112 If a non-header line ends the headers, (which is an error),
113 an attempt is made to backspace over it; it is never
114 included in the returned list.
115
116 The variable self.status is set to the empty string if all
117 went well, otherwise it is an error message.
118 The variable self.headers is a completely uninterpreted list
119 of lines contained in the header (so printing them will
120 reproduce the header exactly as it appears in the file).
121 """
122 self.dict = {}
123 self.unixfrom = ''
124 self.headers = list = []
125 self.status = ''
126 headerseen = ""
127 firstline = 1
Guido van Rossum052969a1998-07-21 14:24:04 +0000128 startofline = unread = tell = None
129 if hasattr(self.fp, 'unread'):
130 unread = self.fp.unread
131 elif self.seekable:
132 tell = self.fp.tell
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000133 while 1:
Guido van Rossum052969a1998-07-21 14:24:04 +0000134 if tell:
135 startofline = tell()
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000136 line = self.fp.readline()
137 if not line:
138 self.status = 'EOF in headers'
139 break
140 # Skip unix From name time lines
141 if firstline and line[:5] == 'From ':
142 self.unixfrom = self.unixfrom + line
143 continue
144 firstline = 0
Guido van Rossume894fc01998-06-11 13:58:40 +0000145 if headerseen and line[0] in ' \t':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000146 # It's a continuation line.
147 list.append(line)
Guido van Rossume894fc01998-06-11 13:58:40 +0000148 x = (self.dict[headerseen] + "\n " + string.strip(line))
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000149 self.dict[headerseen] = string.strip(x)
Guido van Rossume894fc01998-06-11 13:58:40 +0000150 continue
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000151 elif self.iscomment(line):
Guido van Rossume894fc01998-06-11 13:58:40 +0000152 # It's a comment. Ignore it.
153 continue
154 elif self.islast(line):
155 # Note! No pushback here! The delimiter line gets eaten.
156 break
157 headerseen = self.isheader(line)
158 if headerseen:
159 # It's a legal header line, save it.
160 list.append(line)
161 self.dict[headerseen] = string.strip(line[len(headerseen)+2:])
162 continue
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000163 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000164 # It's not a header line; throw it back and stop here.
165 if not self.dict:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000166 self.status = 'No headers'
167 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000168 self.status = 'Non-header line where header expected'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000169 # Try to undo the read.
Guido van Rossum052969a1998-07-21 14:24:04 +0000170 if unread:
171 unread(line)
172 elif tell:
173 self.fp.seek(startofline)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000174 else:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000175 self.status = self.status + '; bad seek'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000176 break
Guido van Rossume894fc01998-06-11 13:58:40 +0000177
178 def isheader(self, line):
179 """Determine whether a given line is a legal header.
180
181 This method should return the header name, suitably canonicalized.
182 You may override this method in order to use Message parsing
183 on tagged data in RFC822-like formats with special header formats.
184 """
185 i = string.find(line, ':')
186 if i > 0:
187 return string.lower(line[:i])
188 else:
189 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000190
191 def islast(self, line):
192 """Determine whether a line is a legal end of RFC-822 headers.
193
194 You may override this method if your application wants
195 to bend the rules, e.g. to strip trailing whitespace,
196 or to recognise MH template separators ('--------').
197 For convenience (e.g. for code reading from sockets) a
198 line consisting of \r\n also matches.
199 """
200 return line in _blanklines
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000201
202 def iscomment(self, line):
203 """Determine whether a line should be skipped entirely.
204
205 You may override this method in order to use Message parsing
206 on tagged data in RFC822-like formats that support embedded
207 comments or free-text data.
208 """
209 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000210
211 def getallmatchingheaders(self, name):
212 """Find all header lines matching a given header name.
213
214 Look through the list of headers and find all lines
215 matching a given header name (and their continuation
216 lines). A list of the lines is returned, without
217 interpretation. If the header does not occur, an
218 empty list is returned. If the header occurs multiple
219 times, all occurrences are returned. Case is not
220 important in the header name.
221 """
222 name = string.lower(name) + ':'
223 n = len(name)
224 list = []
225 hit = 0
226 for line in self.headers:
227 if string.lower(line[:n]) == name:
228 hit = 1
229 elif line[:1] not in string.whitespace:
230 hit = 0
231 if hit:
232 list.append(line)
233 return list
234
235 def getfirstmatchingheader(self, name):
236 """Get the first header line matching name.
237
238 This is similar to getallmatchingheaders, but it returns
239 only the first matching header (and its continuation
240 lines).
241 """
242 name = string.lower(name) + ':'
243 n = len(name)
244 list = []
245 hit = 0
246 for line in self.headers:
247 if hit:
248 if line[:1] not in string.whitespace:
249 break
250 elif string.lower(line[:n]) == name:
251 hit = 1
252 if hit:
253 list.append(line)
254 return list
255
256 def getrawheader(self, name):
257 """A higher-level interface to getfirstmatchingheader().
258
259 Return a string containing the literal text of the
260 header but with the keyword stripped. All leading,
261 trailing and embedded whitespace is kept in the
262 string, however.
263 Return None if the header does not occur.
264 """
265
266 list = self.getfirstmatchingheader(name)
267 if not list:
268 return None
269 list[0] = list[0][len(name) + 1:]
270 return string.joinfields(list, '')
271
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000272 def getheader(self, name, default=None):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000273 """Get the header value for a name.
274
275 This is the normal interface: it return a stripped
276 version of the header value for a given header name,
277 or None if it doesn't exist. This uses the dictionary
278 version which finds the *last* such header.
279 """
280 try:
281 return self.dict[string.lower(name)]
282 except KeyError:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000283 return default
284 get = getheader
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000285
286 def getaddr(self, name):
287 """Get a single address from a header, as a tuple.
288
289 An example return value:
290 ('Guido van Rossum', 'guido@cwi.nl')
291 """
292 # New, by Ben Escoto
293 alist = self.getaddrlist(name)
294 if alist:
295 return alist[0]
296 else:
297 return (None, None)
298
299 def getaddrlist(self, name):
300 """Get a list of addresses from a header.
Barry Warsaw8a578431999-01-14 19:59:58 +0000301
302 Retrieves a list of addresses from a header, where each address is a
303 tuple as returned by getaddr(). Scans all named headers, so it works
304 properly with multiple To: or Cc: headers for example.
305
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000306 """
Barry Warsaw8a578431999-01-14 19:59:58 +0000307 raw = []
308 for h in self.getallmatchingheaders(name):
309 if h[0] in ' \t':
310 raw.append(h)
311 else:
312 if raw:
313 raw.append(', ')
314 i = string.find(h, ':')
315 if i > 0:
316 addr = h[i+1:]
317 raw.append(addr)
318 alladdrs = string.join(raw, '')
319 a = AddrlistClass(alladdrs)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000320 return a.getaddrlist()
321
322 def getdate(self, name):
323 """Retrieve a date field from a header.
324
325 Retrieves a date field from the named header, returning
326 a tuple compatible with time.mktime().
327 """
328 try:
329 data = self[name]
330 except KeyError:
331 return None
332 return parsedate(data)
333
334 def getdate_tz(self, name):
335 """Retrieve a date field from a header as a 10-tuple.
336
337 The first 9 elements make up a tuple compatible with
338 time.mktime(), and the 10th is the offset of the poster's
339 time zone from GMT/UTC.
340 """
341 try:
342 data = self[name]
343 except KeyError:
344 return None
345 return parsedate_tz(data)
346
347
348 # Access as a dictionary (only finds *last* header of each type):
349
350 def __len__(self):
351 """Get the number of headers in a message."""
352 return len(self.dict)
353
354 def __getitem__(self, name):
355 """Get a specific header, as from a dictionary."""
356 return self.dict[string.lower(name)]
Guido van Rossume894fc01998-06-11 13:58:40 +0000357
358 def __setitem__(self, name, value):
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000359 """Set the value of a header.
360
361 Note: This is not a perfect inversion of __getitem__, because
362 any changed headers get stuck at the end of the raw-headers list
363 rather than where the altered header was.
364 """
Guido van Rossume894fc01998-06-11 13:58:40 +0000365 del self[name] # Won't fail if it doesn't exist
366 self.dict[string.lower(name)] = value
367 text = name + ": " + value
368 lines = string.split(text, "\n")
369 for line in lines:
370 self.headers.append(line + "\n")
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000371
Guido van Rossum75d92c11998-04-02 21:33:20 +0000372 def __delitem__(self, name):
373 """Delete all occurrences of a specific header, if it is present."""
374 name = string.lower(name)
375 if not self.dict.has_key(name):
376 return
377 del self.dict[name]
378 name = name + ':'
379 n = len(name)
380 list = []
381 hit = 0
382 for i in range(len(self.headers)):
383 line = self.headers[i]
384 if string.lower(line[:n]) == name:
385 hit = 1
386 elif line[:1] not in string.whitespace:
387 hit = 0
388 if hit:
389 list.append(i)
390 list.reverse()
391 for i in list:
392 del self.headers[i]
393
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000394 def has_key(self, name):
395 """Determine whether a message contains the named header."""
396 return self.dict.has_key(string.lower(name))
397
398 def keys(self):
399 """Get all of a message's header field names."""
400 return self.dict.keys()
401
402 def values(self):
403 """Get all of a message's header field values."""
404 return self.dict.values()
405
406 def items(self):
407 """Get all of a message's headers.
408
409 Returns a list of name, value tuples.
410 """
411 return self.dict.items()
Guido van Rossum01ca3361992-07-13 14:28:59 +0000412
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000413 def __str__(self):
414 str = ''
415 for hdr in self.headers:
416 str = str + hdr
417 return str
Guido van Rossum01ca3361992-07-13 14:28:59 +0000418
419
420# Utility functions
421# -----------------
422
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000423# XXX Should fix unquote() and quote() to be really conformant.
Guido van Rossumb6775db1994-08-01 11:34:53 +0000424# XXX The inverses of the parse functions may also be useful.
425
Guido van Rossum01ca3361992-07-13 14:28:59 +0000426
Guido van Rossum01ca3361992-07-13 14:28:59 +0000427def unquote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000428 """Remove quotes from a string."""
429 if len(str) > 1:
430 if str[0] == '"' and str[-1:] == '"':
431 return str[1:-1]
432 if str[0] == '<' and str[-1:] == '>':
433 return str[1:-1]
434 return str
Guido van Rossumb6775db1994-08-01 11:34:53 +0000435
436
Guido van Rossum7883e1d1997-09-15 14:12:54 +0000437def quote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000438 """Add quotes around a string."""
439 return '"%s"' % string.join(
440 string.split(
441 string.join(
442 string.split(str, '\\'),
443 '\\\\'),
444 '"'),
445 '\\"')
Guido van Rossumb6775db1994-08-01 11:34:53 +0000446
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000447
Guido van Rossumb6775db1994-08-01 11:34:53 +0000448def parseaddr(address):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000449 """Parse an address into a (realname, mailaddr) tuple."""
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000450 a = AddrlistClass(address)
451 list = a.getaddrlist()
452 if not list:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000453 return (None, None)
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000454 else:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000455 return list[0]
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000456
457
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000458class AddrlistClass:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000459 """Address parser class by Ben Escoto.
460
461 To understand what this class does, it helps to have a copy of
462 RFC-822 in front of you.
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000463
464 Note: this class interface is deprecated and may be removed in the future.
465 Use rfc822.AddressList instead.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000466 """
467
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000468 def __init__(self, field):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000469 """Initialize a new instance.
470
471 `field' is an unparsed address header field, containing
472 one or more addresses.
473 """
474 self.specials = '()<>@,:;.\"[]'
475 self.pos = 0
476 self.LWS = ' \t'
Barry Warsaw8a578431999-01-14 19:59:58 +0000477 self.CR = '\r\n'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000478 self.atomends = self.specials + self.LWS + self.CR
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000479 self.field = field
480 self.commentlist = []
481
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000482 def gotonext(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000483 """Parse up to the start of the next address."""
484 while self.pos < len(self.field):
485 if self.field[self.pos] in self.LWS + '\n\r':
486 self.pos = self.pos + 1
487 elif self.field[self.pos] == '(':
488 self.commentlist.append(self.getcomment())
489 else: break
490
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000491 def getaddrlist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000492 """Parse all addresses.
493
494 Returns a list containing all of the addresses.
495 """
496 ad = self.getaddress()
497 if ad:
498 return ad + self.getaddrlist()
499 else: return []
500
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000501 def getaddress(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000502 """Parse the next address."""
503 self.commentlist = []
504 self.gotonext()
505
506 oldpos = self.pos
507 oldcl = self.commentlist
508 plist = self.getphraselist()
509
510 self.gotonext()
511 returnlist = []
512
513 if self.pos >= len(self.field):
514 # Bad email address technically, no domain.
515 if plist:
516 returnlist = [(string.join(self.commentlist), plist[0])]
517
518 elif self.field[self.pos] in '.@':
519 # email address is just an addrspec
520 # this isn't very efficient since we start over
521 self.pos = oldpos
522 self.commentlist = oldcl
523 addrspec = self.getaddrspec()
524 returnlist = [(string.join(self.commentlist), addrspec)]
525
526 elif self.field[self.pos] == ':':
527 # address is a group
528 returnlist = []
529
530 self.pos = self.pos + 1
531 while self.pos < len(self.field):
532 self.gotonext()
533 if self.field[self.pos] == ';':
534 self.pos = self.pos + 1
535 break
536 returnlist = returnlist + self.getaddress()
537
538 elif self.field[self.pos] == '<':
539 # Address is a phrase then a route addr
540 routeaddr = self.getrouteaddr()
541
542 if self.commentlist:
543 returnlist = [(string.join(plist) + ' (' + \
544 string.join(self.commentlist) + ')', routeaddr)]
545 else: returnlist = [(string.join(plist), routeaddr)]
546
547 else:
548 if plist:
549 returnlist = [(string.join(self.commentlist), plist[0])]
Barry Warsaw8a578431999-01-14 19:59:58 +0000550 elif self.field[self.pos] in self.specials:
551 self.pos = self.pos + 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000552
553 self.gotonext()
554 if self.pos < len(self.field) and self.field[self.pos] == ',':
555 self.pos = self.pos + 1
556 return returnlist
557
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000558 def getrouteaddr(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000559 """Parse a route address (Return-path value).
560
561 This method just skips all the route stuff and returns the addrspec.
562 """
563 if self.field[self.pos] != '<':
564 return
565
566 expectroute = 0
567 self.pos = self.pos + 1
568 self.gotonext()
Guido van Rossum9e43adb1998-03-03 16:17:52 +0000569 adlist = None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000570 while self.pos < len(self.field):
571 if expectroute:
572 self.getdomain()
573 expectroute = 0
574 elif self.field[self.pos] == '>':
575 self.pos = self.pos + 1
576 break
577 elif self.field[self.pos] == '@':
578 self.pos = self.pos + 1
579 expectroute = 1
580 elif self.field[self.pos] == ':':
581 self.pos = self.pos + 1
582 expectaddrspec = 1
583 else:
584 adlist = self.getaddrspec()
585 self.pos = self.pos + 1
586 break
587 self.gotonext()
588
589 return adlist
590
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000591 def getaddrspec(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000592 """Parse an RFC-822 addr-spec."""
593 aslist = []
594
595 self.gotonext()
596 while self.pos < len(self.field):
597 if self.field[self.pos] == '.':
598 aslist.append('.')
599 self.pos = self.pos + 1
600 elif self.field[self.pos] == '"':
601 aslist.append(self.getquote())
602 elif self.field[self.pos] in self.atomends:
603 break
604 else: aslist.append(self.getatom())
605 self.gotonext()
606
607 if self.pos >= len(self.field) or self.field[self.pos] != '@':
608 return string.join(aslist, '')
609
610 aslist.append('@')
611 self.pos = self.pos + 1
612 self.gotonext()
613 return string.join(aslist, '') + self.getdomain()
614
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000615 def getdomain(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000616 """Get the complete domain name from an address."""
617 sdlist = []
618 while self.pos < len(self.field):
619 if self.field[self.pos] in self.LWS:
620 self.pos = self.pos + 1
621 elif self.field[self.pos] == '(':
622 self.commentlist.append(self.getcomment())
623 elif self.field[self.pos] == '[':
624 sdlist.append(self.getdomainliteral())
625 elif self.field[self.pos] == '.':
626 self.pos = self.pos + 1
627 sdlist.append('.')
628 elif self.field[self.pos] in self.atomends:
629 break
630 else: sdlist.append(self.getatom())
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000631 return string.join(sdlist, '')
632
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000633 def getdelimited(self, beginchar, endchars, allowcomments = 1):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000634 """Parse a header fragment delimited by special characters.
635
636 `beginchar' is the start character for the fragment.
637 If self is not looking at an instance of `beginchar' then
638 getdelimited returns the empty string.
639
640 `endchars' is a sequence of allowable end-delimiting characters.
641 Parsing stops when one of these is encountered.
642
643 If `allowcomments' is non-zero, embedded RFC-822 comments
644 are allowed within the parsed fragment.
645 """
646 if self.field[self.pos] != beginchar:
647 return ''
648
649 slist = ['']
650 quote = 0
651 self.pos = self.pos + 1
652 while self.pos < len(self.field):
653 if quote == 1:
654 slist.append(self.field[self.pos])
655 quote = 0
656 elif self.field[self.pos] in endchars:
657 self.pos = self.pos + 1
658 break
659 elif allowcomments and self.field[self.pos] == '(':
660 slist.append(self.getcomment())
661 elif self.field[self.pos] == '\\':
662 quote = 1
663 else:
664 slist.append(self.field[self.pos])
665 self.pos = self.pos + 1
666
667 return string.join(slist, '')
668
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000669 def getquote(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000670 """Get a quote-delimited fragment from self's field."""
671 return self.getdelimited('"', '"\r', 0)
672
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000673 def getcomment(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000674 """Get a parenthesis-delimited fragment from self's field."""
675 return self.getdelimited('(', ')\r', 1)
676
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000677 def getdomainliteral(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000678 """Parse an RFC-822 domain-literal."""
679 return self.getdelimited('[', ']\r', 0)
680
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000681 def getatom(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000682 """Parse an RFC-822 atom."""
683 atomlist = ['']
684
685 while self.pos < len(self.field):
686 if self.field[self.pos] in self.atomends:
687 break
688 else: atomlist.append(self.field[self.pos])
689 self.pos = self.pos + 1
690
691 return string.join(atomlist, '')
692
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000693 def getphraselist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000694 """Parse a sequence of RFC-822 phrases.
695
696 A phrase is a sequence of words, which are in turn either
Guido van Rossume894fc01998-06-11 13:58:40 +0000697 RFC-822 atoms or quoted-strings. Phrases are canonicalized
698 by squeezing all runs of continuous whitespace into one space.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000699 """
700 plist = []
701
702 while self.pos < len(self.field):
703 if self.field[self.pos] in self.LWS:
704 self.pos = self.pos + 1
705 elif self.field[self.pos] == '"':
706 plist.append(self.getquote())
707 elif self.field[self.pos] == '(':
708 self.commentlist.append(self.getcomment())
709 elif self.field[self.pos] in self.atomends:
710 break
711 else: plist.append(self.getatom())
712
713 return plist
Guido van Rossumb6775db1994-08-01 11:34:53 +0000714
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000715class AddressList(AddrlistClass):
716 """An AddressList encapsulates a list of parsed RFC822 addresses."""
717 def __init__(self, field):
718 AddrlistClass.__init__(self, field)
719 if field:
720 self.addresslist = self.getaddrlist()
721 else:
722 self.addresslist = []
723
724 def __len__(self):
725 return len(self.addresslist)
726
727 def __str__(self):
728 return string.joinfields(map(dump_address_pair, self.addresslist),", ")
729
730 def __add__(self, other):
731 # Set union
732 newaddr = AddressList(None)
733 newaddr.addresslist = self.addresslist[:]
734 for x in other.addresslist:
735 if not x in self.addresslist:
736 newaddr.addresslist.append(x)
737 return newaddr
738
739 def __sub__(self, other):
740 # Set difference
741 newaddr = AddressList(None)
742 for x in self.addresslist:
743 if not x in other.addresslist:
744 newaddr.addresslist.append(x)
745 return newaddr
746
Guido van Rossum81d10b41998-06-16 22:29:03 +0000747 def __getitem__(self, index):
748 # Make indexing, slices, and 'in' work
749 return self.addrlist[index]
750
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000751def dump_address_pair(pair):
752 """Dump a (name, address) pair in a canonicalized form."""
753 if pair[0]:
754 return '"' + pair[0] + '" <' + pair[1] + '>'
755 else:
756 return pair[1]
Guido van Rossumb6775db1994-08-01 11:34:53 +0000757
758# Parse a date field
759
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000760_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
761 'aug', 'sep', 'oct', 'nov', 'dec',
762 'january', 'february', 'march', 'april', 'may', 'june', 'july',
763 'august', 'september', 'october', 'november', 'december']
764_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
Guido van Rossumb6775db1994-08-01 11:34:53 +0000765
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000766# The timezone table does not include the military time zones defined
767# in RFC822, other than Z. According to RFC1123, the description in
768# RFC822 gets the signs wrong, so we can't rely on any such time
769# zones. RFC1123 recommends that numeric timezone indicators be used
770# instead of timezone names.
771
772_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
Guido van Rossum67133e21998-05-18 16:09:10 +0000773 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000774 'EST': -500, 'EDT': -400, # Eastern
Guido van Rossum67133e21998-05-18 16:09:10 +0000775 'CST': -600, 'CDT': -500, # Central
776 'MST': -700, 'MDT': -600, # Mountain
777 'PST': -800, 'PDT': -700 # Pacific
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000778 }
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000779
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000780
781def parsedate_tz(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000782 """Convert a date string to a time tuple.
783
784 Accounts for military timezones.
785 """
786 data = string.split(data)
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000787 if data[0][-1] in (',', '.') or string.lower(data[0]) in _daynames:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000788 # There's a dayname here. Skip it
789 del data[0]
790 if len(data) == 3: # RFC 850 date, deprecated
791 stuff = string.split(data[0], '-')
792 if len(stuff) == 3:
793 data = stuff + data[1:]
794 if len(data) == 4:
795 s = data[3]
796 i = string.find(s, '+')
797 if i > 0:
798 data[3:] = [s[:i], s[i+1:]]
799 else:
800 data.append('') # Dummy tz
801 if len(data) < 5:
802 return None
803 data = data[:5]
804 [dd, mm, yy, tm, tz] = data
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000805 mm = string.lower(mm)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000806 if not mm in _monthnames:
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000807 dd, mm = mm, string.lower(dd)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000808 if not mm in _monthnames:
809 return None
810 mm = _monthnames.index(mm)+1
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000811 if dd[-1] == ',':
812 dd = dd[:-1]
813 i = string.find(yy, ':')
814 if i > 0:
815 yy, tm = tm, yy
816 if yy[-1] == ',':
817 yy = yy[:-1]
818 if yy[0] not in string.digits:
819 yy, tz = tz, yy
820 if tm[-1] == ',':
821 tm = tm[:-1]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000822 tm = string.splitfields(tm, ':')
823 if len(tm) == 2:
824 [thh, tmm] = tm
825 tss = '0'
Guido van Rossum99e11311998-12-23 21:58:38 +0000826 elif len(tm) == 3:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000827 [thh, tmm, tss] = tm
Guido van Rossum99e11311998-12-23 21:58:38 +0000828 else:
829 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000830 try:
831 yy = string.atoi(yy)
832 dd = string.atoi(dd)
833 thh = string.atoi(thh)
834 tmm = string.atoi(tmm)
835 tss = string.atoi(tss)
836 except string.atoi_error:
837 return None
Guido van Rossuma73033f1998-02-19 00:28:58 +0000838 tzoffset=None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000839 tz=string.upper(tz)
840 if _timezones.has_key(tz):
841 tzoffset=_timezones[tz]
842 else:
843 try:
844 tzoffset=string.atoi(tz)
845 except string.atoi_error:
846 pass
847 # Convert a timezone offset into seconds ; -0500 -> -18000
Guido van Rossuma73033f1998-02-19 00:28:58 +0000848 if tzoffset:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000849 if tzoffset < 0:
850 tzsign = -1
851 tzoffset = -tzoffset
852 else:
853 tzsign = 1
854 tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000855 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
856 return tuple
857
Guido van Rossumb6775db1994-08-01 11:34:53 +0000858
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000859def parsedate(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000860 """Convert a time string to a time tuple."""
861 t=parsedate_tz(data)
862 if type(t)==type( () ):
863 return t[:9]
864 else: return t
865
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000866
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000867def mktime_tz(data):
Guido van Rossum67133e21998-05-18 16:09:10 +0000868 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
Guido van Rossuma73033f1998-02-19 00:28:58 +0000869 if data[9] is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000870 # No zone info, so localtime is better assumption than GMT
871 return time.mktime(data[:8] + (-1,))
Guido van Rossuma73033f1998-02-19 00:28:58 +0000872 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000873 t = time.mktime(data[:8] + (0,))
874 return t - data[9] - time.timezone
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000875
Guido van Rossumb6775db1994-08-01 11:34:53 +0000876
877# When used as script, run a small test program.
878# The first command line argument must be a filename containing one
879# message in RFC-822 format.
880
881if __name__ == '__main__':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000882 import sys, os
883 file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
884 if sys.argv[1:]: file = sys.argv[1]
885 f = open(file, 'r')
886 m = Message(f)
887 print 'From:', m.getaddr('from')
888 print 'To:', m.getaddrlist('to')
889 print 'Subject:', m.getheader('subject')
890 print 'Date:', m.getheader('date')
891 date = m.getdate_tz('date')
892 if date:
893 print 'ParsedDate:', time.asctime(date[:-1]),
894 hhmmss = date[-1]
895 hhmm, ss = divmod(hhmmss, 60)
896 hh, mm = divmod(hhmm, 60)
897 print "%+03d%02d" % (hh, mm),
898 if ss: print ".%02d" % ss,
899 print
900 else:
901 print 'ParsedDate:', None
902 m.rewindbody()
903 n = 0
904 while f.readline():
905 n = n + 1
906 print 'Lines:', n
907 print '-'*70
908 print 'len =', len(m)
909 if m.has_key('Date'): print 'Date =', m['Date']
910 if m.has_key('X-Nonsense'): pass
911 print 'keys =', m.keys()
912 print 'values =', m.values()
913 print 'items =', m.items()