blob: 87d7d394b9d09b9e906db4e2de8237f00bad235e [file] [log] [blame]
Guido van Rossum9ab94c11997-12-10 16:17:39 +00001"""RFC-822 message manipulation class.
Guido van Rossum01ca3361992-07-13 14:28:59 +00002
Guido van Rossum9ab94c11997-12-10 16:17:39 +00003XXX This is only a very rough sketch of a full RFC-822 parser;
4in particular the tokenizing of addresses does not adhere to all the
5quoting rules.
6
7Directions for use:
8
9To create a Message object: first open a file, e.g.:
10 fp = open(file, 'r')
Guido van Rossumc7bb8571998-06-10 21:31:01 +000011You can use any other legal way of getting an open file object, e.g. use
12sys.stdin or call os.popen().
Guido van Rossum9ab94c11997-12-10 16:17:39 +000013Then pass the open file object to the Message() constructor:
14 m = Message(fp)
15
Guido van Rossume894fc01998-06-11 13:58:40 +000016This class can work with any input object that supports a readline
17method. If the input object has seek and tell capability, the
18rewindbody method will work; also illegal lines will be pushed back
19onto the input stream. If the input object lacks seek but has an
20`unread' method that can push back a line of input, Message will use
21that to push back illegal lines. Thus this class can be used to parse
22messages coming from a buffered stream.
Guido van Rossumc7bb8571998-06-10 21:31:01 +000023
24The optional `seekable' argument is provided as a workaround for
25certain stdio libraries in which tell() discards buffered data before
26discovering that the lseek() system call doesn't work. For maximum
27portability, you should set the seekable argument to zero to prevent
28that initial \code{tell} when passing in an unseekable object such as
29a a file object created from a socket object. If it is 1 on entry --
30which it is by default -- the tell() method of the open file object is
31called once; if this raises an exception, seekable is reset to 0. For
32other nonzero values of seekable, this test is not made.
33
Guido van Rossum9ab94c11997-12-10 16:17:39 +000034To get the text of a particular header there are several methods:
35 str = m.getheader(name)
36 str = m.getrawheader(name)
37where name is the name of the header, e.g. 'Subject'.
38The difference is that getheader() strips the leading and trailing
39whitespace, while getrawheader() doesn't. Both functions retain
40embedded whitespace (including newlines) exactly as they are
41specified in the header, and leave the case of the text unchanged.
42
43For addresses and address lists there are functions
44 realname, mailaddress = m.getaddr(name) and
45 list = m.getaddrlist(name)
46where the latter returns a list of (realname, mailaddr) tuples.
47
48There is also a method
49 time = m.getdate(name)
50which parses a Date-like field and returns a time-compatible tuple,
51i.e. a tuple such as returned by time.localtime() or accepted by
52time.mktime().
53
54See the class definition for lower level access methods.
55
56There are also some utility functions here.
57"""
Guido van Rossum4d4ab921998-06-16 22:27:09 +000058# Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
Guido van Rossum01ca3361992-07-13 14:28:59 +000059
Guido van Rossum01ca3361992-07-13 14:28:59 +000060import string
Guido van Rossumb6775db1994-08-01 11:34:53 +000061import time
Guido van Rossum01ca3361992-07-13 14:28:59 +000062
63
Guido van Rossum9ab94c11997-12-10 16:17:39 +000064_blanklines = ('\r\n', '\n') # Optimization for islast()
Guido van Rossum92457b91995-06-22 19:06:57 +000065
66
Guido van Rossum01ca3361992-07-13 14:28:59 +000067class Message:
Guido van Rossum9ab94c11997-12-10 16:17:39 +000068 """Represents a single RFC-822-compliant message."""
69
70 def __init__(self, fp, seekable = 1):
71 """Initialize the class instance and read the headers."""
Guido van Rossumc7bb8571998-06-10 21:31:01 +000072 if seekable == 1:
73 # Exercise tell() to make sure it works
74 # (and then assume seek() works, too)
75 try:
76 fp.tell()
77 except:
78 seekable = 0
79 else:
80 seekable = 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +000081 self.fp = fp
82 self.seekable = seekable
83 self.startofheaders = None
84 self.startofbody = None
85 #
86 if self.seekable:
87 try:
88 self.startofheaders = self.fp.tell()
89 except IOError:
90 self.seekable = 0
91 #
92 self.readheaders()
93 #
94 if self.seekable:
95 try:
96 self.startofbody = self.fp.tell()
97 except IOError:
98 self.seekable = 0
99
100 def rewindbody(self):
101 """Rewind the file to the start of the body (if seekable)."""
102 if not self.seekable:
103 raise IOError, "unseekable file"
104 self.fp.seek(self.startofbody)
105
106 def readheaders(self):
107 """Read header lines.
108
109 Read header lines up to the entirely blank line that
110 terminates them. The (normally blank) line that ends the
111 headers is skipped, but not included in the returned list.
112 If a non-header line ends the headers, (which is an error),
113 an attempt is made to backspace over it; it is never
114 included in the returned list.
115
116 The variable self.status is set to the empty string if all
117 went well, otherwise it is an error message.
118 The variable self.headers is a completely uninterpreted list
119 of lines contained in the header (so printing them will
120 reproduce the header exactly as it appears in the file).
121 """
122 self.dict = {}
123 self.unixfrom = ''
124 self.headers = list = []
125 self.status = ''
126 headerseen = ""
127 firstline = 1
128 while 1:
129 line = self.fp.readline()
130 if not line:
131 self.status = 'EOF in headers'
132 break
133 # Skip unix From name time lines
134 if firstline and line[:5] == 'From ':
135 self.unixfrom = self.unixfrom + line
136 continue
137 firstline = 0
Guido van Rossume894fc01998-06-11 13:58:40 +0000138 if headerseen and line[0] in ' \t':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000139 # It's a continuation line.
140 list.append(line)
Guido van Rossume894fc01998-06-11 13:58:40 +0000141 x = (self.dict[headerseen] + "\n " + string.strip(line))
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000142 self.dict[headerseen] = string.strip(x)
Guido van Rossume894fc01998-06-11 13:58:40 +0000143 continue
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000144 elif self.iscomment(line):
Guido van Rossume894fc01998-06-11 13:58:40 +0000145 # It's a comment. Ignore it.
146 continue
147 elif self.islast(line):
148 # Note! No pushback here! The delimiter line gets eaten.
149 break
150 headerseen = self.isheader(line)
151 if headerseen:
152 # It's a legal header line, save it.
153 list.append(line)
154 self.dict[headerseen] = string.strip(line[len(headerseen)+2:])
155 continue
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000156 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000157 # It's not a header line; throw it back and stop here.
158 if not self.dict:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000159 self.status = 'No headers'
160 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000161 self.status = 'Non-header line where header expected'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000162 # Try to undo the read.
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000163 if getattr(self.fp, 'unread'):
164 self.fp.unread(line)
165 elif self.seekable:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000166 self.fp.seek(-len(line), 1)
167 else:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000168 self.status = self.status + '; bad seek'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000169 break
Guido van Rossume894fc01998-06-11 13:58:40 +0000170
171 def isheader(self, line):
172 """Determine whether a given line is a legal header.
173
174 This method should return the header name, suitably canonicalized.
175 You may override this method in order to use Message parsing
176 on tagged data in RFC822-like formats with special header formats.
177 """
178 i = string.find(line, ':')
179 if i > 0:
180 return string.lower(line[:i])
181 else:
182 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000183
184 def islast(self, line):
185 """Determine whether a line is a legal end of RFC-822 headers.
186
187 You may override this method if your application wants
188 to bend the rules, e.g. to strip trailing whitespace,
189 or to recognise MH template separators ('--------').
190 For convenience (e.g. for code reading from sockets) a
191 line consisting of \r\n also matches.
192 """
193 return line in _blanklines
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000194
195 def iscomment(self, line):
196 """Determine whether a line should be skipped entirely.
197
198 You may override this method in order to use Message parsing
199 on tagged data in RFC822-like formats that support embedded
200 comments or free-text data.
201 """
202 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000203
204 def getallmatchingheaders(self, name):
205 """Find all header lines matching a given header name.
206
207 Look through the list of headers and find all lines
208 matching a given header name (and their continuation
209 lines). A list of the lines is returned, without
210 interpretation. If the header does not occur, an
211 empty list is returned. If the header occurs multiple
212 times, all occurrences are returned. Case is not
213 important in the header name.
214 """
215 name = string.lower(name) + ':'
216 n = len(name)
217 list = []
218 hit = 0
219 for line in self.headers:
220 if string.lower(line[:n]) == name:
221 hit = 1
222 elif line[:1] not in string.whitespace:
223 hit = 0
224 if hit:
225 list.append(line)
226 return list
227
228 def getfirstmatchingheader(self, name):
229 """Get the first header line matching name.
230
231 This is similar to getallmatchingheaders, but it returns
232 only the first matching header (and its continuation
233 lines).
234 """
235 name = string.lower(name) + ':'
236 n = len(name)
237 list = []
238 hit = 0
239 for line in self.headers:
240 if hit:
241 if line[:1] not in string.whitespace:
242 break
243 elif string.lower(line[:n]) == name:
244 hit = 1
245 if hit:
246 list.append(line)
247 return list
248
249 def getrawheader(self, name):
250 """A higher-level interface to getfirstmatchingheader().
251
252 Return a string containing the literal text of the
253 header but with the keyword stripped. All leading,
254 trailing and embedded whitespace is kept in the
255 string, however.
256 Return None if the header does not occur.
257 """
258
259 list = self.getfirstmatchingheader(name)
260 if not list:
261 return None
262 list[0] = list[0][len(name) + 1:]
263 return string.joinfields(list, '')
264
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000265 def getheader(self, name, default=None):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000266 """Get the header value for a name.
267
268 This is the normal interface: it return a stripped
269 version of the header value for a given header name,
270 or None if it doesn't exist. This uses the dictionary
271 version which finds the *last* such header.
272 """
273 try:
274 return self.dict[string.lower(name)]
275 except KeyError:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000276 return default
277 get = getheader
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000278
279 def getaddr(self, name):
280 """Get a single address from a header, as a tuple.
281
282 An example return value:
283 ('Guido van Rossum', 'guido@cwi.nl')
284 """
285 # New, by Ben Escoto
286 alist = self.getaddrlist(name)
287 if alist:
288 return alist[0]
289 else:
290 return (None, None)
291
292 def getaddrlist(self, name):
293 """Get a list of addresses from a header.
294
295 Retrieves a list of addresses from a header, where each
296 address is a tuple as returned by getaddr().
297 """
298 # New, by Ben Escoto
299 try:
300 data = self[name]
301 except KeyError:
302 return []
303 a = AddrlistClass(data)
304 return a.getaddrlist()
305
306 def getdate(self, name):
307 """Retrieve a date field from a header.
308
309 Retrieves a date field from the named header, returning
310 a tuple compatible with time.mktime().
311 """
312 try:
313 data = self[name]
314 except KeyError:
315 return None
316 return parsedate(data)
317
318 def getdate_tz(self, name):
319 """Retrieve a date field from a header as a 10-tuple.
320
321 The first 9 elements make up a tuple compatible with
322 time.mktime(), and the 10th is the offset of the poster's
323 time zone from GMT/UTC.
324 """
325 try:
326 data = self[name]
327 except KeyError:
328 return None
329 return parsedate_tz(data)
330
331
332 # Access as a dictionary (only finds *last* header of each type):
333
334 def __len__(self):
335 """Get the number of headers in a message."""
336 return len(self.dict)
337
338 def __getitem__(self, name):
339 """Get a specific header, as from a dictionary."""
340 return self.dict[string.lower(name)]
Guido van Rossume894fc01998-06-11 13:58:40 +0000341
342 def __setitem__(self, name, value):
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000343 """Set the value of a header.
344
345 Note: This is not a perfect inversion of __getitem__, because
346 any changed headers get stuck at the end of the raw-headers list
347 rather than where the altered header was.
348 """
Guido van Rossume894fc01998-06-11 13:58:40 +0000349 del self[name] # Won't fail if it doesn't exist
350 self.dict[string.lower(name)] = value
351 text = name + ": " + value
352 lines = string.split(text, "\n")
353 for line in lines:
354 self.headers.append(line + "\n")
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000355
Guido van Rossum75d92c11998-04-02 21:33:20 +0000356 def __delitem__(self, name):
357 """Delete all occurrences of a specific header, if it is present."""
358 name = string.lower(name)
359 if not self.dict.has_key(name):
360 return
361 del self.dict[name]
362 name = name + ':'
363 n = len(name)
364 list = []
365 hit = 0
366 for i in range(len(self.headers)):
367 line = self.headers[i]
368 if string.lower(line[:n]) == name:
369 hit = 1
370 elif line[:1] not in string.whitespace:
371 hit = 0
372 if hit:
373 list.append(i)
374 list.reverse()
375 for i in list:
376 del self.headers[i]
377
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000378 def has_key(self, name):
379 """Determine whether a message contains the named header."""
380 return self.dict.has_key(string.lower(name))
381
382 def keys(self):
383 """Get all of a message's header field names."""
384 return self.dict.keys()
385
386 def values(self):
387 """Get all of a message's header field values."""
388 return self.dict.values()
389
390 def items(self):
391 """Get all of a message's headers.
392
393 Returns a list of name, value tuples.
394 """
395 return self.dict.items()
Guido van Rossum01ca3361992-07-13 14:28:59 +0000396
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000397 def __str__(self):
398 str = ''
399 for hdr in self.headers:
400 str = str + hdr
401 return str
Guido van Rossum01ca3361992-07-13 14:28:59 +0000402
403
404# Utility functions
405# -----------------
406
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000407# XXX Should fix unquote() and quote() to be really conformant.
Guido van Rossumb6775db1994-08-01 11:34:53 +0000408# XXX The inverses of the parse functions may also be useful.
409
Guido van Rossum01ca3361992-07-13 14:28:59 +0000410
Guido van Rossum01ca3361992-07-13 14:28:59 +0000411def unquote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000412 """Remove quotes from a string."""
413 if len(str) > 1:
414 if str[0] == '"' and str[-1:] == '"':
415 return str[1:-1]
416 if str[0] == '<' and str[-1:] == '>':
417 return str[1:-1]
418 return str
Guido van Rossumb6775db1994-08-01 11:34:53 +0000419
420
Guido van Rossum7883e1d1997-09-15 14:12:54 +0000421def quote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000422 """Add quotes around a string."""
423 return '"%s"' % string.join(
424 string.split(
425 string.join(
426 string.split(str, '\\'),
427 '\\\\'),
428 '"'),
429 '\\"')
Guido van Rossumb6775db1994-08-01 11:34:53 +0000430
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000431
Guido van Rossumb6775db1994-08-01 11:34:53 +0000432def parseaddr(address):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000433 """Parse an address into a (realname, mailaddr) tuple."""
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000434 a = AddrlistClass(address)
435 list = a.getaddrlist()
436 if not list:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000437 return (None, None)
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000438 else:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000439 return list[0]
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000440
441
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000442class AddrlistClass:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000443 """Address parser class by Ben Escoto.
444
445 To understand what this class does, it helps to have a copy of
446 RFC-822 in front of you.
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000447
448 Note: this class interface is deprecated and may be removed in the future.
449 Use rfc822.AddressList instead.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000450 """
451
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000452 def __init__(self, field):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000453 """Initialize a new instance.
454
455 `field' is an unparsed address header field, containing
456 one or more addresses.
457 """
458 self.specials = '()<>@,:;.\"[]'
459 self.pos = 0
460 self.LWS = ' \t'
461 self.CR = '\r'
462 self.atomends = self.specials + self.LWS + self.CR
463
464 self.field = field
465 self.commentlist = []
466
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000467 def gotonext(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000468 """Parse up to the start of the next address."""
469 while self.pos < len(self.field):
470 if self.field[self.pos] in self.LWS + '\n\r':
471 self.pos = self.pos + 1
472 elif self.field[self.pos] == '(':
473 self.commentlist.append(self.getcomment())
474 else: break
475
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000476 def getaddrlist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000477 """Parse all addresses.
478
479 Returns a list containing all of the addresses.
480 """
481 ad = self.getaddress()
482 if ad:
483 return ad + self.getaddrlist()
484 else: return []
485
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000486 def getaddress(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000487 """Parse the next address."""
488 self.commentlist = []
489 self.gotonext()
490
491 oldpos = self.pos
492 oldcl = self.commentlist
493 plist = self.getphraselist()
494
495 self.gotonext()
496 returnlist = []
497
498 if self.pos >= len(self.field):
499 # Bad email address technically, no domain.
500 if plist:
501 returnlist = [(string.join(self.commentlist), plist[0])]
502
503 elif self.field[self.pos] in '.@':
504 # email address is just an addrspec
505 # this isn't very efficient since we start over
506 self.pos = oldpos
507 self.commentlist = oldcl
508 addrspec = self.getaddrspec()
509 returnlist = [(string.join(self.commentlist), addrspec)]
510
511 elif self.field[self.pos] == ':':
512 # address is a group
513 returnlist = []
514
515 self.pos = self.pos + 1
516 while self.pos < len(self.field):
517 self.gotonext()
518 if self.field[self.pos] == ';':
519 self.pos = self.pos + 1
520 break
521 returnlist = returnlist + self.getaddress()
522
523 elif self.field[self.pos] == '<':
524 # Address is a phrase then a route addr
525 routeaddr = self.getrouteaddr()
526
527 if self.commentlist:
528 returnlist = [(string.join(plist) + ' (' + \
529 string.join(self.commentlist) + ')', routeaddr)]
530 else: returnlist = [(string.join(plist), routeaddr)]
531
532 else:
533 if plist:
534 returnlist = [(string.join(self.commentlist), plist[0])]
535
536 self.gotonext()
537 if self.pos < len(self.field) and self.field[self.pos] == ',':
538 self.pos = self.pos + 1
539 return returnlist
540
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000541 def getrouteaddr(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000542 """Parse a route address (Return-path value).
543
544 This method just skips all the route stuff and returns the addrspec.
545 """
546 if self.field[self.pos] != '<':
547 return
548
549 expectroute = 0
550 self.pos = self.pos + 1
551 self.gotonext()
Guido van Rossum9e43adb1998-03-03 16:17:52 +0000552 adlist = None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000553 while self.pos < len(self.field):
554 if expectroute:
555 self.getdomain()
556 expectroute = 0
557 elif self.field[self.pos] == '>':
558 self.pos = self.pos + 1
559 break
560 elif self.field[self.pos] == '@':
561 self.pos = self.pos + 1
562 expectroute = 1
563 elif self.field[self.pos] == ':':
564 self.pos = self.pos + 1
565 expectaddrspec = 1
566 else:
567 adlist = self.getaddrspec()
568 self.pos = self.pos + 1
569 break
570 self.gotonext()
571
572 return adlist
573
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000574 def getaddrspec(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000575 """Parse an RFC-822 addr-spec."""
576 aslist = []
577
578 self.gotonext()
579 while self.pos < len(self.field):
580 if self.field[self.pos] == '.':
581 aslist.append('.')
582 self.pos = self.pos + 1
583 elif self.field[self.pos] == '"':
584 aslist.append(self.getquote())
585 elif self.field[self.pos] in self.atomends:
586 break
587 else: aslist.append(self.getatom())
588 self.gotonext()
589
590 if self.pos >= len(self.field) or self.field[self.pos] != '@':
591 return string.join(aslist, '')
592
593 aslist.append('@')
594 self.pos = self.pos + 1
595 self.gotonext()
596 return string.join(aslist, '') + self.getdomain()
597
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000598 def getdomain(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000599 """Get the complete domain name from an address."""
600 sdlist = []
601 while self.pos < len(self.field):
602 if self.field[self.pos] in self.LWS:
603 self.pos = self.pos + 1
604 elif self.field[self.pos] == '(':
605 self.commentlist.append(self.getcomment())
606 elif self.field[self.pos] == '[':
607 sdlist.append(self.getdomainliteral())
608 elif self.field[self.pos] == '.':
609 self.pos = self.pos + 1
610 sdlist.append('.')
611 elif self.field[self.pos] in self.atomends:
612 break
613 else: sdlist.append(self.getatom())
614
615 return string.join(sdlist, '')
616
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000617 def getdelimited(self, beginchar, endchars, allowcomments = 1):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000618 """Parse a header fragment delimited by special characters.
619
620 `beginchar' is the start character for the fragment.
621 If self is not looking at an instance of `beginchar' then
622 getdelimited returns the empty string.
623
624 `endchars' is a sequence of allowable end-delimiting characters.
625 Parsing stops when one of these is encountered.
626
627 If `allowcomments' is non-zero, embedded RFC-822 comments
628 are allowed within the parsed fragment.
629 """
630 if self.field[self.pos] != beginchar:
631 return ''
632
633 slist = ['']
634 quote = 0
635 self.pos = self.pos + 1
636 while self.pos < len(self.field):
637 if quote == 1:
638 slist.append(self.field[self.pos])
639 quote = 0
640 elif self.field[self.pos] in endchars:
641 self.pos = self.pos + 1
642 break
643 elif allowcomments and self.field[self.pos] == '(':
644 slist.append(self.getcomment())
645 elif self.field[self.pos] == '\\':
646 quote = 1
647 else:
648 slist.append(self.field[self.pos])
649 self.pos = self.pos + 1
650
651 return string.join(slist, '')
652
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000653 def getquote(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000654 """Get a quote-delimited fragment from self's field."""
655 return self.getdelimited('"', '"\r', 0)
656
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000657 def getcomment(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000658 """Get a parenthesis-delimited fragment from self's field."""
659 return self.getdelimited('(', ')\r', 1)
660
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000661 def getdomainliteral(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000662 """Parse an RFC-822 domain-literal."""
663 return self.getdelimited('[', ']\r', 0)
664
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000665 def getatom(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000666 """Parse an RFC-822 atom."""
667 atomlist = ['']
668
669 while self.pos < len(self.field):
670 if self.field[self.pos] in self.atomends:
671 break
672 else: atomlist.append(self.field[self.pos])
673 self.pos = self.pos + 1
674
675 return string.join(atomlist, '')
676
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000677 def getphraselist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000678 """Parse a sequence of RFC-822 phrases.
679
680 A phrase is a sequence of words, which are in turn either
Guido van Rossume894fc01998-06-11 13:58:40 +0000681 RFC-822 atoms or quoted-strings. Phrases are canonicalized
682 by squeezing all runs of continuous whitespace into one space.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000683 """
684 plist = []
685
686 while self.pos < len(self.field):
687 if self.field[self.pos] in self.LWS:
688 self.pos = self.pos + 1
689 elif self.field[self.pos] == '"':
690 plist.append(self.getquote())
691 elif self.field[self.pos] == '(':
692 self.commentlist.append(self.getcomment())
693 elif self.field[self.pos] in self.atomends:
694 break
695 else: plist.append(self.getatom())
696
697 return plist
Guido van Rossumb6775db1994-08-01 11:34:53 +0000698
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000699class AddressList(AddrlistClass):
700 """An AddressList encapsulates a list of parsed RFC822 addresses."""
701 def __init__(self, field):
702 AddrlistClass.__init__(self, field)
703 if field:
704 self.addresslist = self.getaddrlist()
705 else:
706 self.addresslist = []
707
708 def __len__(self):
709 return len(self.addresslist)
710
711 def __str__(self):
712 return string.joinfields(map(dump_address_pair, self.addresslist),", ")
713
714 def __add__(self, other):
715 # Set union
716 newaddr = AddressList(None)
717 newaddr.addresslist = self.addresslist[:]
718 for x in other.addresslist:
719 if not x in self.addresslist:
720 newaddr.addresslist.append(x)
721 return newaddr
722
723 def __sub__(self, other):
724 # Set difference
725 newaddr = AddressList(None)
726 for x in self.addresslist:
727 if not x in other.addresslist:
728 newaddr.addresslist.append(x)
729 return newaddr
730
Guido van Rossum81d10b41998-06-16 22:29:03 +0000731 def __getitem__(self, index):
732 # Make indexing, slices, and 'in' work
733 return self.addrlist[index]
734
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000735def dump_address_pair(pair):
736 """Dump a (name, address) pair in a canonicalized form."""
737 if pair[0]:
738 return '"' + pair[0] + '" <' + pair[1] + '>'
739 else:
740 return pair[1]
Guido van Rossumb6775db1994-08-01 11:34:53 +0000741
742# Parse a date field
743
744_monthnames = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul',
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000745 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
Guido van Rossum9a876a41997-07-25 15:20:52 +0000746_daynames = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
Guido van Rossumb6775db1994-08-01 11:34:53 +0000747
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000748# The timezone table does not include the military time zones defined
749# in RFC822, other than Z. According to RFC1123, the description in
750# RFC822 gets the signs wrong, so we can't rely on any such time
751# zones. RFC1123 recommends that numeric timezone indicators be used
752# instead of timezone names.
753
754_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
Guido van Rossum67133e21998-05-18 16:09:10 +0000755 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000756 'EST': -500, 'EDT': -400, # Eastern
Guido van Rossum67133e21998-05-18 16:09:10 +0000757 'CST': -600, 'CDT': -500, # Central
758 'MST': -700, 'MDT': -600, # Mountain
759 'PST': -800, 'PDT': -700 # Pacific
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000760 }
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000761
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000762
763def parsedate_tz(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000764 """Convert a date string to a time tuple.
765
766 Accounts for military timezones.
767 """
768 data = string.split(data)
769 if data[0][-1] == ',' or data[0] in _daynames:
770 # There's a dayname here. Skip it
771 del data[0]
772 if len(data) == 3: # RFC 850 date, deprecated
773 stuff = string.split(data[0], '-')
774 if len(stuff) == 3:
775 data = stuff + data[1:]
776 if len(data) == 4:
777 s = data[3]
778 i = string.find(s, '+')
779 if i > 0:
780 data[3:] = [s[:i], s[i+1:]]
781 else:
782 data.append('') # Dummy tz
783 if len(data) < 5:
784 return None
785 data = data[:5]
786 [dd, mm, yy, tm, tz] = data
787 if not mm in _monthnames:
788 dd, mm, yy, tm, tz = mm, dd, tm, yy, tz
789 if not mm in _monthnames:
790 return None
791 mm = _monthnames.index(mm)+1
792 tm = string.splitfields(tm, ':')
793 if len(tm) == 2:
794 [thh, tmm] = tm
795 tss = '0'
796 else:
797 [thh, tmm, tss] = tm
798 try:
799 yy = string.atoi(yy)
800 dd = string.atoi(dd)
801 thh = string.atoi(thh)
802 tmm = string.atoi(tmm)
803 tss = string.atoi(tss)
804 except string.atoi_error:
805 return None
Guido van Rossuma73033f1998-02-19 00:28:58 +0000806 tzoffset=None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000807 tz=string.upper(tz)
808 if _timezones.has_key(tz):
809 tzoffset=_timezones[tz]
810 else:
811 try:
812 tzoffset=string.atoi(tz)
813 except string.atoi_error:
814 pass
815 # Convert a timezone offset into seconds ; -0500 -> -18000
Guido van Rossuma73033f1998-02-19 00:28:58 +0000816 if tzoffset:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000817 if tzoffset < 0:
818 tzsign = -1
819 tzoffset = -tzoffset
820 else:
821 tzsign = 1
822 tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000823 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
824 return tuple
825
Guido van Rossumb6775db1994-08-01 11:34:53 +0000826
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000827def parsedate(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000828 """Convert a time string to a time tuple."""
829 t=parsedate_tz(data)
830 if type(t)==type( () ):
831 return t[:9]
832 else: return t
833
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000834
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000835def mktime_tz(data):
Guido van Rossum67133e21998-05-18 16:09:10 +0000836 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
Guido van Rossuma73033f1998-02-19 00:28:58 +0000837 if data[9] is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000838 # No zone info, so localtime is better assumption than GMT
839 return time.mktime(data[:8] + (-1,))
Guido van Rossuma73033f1998-02-19 00:28:58 +0000840 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000841 t = time.mktime(data[:8] + (0,))
842 return t - data[9] - time.timezone
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000843
Guido van Rossumb6775db1994-08-01 11:34:53 +0000844
845# When used as script, run a small test program.
846# The first command line argument must be a filename containing one
847# message in RFC-822 format.
848
849if __name__ == '__main__':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000850 import sys, os
851 file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
852 if sys.argv[1:]: file = sys.argv[1]
853 f = open(file, 'r')
854 m = Message(f)
855 print 'From:', m.getaddr('from')
856 print 'To:', m.getaddrlist('to')
857 print 'Subject:', m.getheader('subject')
858 print 'Date:', m.getheader('date')
859 date = m.getdate_tz('date')
860 if date:
861 print 'ParsedDate:', time.asctime(date[:-1]),
862 hhmmss = date[-1]
863 hhmm, ss = divmod(hhmmss, 60)
864 hh, mm = divmod(hhmm, 60)
865 print "%+03d%02d" % (hh, mm),
866 if ss: print ".%02d" % ss,
867 print
868 else:
869 print 'ParsedDate:', None
870 m.rewindbody()
871 n = 0
872 while f.readline():
873 n = n + 1
874 print 'Lines:', n
875 print '-'*70
876 print 'len =', len(m)
877 if m.has_key('Date'): print 'Date =', m['Date']
878 if m.has_key('X-Nonsense'): pass
879 print 'keys =', m.keys()
880 print 'values =', m.values()
881 print 'items =', m.items()