blob: ef5a8512f06c7ae1f8d94a5c798877fb02f582bb [file] [log] [blame]
Guido van Rossum9ab94c11997-12-10 16:17:39 +00001"""RFC-822 message manipulation class.
Guido van Rossum01ca3361992-07-13 14:28:59 +00002
Guido van Rossum9ab94c11997-12-10 16:17:39 +00003XXX This is only a very rough sketch of a full RFC-822 parser;
4in particular the tokenizing of addresses does not adhere to all the
5quoting rules.
6
7Directions for use:
8
9To create a Message object: first open a file, e.g.:
10 fp = open(file, 'r')
Guido van Rossumc7bb8571998-06-10 21:31:01 +000011You can use any other legal way of getting an open file object, e.g. use
12sys.stdin or call os.popen().
Guido van Rossum9ab94c11997-12-10 16:17:39 +000013Then pass the open file object to the Message() constructor:
14 m = Message(fp)
15
Guido van Rossume894fc01998-06-11 13:58:40 +000016This class can work with any input object that supports a readline
17method. If the input object has seek and tell capability, the
18rewindbody method will work; also illegal lines will be pushed back
19onto the input stream. If the input object lacks seek but has an
20`unread' method that can push back a line of input, Message will use
21that to push back illegal lines. Thus this class can be used to parse
22messages coming from a buffered stream.
Guido van Rossumc7bb8571998-06-10 21:31:01 +000023
24The optional `seekable' argument is provided as a workaround for
25certain stdio libraries in which tell() discards buffered data before
26discovering that the lseek() system call doesn't work. For maximum
27portability, you should set the seekable argument to zero to prevent
28that initial \code{tell} when passing in an unseekable object such as
29a a file object created from a socket object. If it is 1 on entry --
30which it is by default -- the tell() method of the open file object is
31called once; if this raises an exception, seekable is reset to 0. For
32other nonzero values of seekable, this test is not made.
33
Guido van Rossum9ab94c11997-12-10 16:17:39 +000034To get the text of a particular header there are several methods:
35 str = m.getheader(name)
36 str = m.getrawheader(name)
37where name is the name of the header, e.g. 'Subject'.
38The difference is that getheader() strips the leading and trailing
39whitespace, while getrawheader() doesn't. Both functions retain
40embedded whitespace (including newlines) exactly as they are
41specified in the header, and leave the case of the text unchanged.
42
43For addresses and address lists there are functions
44 realname, mailaddress = m.getaddr(name) and
45 list = m.getaddrlist(name)
46where the latter returns a list of (realname, mailaddr) tuples.
47
48There is also a method
49 time = m.getdate(name)
50which parses a Date-like field and returns a time-compatible tuple,
51i.e. a tuple such as returned by time.localtime() or accepted by
52time.mktime().
53
54See the class definition for lower level access methods.
55
56There are also some utility functions here.
57"""
Guido van Rossum4d4ab921998-06-16 22:27:09 +000058# Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
Guido van Rossum01ca3361992-07-13 14:28:59 +000059
Guido van Rossum01ca3361992-07-13 14:28:59 +000060import string
Guido van Rossumb6775db1994-08-01 11:34:53 +000061import time
Guido van Rossum01ca3361992-07-13 14:28:59 +000062
63
Guido van Rossum9ab94c11997-12-10 16:17:39 +000064_blanklines = ('\r\n', '\n') # Optimization for islast()
Guido van Rossum92457b91995-06-22 19:06:57 +000065
66
Guido van Rossum01ca3361992-07-13 14:28:59 +000067class Message:
Guido van Rossum9ab94c11997-12-10 16:17:39 +000068 """Represents a single RFC-822-compliant message."""
69
70 def __init__(self, fp, seekable = 1):
71 """Initialize the class instance and read the headers."""
Guido van Rossumc7bb8571998-06-10 21:31:01 +000072 if seekable == 1:
73 # Exercise tell() to make sure it works
74 # (and then assume seek() works, too)
75 try:
76 fp.tell()
77 except:
78 seekable = 0
79 else:
80 seekable = 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +000081 self.fp = fp
82 self.seekable = seekable
83 self.startofheaders = None
84 self.startofbody = None
85 #
86 if self.seekable:
87 try:
88 self.startofheaders = self.fp.tell()
89 except IOError:
90 self.seekable = 0
91 #
92 self.readheaders()
93 #
94 if self.seekable:
95 try:
96 self.startofbody = self.fp.tell()
97 except IOError:
98 self.seekable = 0
99
100 def rewindbody(self):
101 """Rewind the file to the start of the body (if seekable)."""
102 if not self.seekable:
103 raise IOError, "unseekable file"
104 self.fp.seek(self.startofbody)
105
106 def readheaders(self):
107 """Read header lines.
108
109 Read header lines up to the entirely blank line that
110 terminates them. The (normally blank) line that ends the
111 headers is skipped, but not included in the returned list.
112 If a non-header line ends the headers, (which is an error),
113 an attempt is made to backspace over it; it is never
114 included in the returned list.
115
116 The variable self.status is set to the empty string if all
117 went well, otherwise it is an error message.
118 The variable self.headers is a completely uninterpreted list
119 of lines contained in the header (so printing them will
120 reproduce the header exactly as it appears in the file).
121 """
122 self.dict = {}
123 self.unixfrom = ''
124 self.headers = list = []
125 self.status = ''
126 headerseen = ""
127 firstline = 1
Guido van Rossum052969a1998-07-21 14:24:04 +0000128 startofline = unread = tell = None
129 if hasattr(self.fp, 'unread'):
130 unread = self.fp.unread
131 elif self.seekable:
132 tell = self.fp.tell
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000133 while 1:
Guido van Rossum052969a1998-07-21 14:24:04 +0000134 if tell:
135 startofline = tell()
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000136 line = self.fp.readline()
137 if not line:
138 self.status = 'EOF in headers'
139 break
140 # Skip unix From name time lines
141 if firstline and line[:5] == 'From ':
142 self.unixfrom = self.unixfrom + line
143 continue
144 firstline = 0
Guido van Rossume894fc01998-06-11 13:58:40 +0000145 if headerseen and line[0] in ' \t':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000146 # It's a continuation line.
147 list.append(line)
Guido van Rossume894fc01998-06-11 13:58:40 +0000148 x = (self.dict[headerseen] + "\n " + string.strip(line))
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000149 self.dict[headerseen] = string.strip(x)
Guido van Rossume894fc01998-06-11 13:58:40 +0000150 continue
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000151 elif self.iscomment(line):
Guido van Rossume894fc01998-06-11 13:58:40 +0000152 # It's a comment. Ignore it.
153 continue
154 elif self.islast(line):
155 # Note! No pushback here! The delimiter line gets eaten.
156 break
157 headerseen = self.isheader(line)
158 if headerseen:
159 # It's a legal header line, save it.
160 list.append(line)
161 self.dict[headerseen] = string.strip(line[len(headerseen)+2:])
162 continue
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000163 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000164 # It's not a header line; throw it back and stop here.
165 if not self.dict:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000166 self.status = 'No headers'
167 else:
Guido van Rossume894fc01998-06-11 13:58:40 +0000168 self.status = 'Non-header line where header expected'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000169 # Try to undo the read.
Guido van Rossum052969a1998-07-21 14:24:04 +0000170 if unread:
171 unread(line)
172 elif tell:
173 self.fp.seek(startofline)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000174 else:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000175 self.status = self.status + '; bad seek'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000176 break
Guido van Rossume894fc01998-06-11 13:58:40 +0000177
178 def isheader(self, line):
179 """Determine whether a given line is a legal header.
180
181 This method should return the header name, suitably canonicalized.
182 You may override this method in order to use Message parsing
183 on tagged data in RFC822-like formats with special header formats.
184 """
185 i = string.find(line, ':')
186 if i > 0:
187 return string.lower(line[:i])
188 else:
189 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000190
191 def islast(self, line):
192 """Determine whether a line is a legal end of RFC-822 headers.
193
194 You may override this method if your application wants
195 to bend the rules, e.g. to strip trailing whitespace,
196 or to recognise MH template separators ('--------').
197 For convenience (e.g. for code reading from sockets) a
198 line consisting of \r\n also matches.
199 """
200 return line in _blanklines
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000201
202 def iscomment(self, line):
203 """Determine whether a line should be skipped entirely.
204
205 You may override this method in order to use Message parsing
206 on tagged data in RFC822-like formats that support embedded
207 comments or free-text data.
208 """
209 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000210
211 def getallmatchingheaders(self, name):
212 """Find all header lines matching a given header name.
213
214 Look through the list of headers and find all lines
215 matching a given header name (and their continuation
216 lines). A list of the lines is returned, without
217 interpretation. If the header does not occur, an
218 empty list is returned. If the header occurs multiple
219 times, all occurrences are returned. Case is not
220 important in the header name.
221 """
222 name = string.lower(name) + ':'
223 n = len(name)
224 list = []
225 hit = 0
226 for line in self.headers:
227 if string.lower(line[:n]) == name:
228 hit = 1
229 elif line[:1] not in string.whitespace:
230 hit = 0
231 if hit:
232 list.append(line)
233 return list
234
235 def getfirstmatchingheader(self, name):
236 """Get the first header line matching name.
237
238 This is similar to getallmatchingheaders, but it returns
239 only the first matching header (and its continuation
240 lines).
241 """
242 name = string.lower(name) + ':'
243 n = len(name)
244 list = []
245 hit = 0
246 for line in self.headers:
247 if hit:
248 if line[:1] not in string.whitespace:
249 break
250 elif string.lower(line[:n]) == name:
251 hit = 1
252 if hit:
253 list.append(line)
254 return list
255
256 def getrawheader(self, name):
257 """A higher-level interface to getfirstmatchingheader().
258
259 Return a string containing the literal text of the
260 header but with the keyword stripped. All leading,
261 trailing and embedded whitespace is kept in the
262 string, however.
263 Return None if the header does not occur.
264 """
265
266 list = self.getfirstmatchingheader(name)
267 if not list:
268 return None
269 list[0] = list[0][len(name) + 1:]
270 return string.joinfields(list, '')
271
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000272 def getheader(self, name, default=None):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000273 """Get the header value for a name.
274
Fred Drakeddf22c41999-04-28 21:17:38 +0000275 This is the normal interface: it returns a stripped
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000276 version of the header value for a given header name,
277 or None if it doesn't exist. This uses the dictionary
278 version which finds the *last* such header.
279 """
280 try:
281 return self.dict[string.lower(name)]
282 except KeyError:
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000283 return default
284 get = getheader
Fred Drakeddf22c41999-04-28 21:17:38 +0000285
286 def getheaders(self, name):
287 """Get all values for a header.
288
289 This returns a list of values for headers given more than once;
290 each value in the result list is stripped in the same way as the
291 result of getheader(). If the header is not given, return None.
292 """
293 result = []
294 current = ''
295 have_header = 0
296 for s in self.getallmatchingheaders(name):
297 if s[0] in string.whitespace:
298 if current:
299 current = "%s\n %s" % (current, string.strip(s))
300 else:
301 current = string.strip(s)
302 else:
303 if have_header:
304 result.append(current)
305 current = string.strip(s[string.find(s, ":") + 1:])
306 have_header = 1
307 if have_header:
308 result.append(current)
309 return result or None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000310
311 def getaddr(self, name):
312 """Get a single address from a header, as a tuple.
313
314 An example return value:
315 ('Guido van Rossum', 'guido@cwi.nl')
316 """
317 # New, by Ben Escoto
318 alist = self.getaddrlist(name)
319 if alist:
320 return alist[0]
321 else:
322 return (None, None)
323
324 def getaddrlist(self, name):
325 """Get a list of addresses from a header.
Barry Warsaw8a578431999-01-14 19:59:58 +0000326
327 Retrieves a list of addresses from a header, where each address is a
328 tuple as returned by getaddr(). Scans all named headers, so it works
329 properly with multiple To: or Cc: headers for example.
330
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000331 """
Barry Warsaw8a578431999-01-14 19:59:58 +0000332 raw = []
333 for h in self.getallmatchingheaders(name):
334 if h[0] in ' \t':
335 raw.append(h)
336 else:
337 if raw:
338 raw.append(', ')
339 i = string.find(h, ':')
340 if i > 0:
341 addr = h[i+1:]
342 raw.append(addr)
343 alladdrs = string.join(raw, '')
344 a = AddrlistClass(alladdrs)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000345 return a.getaddrlist()
346
347 def getdate(self, name):
348 """Retrieve a date field from a header.
349
350 Retrieves a date field from the named header, returning
351 a tuple compatible with time.mktime().
352 """
353 try:
354 data = self[name]
355 except KeyError:
356 return None
357 return parsedate(data)
358
359 def getdate_tz(self, name):
360 """Retrieve a date field from a header as a 10-tuple.
361
362 The first 9 elements make up a tuple compatible with
363 time.mktime(), and the 10th is the offset of the poster's
364 time zone from GMT/UTC.
365 """
366 try:
367 data = self[name]
368 except KeyError:
369 return None
370 return parsedate_tz(data)
371
372
373 # Access as a dictionary (only finds *last* header of each type):
374
375 def __len__(self):
376 """Get the number of headers in a message."""
377 return len(self.dict)
378
379 def __getitem__(self, name):
380 """Get a specific header, as from a dictionary."""
381 return self.dict[string.lower(name)]
Guido van Rossume894fc01998-06-11 13:58:40 +0000382
383 def __setitem__(self, name, value):
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000384 """Set the value of a header.
385
386 Note: This is not a perfect inversion of __getitem__, because
387 any changed headers get stuck at the end of the raw-headers list
388 rather than where the altered header was.
389 """
Guido van Rossume894fc01998-06-11 13:58:40 +0000390 del self[name] # Won't fail if it doesn't exist
391 self.dict[string.lower(name)] = value
392 text = name + ": " + value
393 lines = string.split(text, "\n")
394 for line in lines:
395 self.headers.append(line + "\n")
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000396
Guido van Rossum75d92c11998-04-02 21:33:20 +0000397 def __delitem__(self, name):
398 """Delete all occurrences of a specific header, if it is present."""
399 name = string.lower(name)
400 if not self.dict.has_key(name):
401 return
402 del self.dict[name]
403 name = name + ':'
404 n = len(name)
405 list = []
406 hit = 0
407 for i in range(len(self.headers)):
408 line = self.headers[i]
409 if string.lower(line[:n]) == name:
410 hit = 1
411 elif line[:1] not in string.whitespace:
412 hit = 0
413 if hit:
414 list.append(i)
415 list.reverse()
416 for i in list:
417 del self.headers[i]
418
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000419 def has_key(self, name):
420 """Determine whether a message contains the named header."""
421 return self.dict.has_key(string.lower(name))
422
423 def keys(self):
424 """Get all of a message's header field names."""
425 return self.dict.keys()
426
427 def values(self):
428 """Get all of a message's header field values."""
429 return self.dict.values()
430
431 def items(self):
432 """Get all of a message's headers.
433
434 Returns a list of name, value tuples.
435 """
436 return self.dict.items()
Guido van Rossum01ca3361992-07-13 14:28:59 +0000437
Guido van Rossumc7bb8571998-06-10 21:31:01 +0000438 def __str__(self):
439 str = ''
440 for hdr in self.headers:
441 str = str + hdr
442 return str
Guido van Rossum01ca3361992-07-13 14:28:59 +0000443
444
445# Utility functions
446# -----------------
447
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000448# XXX Should fix unquote() and quote() to be really conformant.
Guido van Rossumb6775db1994-08-01 11:34:53 +0000449# XXX The inverses of the parse functions may also be useful.
450
Guido van Rossum01ca3361992-07-13 14:28:59 +0000451
Guido van Rossum01ca3361992-07-13 14:28:59 +0000452def unquote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000453 """Remove quotes from a string."""
454 if len(str) > 1:
455 if str[0] == '"' and str[-1:] == '"':
456 return str[1:-1]
457 if str[0] == '<' and str[-1:] == '>':
458 return str[1:-1]
459 return str
Guido van Rossumb6775db1994-08-01 11:34:53 +0000460
461
Guido van Rossum7883e1d1997-09-15 14:12:54 +0000462def quote(str):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000463 """Add quotes around a string."""
464 return '"%s"' % string.join(
465 string.split(
466 string.join(
467 string.split(str, '\\'),
468 '\\\\'),
469 '"'),
470 '\\"')
Guido van Rossumb6775db1994-08-01 11:34:53 +0000471
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000472
Guido van Rossumb6775db1994-08-01 11:34:53 +0000473def parseaddr(address):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000474 """Parse an address into a (realname, mailaddr) tuple."""
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000475 a = AddrlistClass(address)
476 list = a.getaddrlist()
477 if not list:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000478 return (None, None)
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000479 else:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000480 return list[0]
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000481
482
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000483class AddrlistClass:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000484 """Address parser class by Ben Escoto.
485
486 To understand what this class does, it helps to have a copy of
487 RFC-822 in front of you.
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000488
489 Note: this class interface is deprecated and may be removed in the future.
490 Use rfc822.AddressList instead.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000491 """
492
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000493 def __init__(self, field):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000494 """Initialize a new instance.
495
496 `field' is an unparsed address header field, containing
497 one or more addresses.
498 """
499 self.specials = '()<>@,:;.\"[]'
500 self.pos = 0
501 self.LWS = ' \t'
Barry Warsaw8a578431999-01-14 19:59:58 +0000502 self.CR = '\r\n'
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000503 self.atomends = self.specials + self.LWS + self.CR
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000504 self.field = field
505 self.commentlist = []
506
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000507 def gotonext(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000508 """Parse up to the start of the next address."""
509 while self.pos < len(self.field):
510 if self.field[self.pos] in self.LWS + '\n\r':
511 self.pos = self.pos + 1
512 elif self.field[self.pos] == '(':
513 self.commentlist.append(self.getcomment())
514 else: break
515
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000516 def getaddrlist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000517 """Parse all addresses.
518
519 Returns a list containing all of the addresses.
520 """
521 ad = self.getaddress()
522 if ad:
523 return ad + self.getaddrlist()
524 else: return []
525
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000526 def getaddress(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000527 """Parse the next address."""
528 self.commentlist = []
529 self.gotonext()
530
531 oldpos = self.pos
532 oldcl = self.commentlist
533 plist = self.getphraselist()
534
535 self.gotonext()
536 returnlist = []
537
538 if self.pos >= len(self.field):
539 # Bad email address technically, no domain.
540 if plist:
541 returnlist = [(string.join(self.commentlist), plist[0])]
542
543 elif self.field[self.pos] in '.@':
544 # email address is just an addrspec
545 # this isn't very efficient since we start over
546 self.pos = oldpos
547 self.commentlist = oldcl
548 addrspec = self.getaddrspec()
549 returnlist = [(string.join(self.commentlist), addrspec)]
550
551 elif self.field[self.pos] == ':':
552 # address is a group
553 returnlist = []
554
555 self.pos = self.pos + 1
556 while self.pos < len(self.field):
557 self.gotonext()
558 if self.field[self.pos] == ';':
559 self.pos = self.pos + 1
560 break
561 returnlist = returnlist + self.getaddress()
562
563 elif self.field[self.pos] == '<':
564 # Address is a phrase then a route addr
565 routeaddr = self.getrouteaddr()
566
567 if self.commentlist:
568 returnlist = [(string.join(plist) + ' (' + \
569 string.join(self.commentlist) + ')', routeaddr)]
570 else: returnlist = [(string.join(plist), routeaddr)]
571
572 else:
573 if plist:
574 returnlist = [(string.join(self.commentlist), plist[0])]
Barry Warsaw8a578431999-01-14 19:59:58 +0000575 elif self.field[self.pos] in self.specials:
576 self.pos = self.pos + 1
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000577
578 self.gotonext()
579 if self.pos < len(self.field) and self.field[self.pos] == ',':
580 self.pos = self.pos + 1
581 return returnlist
582
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000583 def getrouteaddr(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000584 """Parse a route address (Return-path value).
585
586 This method just skips all the route stuff and returns the addrspec.
587 """
588 if self.field[self.pos] != '<':
589 return
590
591 expectroute = 0
592 self.pos = self.pos + 1
593 self.gotonext()
Guido van Rossum9e43adb1998-03-03 16:17:52 +0000594 adlist = None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000595 while self.pos < len(self.field):
596 if expectroute:
597 self.getdomain()
598 expectroute = 0
599 elif self.field[self.pos] == '>':
600 self.pos = self.pos + 1
601 break
602 elif self.field[self.pos] == '@':
603 self.pos = self.pos + 1
604 expectroute = 1
605 elif self.field[self.pos] == ':':
606 self.pos = self.pos + 1
607 expectaddrspec = 1
608 else:
609 adlist = self.getaddrspec()
610 self.pos = self.pos + 1
611 break
612 self.gotonext()
613
614 return adlist
615
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000616 def getaddrspec(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000617 """Parse an RFC-822 addr-spec."""
618 aslist = []
619
620 self.gotonext()
621 while self.pos < len(self.field):
622 if self.field[self.pos] == '.':
623 aslist.append('.')
624 self.pos = self.pos + 1
625 elif self.field[self.pos] == '"':
626 aslist.append(self.getquote())
627 elif self.field[self.pos] in self.atomends:
628 break
629 else: aslist.append(self.getatom())
630 self.gotonext()
631
632 if self.pos >= len(self.field) or self.field[self.pos] != '@':
633 return string.join(aslist, '')
634
635 aslist.append('@')
636 self.pos = self.pos + 1
637 self.gotonext()
638 return string.join(aslist, '') + self.getdomain()
639
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000640 def getdomain(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000641 """Get the complete domain name from an address."""
642 sdlist = []
643 while self.pos < len(self.field):
644 if self.field[self.pos] in self.LWS:
645 self.pos = self.pos + 1
646 elif self.field[self.pos] == '(':
647 self.commentlist.append(self.getcomment())
648 elif self.field[self.pos] == '[':
649 sdlist.append(self.getdomainliteral())
650 elif self.field[self.pos] == '.':
651 self.pos = self.pos + 1
652 sdlist.append('.')
653 elif self.field[self.pos] in self.atomends:
654 break
655 else: sdlist.append(self.getatom())
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000656 return string.join(sdlist, '')
657
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000658 def getdelimited(self, beginchar, endchars, allowcomments = 1):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000659 """Parse a header fragment delimited by special characters.
660
661 `beginchar' is the start character for the fragment.
662 If self is not looking at an instance of `beginchar' then
663 getdelimited returns the empty string.
664
665 `endchars' is a sequence of allowable end-delimiting characters.
666 Parsing stops when one of these is encountered.
667
668 If `allowcomments' is non-zero, embedded RFC-822 comments
669 are allowed within the parsed fragment.
670 """
671 if self.field[self.pos] != beginchar:
672 return ''
673
674 slist = ['']
675 quote = 0
676 self.pos = self.pos + 1
677 while self.pos < len(self.field):
678 if quote == 1:
679 slist.append(self.field[self.pos])
680 quote = 0
681 elif self.field[self.pos] in endchars:
682 self.pos = self.pos + 1
683 break
684 elif allowcomments and self.field[self.pos] == '(':
685 slist.append(self.getcomment())
686 elif self.field[self.pos] == '\\':
687 quote = 1
688 else:
689 slist.append(self.field[self.pos])
690 self.pos = self.pos + 1
691
692 return string.join(slist, '')
693
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000694 def getquote(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000695 """Get a quote-delimited fragment from self's field."""
696 return self.getdelimited('"', '"\r', 0)
697
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000698 def getcomment(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000699 """Get a parenthesis-delimited fragment from self's field."""
700 return self.getdelimited('(', ')\r', 1)
701
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000702 def getdomainliteral(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000703 """Parse an RFC-822 domain-literal."""
704 return self.getdelimited('[', ']\r', 0)
705
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000706 def getatom(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000707 """Parse an RFC-822 atom."""
708 atomlist = ['']
709
710 while self.pos < len(self.field):
711 if self.field[self.pos] in self.atomends:
712 break
713 else: atomlist.append(self.field[self.pos])
714 self.pos = self.pos + 1
715
716 return string.join(atomlist, '')
717
Guido van Rossumbe7c45e1997-11-22 21:49:19 +0000718 def getphraselist(self):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000719 """Parse a sequence of RFC-822 phrases.
720
721 A phrase is a sequence of words, which are in turn either
Guido van Rossume894fc01998-06-11 13:58:40 +0000722 RFC-822 atoms or quoted-strings. Phrases are canonicalized
723 by squeezing all runs of continuous whitespace into one space.
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000724 """
725 plist = []
726
727 while self.pos < len(self.field):
728 if self.field[self.pos] in self.LWS:
729 self.pos = self.pos + 1
730 elif self.field[self.pos] == '"':
731 plist.append(self.getquote())
732 elif self.field[self.pos] == '(':
733 self.commentlist.append(self.getcomment())
734 elif self.field[self.pos] in self.atomends:
735 break
736 else: plist.append(self.getatom())
737
738 return plist
Guido van Rossumb6775db1994-08-01 11:34:53 +0000739
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000740class AddressList(AddrlistClass):
741 """An AddressList encapsulates a list of parsed RFC822 addresses."""
742 def __init__(self, field):
743 AddrlistClass.__init__(self, field)
744 if field:
745 self.addresslist = self.getaddrlist()
746 else:
747 self.addresslist = []
748
749 def __len__(self):
750 return len(self.addresslist)
751
752 def __str__(self):
753 return string.joinfields(map(dump_address_pair, self.addresslist),", ")
754
755 def __add__(self, other):
756 # Set union
757 newaddr = AddressList(None)
758 newaddr.addresslist = self.addresslist[:]
759 for x in other.addresslist:
760 if not x in self.addresslist:
761 newaddr.addresslist.append(x)
762 return newaddr
763
764 def __sub__(self, other):
765 # Set difference
766 newaddr = AddressList(None)
767 for x in self.addresslist:
768 if not x in other.addresslist:
769 newaddr.addresslist.append(x)
770 return newaddr
771
Guido van Rossum81d10b41998-06-16 22:29:03 +0000772 def __getitem__(self, index):
773 # Make indexing, slices, and 'in' work
774 return self.addrlist[index]
775
Guido van Rossum4d4ab921998-06-16 22:27:09 +0000776def dump_address_pair(pair):
777 """Dump a (name, address) pair in a canonicalized form."""
778 if pair[0]:
779 return '"' + pair[0] + '" <' + pair[1] + '>'
780 else:
781 return pair[1]
Guido van Rossumb6775db1994-08-01 11:34:53 +0000782
783# Parse a date field
784
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000785_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
786 'aug', 'sep', 'oct', 'nov', 'dec',
787 'january', 'february', 'march', 'april', 'may', 'june', 'july',
788 'august', 'september', 'october', 'november', 'december']
789_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
Guido van Rossumb6775db1994-08-01 11:34:53 +0000790
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000791# The timezone table does not include the military time zones defined
792# in RFC822, other than Z. According to RFC1123, the description in
793# RFC822 gets the signs wrong, so we can't rely on any such time
794# zones. RFC1123 recommends that numeric timezone indicators be used
795# instead of timezone names.
796
797_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
Guido van Rossum67133e21998-05-18 16:09:10 +0000798 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000799 'EST': -500, 'EDT': -400, # Eastern
Guido van Rossum67133e21998-05-18 16:09:10 +0000800 'CST': -600, 'CDT': -500, # Central
801 'MST': -700, 'MDT': -600, # Mountain
802 'PST': -800, 'PDT': -700 # Pacific
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000803 }
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000804
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000805
806def parsedate_tz(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000807 """Convert a date string to a time tuple.
808
809 Accounts for military timezones.
810 """
811 data = string.split(data)
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000812 if data[0][-1] in (',', '.') or string.lower(data[0]) in _daynames:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000813 # There's a dayname here. Skip it
814 del data[0]
815 if len(data) == 3: # RFC 850 date, deprecated
816 stuff = string.split(data[0], '-')
817 if len(stuff) == 3:
818 data = stuff + data[1:]
819 if len(data) == 4:
820 s = data[3]
821 i = string.find(s, '+')
822 if i > 0:
823 data[3:] = [s[:i], s[i+1:]]
824 else:
825 data.append('') # Dummy tz
826 if len(data) < 5:
827 return None
828 data = data[:5]
829 [dd, mm, yy, tm, tz] = data
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000830 mm = string.lower(mm)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000831 if not mm in _monthnames:
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000832 dd, mm = mm, string.lower(dd)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000833 if not mm in _monthnames:
834 return None
835 mm = _monthnames.index(mm)+1
Guido van Rossumb08f51b1999-04-29 12:50:36 +0000836 if mm > 12: mm = mm - 12
Guido van Rossumdb01ee01998-12-23 22:22:10 +0000837 if dd[-1] == ',':
838 dd = dd[:-1]
839 i = string.find(yy, ':')
840 if i > 0:
841 yy, tm = tm, yy
842 if yy[-1] == ',':
843 yy = yy[:-1]
844 if yy[0] not in string.digits:
845 yy, tz = tz, yy
846 if tm[-1] == ',':
847 tm = tm[:-1]
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000848 tm = string.splitfields(tm, ':')
849 if len(tm) == 2:
850 [thh, tmm] = tm
851 tss = '0'
Guido van Rossum99e11311998-12-23 21:58:38 +0000852 elif len(tm) == 3:
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000853 [thh, tmm, tss] = tm
Guido van Rossum99e11311998-12-23 21:58:38 +0000854 else:
855 return None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000856 try:
857 yy = string.atoi(yy)
858 dd = string.atoi(dd)
859 thh = string.atoi(thh)
860 tmm = string.atoi(tmm)
861 tss = string.atoi(tss)
862 except string.atoi_error:
863 return None
Guido van Rossuma73033f1998-02-19 00:28:58 +0000864 tzoffset=None
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000865 tz=string.upper(tz)
866 if _timezones.has_key(tz):
867 tzoffset=_timezones[tz]
868 else:
869 try:
870 tzoffset=string.atoi(tz)
871 except string.atoi_error:
872 pass
873 # Convert a timezone offset into seconds ; -0500 -> -18000
Guido van Rossuma73033f1998-02-19 00:28:58 +0000874 if tzoffset:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000875 if tzoffset < 0:
876 tzsign = -1
877 tzoffset = -tzoffset
878 else:
879 tzsign = 1
880 tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000881 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
882 return tuple
883
Guido van Rossumb6775db1994-08-01 11:34:53 +0000884
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000885def parsedate(data):
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000886 """Convert a time string to a time tuple."""
887 t=parsedate_tz(data)
888 if type(t)==type( () ):
889 return t[:9]
890 else: return t
891
Guido van Rossum27cb8a41996-11-20 22:12:26 +0000892
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000893def mktime_tz(data):
Guido van Rossum67133e21998-05-18 16:09:10 +0000894 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
Guido van Rossuma73033f1998-02-19 00:28:58 +0000895 if data[9] is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000896 # No zone info, so localtime is better assumption than GMT
897 return time.mktime(data[:8] + (-1,))
Guido van Rossuma73033f1998-02-19 00:28:58 +0000898 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000899 t = time.mktime(data[:8] + (0,))
900 return t - data[9] - time.timezone
Guido van Rossum6cdd7a01996-12-12 18:39:54 +0000901
Guido van Rossum247a78a1999-04-19 18:04:38 +0000902def formatdate(timeval=None):
903 """Returns time format preferred for Internet standards.
904
905 Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
906 """
907 if timeval is None:
908 timeval = time.time()
909 return "%s" % time.strftime('%a, %d %b %Y %H:%M:%S GMT',
910 time.gmtime(timeval))
911
Guido van Rossumb6775db1994-08-01 11:34:53 +0000912
913# When used as script, run a small test program.
914# The first command line argument must be a filename containing one
915# message in RFC-822 format.
916
917if __name__ == '__main__':
Guido van Rossum9ab94c11997-12-10 16:17:39 +0000918 import sys, os
919 file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
920 if sys.argv[1:]: file = sys.argv[1]
921 f = open(file, 'r')
922 m = Message(f)
923 print 'From:', m.getaddr('from')
924 print 'To:', m.getaddrlist('to')
925 print 'Subject:', m.getheader('subject')
926 print 'Date:', m.getheader('date')
927 date = m.getdate_tz('date')
928 if date:
929 print 'ParsedDate:', time.asctime(date[:-1]),
930 hhmmss = date[-1]
931 hhmm, ss = divmod(hhmmss, 60)
932 hh, mm = divmod(hhmm, 60)
933 print "%+03d%02d" % (hh, mm),
934 if ss: print ".%02d" % ss,
935 print
936 else:
937 print 'ParsedDate:', None
938 m.rewindbody()
939 n = 0
940 while f.readline():
941 n = n + 1
942 print 'Lines:', n
943 print '-'*70
944 print 'len =', len(m)
945 if m.has_key('Date'): print 'Date =', m['Date']
946 if m.has_key('X-Nonsense'): pass
947 print 'keys =', m.keys()
948 print 'values =', m.values()
949 print 'items =', m.items()