blob: db5e0d442c118f2708282b17e7a5705761c1356d [file] [log] [blame]
Guido van Rossum01ca3361992-07-13 14:28:59 +00001# RFC-822 message manipulation class.
2#
3# XXX This is only a very rough sketch of a full RFC-822 parser;
Guido van Rossumb6775db1994-08-01 11:34:53 +00004# in particular the tokenizing of addresses does not adhere to all the
5# quoting rules.
Guido van Rossum01ca3361992-07-13 14:28:59 +00006#
7# Directions for use:
8#
9# To create a Message object: first open a file, e.g.:
10# fp = open(file, 'r')
11# (or use any other legal way of getting an open file object, e.g. use
12# sys.stdin or call os.popen()).
Guido van Rossum7bc817d1993-12-17 15:25:27 +000013# Then pass the open file object to the Message() constructor:
14# m = Message(fp)
Guido van Rossum01ca3361992-07-13 14:28:59 +000015#
16# To get the text of a particular header there are several methods:
17# str = m.getheader(name)
18# str = m.getrawheader(name)
19# where name is the name of the header, e.g. 'Subject'.
20# The difference is that getheader() strips the leading and trailing
21# whitespace, while getrawheader() doesn't. Both functions retain
22# embedded whitespace (including newlines) exactly as they are
23# specified in the header, and leave the case of the text unchanged.
24#
Guido van Rossumb6775db1994-08-01 11:34:53 +000025# For addresses and address lists there are functions
26# realname, mailaddress = m.getaddr(name) and
27# list = m.getaddrlist(name)
28# where the latter returns a list of (realname, mailaddr) tuples.
29#
30# There is also a method
31# time = m.getdate(name)
32# which parses a Date-like field and returns a time-compatible tuple,
33# i.e. a tuple such as returned by time.localtime() or accepted by
34# time.mktime().
35#
Guido van Rossum01ca3361992-07-13 14:28:59 +000036# See the class definition for lower level access methods.
37#
38# There are also some utility functions here.
39
40
41import regex
42import string
Guido van Rossumb6775db1994-08-01 11:34:53 +000043import time
Guido van Rossum01ca3361992-07-13 14:28:59 +000044
45
46class Message:
47
48 # Initialize the class instance and read the headers.
49
Guido van Rossum7bc817d1993-12-17 15:25:27 +000050 def __init__(self, fp):
Guido van Rossum01ca3361992-07-13 14:28:59 +000051 self.fp = fp
52 #
53 try:
54 self.startofheaders = self.fp.tell()
55 except IOError:
56 self.startofheaders = None
57 #
58 self.readheaders()
59 #
60 try:
61 self.startofbody = self.fp.tell()
62 except IOError:
63 self.startofbody = None
Guido van Rossum01ca3361992-07-13 14:28:59 +000064
65
66 # Rewind the file to the start of the body (if seekable).
67
68 def rewindbody(self):
69 self.fp.seek(self.startofbody)
70
71
72 # Read header lines up to the entirely blank line that
73 # terminates them. The (normally blank) line that ends the
74 # headers is skipped, but not included in the returned list.
75 # If a non-header line ends the headers, (which is an error),
76 # an attempt is made to backspace over it; it is never
77 # included in the returned list.
78 #
79 # The variable self.status is set to the empty string if all
80 # went well, otherwise it is an error message.
81 # The variable self.headers is a completely uninterpreted list
82 # of lines contained in the header (so printing them will
83 # reproduce the header exactly as it appears in the file).
84
85 def readheaders(self):
86 self.headers = list = []
87 self.status = ''
88 headerseen = 0
Jack Jansen3a15dca1995-06-13 11:19:48 +000089 firstline = 1
Guido van Rossum01ca3361992-07-13 14:28:59 +000090 while 1:
91 line = self.fp.readline()
92 if not line:
93 self.status = 'EOF in headers'
94 break
Jack Jansen3a15dca1995-06-13 11:19:48 +000095 # Skip unix From name time lines
96 if firstline and (line[:5] == 'From '
97 or line[:6] == '>From '):
98 continue
99 firstline = 0
Guido van Rossum01ca3361992-07-13 14:28:59 +0000100 if self.islast(line):
101 break
102 elif headerseen and line[0] in ' \t':
103 # It's a continuation line.
104 list.append(line)
Guido van Rossum3f9a6ec1994-08-12 13:16:50 +0000105 elif regex.match('^[!-9;-~]+:', line) >= 0:
Guido van Rossum01ca3361992-07-13 14:28:59 +0000106 # It's a header line.
107 list.append(line)
108 headerseen = 1
109 else:
110 # It's not a header line; stop here.
111 if not headerseen:
112 self.status = 'No headers'
113 else:
114 self.status = 'Bad header'
115 # Try to undo the read.
116 try:
117 self.fp.seek(-len(line), 1)
118 except IOError:
119 self.status = \
120 self.status + '; bad seek'
121 break
122
123
124 # Method to determine whether a line is a legal end of
125 # RFC-822 headers. You may override this method if your
Guido van Rossumb6775db1994-08-01 11:34:53 +0000126 # application wants to bend the rules, e.g. to strip trailing
127 # whitespace, or to recognise MH template separators
128 # ('--------'). For convenience (e.g. for code reading from
129 # sockets) a line consisting of \r\n also matches.
Guido van Rossum01ca3361992-07-13 14:28:59 +0000130
131 def islast(self, line):
Guido van Rossumb6775db1994-08-01 11:34:53 +0000132 return line == '\n' or line == '\r\n'
Guido van Rossum01ca3361992-07-13 14:28:59 +0000133
134
135 # Look through the list of headers and find all lines matching
136 # a given header name (and their continuation lines).
137 # A list of the lines is returned, without interpretation.
138 # If the header does not occur, an empty list is returned.
139 # If the header occurs multiple times, all occurrences are
140 # returned. Case is not important in the header name.
141
142 def getallmatchingheaders(self, name):
143 name = string.lower(name) + ':'
144 n = len(name)
145 list = []
146 hit = 0
147 for line in self.headers:
148 if string.lower(line[:n]) == name:
149 hit = 1
150 elif line[:1] not in string.whitespace:
151 hit = 0
152 if hit:
153 list.append(line)
154 return list
155
156
157 # Similar, but return only the first matching header (and its
158 # continuation lines).
159
160 def getfirstmatchingheader(self, name):
161 name = string.lower(name) + ':'
162 n = len(name)
163 list = []
164 hit = 0
165 for line in self.headers:
Guido van Rossum3f9a6ec1994-08-12 13:16:50 +0000166 if hit:
167 if line[:1] not in string.whitespace:
Guido van Rossum01ca3361992-07-13 14:28:59 +0000168 break
Guido van Rossum3f9a6ec1994-08-12 13:16:50 +0000169 elif string.lower(line[:n]) == name:
170 hit = 1
Guido van Rossum01ca3361992-07-13 14:28:59 +0000171 if hit:
172 list.append(line)
173 return list
174
175
176 # A higher-level interface to getfirstmatchingheader().
177 # Return a string containing the literal text of the header
178 # but with the keyword stripped. All leading, trailing and
179 # embedded whitespace is kept in the string, however.
180 # Return None if the header does not occur.
181
182 def getrawheader(self, name):
183 list = self.getfirstmatchingheader(name)
184 if not list:
185 return None
186 list[0] = list[0][len(name) + 1:]
187 return string.joinfields(list, '')
188
189
190 # Going one step further: also strip leading and trailing
191 # whitespace.
192
193 def getheader(self, name):
194 text = self.getrawheader(name)
195 if text == None:
196 return None
197 return string.strip(text)
198
199
Guido van Rossumb6775db1994-08-01 11:34:53 +0000200 # Retrieve a single address from a header as a tuple, e.g.
201 # ('Guido van Rossum', 'guido@cwi.nl').
Guido van Rossum01ca3361992-07-13 14:28:59 +0000202
Guido van Rossumb6775db1994-08-01 11:34:53 +0000203 def getaddr(self, name):
204 data = self.getheader(name)
205 if not data:
206 return None, None
207 return parseaddr(data)
Guido van Rossum01ca3361992-07-13 14:28:59 +0000208
Guido van Rossumb6775db1994-08-01 11:34:53 +0000209 # Retrieve a list of addresses from a header, where each
210 # address is a tuple as returned by getaddr().
Guido van Rossum01ca3361992-07-13 14:28:59 +0000211
Guido van Rossumb6775db1994-08-01 11:34:53 +0000212 def getaddrlist(self, name):
213 # XXX This function is not really correct. The split
214 # on ',' might fail in the case of commas within
215 # quoted strings.
216 data = self.getheader(name)
217 if not data:
218 return []
219 data = string.splitfields(data, ',')
220 for i in range(len(data)):
221 data[i] = parseaddr(data[i])
222 return data
223
224 # Retrieve a date field from a header as a tuple compatible
225 # with time.mktime().
226
227 def getdate(self, name):
228 data = self.getheader(name)
229 if not data:
230 return None
231 return parsedate(data)
232
233
234 # Access as a dictionary (only finds first header of each type):
235
236 def __len__(self):
237 types = {}
238 for line in self.headers:
239 if line[0] in string.whitespace: continue
240 i = string.find(line, ':')
241 if i > 0:
242 name = string.lower(line[:i])
243 types[name] = None
244 return len(types)
245
246 def __getitem__(self, name):
247 value = self.getheader(name)
248 if value is None: raise KeyError, name
249 return value
250
251 def has_key(self, name):
252 value = self.getheader(name)
253 return value is not None
254
255 def keys(self):
256 types = {}
257 for line in self.headers:
258 if line[0] in string.whitespace: continue
259 i = string.find(line, ':')
260 if i > 0:
261 name = line[:i]
262 key = string.lower(name)
263 types[key] = name
264 return types.values()
265
266 def values(self):
267 values = []
268 for name in self.keys():
269 values.append(self[name])
270 return values
271
272 def items(self):
273 items = []
274 for name in self.keys():
275 items.append(name, self[name])
276 return items
Guido van Rossum01ca3361992-07-13 14:28:59 +0000277
278
279
280# Utility functions
281# -----------------
282
Guido van Rossumb6775db1994-08-01 11:34:53 +0000283# XXX Should fix these to be really conformant.
284# XXX The inverses of the parse functions may also be useful.
285
Guido van Rossum01ca3361992-07-13 14:28:59 +0000286
287# Remove quotes from a string.
Guido van Rossum01ca3361992-07-13 14:28:59 +0000288
289def unquote(str):
290 if len(str) > 1:
291 if str[0] == '"' and str[-1:] == '"':
292 return str[1:-1]
293 if str[0] == '<' and str[-1:] == '>':
294 return str[1:-1]
295 return str
Guido van Rossumb6775db1994-08-01 11:34:53 +0000296
297
298# Parse an address into (name, address) tuple
299
300def parseaddr(address):
301 # This is probably not perfect
302 address = string.strip(address)
303 # Case 1: part of the address is in <xx@xx> form.
304 pos = regex.search('<.*>', address)
305 if pos >= 0:
306 name = address[:pos]
307 address = address[pos:]
308 length = regex.match('<.*>', address)
309 name = name + address[length:]
310 address = address[:length]
311 else:
312 # Case 2: part of the address is in (comment) form
313 pos = regex.search('(.*)', address)
314 if pos >= 0:
315 name = address[pos:]
316 address = address[:pos]
317 length = regex.match('(.*)', name)
318 address = address + name[length:]
319 name = name[:length]
320 else:
321 # Case 3: neither. Only an address
322 name = ''
323 name = string.strip(name)
324 address = string.strip(address)
325 if address and address[0] == '<' and address[-1] == '>':
326 address = address[1:-1]
327 if name and name[0] == '(' and name[-1] == ')':
328 name = name[1:-1]
329 return name, address
330
331
332# Parse a date field
333
334_monthnames = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul',
335 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
336
337def parsedate(data):
Guido van Rossum85347411994-09-09 11:10:15 +0000338 # XXX This still mostly ignores timezone matters at the moment...
Guido van Rossumb6775db1994-08-01 11:34:53 +0000339 data = string.split(data)
340 if data[0][-1] == ',':
341 # There's a dayname here. Skip it
342 del data[0]
Guido van Rossum85347411994-09-09 11:10:15 +0000343 if len(data) == 4:
344 s = data[3]
345 i = string.find(s, '+')
346 if i > 0:
347 data[3:] = [s[:i], s[i+1:]]
348 else:
349 data.append('') # Dummy tz
Guido van Rossumb6775db1994-08-01 11:34:53 +0000350 if len(data) < 5:
351 return None
352 data = data[:5]
353 [dd, mm, yy, tm, tz] = data
354 if not mm in _monthnames:
355 return None
356 mm = _monthnames.index(mm)+1
357 tm = string.splitfields(tm, ':')
358 if len(tm) == 2:
359 [thh, tmm] = tm
360 tss = '0'
361 else:
362 [thh, tmm, tss] = tm
363 try:
364 yy = string.atoi(yy)
365 dd = string.atoi(dd)
366 thh = string.atoi(thh)
367 tmm = string.atoi(tmm)
368 tss = string.atoi(tss)
369 except string.atoi_error:
370 return None
371 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0)
372 return tuple
373
374
375# When used as script, run a small test program.
376# The first command line argument must be a filename containing one
377# message in RFC-822 format.
378
379if __name__ == '__main__':
380 import sys
381 file = '/ufs/guido/Mail/drafts/,1'
382 if sys.argv[1:]: file = sys.argv[1]
383 f = open(file, 'r')
384 m = Message(f)
385 print 'From:', m.getaddr('from')
386 print 'To:', m.getaddrlist('to')
387 print 'Subject:', m.getheader('subject')
388 print 'Date:', m.getheader('date')
389 date = m.getdate('date')
390 if date:
391 print 'ParsedDate:', time.asctime(date)
392 else:
393 print 'ParsedDate:', None
394 m.rewindbody()
395 n = 0
396 while f.readline():
397 n = n + 1
398 print 'Lines:', n
399 print '-'*70
400 print 'len =', len(m)
401 if m.has_key('Date'): print 'Date =', m['Date']
402 if m.has_key('X-Nonsense'): pass
403 print 'keys =', m.keys()
404 print 'values =', m.values()
405 print 'items =', m.items()
406