blob: cb994bad1bb05231b1f2ca2b5360c4d23bf077ff [file] [log] [blame]
Barry Warsaw15e9dc92002-01-27 06:48:02 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsaw7e21b672002-05-19 23:51:50 +00007import re
Barry Warsawba925802001-09-23 03:17:28 +00008from cStringIO import StringIO
Barry Warsaw15e9dc92002-01-27 06:48:02 +00009from types import ListType
Barry Warsawba925802001-09-23 03:17:28 +000010
11# Intrapackage imports
12import Errors
13import Message
14
Barry Warsawba925802001-09-23 03:17:28 +000015EMPTYSTRING = ''
16NL = '\n'
17
18
Barry Warsawe968ead2001-10-04 17:05:11 +000019
Barry Warsawba925802001-09-23 03:17:28 +000020class Parser:
21 def __init__(self, _class=Message.Message):
22 """Parser of RFC 2822 and MIME email messages.
23
24 Creates an in-memory object tree representing the email message, which
25 can then be manipulated and turned over to a Generator to return the
26 textual representation of the message.
27
28 The string must be formatted as a block of RFC 2822 headers and header
29 continuation lines, optionally preceeded by a `Unix-from' header. The
30 header block is terminated either by the end of the string or by a
31 blank line.
32
33 _class is the class to instantiate for new message objects when they
34 must be created. This class must have a constructor that can take
35 zero arguments. Default is Message.Message.
36 """
37 self._class = _class
38
39 def parse(self, fp):
40 root = self._class()
41 self._parseheaders(root, fp)
42 self._parsebody(root, fp)
43 return root
44
45 def parsestr(self, text):
46 return self.parse(StringIO(text))
47
48 def _parseheaders(self, container, fp):
49 # Parse the headers, returning a list of header/value pairs. None as
50 # the header means the Unix-From header.
51 lastheader = ''
52 lastvalue = []
53 lineno = 0
54 while 1:
Barry Warsaw409a4c02002-04-10 21:01:31 +000055 # Don't strip the line before we test for the end condition,
56 # because whitespace-only header lines are RFC compliant
57 # continuation lines.
58 line = fp.readline()
59 if not line:
Barry Warsawba925802001-09-23 03:17:28 +000060 break
Barry Warsaw409a4c02002-04-10 21:01:31 +000061 line = line.splitlines()[0]
62 if not line:
63 break
64 # Ignore the trailing newline
Barry Warsawba925802001-09-23 03:17:28 +000065 lineno += 1
66 # Check for initial Unix From_ line
67 if line.startswith('From '):
68 if lineno == 1:
69 container.set_unixfrom(line)
70 continue
71 else:
72 raise Errors.HeaderParseError(
73 'Unix-from in headers after first rfc822 header')
Barry Warsawba925802001-09-23 03:17:28 +000074 # Header continuation line
75 if line[0] in ' \t':
76 if not lastheader:
77 raise Errors.HeaderParseError(
78 'Continuation line seen before first header')
79 lastvalue.append(line)
80 continue
81 # Normal, non-continuation header. BAW: this should check to make
82 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
83 # should expose the header matching algorithm in the API, and
84 # allow for a non-strict parsing mode (that ignores the line
85 # instead of raising the exception).
86 i = line.find(':')
87 if i < 0:
88 raise Errors.HeaderParseError(
89 'Not a header, not a continuation')
90 if lastheader:
91 container[lastheader] = NL.join(lastvalue)
92 lastheader = line[:i]
93 lastvalue = [line[i+1:].lstrip()]
94 # Make sure we retain the last header
95 if lastheader:
96 container[lastheader] = NL.join(lastvalue)
97
98 def _parsebody(self, container, fp):
99 # Parse the body, but first split the payload on the content-type
100 # boundary if present.
Barry Warsaw66971fb2001-09-26 05:44:09 +0000101 boundary = container.get_boundary()
102 isdigest = (container.get_type() == 'multipart/digest')
Barry Warsawba925802001-09-23 03:17:28 +0000103 # If there's a boundary, split the payload text into its constituent
104 # parts and parse each separately. Otherwise, just parse the rest of
105 # the body as a single message. Note: any exceptions raised in the
106 # recursive parse need to have their line numbers coerced.
107 if boundary:
108 preamble = epilogue = None
109 # Split into subparts. The first boundary we're looking for won't
110 # have the leading newline since we're at the start of the body
111 # text.
112 separator = '--' + boundary
113 payload = fp.read()
114 start = payload.find(separator)
115 if start < 0:
116 raise Errors.BoundaryError(
117 "Couldn't find starting boundary: %s" % boundary)
118 if start > 0:
119 # there's some pre-MIME boundary preamble
120 preamble = payload[0:start]
Barry Warsaw7e21b672002-05-19 23:51:50 +0000121 # Find out what kind of line endings we're using
122 start += len(separator)
123 cre = re.compile('\r\n|\r|\n')
124 mo = cre.search(payload, start)
125 if mo:
126 start += len(mo.group(0)) * (1 + isdigest)
127 # We create a compiled regexp first because we need to be able to
128 # specify the start position, and the module function doesn't
129 # support this signature. :(
130 cre = re.compile('(?P<sep>\r\n|\r|\n)' +
131 re.escape(separator) + '--')
132 mo = cre.search(payload, start)
133 if not mo:
Barry Warsawba925802001-09-23 03:17:28 +0000134 raise Errors.BoundaryError(
135 "Couldn't find terminating boundary: %s" % boundary)
Barry Warsaw7e21b672002-05-19 23:51:50 +0000136 terminator = mo.start()
137 linesep = mo.group('sep')
138 if mo.end() < len(payload):
Barry Warsawba925802001-09-23 03:17:28 +0000139 # there's some post-MIME boundary epilogue
Barry Warsaw7e21b672002-05-19 23:51:50 +0000140 epilogue = payload[mo.end():]
Barry Warsawba925802001-09-23 03:17:28 +0000141 # We split the textual payload on the boundary separator, which
142 # includes the trailing newline. If the container is a
143 # multipart/digest then the subparts are by default message/rfc822
144 # instead of text/plain. In that case, they'll have an extra
145 # newline before the headers to distinguish the message's headers
146 # from the subpart headers.
Barry Warsaw7e21b672002-05-19 23:51:50 +0000147 separator += linesep * (1 + isdigest)
148 parts = payload[start:terminator].split(linesep + separator)
Barry Warsawba925802001-09-23 03:17:28 +0000149 for part in parts:
150 msgobj = self.parsestr(part)
151 container.preamble = preamble
152 container.epilogue = epilogue
Barry Warsaw409a4c02002-04-10 21:01:31 +0000153 container.attach(msgobj)
154 elif container.get_main_type() == 'multipart':
155 # Very bad. A message is a multipart with no boundary!
156 raise Errors.BoundaryError(
157 'multipart message with no defined boundary')
Barry Warsaw66971fb2001-09-26 05:44:09 +0000158 elif container.get_type() == 'message/delivery-status':
159 # This special kind of type contains blocks of headers separated
160 # by a blank line. We'll represent each header block as a
161 # separate Message object
162 blocks = []
163 while 1:
164 blockmsg = self._class()
165 self._parseheaders(blockmsg, fp)
166 if not len(blockmsg):
167 # No more header blocks left
168 break
169 blocks.append(blockmsg)
170 container.set_payload(blocks)
171 elif container.get_main_type() == 'message':
Barry Warsawba925802001-09-23 03:17:28 +0000172 # Create a container for the payload, but watch out for there not
173 # being any headers left
174 try:
175 msg = self.parse(fp)
176 except Errors.HeaderParseError:
177 msg = self._class()
178 self._parsebody(msg, fp)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000179 container.set_payload(msg)
Barry Warsawba925802001-09-23 03:17:28 +0000180 else:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000181 container.set_payload(fp.read())
Barry Warsawe5528822001-10-11 15:43:00 +0000182
183
184
185class HeaderParser(Parser):
186 """A subclass of Parser, this one only meaningfully parses message headers.
187
188 This class can be used if all you're interested in is the headers of a
189 message. While it consumes the message body, it does not parse it, but
190 simply makes it available as a string payload.
191
192 Parsing with this subclass can be considerably faster if all you're
193 interested in is the message headers.
194 """
195 def _parsebody(self, container, fp):
196 # Consume but do not parse, the body
197 container.set_payload(fp.read())