blob: 9add4425d5ce5c417e5bd8f999cdac9a1e434375 [file] [log] [blame]
Barry Warsaw15e9dc92002-01-27 06:48:02 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsaw7e21b672002-05-19 23:51:50 +00007import re
Barry Warsawba925802001-09-23 03:17:28 +00008from cStringIO import StringIO
Barry Warsaw15e9dc92002-01-27 06:48:02 +00009from types import ListType
Barry Warsawba925802001-09-23 03:17:28 +000010
Barry Warsaw69e18af2002-06-02 19:12:03 +000011from email import Errors
12from email import Message
Barry Warsawba925802001-09-23 03:17:28 +000013
Barry Warsawba925802001-09-23 03:17:28 +000014EMPTYSTRING = ''
15NL = '\n'
16
17
Barry Warsawe968ead2001-10-04 17:05:11 +000018
Barry Warsawba925802001-09-23 03:17:28 +000019class Parser:
20 def __init__(self, _class=Message.Message):
21 """Parser of RFC 2822 and MIME email messages.
22
23 Creates an in-memory object tree representing the email message, which
24 can then be manipulated and turned over to a Generator to return the
25 textual representation of the message.
26
27 The string must be formatted as a block of RFC 2822 headers and header
28 continuation lines, optionally preceeded by a `Unix-from' header. The
29 header block is terminated either by the end of the string or by a
30 blank line.
31
32 _class is the class to instantiate for new message objects when they
33 must be created. This class must have a constructor that can take
34 zero arguments. Default is Message.Message.
35 """
36 self._class = _class
37
38 def parse(self, fp):
39 root = self._class()
40 self._parseheaders(root, fp)
41 self._parsebody(root, fp)
42 return root
43
44 def parsestr(self, text):
45 return self.parse(StringIO(text))
46
47 def _parseheaders(self, container, fp):
48 # Parse the headers, returning a list of header/value pairs. None as
49 # the header means the Unix-From header.
50 lastheader = ''
51 lastvalue = []
52 lineno = 0
53 while 1:
Barry Warsaw409a4c02002-04-10 21:01:31 +000054 # Don't strip the line before we test for the end condition,
55 # because whitespace-only header lines are RFC compliant
56 # continuation lines.
57 line = fp.readline()
58 if not line:
Barry Warsawba925802001-09-23 03:17:28 +000059 break
Barry Warsaw409a4c02002-04-10 21:01:31 +000060 line = line.splitlines()[0]
61 if not line:
62 break
63 # Ignore the trailing newline
Barry Warsawba925802001-09-23 03:17:28 +000064 lineno += 1
65 # Check for initial Unix From_ line
66 if line.startswith('From '):
67 if lineno == 1:
68 container.set_unixfrom(line)
69 continue
70 else:
71 raise Errors.HeaderParseError(
72 'Unix-from in headers after first rfc822 header')
Barry Warsawba925802001-09-23 03:17:28 +000073 # Header continuation line
74 if line[0] in ' \t':
75 if not lastheader:
76 raise Errors.HeaderParseError(
77 'Continuation line seen before first header')
78 lastvalue.append(line)
79 continue
80 # Normal, non-continuation header. BAW: this should check to make
81 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
82 # should expose the header matching algorithm in the API, and
83 # allow for a non-strict parsing mode (that ignores the line
84 # instead of raising the exception).
85 i = line.find(':')
86 if i < 0:
87 raise Errors.HeaderParseError(
88 'Not a header, not a continuation')
89 if lastheader:
90 container[lastheader] = NL.join(lastvalue)
91 lastheader = line[:i]
92 lastvalue = [line[i+1:].lstrip()]
93 # Make sure we retain the last header
94 if lastheader:
95 container[lastheader] = NL.join(lastvalue)
96
97 def _parsebody(self, container, fp):
98 # Parse the body, but first split the payload on the content-type
99 # boundary if present.
Barry Warsaw66971fb2001-09-26 05:44:09 +0000100 boundary = container.get_boundary()
101 isdigest = (container.get_type() == 'multipart/digest')
Barry Warsawba925802001-09-23 03:17:28 +0000102 # If there's a boundary, split the payload text into its constituent
103 # parts and parse each separately. Otherwise, just parse the rest of
104 # the body as a single message. Note: any exceptions raised in the
105 # recursive parse need to have their line numbers coerced.
106 if boundary:
107 preamble = epilogue = None
108 # Split into subparts. The first boundary we're looking for won't
109 # have the leading newline since we're at the start of the body
110 # text.
111 separator = '--' + boundary
112 payload = fp.read()
113 start = payload.find(separator)
114 if start < 0:
115 raise Errors.BoundaryError(
116 "Couldn't find starting boundary: %s" % boundary)
117 if start > 0:
118 # there's some pre-MIME boundary preamble
119 preamble = payload[0:start]
Barry Warsaw7e21b672002-05-19 23:51:50 +0000120 # Find out what kind of line endings we're using
121 start += len(separator)
122 cre = re.compile('\r\n|\r|\n')
123 mo = cre.search(payload, start)
124 if mo:
125 start += len(mo.group(0)) * (1 + isdigest)
126 # We create a compiled regexp first because we need to be able to
127 # specify the start position, and the module function doesn't
128 # support this signature. :(
129 cre = re.compile('(?P<sep>\r\n|\r|\n)' +
130 re.escape(separator) + '--')
131 mo = cre.search(payload, start)
132 if not mo:
Barry Warsawba925802001-09-23 03:17:28 +0000133 raise Errors.BoundaryError(
134 "Couldn't find terminating boundary: %s" % boundary)
Barry Warsaw7e21b672002-05-19 23:51:50 +0000135 terminator = mo.start()
136 linesep = mo.group('sep')
137 if mo.end() < len(payload):
Barry Warsawba925802001-09-23 03:17:28 +0000138 # there's some post-MIME boundary epilogue
Barry Warsaw7e21b672002-05-19 23:51:50 +0000139 epilogue = payload[mo.end():]
Barry Warsawba925802001-09-23 03:17:28 +0000140 # We split the textual payload on the boundary separator, which
141 # includes the trailing newline. If the container is a
142 # multipart/digest then the subparts are by default message/rfc822
143 # instead of text/plain. In that case, they'll have an extra
144 # newline before the headers to distinguish the message's headers
145 # from the subpart headers.
Barry Warsaw7e21b672002-05-19 23:51:50 +0000146 separator += linesep * (1 + isdigest)
147 parts = payload[start:terminator].split(linesep + separator)
Barry Warsawba925802001-09-23 03:17:28 +0000148 for part in parts:
149 msgobj = self.parsestr(part)
150 container.preamble = preamble
151 container.epilogue = epilogue
Barry Warsaw409a4c02002-04-10 21:01:31 +0000152 container.attach(msgobj)
153 elif container.get_main_type() == 'multipart':
154 # Very bad. A message is a multipart with no boundary!
155 raise Errors.BoundaryError(
156 'multipart message with no defined boundary')
Barry Warsaw66971fb2001-09-26 05:44:09 +0000157 elif container.get_type() == 'message/delivery-status':
158 # This special kind of type contains blocks of headers separated
159 # by a blank line. We'll represent each header block as a
160 # separate Message object
161 blocks = []
162 while 1:
163 blockmsg = self._class()
164 self._parseheaders(blockmsg, fp)
165 if not len(blockmsg):
166 # No more header blocks left
167 break
168 blocks.append(blockmsg)
169 container.set_payload(blocks)
170 elif container.get_main_type() == 'message':
Barry Warsawba925802001-09-23 03:17:28 +0000171 # Create a container for the payload, but watch out for there not
172 # being any headers left
173 try:
174 msg = self.parse(fp)
175 except Errors.HeaderParseError:
176 msg = self._class()
177 self._parsebody(msg, fp)
Barry Warsaw69e18af2002-06-02 19:12:03 +0000178 container.attach(msg)
Barry Warsawba925802001-09-23 03:17:28 +0000179 else:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000180 container.set_payload(fp.read())
Barry Warsawe5528822001-10-11 15:43:00 +0000181
182
183
184class HeaderParser(Parser):
185 """A subclass of Parser, this one only meaningfully parses message headers.
186
187 This class can be used if all you're interested in is the headers of a
188 message. While it consumes the message body, it does not parse it, but
189 simply makes it available as a string payload.
190
191 Parsing with this subclass can be considerably faster if all you're
192 interested in is the message headers.
193 """
194 def _parsebody(self, container, fp):
195 # Consume but do not parse, the body
196 container.set_payload(fp.read())