blob: 2f131d6b3a92ac43b2b263cabaf964f6e5d65b88 [file] [log] [blame]
Barry Warsaw15e9dc92002-01-27 06:48:02 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsawba925802001-09-23 03:17:28 +00007from cStringIO import StringIO
Barry Warsaw15e9dc92002-01-27 06:48:02 +00008from types import ListType
Barry Warsawba925802001-09-23 03:17:28 +00009
10# Intrapackage imports
11import Errors
12import Message
13
Barry Warsawba925802001-09-23 03:17:28 +000014EMPTYSTRING = ''
15NL = '\n'
16
17
Barry Warsawe968ead2001-10-04 17:05:11 +000018
Barry Warsawba925802001-09-23 03:17:28 +000019class Parser:
20 def __init__(self, _class=Message.Message):
21 """Parser of RFC 2822 and MIME email messages.
22
23 Creates an in-memory object tree representing the email message, which
24 can then be manipulated and turned over to a Generator to return the
25 textual representation of the message.
26
27 The string must be formatted as a block of RFC 2822 headers and header
28 continuation lines, optionally preceeded by a `Unix-from' header. The
29 header block is terminated either by the end of the string or by a
30 blank line.
31
32 _class is the class to instantiate for new message objects when they
33 must be created. This class must have a constructor that can take
34 zero arguments. Default is Message.Message.
35 """
36 self._class = _class
37
38 def parse(self, fp):
39 root = self._class()
40 self._parseheaders(root, fp)
41 self._parsebody(root, fp)
42 return root
43
44 def parsestr(self, text):
45 return self.parse(StringIO(text))
46
47 def _parseheaders(self, container, fp):
48 # Parse the headers, returning a list of header/value pairs. None as
49 # the header means the Unix-From header.
50 lastheader = ''
51 lastvalue = []
52 lineno = 0
53 while 1:
54 line = fp.readline()[:-1]
55 if not line or not line.strip():
56 break
57 lineno += 1
58 # Check for initial Unix From_ line
59 if line.startswith('From '):
60 if lineno == 1:
61 container.set_unixfrom(line)
62 continue
63 else:
64 raise Errors.HeaderParseError(
65 'Unix-from in headers after first rfc822 header')
66 #
67 # Header continuation line
68 if line[0] in ' \t':
69 if not lastheader:
70 raise Errors.HeaderParseError(
71 'Continuation line seen before first header')
72 lastvalue.append(line)
73 continue
74 # Normal, non-continuation header. BAW: this should check to make
75 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
76 # should expose the header matching algorithm in the API, and
77 # allow for a non-strict parsing mode (that ignores the line
78 # instead of raising the exception).
79 i = line.find(':')
80 if i < 0:
81 raise Errors.HeaderParseError(
82 'Not a header, not a continuation')
83 if lastheader:
84 container[lastheader] = NL.join(lastvalue)
85 lastheader = line[:i]
86 lastvalue = [line[i+1:].lstrip()]
87 # Make sure we retain the last header
88 if lastheader:
89 container[lastheader] = NL.join(lastvalue)
90
91 def _parsebody(self, container, fp):
92 # Parse the body, but first split the payload on the content-type
93 # boundary if present.
Barry Warsaw66971fb2001-09-26 05:44:09 +000094 boundary = container.get_boundary()
95 isdigest = (container.get_type() == 'multipart/digest')
Barry Warsawba925802001-09-23 03:17:28 +000096 # If there's a boundary, split the payload text into its constituent
97 # parts and parse each separately. Otherwise, just parse the rest of
98 # the body as a single message. Note: any exceptions raised in the
99 # recursive parse need to have their line numbers coerced.
100 if boundary:
101 preamble = epilogue = None
102 # Split into subparts. The first boundary we're looking for won't
103 # have the leading newline since we're at the start of the body
104 # text.
105 separator = '--' + boundary
106 payload = fp.read()
107 start = payload.find(separator)
108 if start < 0:
109 raise Errors.BoundaryError(
110 "Couldn't find starting boundary: %s" % boundary)
111 if start > 0:
112 # there's some pre-MIME boundary preamble
113 preamble = payload[0:start]
114 start += len(separator) + 1 + isdigest
115 terminator = payload.find('\n' + separator + '--', start)
116 if terminator < 0:
117 raise Errors.BoundaryError(
118 "Couldn't find terminating boundary: %s" % boundary)
119 if terminator+len(separator)+3 < len(payload):
120 # there's some post-MIME boundary epilogue
121 epilogue = payload[terminator+len(separator)+3:]
122 # We split the textual payload on the boundary separator, which
123 # includes the trailing newline. If the container is a
124 # multipart/digest then the subparts are by default message/rfc822
125 # instead of text/plain. In that case, they'll have an extra
126 # newline before the headers to distinguish the message's headers
127 # from the subpart headers.
128 if isdigest:
129 separator += '\n\n'
130 else:
131 separator += '\n'
132 parts = payload[start:terminator].split('\n' + separator)
133 for part in parts:
134 msgobj = self.parsestr(part)
135 container.preamble = preamble
136 container.epilogue = epilogue
Barry Warsaw15e9dc92002-01-27 06:48:02 +0000137 # Ensure that the container's payload is a list
138 if not isinstance(container.get_payload(), ListType):
139 container.set_payload([msgobj])
140 else:
141 container.add_payload(msgobj)
Barry Warsaw66971fb2001-09-26 05:44:09 +0000142 elif container.get_type() == 'message/delivery-status':
143 # This special kind of type contains blocks of headers separated
144 # by a blank line. We'll represent each header block as a
145 # separate Message object
146 blocks = []
147 while 1:
148 blockmsg = self._class()
149 self._parseheaders(blockmsg, fp)
150 if not len(blockmsg):
151 # No more header blocks left
152 break
153 blocks.append(blockmsg)
154 container.set_payload(blocks)
155 elif container.get_main_type() == 'message':
Barry Warsawba925802001-09-23 03:17:28 +0000156 # Create a container for the payload, but watch out for there not
157 # being any headers left
158 try:
159 msg = self.parse(fp)
160 except Errors.HeaderParseError:
161 msg = self._class()
162 self._parsebody(msg, fp)
163 container.add_payload(msg)
164 else:
165 container.add_payload(fp.read())
Barry Warsawe5528822001-10-11 15:43:00 +0000166
167
168
169class HeaderParser(Parser):
170 """A subclass of Parser, this one only meaningfully parses message headers.
171
172 This class can be used if all you're interested in is the headers of a
173 message. While it consumes the message body, it does not parse it, but
174 simply makes it available as a string payload.
175
176 Parsing with this subclass can be considerably faster if all you're
177 interested in is the message headers.
178 """
179 def _parsebody(self, container, fp):
180 # Consume but do not parse, the body
181 container.set_payload(fp.read())