blob: 44a0ca2bdd61a90877480fbf4e153e7971857c41 [file] [log] [blame]
Barry Warsawba925802001-09-23 03:17:28 +00001# Copyright (C) 2001 Python Software Foundation
2# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsawba925802001-09-23 03:17:28 +00007from cStringIO import StringIO
8
9# Intrapackage imports
10import Errors
11import Message
12
Barry Warsawba925802001-09-23 03:17:28 +000013EMPTYSTRING = ''
14NL = '\n'
15
16
Barry Warsawe968ead2001-10-04 17:05:11 +000017
Barry Warsawba925802001-09-23 03:17:28 +000018class Parser:
19 def __init__(self, _class=Message.Message):
20 """Parser of RFC 2822 and MIME email messages.
21
22 Creates an in-memory object tree representing the email message, which
23 can then be manipulated and turned over to a Generator to return the
24 textual representation of the message.
25
26 The string must be formatted as a block of RFC 2822 headers and header
27 continuation lines, optionally preceeded by a `Unix-from' header. The
28 header block is terminated either by the end of the string or by a
29 blank line.
30
31 _class is the class to instantiate for new message objects when they
32 must be created. This class must have a constructor that can take
33 zero arguments. Default is Message.Message.
34 """
35 self._class = _class
36
37 def parse(self, fp):
38 root = self._class()
39 self._parseheaders(root, fp)
40 self._parsebody(root, fp)
41 return root
42
43 def parsestr(self, text):
44 return self.parse(StringIO(text))
45
46 def _parseheaders(self, container, fp):
47 # Parse the headers, returning a list of header/value pairs. None as
48 # the header means the Unix-From header.
49 lastheader = ''
50 lastvalue = []
51 lineno = 0
52 while 1:
53 line = fp.readline()[:-1]
54 if not line or not line.strip():
55 break
56 lineno += 1
57 # Check for initial Unix From_ line
58 if line.startswith('From '):
59 if lineno == 1:
60 container.set_unixfrom(line)
61 continue
62 else:
63 raise Errors.HeaderParseError(
64 'Unix-from in headers after first rfc822 header')
65 #
66 # Header continuation line
67 if line[0] in ' \t':
68 if not lastheader:
69 raise Errors.HeaderParseError(
70 'Continuation line seen before first header')
71 lastvalue.append(line)
72 continue
73 # Normal, non-continuation header. BAW: this should check to make
74 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
75 # should expose the header matching algorithm in the API, and
76 # allow for a non-strict parsing mode (that ignores the line
77 # instead of raising the exception).
78 i = line.find(':')
79 if i < 0:
80 raise Errors.HeaderParseError(
81 'Not a header, not a continuation')
82 if lastheader:
83 container[lastheader] = NL.join(lastvalue)
84 lastheader = line[:i]
85 lastvalue = [line[i+1:].lstrip()]
86 # Make sure we retain the last header
87 if lastheader:
88 container[lastheader] = NL.join(lastvalue)
89
90 def _parsebody(self, container, fp):
91 # Parse the body, but first split the payload on the content-type
92 # boundary if present.
Barry Warsaw66971fb2001-09-26 05:44:09 +000093 boundary = container.get_boundary()
94 isdigest = (container.get_type() == 'multipart/digest')
Barry Warsawba925802001-09-23 03:17:28 +000095 # If there's a boundary, split the payload text into its constituent
96 # parts and parse each separately. Otherwise, just parse the rest of
97 # the body as a single message. Note: any exceptions raised in the
98 # recursive parse need to have their line numbers coerced.
99 if boundary:
100 preamble = epilogue = None
101 # Split into subparts. The first boundary we're looking for won't
102 # have the leading newline since we're at the start of the body
103 # text.
104 separator = '--' + boundary
105 payload = fp.read()
106 start = payload.find(separator)
107 if start < 0:
108 raise Errors.BoundaryError(
109 "Couldn't find starting boundary: %s" % boundary)
110 if start > 0:
111 # there's some pre-MIME boundary preamble
112 preamble = payload[0:start]
113 start += len(separator) + 1 + isdigest
114 terminator = payload.find('\n' + separator + '--', start)
115 if terminator < 0:
116 raise Errors.BoundaryError(
117 "Couldn't find terminating boundary: %s" % boundary)
118 if terminator+len(separator)+3 < len(payload):
119 # there's some post-MIME boundary epilogue
120 epilogue = payload[terminator+len(separator)+3:]
121 # We split the textual payload on the boundary separator, which
122 # includes the trailing newline. If the container is a
123 # multipart/digest then the subparts are by default message/rfc822
124 # instead of text/plain. In that case, they'll have an extra
125 # newline before the headers to distinguish the message's headers
126 # from the subpart headers.
127 if isdigest:
128 separator += '\n\n'
129 else:
130 separator += '\n'
131 parts = payload[start:terminator].split('\n' + separator)
132 for part in parts:
133 msgobj = self.parsestr(part)
134 container.preamble = preamble
135 container.epilogue = epilogue
136 container.add_payload(msgobj)
Barry Warsaw66971fb2001-09-26 05:44:09 +0000137 elif container.get_type() == 'message/delivery-status':
138 # This special kind of type contains blocks of headers separated
139 # by a blank line. We'll represent each header block as a
140 # separate Message object
141 blocks = []
142 while 1:
143 blockmsg = self._class()
144 self._parseheaders(blockmsg, fp)
145 if not len(blockmsg):
146 # No more header blocks left
147 break
148 blocks.append(blockmsg)
149 container.set_payload(blocks)
150 elif container.get_main_type() == 'message':
Barry Warsawba925802001-09-23 03:17:28 +0000151 # Create a container for the payload, but watch out for there not
152 # being any headers left
153 try:
154 msg = self.parse(fp)
155 except Errors.HeaderParseError:
156 msg = self._class()
157 self._parsebody(msg, fp)
158 container.add_payload(msg)
159 else:
160 container.add_payload(fp.read())
Barry Warsawe5528822001-10-11 15:43:00 +0000161
162
163
164class HeaderParser(Parser):
165 """A subclass of Parser, this one only meaningfully parses message headers.
166
167 This class can be used if all you're interested in is the headers of a
168 message. While it consumes the message body, it does not parse it, but
169 simply makes it available as a string payload.
170
171 Parsing with this subclass can be considerably faster if all you're
172 interested in is the message headers.
173 """
174 def _parsebody(self, container, fp):
175 # Consume but do not parse, the body
176 container.set_payload(fp.read())