blob: cc23d1984ac71c64d51464c8509235f851761665 [file] [log] [blame]
Barry Warsawba925802001-09-23 03:17:28 +00001# Copyright (C) 2001 Python Software Foundation
2# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
7import re
8from cStringIO import StringIO
9
10# Intrapackage imports
11import Errors
12import Message
13
14bcre = re.compile('boundary="?([^"]+)"?', re.IGNORECASE)
15EMPTYSTRING = ''
16NL = '\n'
17
18
19
20class Parser:
21 def __init__(self, _class=Message.Message):
22 """Parser of RFC 2822 and MIME email messages.
23
24 Creates an in-memory object tree representing the email message, which
25 can then be manipulated and turned over to a Generator to return the
26 textual representation of the message.
27
28 The string must be formatted as a block of RFC 2822 headers and header
29 continuation lines, optionally preceeded by a `Unix-from' header. The
30 header block is terminated either by the end of the string or by a
31 blank line.
32
33 _class is the class to instantiate for new message objects when they
34 must be created. This class must have a constructor that can take
35 zero arguments. Default is Message.Message.
36 """
37 self._class = _class
38
39 def parse(self, fp):
40 root = self._class()
41 self._parseheaders(root, fp)
42 self._parsebody(root, fp)
43 return root
44
45 def parsestr(self, text):
46 return self.parse(StringIO(text))
47
48 def _parseheaders(self, container, fp):
49 # Parse the headers, returning a list of header/value pairs. None as
50 # the header means the Unix-From header.
51 lastheader = ''
52 lastvalue = []
53 lineno = 0
54 while 1:
55 line = fp.readline()[:-1]
56 if not line or not line.strip():
57 break
58 lineno += 1
59 # Check for initial Unix From_ line
60 if line.startswith('From '):
61 if lineno == 1:
62 container.set_unixfrom(line)
63 continue
64 else:
65 raise Errors.HeaderParseError(
66 'Unix-from in headers after first rfc822 header')
67 #
68 # Header continuation line
69 if line[0] in ' \t':
70 if not lastheader:
71 raise Errors.HeaderParseError(
72 'Continuation line seen before first header')
73 lastvalue.append(line)
74 continue
75 # Normal, non-continuation header. BAW: this should check to make
76 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
77 # should expose the header matching algorithm in the API, and
78 # allow for a non-strict parsing mode (that ignores the line
79 # instead of raising the exception).
80 i = line.find(':')
81 if i < 0:
82 raise Errors.HeaderParseError(
83 'Not a header, not a continuation')
84 if lastheader:
85 container[lastheader] = NL.join(lastvalue)
86 lastheader = line[:i]
87 lastvalue = [line[i+1:].lstrip()]
88 # Make sure we retain the last header
89 if lastheader:
90 container[lastheader] = NL.join(lastvalue)
91
92 def _parsebody(self, container, fp):
93 # Parse the body, but first split the payload on the content-type
94 # boundary if present.
95 boundary = isdigest = None
96 ctype = container['content-type']
97 if ctype:
98 mo = bcre.search(ctype)
99 if mo:
100 boundary = mo.group(1)
101 isdigest = container.get_type() == 'multipart/digest'
102 # If there's a boundary, split the payload text into its constituent
103 # parts and parse each separately. Otherwise, just parse the rest of
104 # the body as a single message. Note: any exceptions raised in the
105 # recursive parse need to have their line numbers coerced.
106 if boundary:
107 preamble = epilogue = None
108 # Split into subparts. The first boundary we're looking for won't
109 # have the leading newline since we're at the start of the body
110 # text.
111 separator = '--' + boundary
112 payload = fp.read()
113 start = payload.find(separator)
114 if start < 0:
115 raise Errors.BoundaryError(
116 "Couldn't find starting boundary: %s" % boundary)
117 if start > 0:
118 # there's some pre-MIME boundary preamble
119 preamble = payload[0:start]
120 start += len(separator) + 1 + isdigest
121 terminator = payload.find('\n' + separator + '--', start)
122 if terminator < 0:
123 raise Errors.BoundaryError(
124 "Couldn't find terminating boundary: %s" % boundary)
125 if terminator+len(separator)+3 < len(payload):
126 # there's some post-MIME boundary epilogue
127 epilogue = payload[terminator+len(separator)+3:]
128 # We split the textual payload on the boundary separator, which
129 # includes the trailing newline. If the container is a
130 # multipart/digest then the subparts are by default message/rfc822
131 # instead of text/plain. In that case, they'll have an extra
132 # newline before the headers to distinguish the message's headers
133 # from the subpart headers.
134 if isdigest:
135 separator += '\n\n'
136 else:
137 separator += '\n'
138 parts = payload[start:terminator].split('\n' + separator)
139 for part in parts:
140 msgobj = self.parsestr(part)
141 container.preamble = preamble
142 container.epilogue = epilogue
143 container.add_payload(msgobj)
144 elif ctype == 'message/rfc822':
145 # Create a container for the payload, but watch out for there not
146 # being any headers left
147 try:
148 msg = self.parse(fp)
149 except Errors.HeaderParseError:
150 msg = self._class()
151 self._parsebody(msg, fp)
152 container.add_payload(msg)
153 else:
154 container.add_payload(fp.read())