Barry Warsaw | 15e9dc9 | 2002-01-27 06:48:02 +0000 | [diff] [blame] | 1 | # Copyright (C) 2001,2002 Python Software Foundation |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 2 | # Author: barry@zope.com (Barry Warsaw) |
| 3 | |
| 4 | """A parser of RFC 2822 and MIME email messages. |
| 5 | """ |
| 6 | |
Barry Warsaw | 7e21b67 | 2002-05-19 23:51:50 +0000 | [diff] [blame] | 7 | import re |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 8 | from cStringIO import StringIO |
Barry Warsaw | 15e9dc9 | 2002-01-27 06:48:02 +0000 | [diff] [blame] | 9 | from types import ListType |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 10 | |
Barry Warsaw | 69e18af | 2002-06-02 19:12:03 +0000 | [diff] [blame] | 11 | from email import Errors |
| 12 | from email import Message |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 13 | |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 14 | EMPTYSTRING = '' |
| 15 | NL = '\n' |
| 16 | |
Barry Warsaw | e03e8f0 | 2002-09-28 20:44:58 +0000 | [diff] [blame] | 17 | try: |
| 18 | True, False |
| 19 | except NameError: |
| 20 | True = 1 |
| 21 | False = 0 |
| 22 | |
Barry Warsaw | 0e4570b | 2003-03-06 05:25:35 +0000 | [diff] [blame] | 23 | NLCRE = re.compile('\r\n|\r|\n') |
Barry Warsaw | 487fe6a | 2002-10-07 17:27:35 +0000 | [diff] [blame] | 24 | |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 25 | class TextUtil: |
| 26 | """ A utility class for wrapping a file object and providing a |
| 27 | couple of additional useful functions. |
| 28 | """ |
| 29 | |
| 30 | def __init__(self, fp): |
| 31 | self.fp = fp |
| 32 | self.unread = [] |
| 33 | |
| 34 | def readline(self): |
| 35 | """ Return a line of data. |
| 36 | |
| 37 | If data has been pushed back with unreadline(), the most recently |
| 38 | returned unreadline()d data will be returned. |
| 39 | """ |
| 40 | if self.unread: |
| 41 | return self.unread.pop() |
| 42 | else: |
| 43 | return self.fp.readline() |
| 44 | |
| 45 | def unreadline(self, line): |
| 46 | """Push a line back into the object. |
| 47 | """ |
| 48 | self.unread.append(line) |
| 49 | |
| 50 | def peekline(self): |
| 51 | """Non-destructively look at the next line""" |
| 52 | line = self.readline() |
| 53 | self.unreadline(line) |
| 54 | return line |
| 55 | |
| 56 | def read(self): |
| 57 | """Return the remaining data |
| 58 | """ |
| 59 | r = self.fp.read() |
| 60 | if self.unread: |
| 61 | r = "\n".join(self.unread) + r |
| 62 | self.unread = [] |
| 63 | return r |
| 64 | |
| 65 | def readuntil(self, re, afterblank=0, includematch=0): |
| 66 | """Read a line at a time until we get the specified RE. |
| 67 | |
| 68 | Returns the text up to (and including, if includematch is true) the |
| 69 | matched text, and the RE match object. If afterblank is true, |
| 70 | there must be a blank line before the matched text. Moves current |
| 71 | filepointer to the line following the matched line. If we reach |
| 72 | end-of-file, return what we've got so far, and return None as the |
| 73 | RE match object. |
| 74 | """ |
| 75 | prematch = [] |
| 76 | blankseen = 0 |
| 77 | while 1: |
| 78 | line = self.readline() |
| 79 | if not line: |
| 80 | # end of file |
| 81 | return EMPTYSTRING.join(prematch), None |
| 82 | if afterblank: |
| 83 | if NLCRE.match(line): |
| 84 | blankseen = 1 |
| 85 | continue |
| 86 | else: |
| 87 | blankseen = 0 |
| 88 | m = re.match(line) |
| 89 | if (m and not afterblank) or (m and afterblank and blankseen): |
| 90 | if includematch: |
| 91 | prematch.append(line) |
| 92 | return EMPTYSTRING.join(prematch), m |
| 93 | prematch.append(line) |
Barry Warsaw | e03e8f0 | 2002-09-28 20:44:58 +0000 | [diff] [blame] | 94 | |
Barry Warsaw | e968ead | 2001-10-04 17:05:11 +0000 | [diff] [blame] | 95 | |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 96 | class Parser: |
Barry Warsaw | e03e8f0 | 2002-09-28 20:44:58 +0000 | [diff] [blame] | 97 | def __init__(self, _class=Message.Message, strict=False): |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 98 | """Parser of RFC 2822 and MIME email messages. |
| 99 | |
| 100 | Creates an in-memory object tree representing the email message, which |
| 101 | can then be manipulated and turned over to a Generator to return the |
| 102 | textual representation of the message. |
| 103 | |
| 104 | The string must be formatted as a block of RFC 2822 headers and header |
| 105 | continuation lines, optionally preceeded by a `Unix-from' header. The |
| 106 | header block is terminated either by the end of the string or by a |
| 107 | blank line. |
| 108 | |
| 109 | _class is the class to instantiate for new message objects when they |
| 110 | must be created. This class must have a constructor that can take |
| 111 | zero arguments. Default is Message.Message. |
Barry Warsaw | f6caeba | 2002-07-09 02:50:02 +0000 | [diff] [blame] | 112 | |
| 113 | Optional strict tells the parser to be strictly RFC compliant or to be |
| 114 | more forgiving in parsing of ill-formatted MIME documents. When |
| 115 | non-strict mode is used, the parser will try to make up for missing or |
| 116 | erroneous boundaries and other peculiarities seen in the wild. |
Barry Warsaw | bb26b45 | 2002-07-19 22:25:34 +0000 | [diff] [blame] | 117 | Default is non-strict parsing. |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 118 | """ |
| 119 | self._class = _class |
Barry Warsaw | f6caeba | 2002-07-09 02:50:02 +0000 | [diff] [blame] | 120 | self._strict = strict |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 121 | |
Barry Warsaw | e03e8f0 | 2002-09-28 20:44:58 +0000 | [diff] [blame] | 122 | def parse(self, fp, headersonly=False): |
Barry Warsaw | 057b842 | 2002-09-30 20:07:22 +0000 | [diff] [blame] | 123 | """Create a message structure from the data in a file. |
| 124 | |
| 125 | Reads all the data from the file and returns the root of the message |
| 126 | structure. Optional headersonly is a flag specifying whether to stop |
| 127 | parsing after reading the headers or not. The default is False, |
| 128 | meaning it parses the entire contents of the file. |
| 129 | """ |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 130 | root = self._class() |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 131 | fp = TextUtil(fp) |
| 132 | self._parseheaders(root, fp) |
Barry Warsaw | f6caeba | 2002-07-09 02:50:02 +0000 | [diff] [blame] | 133 | if not headersonly: |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 134 | obj = self._parsemessage(root, fp) |
| 135 | trailer = fp.read() |
| 136 | if obj and trailer: |
| 137 | self._attach_trailer(obj, trailer) |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 138 | return root |
| 139 | |
Barry Warsaw | e03e8f0 | 2002-09-28 20:44:58 +0000 | [diff] [blame] | 140 | def parsestr(self, text, headersonly=False): |
Barry Warsaw | 057b842 | 2002-09-30 20:07:22 +0000 | [diff] [blame] | 141 | """Create a message structure from a string. |
| 142 | |
| 143 | Returns the root of the message structure. Optional headersonly is a |
| 144 | flag specifying whether to stop parsing after reading the headers or |
| 145 | not. The default is False, meaning it parses the entire contents of |
| 146 | the file. |
| 147 | """ |
Barry Warsaw | f6caeba | 2002-07-09 02:50:02 +0000 | [diff] [blame] | 148 | return self.parse(StringIO(text), headersonly=headersonly) |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 149 | |
| 150 | def _parseheaders(self, container, fp): |
| 151 | # Parse the headers, returning a list of header/value pairs. None as |
| 152 | # the header means the Unix-From header. |
| 153 | lastheader = '' |
| 154 | lastvalue = [] |
| 155 | lineno = 0 |
Barry Warsaw | e03e8f0 | 2002-09-28 20:44:58 +0000 | [diff] [blame] | 156 | while True: |
Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 157 | # Don't strip the line before we test for the end condition, |
| 158 | # because whitespace-only header lines are RFC compliant |
| 159 | # continuation lines. |
| 160 | line = fp.readline() |
| 161 | if not line: |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 162 | break |
Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 163 | line = line.splitlines()[0] |
| 164 | if not line: |
| 165 | break |
| 166 | # Ignore the trailing newline |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 167 | lineno += 1 |
| 168 | # Check for initial Unix From_ line |
| 169 | if line.startswith('From '): |
| 170 | if lineno == 1: |
| 171 | container.set_unixfrom(line) |
| 172 | continue |
Barry Warsaw | f6caeba | 2002-07-09 02:50:02 +0000 | [diff] [blame] | 173 | elif self._strict: |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 174 | raise Errors.HeaderParseError( |
| 175 | 'Unix-from in headers after first rfc822 header') |
Barry Warsaw | f6caeba | 2002-07-09 02:50:02 +0000 | [diff] [blame] | 176 | else: |
| 177 | # ignore the wierdly placed From_ line |
| 178 | # XXX: maybe set unixfrom anyway? or only if not already? |
| 179 | continue |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 180 | # Header continuation line |
| 181 | if line[0] in ' \t': |
| 182 | if not lastheader: |
| 183 | raise Errors.HeaderParseError( |
| 184 | 'Continuation line seen before first header') |
| 185 | lastvalue.append(line) |
| 186 | continue |
| 187 | # Normal, non-continuation header. BAW: this should check to make |
| 188 | # sure it's a legal header, e.g. doesn't contain spaces. Also, we |
| 189 | # should expose the header matching algorithm in the API, and |
| 190 | # allow for a non-strict parsing mode (that ignores the line |
| 191 | # instead of raising the exception). |
| 192 | i = line.find(':') |
| 193 | if i < 0: |
Barry Warsaw | f6caeba | 2002-07-09 02:50:02 +0000 | [diff] [blame] | 194 | if self._strict: |
| 195 | raise Errors.HeaderParseError( |
Barry Warsaw | da2525e | 2002-11-05 21:44:06 +0000 | [diff] [blame] | 196 | "Not a header, not a continuation: ``%s''" % line) |
Barry Warsaw | f6caeba | 2002-07-09 02:50:02 +0000 | [diff] [blame] | 197 | elif lineno == 1 and line.startswith('--'): |
| 198 | # allow through duplicate boundary tags. |
| 199 | continue |
| 200 | else: |
Barry Warsaw | da2525e | 2002-11-05 21:44:06 +0000 | [diff] [blame] | 201 | # There was no separating blank line as mandated by RFC |
| 202 | # 2822, but we're in non-strict mode. So just offer up |
| 203 | # this current line as the first body line. |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 204 | fp.unreadline(line) |
Barry Warsaw | da2525e | 2002-11-05 21:44:06 +0000 | [diff] [blame] | 205 | break |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 206 | if lastheader: |
| 207 | container[lastheader] = NL.join(lastvalue) |
| 208 | lastheader = line[:i] |
| 209 | lastvalue = [line[i+1:].lstrip()] |
| 210 | # Make sure we retain the last header |
| 211 | if lastheader: |
| 212 | container[lastheader] = NL.join(lastvalue) |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 213 | return |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 214 | |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 215 | def _parsemessage(self, container, fp): |
| 216 | # Parse the body. We walk through the body from top to bottom, |
| 217 | # keeping track of the current multipart nesting as we go. |
| 218 | # We return the object that gets the data at the end of this |
| 219 | # block. |
Barry Warsaw | 66971fb | 2001-09-26 05:44:09 +0000 | [diff] [blame] | 220 | boundary = container.get_boundary() |
Barry Warsaw | 487fe6a | 2002-10-07 17:27:35 +0000 | [diff] [blame] | 221 | isdigest = (container.get_content_type() == 'multipart/digest') |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 222 | if boundary: |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 223 | separator = '--' + boundary |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 224 | boundaryRE = re.compile( |
| 225 | r'(?P<sep>' + re.escape(separator) + |
| 226 | r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$') |
| 227 | preamble, matchobj = fp.readuntil(boundaryRE) |
| 228 | if not matchobj: |
| 229 | # Broken - we hit the end of file. Just set the body |
| 230 | # to the text. |
| 231 | container.set_payload(preamble) |
| 232 | return container |
| 233 | if preamble: |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 234 | container.preamble = preamble |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 235 | else: |
| 236 | # The module docs specify an empty preamble is None, not '' |
| 237 | container.preamble = None |
| 238 | while 1: |
| 239 | subobj = self._class() |
| 240 | if isdigest: |
| 241 | subobj.set_default_type('message/rfc822') |
| 242 | firstline = fp.peekline() |
| 243 | if firstline.strip(): |
| 244 | # we have MIME headers. all good. |
| 245 | self._parseheaders(subobj, fp) |
| 246 | else: |
| 247 | # no MIME headers. this is allowed for multipart/digest |
| 248 | # Consume the extra blank line |
| 249 | fp.readline() |
| 250 | pass |
| 251 | else: |
| 252 | self._parseheaders(subobj, fp) |
| 253 | container.attach(subobj) |
| 254 | maintype = subobj.get_content_maintype() |
| 255 | hassubparts = (subobj.get_content_maintype() in |
| 256 | ( "message", "multipart" )) |
| 257 | if hassubparts: |
| 258 | subobj = self._parsemessage(subobj, fp) |
| 259 | |
| 260 | trailer, matchobj = fp.readuntil(boundaryRE) |
| 261 | if matchobj is None or trailer: |
| 262 | mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer) |
| 263 | if not mo: |
| 264 | mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer) |
| 265 | if not mo: |
| 266 | raise Errors.BoundaryError( |
| 267 | 'No terminating boundary and no trailing empty line') |
| 268 | linesep = mo.group('sep') |
| 269 | trailer = trailer[:-len(linesep)] |
| 270 | if trailer: |
| 271 | self._attach_trailer(subobj, trailer) |
| 272 | if matchobj is None or matchobj.group('end'): |
| 273 | # That was the last piece of data. Let our caller attach |
| 274 | # the epilogue to us. But before we do that, push the |
| 275 | # line ending of the match group back into the readline |
| 276 | # buffer, as it's part of the epilogue. |
| 277 | if matchobj: |
| 278 | fp.unreadline(matchobj.group('linesep')) |
| 279 | return container |
| 280 | |
| 281 | elif container.get_content_maintype() == "multipart": |
Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 282 | # Very bad. A message is a multipart with no boundary! |
| 283 | raise Errors.BoundaryError( |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 284 | 'multipart message with no defined boundary') |
| 285 | elif container.get_content_maintype() == "message": |
| 286 | ct = container.get_content_type() |
| 287 | if ct == "message/rfc822": |
| 288 | submessage = self._class() |
| 289 | self._parseheaders(submessage, fp) |
| 290 | self._parsemessage(submessage, fp) |
| 291 | container.attach(submessage) |
| 292 | return submessage |
| 293 | elif ct == "message/delivery-status": |
| 294 | # This special kind of type contains blocks of headers |
| 295 | # separated by a blank line. We'll represent each header |
| 296 | # block as a separate Message object |
| 297 | while 1: |
| 298 | nextblock = self._class() |
| 299 | self._parseheaders(nextblock, fp) |
| 300 | container.attach(nextblock) |
| 301 | # next peek ahead to see whether we've hit the end or not |
| 302 | nextline = fp.peekline() |
| 303 | if nextline[:2] == "--": |
| 304 | break |
| 305 | return container |
| 306 | else: |
| 307 | # Other sort of message object (e.g. external-body) |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 308 | msg = self._class() |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 309 | self._parsemessage(msg, fp) |
| 310 | container.attach(msg) |
| 311 | return msg |
Barry Warsaw | ba92580 | 2001-09-23 03:17:28 +0000 | [diff] [blame] | 312 | else: |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 313 | # single body section. We let our caller set the payload. |
| 314 | return container |
Barry Warsaw | e552882 | 2001-10-11 15:43:00 +0000 | [diff] [blame] | 315 | |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 316 | def _attach_trailer(self, obj, trailer): |
| 317 | if obj.get_content_maintype() in ("message", "multipart"): |
| 318 | obj.epilogue = trailer |
| 319 | else: |
| 320 | obj.set_payload(trailer) |
Barry Warsaw | e552882 | 2001-10-11 15:43:00 +0000 | [diff] [blame] | 321 | |
| 322 | |
| 323 | class HeaderParser(Parser): |
| 324 | """A subclass of Parser, this one only meaningfully parses message headers. |
| 325 | |
| 326 | This class can be used if all you're interested in is the headers of a |
| 327 | message. While it consumes the message body, it does not parse it, but |
| 328 | simply makes it available as a string payload. |
| 329 | |
| 330 | Parsing with this subclass can be considerably faster if all you're |
| 331 | interested in is the message headers. |
| 332 | """ |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 333 | def _parsemessage(self, container, fp): |
Barry Warsaw | e552882 | 2001-10-11 15:43:00 +0000 | [diff] [blame] | 334 | # Consume but do not parse, the body |
Barry Warsaw | da2525e | 2002-11-05 21:44:06 +0000 | [diff] [blame] | 335 | text = fp.read() |
Barry Warsaw | da2525e | 2002-11-05 21:44:06 +0000 | [diff] [blame] | 336 | container.set_payload(text) |
Thomas Wouters | 0813d76 | 2004-03-20 17:31:29 +0000 | [diff] [blame] | 337 | return None |