| # Copyright (C) 2001,2002 Python Software Foundation |
| # Author: barry@zope.com (Barry Warsaw) |
| |
| """A parser of RFC 2822 and MIME email messages. |
| """ |
| |
| import re |
| from cStringIO import StringIO |
| from types import ListType |
| |
| from email import Errors |
| from email import Message |
| |
| EMPTYSTRING = '' |
| NL = '\n' |
| |
| |
| class Parser: |
| def __init__(self, _class=Message.Message, strict=0): |
| """Parser of RFC 2822 and MIME email messages. |
| |
| Creates an in-memory object tree representing the email message, which |
| can then be manipulated and turned over to a Generator to return the |
| textual representation of the message. |
| |
| The string must be formatted as a block of RFC 2822 headers and header |
| continuation lines, optionally preceeded by a `Unix-from' header. The |
| header block is terminated either by the end of the string or by a |
| blank line. |
| |
| _class is the class to instantiate for new message objects when they |
| must be created. This class must have a constructor that can take |
| zero arguments. Default is Message.Message. |
| |
| Optional strict tells the parser to be strictly RFC compliant or to be |
| more forgiving in parsing of ill-formatted MIME documents. When |
| non-strict mode is used, the parser will try to make up for missing or |
| erroneous boundaries and other peculiarities seen in the wild. |
| Default is non-strict parsing. |
| """ |
| self._class = _class |
| self._strict = strict |
| |
| def parse(self, fp, headersonly=0): |
| root = self._class() |
| self._parseheaders(root, fp) |
| if not headersonly: |
| self._parsebody(root, fp) |
| return root |
| |
| def parsestr(self, text, headersonly=0): |
| return self.parse(StringIO(text), headersonly=headersonly) |
| |
| def _parseheaders(self, container, fp): |
| # Parse the headers, returning a list of header/value pairs. None as |
| # the header means the Unix-From header. |
| lastheader = '' |
| lastvalue = [] |
| lineno = 0 |
| while 1: |
| # Don't strip the line before we test for the end condition, |
| # because whitespace-only header lines are RFC compliant |
| # continuation lines. |
| line = fp.readline() |
| if not line: |
| break |
| line = line.splitlines()[0] |
| if not line: |
| break |
| # Ignore the trailing newline |
| lineno += 1 |
| # Check for initial Unix From_ line |
| if line.startswith('From '): |
| if lineno == 1: |
| container.set_unixfrom(line) |
| continue |
| elif self._strict: |
| raise Errors.HeaderParseError( |
| 'Unix-from in headers after first rfc822 header') |
| else: |
| # ignore the wierdly placed From_ line |
| # XXX: maybe set unixfrom anyway? or only if not already? |
| continue |
| # Header continuation line |
| if line[0] in ' \t': |
| if not lastheader: |
| raise Errors.HeaderParseError( |
| 'Continuation line seen before first header') |
| lastvalue.append(line) |
| continue |
| # Normal, non-continuation header. BAW: this should check to make |
| # sure it's a legal header, e.g. doesn't contain spaces. Also, we |
| # should expose the header matching algorithm in the API, and |
| # allow for a non-strict parsing mode (that ignores the line |
| # instead of raising the exception). |
| i = line.find(':') |
| if i < 0: |
| if self._strict: |
| raise Errors.HeaderParseError( |
| "Not a header, not a continuation: ``%s''"%line) |
| elif lineno == 1 and line.startswith('--'): |
| # allow through duplicate boundary tags. |
| continue |
| else: |
| raise Errors.HeaderParseError( |
| "Not a header, not a continuation: ``%s''"%line) |
| if lastheader: |
| container[lastheader] = NL.join(lastvalue) |
| lastheader = line[:i] |
| lastvalue = [line[i+1:].lstrip()] |
| # Make sure we retain the last header |
| if lastheader: |
| container[lastheader] = NL.join(lastvalue) |
| |
| def _parsebody(self, container, fp): |
| # Parse the body, but first split the payload on the content-type |
| # boundary if present. |
| boundary = container.get_boundary() |
| isdigest = (container.get_type() == 'multipart/digest') |
| # If there's a boundary, split the payload text into its constituent |
| # parts and parse each separately. Otherwise, just parse the rest of |
| # the body as a single message. Note: any exceptions raised in the |
| # recursive parse need to have their line numbers coerced. |
| if boundary: |
| preamble = epilogue = None |
| # Split into subparts. The first boundary we're looking for won't |
| # always have a leading newline since we're at the start of the |
| # body text, and there's not always a preamble before the first |
| # boundary. |
| separator = '--' + boundary |
| payload = fp.read() |
| # We use an RE here because boundaries can have trailing |
| # whitespace. |
| mo = re.search( |
| r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)', |
| payload) |
| if not mo: |
| raise Errors.BoundaryError( |
| "Couldn't find starting boundary: %s" % boundary) |
| start = mo.start() |
| if start > 0: |
| # there's some pre-MIME boundary preamble |
| preamble = payload[0:start] |
| # Find out what kind of line endings we're using |
| start += len(mo.group('sep')) + len(mo.group('ws')) |
| cre = re.compile('\r\n|\r|\n') |
| mo = cre.search(payload, start) |
| if mo: |
| start += len(mo.group(0)) |
| # We create a compiled regexp first because we need to be able to |
| # specify the start position, and the module function doesn't |
| # support this signature. :( |
| cre = re.compile('(?P<sep>\r\n|\r|\n)' + |
| re.escape(separator) + '--') |
| mo = cre.search(payload, start) |
| if mo: |
| terminator = mo.start() |
| linesep = mo.group('sep') |
| if mo.end() < len(payload): |
| # There's some post-MIME boundary epilogue |
| epilogue = payload[mo.end():] |
| elif self._strict: |
| raise Errors.BoundaryError( |
| "Couldn't find terminating boundary: %s" % boundary) |
| else: |
| # Handle the case of no trailing boundary. Check that it ends |
| # in a blank line. Some cases (spamspamspam) don't even have |
| # that! |
| mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload) |
| if not mo: |
| mo = re.search('(?P<sep>\r\n|\r|\n)$', payload) |
| if not mo: |
| raise Errors.BoundaryError( |
| 'No terminating boundary and no trailing empty line') |
| linesep = mo.group('sep') |
| terminator = len(payload) |
| # We split the textual payload on the boundary separator, which |
| # includes the trailing newline. If the container is a |
| # multipart/digest then the subparts are by default message/rfc822 |
| # instead of text/plain. In that case, they'll have a optional |
| # block of MIME headers, then an empty line followed by the |
| # message headers. |
| parts = re.split( |
| linesep + re.escape(separator) + r'[ \t]*' + linesep, |
| payload[start:terminator]) |
| for part in parts: |
| if isdigest: |
| if part[0] == linesep: |
| # There's no header block so create an empty message |
| # object as the container, and lop off the newline so |
| # we can parse the sub-subobject |
| msgobj = self._class() |
| part = part[1:] |
| else: |
| parthdrs, part = part.split(linesep+linesep, 1) |
| # msgobj in this case is the "message/rfc822" container |
| msgobj = self.parsestr(parthdrs, headersonly=1) |
| # while submsgobj is the message itself |
| submsgobj = self.parsestr(part) |
| msgobj.attach(submsgobj) |
| msgobj.set_default_type('message/rfc822') |
| else: |
| msgobj = self.parsestr(part) |
| container.preamble = preamble |
| container.epilogue = epilogue |
| container.attach(msgobj) |
| elif container.get_main_type() == 'multipart': |
| # Very bad. A message is a multipart with no boundary! |
| raise Errors.BoundaryError( |
| 'multipart message with no defined boundary') |
| elif container.get_type() == 'message/delivery-status': |
| # This special kind of type contains blocks of headers separated |
| # by a blank line. We'll represent each header block as a |
| # separate Message object |
| blocks = [] |
| while 1: |
| blockmsg = self._class() |
| self._parseheaders(blockmsg, fp) |
| if not len(blockmsg): |
| # No more header blocks left |
| break |
| blocks.append(blockmsg) |
| container.set_payload(blocks) |
| elif container.get_main_type() == 'message': |
| # Create a container for the payload, but watch out for there not |
| # being any headers left |
| try: |
| msg = self.parse(fp) |
| except Errors.HeaderParseError: |
| msg = self._class() |
| self._parsebody(msg, fp) |
| container.attach(msg) |
| else: |
| container.set_payload(fp.read()) |
| |
| |
| |
| class HeaderParser(Parser): |
| """A subclass of Parser, this one only meaningfully parses message headers. |
| |
| This class can be used if all you're interested in is the headers of a |
| message. While it consumes the message body, it does not parse it, but |
| simply makes it available as a string payload. |
| |
| Parsing with this subclass can be considerably faster if all you're |
| interested in is the message headers. |
| """ |
| def _parsebody(self, container, fp): |
| # Consume but do not parse, the body |
| container.set_payload(fp.read()) |