| # Copyright (C) 2001,2002 Python Software Foundation | 
 | # Author: barry@zope.com (Barry Warsaw) | 
 |  | 
 | """A parser of RFC 2822 and MIME email messages. | 
 | """ | 
 |  | 
 | from cStringIO import StringIO | 
 | from types import ListType | 
 |  | 
 | # Intrapackage imports | 
 | import Errors | 
 | import Message | 
 |  | 
 | EMPTYSTRING = '' | 
 | NL = '\n' | 
 |  | 
 |  | 
 |  | 
 | class Parser: | 
 |     def __init__(self, _class=Message.Message): | 
 |         """Parser of RFC 2822 and MIME email messages. | 
 |  | 
 |         Creates an in-memory object tree representing the email message, which | 
 |         can then be manipulated and turned over to a Generator to return the | 
 |         textual representation of the message. | 
 |  | 
 |         The string must be formatted as a block of RFC 2822 headers and header | 
 |         continuation lines, optionally preceeded by a `Unix-from' header.  The | 
 |         header block is terminated either by the end of the string or by a | 
 |         blank line. | 
 |  | 
 |         _class is the class to instantiate for new message objects when they | 
 |         must be created.  This class must have a constructor that can take | 
 |         zero arguments.  Default is Message.Message. | 
 |         """ | 
 |         self._class = _class | 
 |  | 
 |     def parse(self, fp): | 
 |         root = self._class() | 
 |         self._parseheaders(root, fp) | 
 |         self._parsebody(root, fp) | 
 |         return root | 
 |  | 
 |     def parsestr(self, text): | 
 |         return self.parse(StringIO(text)) | 
 |  | 
 |     def _parseheaders(self, container, fp): | 
 |         # Parse the headers, returning a list of header/value pairs.  None as | 
 |         # the header means the Unix-From header. | 
 |         lastheader = '' | 
 |         lastvalue = [] | 
 |         lineno = 0 | 
 |         while 1: | 
 |             # Don't strip the line before we test for the end condition, | 
 |             # because whitespace-only header lines are RFC compliant | 
 |             # continuation lines. | 
 |             line = fp.readline() | 
 |             if not line: | 
 |                 break | 
 |             line = line.splitlines()[0] | 
 |             if not line: | 
 |                 break | 
 |             # Ignore the trailing newline | 
 |             lineno += 1 | 
 |             # Check for initial Unix From_ line | 
 |             if line.startswith('From '): | 
 |                 if lineno == 1: | 
 |                     container.set_unixfrom(line) | 
 |                     continue | 
 |                 else: | 
 |                     raise Errors.HeaderParseError( | 
 |                         'Unix-from in headers after first rfc822 header') | 
 |             # Header continuation line | 
 |             if line[0] in ' \t': | 
 |                 if not lastheader: | 
 |                     raise Errors.HeaderParseError( | 
 |                         'Continuation line seen before first header') | 
 |                 lastvalue.append(line) | 
 |                 continue | 
 |             # Normal, non-continuation header.  BAW: this should check to make | 
 |             # sure it's a legal header, e.g. doesn't contain spaces.  Also, we | 
 |             # should expose the header matching algorithm in the API, and | 
 |             # allow for a non-strict parsing mode (that ignores the line | 
 |             # instead of raising the exception). | 
 |             i = line.find(':') | 
 |             if i < 0: | 
 |                 raise Errors.HeaderParseError( | 
 |                     'Not a header, not a continuation') | 
 |             if lastheader: | 
 |                 container[lastheader] = NL.join(lastvalue) | 
 |             lastheader = line[:i] | 
 |             lastvalue = [line[i+1:].lstrip()] | 
 |         # Make sure we retain the last header | 
 |         if lastheader: | 
 |             container[lastheader] = NL.join(lastvalue) | 
 |  | 
 |     def _parsebody(self, container, fp): | 
 |         # Parse the body, but first split the payload on the content-type | 
 |         # boundary if present. | 
 |         boundary = container.get_boundary() | 
 |         isdigest = (container.get_type() == 'multipart/digest') | 
 |         # If there's a boundary, split the payload text into its constituent | 
 |         # parts and parse each separately.  Otherwise, just parse the rest of | 
 |         # the body as a single message.  Note: any exceptions raised in the | 
 |         # recursive parse need to have their line numbers coerced. | 
 |         if boundary: | 
 |             preamble = epilogue = None | 
 |             # Split into subparts.  The first boundary we're looking for won't | 
 |             # have the leading newline since we're at the start of the body | 
 |             # text. | 
 |             separator = '--' + boundary | 
 |             payload = fp.read() | 
 |             start = payload.find(separator) | 
 |             if start < 0: | 
 |                 raise Errors.BoundaryError( | 
 |                     "Couldn't find starting boundary: %s" % boundary) | 
 |             if start > 0: | 
 |                 # there's some pre-MIME boundary preamble | 
 |                 preamble = payload[0:start] | 
 |             start += len(separator) + 1 + isdigest | 
 |             terminator = payload.find('\n' + separator + '--', start) | 
 |             if terminator < 0: | 
 |                 raise Errors.BoundaryError( | 
 |                     "Couldn't find terminating boundary: %s" % boundary) | 
 |             if terminator+len(separator)+3 < len(payload): | 
 |                 # there's some post-MIME boundary epilogue | 
 |                 epilogue = payload[terminator+len(separator)+3:] | 
 |             # We split the textual payload on the boundary separator, which | 
 |             # includes the trailing newline.  If the container is a | 
 |             # multipart/digest then the subparts are by default message/rfc822 | 
 |             # instead of text/plain.  In that case, they'll have an extra | 
 |             # newline before the headers to distinguish the message's headers | 
 |             # from the subpart headers. | 
 |             if isdigest: | 
 |                 separator += '\n\n' | 
 |             else: | 
 |                 separator += '\n' | 
 |             parts = payload[start:terminator].split('\n' + separator) | 
 |             for part in parts: | 
 |                 msgobj = self.parsestr(part) | 
 |                 container.preamble = preamble | 
 |                 container.epilogue = epilogue | 
 |                 container.attach(msgobj) | 
 |         elif container.get_main_type() == 'multipart': | 
 |             # Very bad.  A message is a multipart with no boundary! | 
 |             raise Errors.BoundaryError( | 
 |                 'multipart message with no defined boundary') | 
 |         elif container.get_type() == 'message/delivery-status': | 
 |             # This special kind of type contains blocks of headers separated | 
 |             # by a blank line.  We'll represent each header block as a | 
 |             # separate Message object | 
 |             blocks = [] | 
 |             while 1: | 
 |                 blockmsg = self._class() | 
 |                 self._parseheaders(blockmsg, fp) | 
 |                 if not len(blockmsg): | 
 |                     # No more header blocks left | 
 |                     break | 
 |                 blocks.append(blockmsg) | 
 |             container.set_payload(blocks) | 
 |         elif container.get_main_type() == 'message': | 
 |             # Create a container for the payload, but watch out for there not | 
 |             # being any headers left | 
 |             try: | 
 |                 msg = self.parse(fp) | 
 |             except Errors.HeaderParseError: | 
 |                 msg = self._class() | 
 |                 self._parsebody(msg, fp) | 
 |             container.set_payload(msg) | 
 |         else: | 
 |             container.set_payload(fp.read()) | 
 |  | 
 |  | 
 |  | 
 | class HeaderParser(Parser): | 
 |     """A subclass of Parser, this one only meaningfully parses message headers. | 
 |  | 
 |     This class can be used if all you're interested in is the headers of a | 
 |     message.  While it consumes the message body, it does not parse it, but | 
 |     simply makes it available as a string payload. | 
 |  | 
 |     Parsing with this subclass can be considerably faster if all you're | 
 |     interested in is the message headers. | 
 |     """ | 
 |     def _parsebody(self, container, fp): | 
 |         # Consume but do not parse, the body | 
 |         container.set_payload(fp.read()) |