Lib/email/Parser.py - platform/external/python/cpython3 - Gitiles

 # Copyright (C) 2001,2002 Python Software Foundation
 # Author: barry@zope.com (Barry Warsaw)

 """A parser of RFC 2822 and MIME email messages.
 """

 import re
 from cStringIO import StringIO
 from types import ListType

 from email import Errors
 from email import Message

 EMPTYSTRING = ''
 NL = '\n'


 class Parser:
     def __init__(self, _class=Message.Message, strict=0):
         """Parser of RFC 2822 and MIME email messages.

         Creates an in-memory object tree representing the email message, which
         can then be manipulated and turned over to a Generator to return the
         textual representation of the message.

         The string must be formatted as a block of RFC 2822 headers and header
         continuation lines, optionally preceeded by a `Unix-from' header.  The
         header block is terminated either by the end of the string or by a
         blank line.

         _class is the class to instantiate for new message objects when they
         must be created.  This class must have a constructor that can take
         zero arguments.  Default is Message.Message.

         Optional strict tells the parser to be strictly RFC compliant or to be
         more forgiving in parsing of ill-formatted MIME documents.  When
         non-strict mode is used, the parser will try to make up for missing or
         erroneous boundaries and other peculiarities seen in the wild.
         Default is non-strict parsing.
         """
         self._class = _class
         self._strict = strict

     def parse(self, fp, headersonly=0):
         root = self._class()
         self._parseheaders(root, fp)
         if not headersonly:
             self._parsebody(root, fp)
         return root

     def parsestr(self, text, headersonly=0):
         return self.parse(StringIO(text), headersonly=headersonly)

     def _parseheaders(self, container, fp):
         # Parse the headers, returning a list of header/value pairs.  None as
         # the header means the Unix-From header.
         lastheader = ''
         lastvalue = []
         lineno = 0
         while 1:
             # Don't strip the line before we test for the end condition,
             # because whitespace-only header lines are RFC compliant
             # continuation lines.
             line = fp.readline()
             if not line:
                 break
             line = line.splitlines()[0]
             if not line:
                 break
             # Ignore the trailing newline
             lineno += 1
             # Check for initial Unix From_ line
             if line.startswith('From '):
                 if lineno == 1:
                     container.set_unixfrom(line)
                     continue
                 elif self._strict:
                     raise Errors.HeaderParseError(
                         'Unix-from in headers after first rfc822 header')
                 else:
                     # ignore the wierdly placed From_ line
                     # XXX: maybe set unixfrom anyway? or only if not already?
                     continue
             # Header continuation line
             if line[0] in ' \t':
                 if not lastheader:
                     raise Errors.HeaderParseError(
                         'Continuation line seen before first header')
                 lastvalue.append(line)
                 continue
             # Normal, non-continuation header.  BAW: this should check to make
             # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
             # should expose the header matching algorithm in the API, and
             # allow for a non-strict parsing mode (that ignores the line
             # instead of raising the exception).
             i = line.find(':')
             if i < 0:
                 if self._strict:
                     raise Errors.HeaderParseError(
                         "Not a header, not a continuation: ``%s''"%line)
                 elif lineno == 1 and line.startswith('--'):
                     # allow through duplicate boundary tags.
                     continue
                 else:
                     raise Errors.HeaderParseError(
                         "Not a header, not a continuation: ``%s''"%line)
             if lastheader:
                 container[lastheader] = NL.join(lastvalue)
             lastheader = line[:i]
             lastvalue = [line[i+1:].lstrip()]
         # Make sure we retain the last header
         if lastheader:
             container[lastheader] = NL.join(lastvalue)

     def _parsebody(self, container, fp):
         # Parse the body, but first split the payload on the content-type
         # boundary if present.
         boundary = container.get_boundary()
         isdigest = (container.get_type() == 'multipart/digest')
         # If there's a boundary, split the payload text into its constituent
         # parts and parse each separately.  Otherwise, just parse the rest of
         # the body as a single message.  Note: any exceptions raised in the
         # recursive parse need to have their line numbers coerced.
         if boundary:
             preamble = epilogue = None
             # Split into subparts.  The first boundary we're looking for won't
             # always have a leading newline since we're at the start of the
             # body text, and there's not always a preamble before the first
             # boundary.
             separator = '--' + boundary
             payload = fp.read()
             # We use an RE here because boundaries can have trailing
             # whitespace.
             mo = re.search(
                 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
                 payload)
             if not mo:
                 raise Errors.BoundaryError(
                     "Couldn't find starting boundary: %s" % boundary)
             start = mo.start()
             if start > 0:
                 # there's some pre-MIME boundary preamble
                 preamble = payload[0:start]
             # Find out what kind of line endings we're using
             start += len(mo.group('sep')) + len(mo.group('ws'))
             cre = re.compile('\r\n|\r|\n')
             mo = cre.search(payload, start)
             if mo:
                 start += len(mo.group(0))
             # We create a compiled regexp first because we need to be able to
             # specify the start position, and the module function doesn't
             # support this signature. :(
             cre = re.compile('(?P<sep>\r\n|\r|\n)' +
                              re.escape(separator) + '--')
             mo = cre.search(payload, start)
             if mo:
                 terminator = mo.start()
                 linesep = mo.group('sep')
                 if mo.end() < len(payload):
                     # There's some post-MIME boundary epilogue
                     epilogue = payload[mo.end():]
             elif self._strict:
                 raise Errors.BoundaryError(
                         "Couldn't find terminating boundary: %s" % boundary)
             else:
                 # Handle the case of no trailing boundary.  Check that it ends
                 # in a blank line.  Some cases (spamspamspam) don't even have
                 # that!
                 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
                 if not mo:
                     mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
                     if not mo:
                         raise Errors.BoundaryError(
                           'No terminating boundary and no trailing empty line')
                 linesep = mo.group('sep')
                 terminator = len(payload)
             # We split the textual payload on the boundary separator, which
             # includes the trailing newline. If the container is a
             # multipart/digest then the subparts are by default message/rfc822
             # instead of text/plain.  In that case, they'll have a optional
             # block of MIME headers, then an empty line followed by the
             # message headers.
             parts = re.split(
                 linesep + re.escape(separator) + r'[ \t]*' + linesep,
                 payload[start:terminator])
             for part in parts:
                 if isdigest:
                     if part[0] == linesep:
                         # There's no header block so create an empty message
                         # object as the container, and lop off the newline so
                         # we can parse the sub-subobject
                         msgobj = self._class()
                         part = part[1:]
                     else:
                         parthdrs, part = part.split(linesep+linesep, 1)
                         # msgobj in this case is the "message/rfc822" container
                         msgobj = self.parsestr(parthdrs, headersonly=1)
                     # while submsgobj is the message itself
                     submsgobj = self.parsestr(part)
                     msgobj.attach(submsgobj)
                     msgobj.set_default_type('message/rfc822')
                 else:
                     msgobj = self.parsestr(part)
                 container.preamble = preamble
                 container.epilogue = epilogue
                 container.attach(msgobj)
         elif container.get_main_type() == 'multipart':
             # Very bad.  A message is a multipart with no boundary!
             raise Errors.BoundaryError(
                 'multipart message with no defined boundary')
         elif container.get_type() == 'message/delivery-status':
             # This special kind of type contains blocks of headers separated
             # by a blank line.  We'll represent each header block as a
             # separate Message object
             blocks = []
             while 1:
                 blockmsg = self._class()
                 self._parseheaders(blockmsg, fp)
                 if not len(blockmsg):
                     # No more header blocks left
                     break
                 blocks.append(blockmsg)
             container.set_payload(blocks)
         elif container.get_main_type() == 'message':
             # Create a container for the payload, but watch out for there not
             # being any headers left
             try:
                 msg = self.parse(fp)
             except Errors.HeaderParseError:
                 msg = self._class()
                 self._parsebody(msg, fp)
             container.attach(msg)
         else:
             container.set_payload(fp.read())


 class HeaderParser(Parser):
     """A subclass of Parser, this one only meaningfully parses message headers.

     This class can be used if all you're interested in is the headers of a
     message.  While it consumes the message body, it does not parse it, but
     simply makes it available as a string payload.

     Parsing with this subclass can be considerably faster if all you're
     interested in is the message headers.
     """
     def _parsebody(self, container, fp):
         # Consume but do not parse, the body
         container.set_payload(fp.read())
	# Copyright (C) 2001,2002 Python Software Foundation
	# Author: barry@zope.com (Barry Warsaw)

	"""A parser of RFC 2822 and MIME email messages.
	"""

	import re
	from cStringIO import StringIO
	from types import ListType

	from email import Errors
	from email import Message

	EMPTYSTRING = ''
	NL = '\n'


	class Parser:
	def __init__(self, _class=Message.Message, strict=0):
	"""Parser of RFC 2822 and MIME email messages.

	Creates an in-memory object tree representing the email message, which
	can then be manipulated and turned over to a Generator to return the
	textual representation of the message.

	The string must be formatted as a block of RFC 2822 headers and header
	continuation lines, optionally preceeded by a `Unix-from' header. The
	header block is terminated either by the end of the string or by a
	blank line.

	_class is the class to instantiate for new message objects when they
	must be created. This class must have a constructor that can take
	zero arguments. Default is Message.Message.

	Optional strict tells the parser to be strictly RFC compliant or to be
	more forgiving in parsing of ill-formatted MIME documents. When
	non-strict mode is used, the parser will try to make up for missing or
	erroneous boundaries and other peculiarities seen in the wild.
	Default is non-strict parsing.
	"""
	self._class = _class
	self._strict = strict

	def parse(self, fp, headersonly=0):
	root = self._class()
	self._parseheaders(root, fp)
	if not headersonly:
	self._parsebody(root, fp)
	return root

	def parsestr(self, text, headersonly=0):
	return self.parse(StringIO(text), headersonly=headersonly)

	def _parseheaders(self, container, fp):
	# Parse the headers, returning a list of header/value pairs. None as
	# the header means the Unix-From header.
	lastheader = ''
	lastvalue = []
	lineno = 0
	while 1:
	# Don't strip the line before we test for the end condition,
	# because whitespace-only header lines are RFC compliant
	# continuation lines.
	line = fp.readline()
	if not line:
	break
	line = line.splitlines()[0]
	if not line:
	break
	# Ignore the trailing newline
	lineno += 1
	# Check for initial Unix From_ line
	if line.startswith('From '):
	if lineno == 1:
	container.set_unixfrom(line)
	continue
	elif self._strict:
	raise Errors.HeaderParseError(
	'Unix-from in headers after first rfc822 header')
	else:
	# ignore the wierdly placed From_ line
	# XXX: maybe set unixfrom anyway? or only if not already?
	continue
	# Header continuation line
	if line[0] in ' \t':
	if not lastheader:
	raise Errors.HeaderParseError(
	'Continuation line seen before first header')
	lastvalue.append(line)
	continue
	# Normal, non-continuation header. BAW: this should check to make
	# sure it's a legal header, e.g. doesn't contain spaces. Also, we
	# should expose the header matching algorithm in the API, and
	# allow for a non-strict parsing mode (that ignores the line
	# instead of raising the exception).
	i = line.find(':')
	if i < 0:
	if self._strict:
	raise Errors.HeaderParseError(
	"Not a header, not a continuation: ``%s''"%line)
	elif lineno == 1 and line.startswith('--'):
	# allow through duplicate boundary tags.
	continue
	else:
	raise Errors.HeaderParseError(
	"Not a header, not a continuation: ``%s''"%line)
	if lastheader:
	container[lastheader] = NL.join(lastvalue)
	lastheader = line[:i]
	lastvalue = [line[i+1:].lstrip()]
	# Make sure we retain the last header
	if lastheader:
	container[lastheader] = NL.join(lastvalue)

	def _parsebody(self, container, fp):
	# Parse the body, but first split the payload on the content-type
	# boundary if present.
	boundary = container.get_boundary()
	isdigest = (container.get_type() == 'multipart/digest')
	# If there's a boundary, split the payload text into its constituent
	# parts and parse each separately. Otherwise, just parse the rest of
	# the body as a single message. Note: any exceptions raised in the
	# recursive parse need to have their line numbers coerced.
	if boundary:
	preamble = epilogue = None
	# Split into subparts. The first boundary we're looking for won't
	# always have a leading newline since we're at the start of the
	# body text, and there's not always a preamble before the first
	# boundary.
	separator = '--' + boundary
	payload = fp.read()
	# We use an RE here because boundaries can have trailing
	# whitespace.
	mo = re.search(
	r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
	payload)
	if not mo:
	raise Errors.BoundaryError(
	"Couldn't find starting boundary: %s" % boundary)
	start = mo.start()
	if start > 0:
	# there's some pre-MIME boundary preamble
	preamble = payload[0:start]
	# Find out what kind of line endings we're using
	start += len(mo.group('sep')) + len(mo.group('ws'))
	cre = re.compile('\r\n\|\r\|\n')
	mo = cre.search(payload, start)
	if mo:
	start += len(mo.group(0))
	# We create a compiled regexp first because we need to be able to
	# specify the start position, and the module function doesn't
	# support this signature. :(
	cre = re.compile('(?P<sep>\r\n\|\r\|\n)' +
	re.escape(separator) + '--')
	mo = cre.search(payload, start)
	if mo:
	terminator = mo.start()
	linesep = mo.group('sep')
	if mo.end() < len(payload):
	# There's some post-MIME boundary epilogue
	epilogue = payload[mo.end():]
	elif self._strict:
	raise Errors.BoundaryError(
	"Couldn't find terminating boundary: %s" % boundary)
	else:
	# Handle the case of no trailing boundary. Check that it ends
	# in a blank line. Some cases (spamspamspam) don't even have
	# that!
	mo = re.search('(?P<sep>\r\n\|\r\|\n){2}$', payload)
	if not mo:
	mo = re.search('(?P<sep>\r\n\|\r\|\n)$', payload)
	if not mo:
	raise Errors.BoundaryError(
	'No terminating boundary and no trailing empty line')
	linesep = mo.group('sep')
	terminator = len(payload)
	# We split the textual payload on the boundary separator, which
	# includes the trailing newline. If the container is a
	# multipart/digest then the subparts are by default message/rfc822
	# instead of text/plain. In that case, they'll have a optional
	# block of MIME headers, then an empty line followed by the
	# message headers.
	parts = re.split(
	linesep + re.escape(separator) + r'[ \t]*' + linesep,
	payload[start:terminator])
	for part in parts:
	if isdigest:
	if part[0] == linesep:
	# There's no header block so create an empty message
	# object as the container, and lop off the newline so
	# we can parse the sub-subobject
	msgobj = self._class()
	part = part[1:]
	else:
	parthdrs, part = part.split(linesep+linesep, 1)
	# msgobj in this case is the "message/rfc822" container
	msgobj = self.parsestr(parthdrs, headersonly=1)
	# while submsgobj is the message itself
	submsgobj = self.parsestr(part)
	msgobj.attach(submsgobj)
	msgobj.set_default_type('message/rfc822')
	else:
	msgobj = self.parsestr(part)
	container.preamble = preamble
	container.epilogue = epilogue
	container.attach(msgobj)
	elif container.get_main_type() == 'multipart':
	# Very bad. A message is a multipart with no boundary!
	raise Errors.BoundaryError(
	'multipart message with no defined boundary')
	elif container.get_type() == 'message/delivery-status':
	# This special kind of type contains blocks of headers separated
	# by a blank line. We'll represent each header block as a
	# separate Message object
	blocks = []
	while 1:
	blockmsg = self._class()
	self._parseheaders(blockmsg, fp)
	if not len(blockmsg):
	# No more header blocks left
	break
	blocks.append(blockmsg)
	container.set_payload(blocks)
	elif container.get_main_type() == 'message':
	# Create a container for the payload, but watch out for there not
	# being any headers left
	try:
	msg = self.parse(fp)
	except Errors.HeaderParseError:
	msg = self._class()
	self._parsebody(msg, fp)
	container.attach(msg)
	else:
	container.set_payload(fp.read())



	class HeaderParser(Parser):
	"""A subclass of Parser, this one only meaningfully parses message headers.

	This class can be used if all you're interested in is the headers of a
	message. While it consumes the message body, it does not parse it, but
	simply makes it available as a string payload.

	Parsing with this subclass can be considerably faster if all you're
	interested in is the message headers.
	"""
	def _parsebody(self, container, fp):
	# Consume but do not parse, the body
	container.set_payload(fp.read())