Lib/email/Parser.py - platform/external/python/cpython3 - Gitiles

 # Copyright (C) 2001,2002 Python Software Foundation
 # Author: barry@zope.com (Barry Warsaw)

 """A parser of RFC 2822 and MIME email messages.
 """

 import re
 from cStringIO import StringIO
 from types import ListType

 from email import Errors
 from email import Message

 EMPTYSTRING = ''
 NL = '\n'

 try:
     True, False
 except NameError:
     True = 1
     False = 0

 NLCRE = re.compile('\r\n|\r|\n')

 class TextUtil:
     """ A utility class for wrapping a file object and providing a
         couple of additional useful functions.
     """

     def __init__(self, fp):
         self.fp = fp
         self.unread = []

     def readline(self):
         """ Return a line of data.

         If data has been pushed back with unreadline(), the most recently
         returned unreadline()d data will be returned.
         """
         if self.unread:
             return self.unread.pop()
         else:
             return self.fp.readline()

     def unreadline(self, line):
         """Push a line back into the object.
         """
         self.unread.append(line)

     def peekline(self):
         """Non-destructively look at the next line"""
         line = self.readline()
         self.unreadline(line)
         return line

     def read(self):
         """Return the remaining data
         """
         r = self.fp.read()
         if self.unread:
             r = "\n".join(self.unread) + r
             self.unread = []
         return r

     def readuntil(self, re, afterblank=0, includematch=0):
         """Read a line at a time until we get the specified RE.

         Returns the text up to (and including, if includematch is true) the
         matched text, and the RE match object. If afterblank is true,
         there must be a blank line before the matched text. Moves current
         filepointer to the line following the matched line. If we reach
         end-of-file, return what we've got so far, and return None as the
         RE match object.
         """
         prematch = []
         blankseen = 0
         while 1:
             line = self.readline()
             if not line:
                 # end of file
                 return EMPTYSTRING.join(prematch), None
             if afterblank:
                 if NLCRE.match(line):
                     blankseen = 1
                     continue
                 else:
                     blankseen = 0
             m = re.match(line)
             if (m and not afterblank) or (m and afterblank and blankseen):
                 if includematch:
                     prematch.append(line)
                 return EMPTYSTRING.join(prematch), m
             prematch.append(line)


 class Parser:
     def __init__(self, _class=Message.Message, strict=False):
         """Parser of RFC 2822 and MIME email messages.

         Creates an in-memory object tree representing the email message, which
         can then be manipulated and turned over to a Generator to return the
         textual representation of the message.

         The string must be formatted as a block of RFC 2822 headers and header
         continuation lines, optionally preceeded by a `Unix-from' header.  The
         header block is terminated either by the end of the string or by a
         blank line.

         _class is the class to instantiate for new message objects when they
         must be created.  This class must have a constructor that can take
         zero arguments.  Default is Message.Message.

         Optional strict tells the parser to be strictly RFC compliant or to be
         more forgiving in parsing of ill-formatted MIME documents.  When
         non-strict mode is used, the parser will try to make up for missing or
         erroneous boundaries and other peculiarities seen in the wild.
         Default is non-strict parsing.
         """
         self._class = _class
         self._strict = strict

     def parse(self, fp, headersonly=False):
         """Create a message structure from the data in a file.

         Reads all the data from the file and returns the root of the message
         structure.  Optional headersonly is a flag specifying whether to stop
         parsing after reading the headers or not.  The default is False,
         meaning it parses the entire contents of the file.
         """
         root = self._class()
         fp = TextUtil(fp)
         self._parseheaders(root, fp)
         if not headersonly:
             obj = self._parsemessage(root, fp)
             trailer = fp.read()
             if obj and trailer:
                 self._attach_trailer(obj, trailer)
         return root

     def parsestr(self, text, headersonly=False):
         """Create a message structure from a string.

         Returns the root of the message structure.  Optional headersonly is a
         flag specifying whether to stop parsing after reading the headers or
         not.  The default is False, meaning it parses the entire contents of
         the file.
         """
         return self.parse(StringIO(text), headersonly=headersonly)

     def _parseheaders(self, container, fp):
         # Parse the headers, returning a list of header/value pairs.  None as
         # the header means the Unix-From header.
         lastheader = ''
         lastvalue = []
         lineno = 0
         while True:
             # Don't strip the line before we test for the end condition,
             # because whitespace-only header lines are RFC compliant
             # continuation lines.
             line = fp.readline()
             if not line:
                 break
             line = line.splitlines()[0]
             if not line:
                 break
             # Ignore the trailing newline
             lineno += 1
             # Check for initial Unix From_ line
             if line.startswith('From '):
                 if lineno == 1:
                     container.set_unixfrom(line)
                     continue
                 elif self._strict:
                     raise Errors.HeaderParseError(
                         'Unix-from in headers after first rfc822 header')
                 else:
                     # ignore the wierdly placed From_ line
                     # XXX: maybe set unixfrom anyway? or only if not already?
                     continue
             # Header continuation line
             if line[0] in ' \t':
                 if not lastheader:
                     raise Errors.HeaderParseError(
                         'Continuation line seen before first header')
                 lastvalue.append(line)
                 continue
             # Normal, non-continuation header.  BAW: this should check to make
             # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
             # should expose the header matching algorithm in the API, and
             # allow for a non-strict parsing mode (that ignores the line
             # instead of raising the exception).
             i = line.find(':')
             if i < 0:
                 if self._strict:
                     raise Errors.HeaderParseError(
                         "Not a header, not a continuation: ``%s''" % line)
                 elif lineno == 1 and line.startswith('--'):
                     # allow through duplicate boundary tags.
                     continue
                 else:
                     # There was no separating blank line as mandated by RFC
                     # 2822, but we're in non-strict mode.  So just offer up
                     # this current line as the first body line.
                     fp.unreadline(line)
                     break
             if lastheader:
                 container[lastheader] = NL.join(lastvalue)
             lastheader = line[:i]
             lastvalue = [line[i+1:].lstrip()]
         # Make sure we retain the last header
         if lastheader:
             container[lastheader] = NL.join(lastvalue)
         return

     def _parsemessage(self, container, fp):
         # Parse the body. We walk through the body from top to bottom,
         # keeping track of the current multipart nesting as we go.
         # We return the object that gets the data at the end of this
         # block.
         boundary = container.get_boundary()
         isdigest = (container.get_content_type() == 'multipart/digest')
         if boundary:
             separator = '--' + boundary
             boundaryRE = re.compile(
                     r'(?P<sep>' + re.escape(separator) +
                     r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
             preamble, matchobj = fp.readuntil(boundaryRE)
             if not matchobj:
                 # Broken - we hit the end of file. Just set the body
                 # to the text.
                 container.set_payload(preamble)
                 return container
             if preamble:
                 container.preamble = preamble
             else:
                 # The module docs specify an empty preamble is None, not ''
                 container.preamble = None
             while 1:
                 subobj = self._class()
                 if isdigest:
                     subobj.set_default_type('message/rfc822')
                     firstline = fp.peekline()
                     if firstline.strip():
                         # we have MIME headers. all good.
                         self._parseheaders(subobj, fp)
                     else:
                         # no MIME headers. this is allowed for multipart/digest
                         # Consume the extra blank line
                         fp.readline()
                         pass
                 else:
                     self._parseheaders(subobj, fp)
                 container.attach(subobj)
                 maintype = subobj.get_content_maintype()
                 hassubparts = (subobj.get_content_maintype() in
                                                 ( "message", "multipart" ))
                 if hassubparts:
                     subobj = self._parsemessage(subobj, fp)

                 trailer, matchobj = fp.readuntil(boundaryRE)
                 if matchobj is None or trailer:
                     mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)
                     if not mo:
                         mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)
                         if not mo:
                             raise Errors.BoundaryError(
                           'No terminating boundary and no trailing empty line')
                     linesep = mo.group('sep')
                     trailer = trailer[:-len(linesep)]
                 if trailer:
                     self._attach_trailer(subobj, trailer)
                 if matchobj is None or matchobj.group('end'):
                     # That was the last piece of data. Let our caller attach
                     # the epilogue to us. But before we do that, push the
                     # line ending of the match group back into the readline
                     # buffer, as it's part of the epilogue.
                     if matchobj:
                         fp.unreadline(matchobj.group('linesep'))
                     return container

         elif container.get_content_maintype() == "multipart":
             # Very bad.  A message is a multipart with no boundary!
             raise Errors.BoundaryError(
                     'multipart message with no defined boundary')
         elif container.get_content_maintype() == "message":
             ct = container.get_content_type()
             if ct == "message/rfc822":
                 submessage = self._class()
                 self._parseheaders(submessage, fp)
                 self._parsemessage(submessage, fp)
                 container.attach(submessage)
                 return submessage
             elif ct == "message/delivery-status":
                 # This special kind of type contains blocks of headers
                 # separated by a blank line.  We'll represent each header
                 # block as a separate Message object
                 while 1:
                     nextblock = self._class()
                     self._parseheaders(nextblock, fp)
                     container.attach(nextblock)
                     # next peek ahead to see whether we've hit the end or not
                     nextline = fp.peekline()
                     if nextline[:2] == "--":
                         break
                 return container
             else:
                 # Other sort of message object (e.g. external-body)
                 msg = self._class()
                 self._parsemessage(msg, fp)
                 container.attach(msg)
                 return msg
         else:
             # single body section. We let our caller set the payload.
             return container

     def _attach_trailer(self, obj, trailer):
         if obj.get_content_maintype() in ("message", "multipart"):
             obj.epilogue = trailer
         else:
             obj.set_payload(trailer)


 class HeaderParser(Parser):
     """A subclass of Parser, this one only meaningfully parses message headers.

     This class can be used if all you're interested in is the headers of a
     message.  While it consumes the message body, it does not parse it, but
     simply makes it available as a string payload.

     Parsing with this subclass can be considerably faster if all you're
     interested in is the message headers.
     """
     def _parsemessage(self, container, fp):
         # Consume but do not parse, the body
         text = fp.read()
         container.set_payload(text)
         return None
	# Copyright (C) 2001,2002 Python Software Foundation
	# Author: barry@zope.com (Barry Warsaw)

	"""A parser of RFC 2822 and MIME email messages.
	"""

	import re
	from cStringIO import StringIO
	from types import ListType

	from email import Errors
	from email import Message

	EMPTYSTRING = ''
	NL = '\n'

	try:
	True, False
	except NameError:
	True = 1
	False = 0

	NLCRE = re.compile('\r\n\|\r\|\n')

	class TextUtil:
	""" A utility class for wrapping a file object and providing a
	couple of additional useful functions.
	"""

	def __init__(self, fp):
	self.fp = fp
	self.unread = []

	def readline(self):
	""" Return a line of data.

	If data has been pushed back with unreadline(), the most recently
	returned unreadline()d data will be returned.
	"""
	if self.unread:
	return self.unread.pop()
	else:
	return self.fp.readline()

	def unreadline(self, line):
	"""Push a line back into the object.
	"""
	self.unread.append(line)

	def peekline(self):
	"""Non-destructively look at the next line"""
	line = self.readline()
	self.unreadline(line)
	return line

	def read(self):
	"""Return the remaining data
	"""
	r = self.fp.read()
	if self.unread:
	r = "\n".join(self.unread) + r
	self.unread = []
	return r

	def readuntil(self, re, afterblank=0, includematch=0):
	"""Read a line at a time until we get the specified RE.

	Returns the text up to (and including, if includematch is true) the
	matched text, and the RE match object. If afterblank is true,
	there must be a blank line before the matched text. Moves current
	filepointer to the line following the matched line. If we reach
	end-of-file, return what we've got so far, and return None as the
	RE match object.
	"""
	prematch = []
	blankseen = 0
	while 1:
	line = self.readline()
	if not line:
	# end of file
	return EMPTYSTRING.join(prematch), None
	if afterblank:
	if NLCRE.match(line):
	blankseen = 1
	continue
	else:
	blankseen = 0
	m = re.match(line)
	if (m and not afterblank) or (m and afterblank and blankseen):
	if includematch:
	prematch.append(line)
	return EMPTYSTRING.join(prematch), m
	prematch.append(line)


	class Parser:
	def __init__(self, _class=Message.Message, strict=False):
	"""Parser of RFC 2822 and MIME email messages.

	Creates an in-memory object tree representing the email message, which
	can then be manipulated and turned over to a Generator to return the
	textual representation of the message.

	The string must be formatted as a block of RFC 2822 headers and header
	continuation lines, optionally preceeded by a `Unix-from' header. The
	header block is terminated either by the end of the string or by a
	blank line.

	_class is the class to instantiate for new message objects when they
	must be created. This class must have a constructor that can take
	zero arguments. Default is Message.Message.

	Optional strict tells the parser to be strictly RFC compliant or to be
	more forgiving in parsing of ill-formatted MIME documents. When
	non-strict mode is used, the parser will try to make up for missing or
	erroneous boundaries and other peculiarities seen in the wild.
	Default is non-strict parsing.
	"""
	self._class = _class
	self._strict = strict

	def parse(self, fp, headersonly=False):
	"""Create a message structure from the data in a file.

	Reads all the data from the file and returns the root of the message
	structure. Optional headersonly is a flag specifying whether to stop
	parsing after reading the headers or not. The default is False,
	meaning it parses the entire contents of the file.
	"""
	root = self._class()
	fp = TextUtil(fp)
	self._parseheaders(root, fp)
	if not headersonly:
	obj = self._parsemessage(root, fp)
	trailer = fp.read()
	if obj and trailer:
	self._attach_trailer(obj, trailer)
	return root

	def parsestr(self, text, headersonly=False):
	"""Create a message structure from a string.

	Returns the root of the message structure. Optional headersonly is a
	flag specifying whether to stop parsing after reading the headers or
	not. The default is False, meaning it parses the entire contents of
	the file.
	"""
	return self.parse(StringIO(text), headersonly=headersonly)

	def _parseheaders(self, container, fp):
	# Parse the headers, returning a list of header/value pairs. None as
	# the header means the Unix-From header.
	lastheader = ''
	lastvalue = []
	lineno = 0
	while True:
	# Don't strip the line before we test for the end condition,
	# because whitespace-only header lines are RFC compliant
	# continuation lines.
	line = fp.readline()
	if not line:
	break
	line = line.splitlines()[0]
	if not line:
	break
	# Ignore the trailing newline
	lineno += 1
	# Check for initial Unix From_ line
	if line.startswith('From '):
	if lineno == 1:
	container.set_unixfrom(line)
	continue
	elif self._strict:
	raise Errors.HeaderParseError(
	'Unix-from in headers after first rfc822 header')
	else:
	# ignore the wierdly placed From_ line
	# XXX: maybe set unixfrom anyway? or only if not already?
	continue
	# Header continuation line
	if line[0] in ' \t':
	if not lastheader:
	raise Errors.HeaderParseError(
	'Continuation line seen before first header')
	lastvalue.append(line)
	continue
	# Normal, non-continuation header. BAW: this should check to make
	# sure it's a legal header, e.g. doesn't contain spaces. Also, we
	# should expose the header matching algorithm in the API, and
	# allow for a non-strict parsing mode (that ignores the line
	# instead of raising the exception).
	i = line.find(':')
	if i < 0:
	if self._strict:
	raise Errors.HeaderParseError(
	"Not a header, not a continuation: ``%s''" % line)
	elif lineno == 1 and line.startswith('--'):
	# allow through duplicate boundary tags.
	continue
	else:
	# There was no separating blank line as mandated by RFC
	# 2822, but we're in non-strict mode. So just offer up
	# this current line as the first body line.
	fp.unreadline(line)
	break
	if lastheader:
	container[lastheader] = NL.join(lastvalue)
	lastheader = line[:i]
	lastvalue = [line[i+1:].lstrip()]
	# Make sure we retain the last header
	if lastheader:
	container[lastheader] = NL.join(lastvalue)
	return

	def _parsemessage(self, container, fp):
	# Parse the body. We walk through the body from top to bottom,
	# keeping track of the current multipart nesting as we go.
	# We return the object that gets the data at the end of this
	# block.
	boundary = container.get_boundary()
	isdigest = (container.get_content_type() == 'multipart/digest')
	if boundary:
	separator = '--' + boundary
	boundaryRE = re.compile(
	r'(?P<sep>' + re.escape(separator) +
	r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n\|\r\|\n)$')
	preamble, matchobj = fp.readuntil(boundaryRE)
	if not matchobj:
	# Broken - we hit the end of file. Just set the body
	# to the text.
	container.set_payload(preamble)
	return container
	if preamble:
	container.preamble = preamble
	else:
	# The module docs specify an empty preamble is None, not ''
	container.preamble = None
	while 1:
	subobj = self._class()
	if isdigest:
	subobj.set_default_type('message/rfc822')
	firstline = fp.peekline()
	if firstline.strip():
	# we have MIME headers. all good.
	self._parseheaders(subobj, fp)
	else:
	# no MIME headers. this is allowed for multipart/digest
	# Consume the extra blank line
	fp.readline()
	pass
	else:
	self._parseheaders(subobj, fp)
	container.attach(subobj)
	maintype = subobj.get_content_maintype()
	hassubparts = (subobj.get_content_maintype() in
	( "message", "multipart" ))
	if hassubparts:
	subobj = self._parsemessage(subobj, fp)

	trailer, matchobj = fp.readuntil(boundaryRE)
	if matchobj is None or trailer:
	mo = re.search('(?P<sep>\r\n\|\r\|\n){2}$', trailer)
	if not mo:
	mo = re.search('(?P<sep>\r\n\|\r\|\n)$', trailer)
	if not mo:
	raise Errors.BoundaryError(
	'No terminating boundary and no trailing empty line')
	linesep = mo.group('sep')
	trailer = trailer[:-len(linesep)]
	if trailer:
	self._attach_trailer(subobj, trailer)
	if matchobj is None or matchobj.group('end'):
	# That was the last piece of data. Let our caller attach
	# the epilogue to us. But before we do that, push the
	# line ending of the match group back into the readline
	# buffer, as it's part of the epilogue.
	if matchobj:
	fp.unreadline(matchobj.group('linesep'))
	return container

	elif container.get_content_maintype() == "multipart":
	# Very bad. A message is a multipart with no boundary!
	raise Errors.BoundaryError(
	'multipart message with no defined boundary')
	elif container.get_content_maintype() == "message":
	ct = container.get_content_type()
	if ct == "message/rfc822":
	submessage = self._class()
	self._parseheaders(submessage, fp)
	self._parsemessage(submessage, fp)
	container.attach(submessage)
	return submessage
	elif ct == "message/delivery-status":
	# This special kind of type contains blocks of headers
	# separated by a blank line. We'll represent each header
	# block as a separate Message object
	while 1:
	nextblock = self._class()
	self._parseheaders(nextblock, fp)
	container.attach(nextblock)
	# next peek ahead to see whether we've hit the end or not
	nextline = fp.peekline()
	if nextline[:2] == "--":
	break
	return container
	else:
	# Other sort of message object (e.g. external-body)
	msg = self._class()
	self._parsemessage(msg, fp)
	container.attach(msg)
	return msg
	else:
	# single body section. We let our caller set the payload.
	return container

	def _attach_trailer(self, obj, trailer):
	if obj.get_content_maintype() in ("message", "multipart"):
	obj.epilogue = trailer
	else:
	obj.set_payload(trailer)


	class HeaderParser(Parser):
	"""A subclass of Parser, this one only meaningfully parses message headers.

	This class can be used if all you're interested in is the headers of a
	message. While it consumes the message body, it does not parse it, but
	simply makes it available as a string payload.

	Parsing with this subclass can be considerably faster if all you're
	interested in is the message headers.
	"""
	def _parsemessage(self, container, fp):
	# Consume but do not parse, the body
	text = fp.read()
	container.set_payload(text)
	return None