Blame - Lib/email/Parser.py - platform/external/python/cpython2

blob: cc23d1984ac71c64d51464c8509235f851761665 [file] [log] [blame]

Barry Warsaw	ba92580	2001-09-23 03:17:28 +0000	[diff] [blame]	1	# Copyright (C) 2001 Python Software Foundation
				2	# Author: barry@zope.com (Barry Warsaw)
				3
				4	"""A parser of RFC 2822 and MIME email messages.
				5	"""
				6
				7	import re
				8	from cStringIO import StringIO
				9
				10	# Intrapackage imports
				11	import Errors
				12	import Message
				13
				14	bcre = re.compile('boundary="?([^"]+)"?', re.IGNORECASE)
				15	EMPTYSTRING = ''
				16	NL = '\n'
				17
				18
				19
				20	class Parser:
				21	def __init__(self, _class=Message.Message):
				22	"""Parser of RFC 2822 and MIME email messages.
				23
				24	Creates an in-memory object tree representing the email message, which
				25	can then be manipulated and turned over to a Generator to return the
				26	textual representation of the message.
				27
				28	The string must be formatted as a block of RFC 2822 headers and header
				29	continuation lines, optionally preceeded by a `Unix-from' header. The
				30	header block is terminated either by the end of the string or by a
				31	blank line.
				32
				33	_class is the class to instantiate for new message objects when they
				34	must be created. This class must have a constructor that can take
				35	zero arguments. Default is Message.Message.
				36	"""
				37	self._class = _class
				38
				39	def parse(self, fp):
				40	root = self._class()
				41	self._parseheaders(root, fp)
				42	self._parsebody(root, fp)
				43	return root
				44
				45	def parsestr(self, text):
				46	return self.parse(StringIO(text))
				47
				48	def _parseheaders(self, container, fp):
				49	# Parse the headers, returning a list of header/value pairs. None as
				50	# the header means the Unix-From header.
				51	lastheader = ''
				52	lastvalue = []
				53	lineno = 0
				54	while 1:
				55	line = fp.readline()[:-1]
				56	if not line or not line.strip():
				57	break
				58	lineno += 1
				59	# Check for initial Unix From_ line
				60	if line.startswith('From '):
				61	if lineno == 1:
				62	container.set_unixfrom(line)
				63	continue
				64	else:
				65	raise Errors.HeaderParseError(
				66	'Unix-from in headers after first rfc822 header')
				67	#
				68	# Header continuation line
				69	if line[0] in ' \t':
				70	if not lastheader:
				71	raise Errors.HeaderParseError(
				72	'Continuation line seen before first header')
				73	lastvalue.append(line)
				74	continue
				75	# Normal, non-continuation header. BAW: this should check to make
				76	# sure it's a legal header, e.g. doesn't contain spaces. Also, we
				77	# should expose the header matching algorithm in the API, and
				78	# allow for a non-strict parsing mode (that ignores the line
				79	# instead of raising the exception).
				80	i = line.find(':')
				81	if i < 0:
				82	raise Errors.HeaderParseError(
				83	'Not a header, not a continuation')
				84	if lastheader:
				85	container[lastheader] = NL.join(lastvalue)
				86	lastheader = line[:i]
				87	lastvalue = [line[i+1:].lstrip()]
				88	# Make sure we retain the last header
				89	if lastheader:
				90	container[lastheader] = NL.join(lastvalue)
				91
				92	def _parsebody(self, container, fp):
				93	# Parse the body, but first split the payload on the content-type
				94	# boundary if present.
				95	boundary = isdigest = None
				96	ctype = container['content-type']
				97	if ctype:
				98	mo = bcre.search(ctype)
				99	if mo:
				100	boundary = mo.group(1)
				101	isdigest = container.get_type() == 'multipart/digest'
				102	# If there's a boundary, split the payload text into its constituent
				103	# parts and parse each separately. Otherwise, just parse the rest of
				104	# the body as a single message. Note: any exceptions raised in the
				105	# recursive parse need to have their line numbers coerced.
				106	if boundary:
				107	preamble = epilogue = None
				108	# Split into subparts. The first boundary we're looking for won't
				109	# have the leading newline since we're at the start of the body
				110	# text.
				111	separator = '--' + boundary
				112	payload = fp.read()
				113	start = payload.find(separator)
				114	if start < 0:
				115	raise Errors.BoundaryError(
				116	"Couldn't find starting boundary: %s" % boundary)
				117	if start > 0:
				118	# there's some pre-MIME boundary preamble
				119	preamble = payload[0:start]
				120	start += len(separator) + 1 + isdigest
				121	terminator = payload.find('\n' + separator + '--', start)
				122	if terminator < 0:
				123	raise Errors.BoundaryError(
				124	"Couldn't find terminating boundary: %s" % boundary)
				125	if terminator+len(separator)+3 < len(payload):
				126	# there's some post-MIME boundary epilogue
				127	epilogue = payload[terminator+len(separator)+3:]
				128	# We split the textual payload on the boundary separator, which
				129	# includes the trailing newline. If the container is a
				130	# multipart/digest then the subparts are by default message/rfc822
				131	# instead of text/plain. In that case, they'll have an extra
				132	# newline before the headers to distinguish the message's headers
				133	# from the subpart headers.
				134	if isdigest:
				135	separator += '\n\n'
				136	else:
				137	separator += '\n'
				138	parts = payload[start:terminator].split('\n' + separator)
				139	for part in parts:
				140	msgobj = self.parsestr(part)
				141	container.preamble = preamble
				142	container.epilogue = epilogue
				143	container.add_payload(msgobj)
				144	elif ctype == 'message/rfc822':
				145	# Create a container for the payload, but watch out for there not
				146	# being any headers left
				147	try:
				148	msg = self.parse(fp)
				149	except Errors.HeaderParseError:
				150	msg = self._class()
				151	self._parsebody(msg, fp)
				152	container.add_payload(msg)
				153	else:
				154	container.add_payload(fp.read())