Blame - Lib/email/Parser.py - platform/external/python/cpython3

2001-09-23 03:17:28 +0000

[diff] [blame]

2

# Author: barry@zope.com (Barry Warsaw)

3

4

"""A parser of RFC 2822 and MIME email messages.

5

"""

6

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

7

import re

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

8

from cStringIO import StringIO

Barry Warsaw

15e9dc9

2002-01-27 06:48:02 +0000

[diff] [blame]

9

from types import ListType

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

10

Barry Warsaw

69e18af

2002-06-02 19:12:03 +0000

[diff] [blame]

11

from email import Errors

12

from email import Message

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

13

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

EMPTYSTRING = ''

NL = '\n'

Barry Warsaw

2001-10-04 17:05:11 +0000

[diff] [blame]

18

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

19

class Parser:

20

def __init__(self, _class=Message.Message):

21

"""Parser of RFC 2822 and MIME email messages.

22

23

Creates an in-memory object tree representing the email message, which

24

can then be manipulated and turned over to a Generator to return the

25

textual representation of the message.

26

27

The string must be formatted as a block of RFC 2822 headers and header

28

continuation lines, optionally preceeded by a `Unix-from' header. The

29

header block is terminated either by the end of the string or by a

30

blank line.

31

32

_class is the class to instantiate for new message objects when they

33

must be created. This class must have a constructor that can take

34

zero arguments. Default is Message.Message.

"""

self._class = _class

def parse(self, fp):

root = self._class()

self._parseheaders(root, fp)

41

self._parsebody(root, fp)

42

return root

43

44

def parsestr(self, text):

45

return self.parse(StringIO(text))

46

47

def _parseheaders(self, container, fp):

48

# Parse the headers, returning a list of header/value pairs. None as

49

# the header means the Unix-From header.

lastheader = ''

lastvalue = []

lineno = 0

while 1:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

54

# Don't strip the line before we test for the end condition,

55

# because whitespace-only header lines are RFC compliant

56

# continuation lines.

57

line = fp.readline()

58

if not line:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

59

break

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

60

line = line.splitlines()[0]

61

if not line:

62

break

63

# Ignore the trailing newline

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

64

lineno += 1

65

# Check for initial Unix From_ line

66

if line.startswith('From '):

67

if lineno == 1:

68

container.set_unixfrom(line)

69

continue

70

else:

71

raise Errors.HeaderParseError(

72

'Unix-from in headers after first rfc822 header')

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

73

# Header continuation line

74

if line[0] in ' \t':

75

if not lastheader:

76

raise Errors.HeaderParseError(

77

'Continuation line seen before first header')

78

lastvalue.append(line)

79

continue

80

# Normal, non-continuation header. BAW: this should check to make

81

# sure it's a legal header, e.g. doesn't contain spaces. Also, we

82

# should expose the header matching algorithm in the API, and

83

# allow for a non-strict parsing mode (that ignores the line

84

# instead of raising the exception).

85

i = line.find(':')

86

if i < 0:

87

raise Errors.HeaderParseError(

88

'Not a header, not a continuation')

89

if lastheader:

90

container[lastheader] = NL.join(lastvalue)

91

lastheader = line[:i]

92

lastvalue = [line[i+1:].lstrip()]

93

# Make sure we retain the last header

94

if lastheader:

95

container[lastheader] = NL.join(lastvalue)

96

97

def _parsebody(self, container, fp):

98

# Parse the body, but first split the payload on the content-type

99

# boundary if present.

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

100

boundary = container.get_boundary()

101

isdigest = (container.get_type() == 'multipart/digest')

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

102

# If there's a boundary, split the payload text into its constituent

103

# parts and parse each separately. Otherwise, just parse the rest of

104

# the body as a single message. Note: any exceptions raised in the

105

# recursive parse need to have their line numbers coerced.

106

if boundary:

107

preamble = epilogue = None

108

# Split into subparts. The first boundary we're looking for won't

109

# have the leading newline since we're at the start of the body

110

# text.

111

separator = '--' + boundary

112

payload = fp.read()

113

start = payload.find(separator)

114

if start < 0:

115

raise Errors.BoundaryError(

116

"Couldn't find starting boundary: %s" % boundary)

117

if start > 0:

118

# there's some pre-MIME boundary preamble

119

preamble = payload[0:start]

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

120

# Find out what kind of line endings we're using

121

start += len(separator)

122

cre = re.compile('\r\n|\r|\n')

123

mo = cre.search(payload, start)

124

if mo:

125

start += len(mo.group(0)) * (1 + isdigest)

126

# We create a compiled regexp first because we need to be able to

127

# specify the start position, and the module function doesn't

128

# support this signature. :(

129

cre = re.compile('(?P<sep>\r\n|\r|\n)' +

130

re.escape(separator) + '--')

131

mo = cre.search(payload, start)

132

if not mo:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

133

raise Errors.BoundaryError(

134

"Couldn't find terminating boundary: %s" % boundary)

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

135

terminator = mo.start()

136

linesep = mo.group('sep')

137

if mo.end() < len(payload):

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

138

# there's some post-MIME boundary epilogue

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

139

epilogue = payload[mo.end():]

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

140

# We split the textual payload on the boundary separator, which

141

# includes the trailing newline. If the container is a

142

# multipart/digest then the subparts are by default message/rfc822

143

# instead of text/plain. In that case, they'll have an extra

144

# newline before the headers to distinguish the message's headers

145

# from the subpart headers.

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

146

separator += linesep * (1 + isdigest)

147

parts = payload[start:terminator].split(linesep + separator)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

148

for part in parts:

149

msgobj = self.parsestr(part)

150

container.preamble = preamble

151

container.epilogue = epilogue

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

152

container.attach(msgobj)

153

elif container.get_main_type() == 'multipart':

154

# Very bad. A message is a multipart with no boundary!

155

raise Errors.BoundaryError(

156

'multipart message with no defined boundary')

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

157

elif container.get_type() == 'message/delivery-status':

158

# This special kind of type contains blocks of headers separated

159

# by a blank line. We'll represent each header block as a

160

# separate Message object

161

blocks = []

162

while 1:

163

blockmsg = self._class()

164

self._parseheaders(blockmsg, fp)

165

if not len(blockmsg):

166

# No more header blocks left

167

break

168

blocks.append(blockmsg)

169

container.set_payload(blocks)

170

elif container.get_main_type() == 'message':

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

171

# Create a container for the payload, but watch out for there not

172

# being any headers left

173

try:

174

msg = self.parse(fp)

175

except Errors.HeaderParseError:

176

msg = self._class()

177

self._parsebody(msg, fp)

Barry Warsaw

69e18af

2002-06-02 19:12:03 +0000

[diff] [blame]

178

container.attach(msg)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

179

else:

Barry Warsaw