Blame - Lib/email/Parser.py - platform/external/python/cpython2

2001-09-23 03:17:28 +0000

[diff] [blame]

2

# Author: barry@zope.com (Barry Warsaw)

3

4

"""A parser of RFC 2822 and MIME email messages.

5

"""

6

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

7

import re

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

8

from cStringIO import StringIO

Barry Warsaw

15e9dc9

2002-01-27 06:48:02 +0000

[diff] [blame]

9

from types import ListType

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

10

Barry Warsaw

69e18af

2002-06-02 19:12:03 +0000

[diff] [blame]

11

from email import Errors

12

from email import Message

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

13

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

EMPTYSTRING = ''

NL = '\n'

Barry Warsaw

2001-10-04 17:05:11 +0000

[diff] [blame]

17

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

18

class Parser:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

19

def __init__(self, _class=Message.Message, strict=1):

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

20

"""Parser of RFC 2822 and MIME email messages.

21

22

Creates an in-memory object tree representing the email message, which

23

can then be manipulated and turned over to a Generator to return the

24

textual representation of the message.

25

26

The string must be formatted as a block of RFC 2822 headers and header

27

continuation lines, optionally preceeded by a `Unix-from' header. The

28

header block is terminated either by the end of the string or by a

29

blank line.

30

31

_class is the class to instantiate for new message objects when they

32

must be created. This class must have a constructor that can take

33

zero arguments. Default is Message.Message.

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

34

35

Optional strict tells the parser to be strictly RFC compliant or to be

36

more forgiving in parsing of ill-formatted MIME documents. When

37

non-strict mode is used, the parser will try to make up for missing or

38

erroneous boundaries and other peculiarities seen in the wild.

39

Defaults to strict parsing.

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

40

"""

41

self._class = _class

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

42

self._strict = strict

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

43

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

44

def parse(self, fp, headersonly=0):

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

45

root = self._class()

46

self._parseheaders(root, fp)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

47

if not headersonly:

48

self._parsebody(root, fp)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

49

return root

50

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

51

def parsestr(self, text, headersonly=0):

52

return self.parse(StringIO(text), headersonly=headersonly)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

53

54

def _parseheaders(self, container, fp):

55

# Parse the headers, returning a list of header/value pairs. None as

56

# the header means the Unix-From header.

lastheader = ''

lastvalue = []

lineno = 0

while 1:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

61

# Don't strip the line before we test for the end condition,

62

# because whitespace-only header lines are RFC compliant

63

# continuation lines.

64

line = fp.readline()

65

if not line:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

66

break

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

67

line = line.splitlines()[0]

68

if not line:

69

break

70

# Ignore the trailing newline

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

71

lineno += 1

72

# Check for initial Unix From_ line

73

if line.startswith('From '):

74

if lineno == 1:

75

container.set_unixfrom(line)

76

continue

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

77

elif self._strict:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

78

raise Errors.HeaderParseError(

79

'Unix-from in headers after first rfc822 header')

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

80

else:

81

# ignore the wierdly placed From_ line

82

# XXX: maybe set unixfrom anyway? or only if not already?

83

continue

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

84

# Header continuation line

85

if line[0] in ' \t':

86

if not lastheader:

87

raise Errors.HeaderParseError(

88

'Continuation line seen before first header')

89

lastvalue.append(line)

90

continue

91

# Normal, non-continuation header. BAW: this should check to make

92

# sure it's a legal header, e.g. doesn't contain spaces. Also, we

93

# should expose the header matching algorithm in the API, and

94

# allow for a non-strict parsing mode (that ignores the line

95

# instead of raising the exception).

96

i = line.find(':')

97

if i < 0:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

98

if self._strict:

99

raise Errors.HeaderParseError(

100

"Not a header, not a continuation: ``%s''"%line)

101

elif lineno == 1 and line.startswith('--'):

102

# allow through duplicate boundary tags.

103

continue

104

else:

105

raise Errors.HeaderParseError(

106

"Not a header, not a continuation: ``%s''"%line)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

107

if lastheader:

108

container[lastheader] = NL.join(lastvalue)

109

lastheader = line[:i]

110

lastvalue = [line[i+1:].lstrip()]

111

# Make sure we retain the last header

112

if lastheader:

113

container[lastheader] = NL.join(lastvalue)

114

115

def _parsebody(self, container, fp):

116

# Parse the body, but first split the payload on the content-type

117

# boundary if present.

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

118

boundary = container.get_boundary()

119

isdigest = (container.get_type() == 'multipart/digest')

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

120

# If there's a boundary, split the payload text into its constituent

121

# parts and parse each separately. Otherwise, just parse the rest of

122

# the body as a single message. Note: any exceptions raised in the

123

# recursive parse need to have their line numbers coerced.

124

if boundary:

125

preamble = epilogue = None

126

# Split into subparts. The first boundary we're looking for won't

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

127

# always have a leading newline since we're at the start of the

128

# body text, and there's not always a preamble before the first

129

# boundary.

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

130

separator = '--' + boundary

131

payload = fp.read()

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

132

# We use an RE here because boundaries can have trailing

133

# whitespace.

134

mo = re.search(

135

r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',

136

payload)

137

if not mo:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

138

raise Errors.BoundaryError(

139

"Couldn't find starting boundary: %s" % boundary)

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

140

start = mo.start()

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

141

if start > 0:

142

# there's some pre-MIME boundary preamble

143

preamble = payload[0:start]

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

144

# Find out what kind of line endings we're using

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

145

start += len(mo.group('sep')) + len(mo.group('ws'))

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

146

cre = re.compile('\r\n|\r|\n')

147

mo = cre.search(payload, start)

148

if mo:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

149

start += len(mo.group(0))

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

150

# We create a compiled regexp first because we need to be able to

151

# specify the start position, and the module function doesn't

152

# support this signature. :(

153

cre = re.compile('(?P<sep>\r\n|\r|\n)' +

154

re.escape(separator) + '--')

155

mo = cre.search(payload, start)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

156

if mo:

157

terminator = mo.start()

158

linesep = mo.group('sep')

159

if mo.end() < len(payload):

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

160

# There's some post-MIME boundary epilogue

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

161

epilogue = payload[mo.end():]

162

elif self._strict:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

163

raise Errors.BoundaryError(

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

164

"Couldn't find terminating boundary: %s" % boundary)

165

else:

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

166

# Handle the case of no trailing boundary. Check that it ends

167

# in a blank line. Some cases (spamspamspam) don't even have

168

# that!

169

mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

170

if not mo:

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

171

mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)

172

if not mo:

173

raise Errors.BoundaryError(

174

'No terminating boundary and no trailing empty line')

175

linesep = mo.group('sep')

176

terminator = len(payload)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

177

# We split the textual payload on the boundary separator, which

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

178

# includes the trailing newline. If the container is a

179

# multipart/digest then the subparts are by default message/rfc822

180

# instead of text/plain. In that case, they'll have a optional

181

# block of MIME headers, then an empty line followed by the

182

# message headers.

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

183

parts = re.split(

184

linesep + re.escape(separator) + r'[ \t]*' + linesep,

185

payload[start:terminator])

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

186

for part in parts:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

187

if isdigest:

188

if part[0] == linesep:

189

# There's no header block so create an empty message

190

# object as the container, and lop off the newline so

191

# we can parse the sub-subobject

192

msgobj = self._class()

193

part = part[1:]

194

else:

195

parthdrs, part = part.split(linesep+linesep, 1)

196

# msgobj in this case is the "message/rfc822" container

197

msgobj = self.parsestr(parthdrs, headersonly=1)

198

# while submsgobj is the message itself

199

submsgobj = self.parsestr(part)

200

msgobj.attach(submsgobj)

201

msgobj.set_default_type('message/rfc822')

202

else:

203

msgobj = self.parsestr(part)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

204

container.preamble = preamble

205

container.epilogue = epilogue

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

206

container.attach(msgobj)

207

elif container.get_main_type() == 'multipart':

208

# Very bad. A message is a multipart with no boundary!

209

raise Errors.BoundaryError(

210

'multipart message with no defined boundary')

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

211

elif container.get_type() == 'message/delivery-status':

212

# This special kind of type contains blocks of headers separated

213

# by a blank line. We'll represent each header block as a

214

# separate Message object

215

blocks = []

216

while 1:

217

blockmsg = self._class()

218

self._parseheaders(blockmsg, fp)

219

if not len(blockmsg):

220

# No more header blocks left

221

break

222

blocks.append(blockmsg)

223

container.set_payload(blocks)

224

elif container.get_main_type() == 'message':

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

225

# Create a container for the payload, but watch out for there not

226

# being any headers left

227

try:

228

msg = self.parse(fp)

229

except Errors.HeaderParseError:

230

msg = self._class()

231

self._parsebody(msg, fp)

Barry Warsaw

69e18af

2002-06-02 19:12:03 +0000

[diff] [blame]

232

container.attach(msg)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

233

else:

Barry Warsaw