Blame - Lib/email/Parser.py - platform/external/python/cpython3

2001-09-23 03:17:28 +0000

[diff] [blame]

2

# Author: barry@zope.com (Barry Warsaw)

3

4

"""A parser of RFC 2822 and MIME email messages.

5

"""

6

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

7

import re

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

8

from cStringIO import StringIO

Barry Warsaw

15e9dc9

2002-01-27 06:48:02 +0000

[diff] [blame]

9

from types import ListType

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

10

Barry Warsaw

69e18af

2002-06-02 19:12:03 +0000

[diff] [blame]

11

from email import Errors

12

from email import Message

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

13

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

EMPTYSTRING = ''

NL = '\n'

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

try:

True, False

except NameError:

True = 1

False = 0

Barry Warsaw

2001-10-04 17:05:11 +0000

[diff] [blame]

24

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

25

class Parser:

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

26

def __init__(self, _class=Message.Message, strict=False):

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

27

"""Parser of RFC 2822 and MIME email messages.

28

29

Creates an in-memory object tree representing the email message, which

30

can then be manipulated and turned over to a Generator to return the

31

textual representation of the message.

32

33

The string must be formatted as a block of RFC 2822 headers and header

34

continuation lines, optionally preceeded by a `Unix-from' header. The

35

header block is terminated either by the end of the string or by a

36

blank line.

37

38

_class is the class to instantiate for new message objects when they

39

must be created. This class must have a constructor that can take

40

zero arguments. Default is Message.Message.

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

41

42

Optional strict tells the parser to be strictly RFC compliant or to be

43

more forgiving in parsing of ill-formatted MIME documents. When

44

non-strict mode is used, the parser will try to make up for missing or

45

erroneous boundaries and other peculiarities seen in the wild.

Barry Warsaw

bb26b45

2002-07-19 22:25:34 +0000

[diff] [blame]

46

Default is non-strict parsing.

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

47

"""

48

self._class = _class

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

49

self._strict = strict

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

50

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

51

def parse(self, fp, headersonly=False):

Barry Warsaw

057b842

2002-09-30 20:07:22 +0000

[diff] [blame]

52

"""Create a message structure from the data in a file.

53

54

Reads all the data from the file and returns the root of the message

55

structure. Optional headersonly is a flag specifying whether to stop

56

parsing after reading the headers or not. The default is False,

57

meaning it parses the entire contents of the file.

58

"""

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

59

root = self._class()

60

self._parseheaders(root, fp)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

61

if not headersonly:

62

self._parsebody(root, fp)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

63

return root

64

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

65

def parsestr(self, text, headersonly=False):

Barry Warsaw

057b842

2002-09-30 20:07:22 +0000

[diff] [blame]

66

"""Create a message structure from a string.

67

68

Returns the root of the message structure. Optional headersonly is a

69

flag specifying whether to stop parsing after reading the headers or

70

not. The default is False, meaning it parses the entire contents of

71

the file.

72

"""

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

73

return self.parse(StringIO(text), headersonly=headersonly)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

74

75

def _parseheaders(self, container, fp):

76

# Parse the headers, returning a list of header/value pairs. None as

77

# the header means the Unix-From header.

78

lastheader = ''

79

lastvalue = []

80

lineno = 0

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

81

while True:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

82

# Don't strip the line before we test for the end condition,

83

# because whitespace-only header lines are RFC compliant

84

# continuation lines.

85

line = fp.readline()

86

if not line:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

87

break

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

88

line = line.splitlines()[0]

89

if not line:

90

break

91

# Ignore the trailing newline

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

92

lineno += 1

93

# Check for initial Unix From_ line

94

if line.startswith('From '):

95

if lineno == 1:

96

container.set_unixfrom(line)

97

continue

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

98

elif self._strict:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

99

raise Errors.HeaderParseError(

100

'Unix-from in headers after first rfc822 header')

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

101

else:

102

# ignore the wierdly placed From_ line

103

# XXX: maybe set unixfrom anyway? or only if not already?

104

continue

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

105

# Header continuation line

106

if line[0] in ' \t':

107

if not lastheader:

108

raise Errors.HeaderParseError(

109

'Continuation line seen before first header')

110

lastvalue.append(line)

111

continue

112

# Normal, non-continuation header. BAW: this should check to make

113

# sure it's a legal header, e.g. doesn't contain spaces. Also, we

114

# should expose the header matching algorithm in the API, and

115

# allow for a non-strict parsing mode (that ignores the line

116

# instead of raising the exception).

117

i = line.find(':')

118

if i < 0:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

119

if self._strict:

120

raise Errors.HeaderParseError(

121

"Not a header, not a continuation: ``%s''"%line)

122

elif lineno == 1 and line.startswith('--'):

123

# allow through duplicate boundary tags.

124

continue

125

else:

126

raise Errors.HeaderParseError(

127

"Not a header, not a continuation: ``%s''"%line)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

128

if lastheader:

129

container[lastheader] = NL.join(lastvalue)

130

lastheader = line[:i]

131

lastvalue = [line[i+1:].lstrip()]

132

# Make sure we retain the last header

133

if lastheader:

134

container[lastheader] = NL.join(lastvalue)

135

136

def _parsebody(self, container, fp):

137

# Parse the body, but first split the payload on the content-type

138

# boundary if present.

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

139

boundary = container.get_boundary()

140

isdigest = (container.get_type() == 'multipart/digest')

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

141

# If there's a boundary, split the payload text into its constituent

142

# parts and parse each separately. Otherwise, just parse the rest of

143

# the body as a single message. Note: any exceptions raised in the

144

# recursive parse need to have their line numbers coerced.

145

if boundary:

146

preamble = epilogue = None

147

# Split into subparts. The first boundary we're looking for won't

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

148

# always have a leading newline since we're at the start of the

149

# body text, and there's not always a preamble before the first

150

# boundary.

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

151

separator = '--' + boundary

152

payload = fp.read()

Tim Peters

280488b

2002-08-23 18:19:30 +0000

[diff] [blame]

153

# We use an RE here because boundaries can have trailing

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

154

# whitespace.

155

mo = re.search(

156

r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',

157

payload)

158

if not mo:

Barry Warsaw

034b47a

2002-09-10 16:14:56 +0000

[diff] [blame]

159

if self._strict:

160

raise Errors.BoundaryError(

161

"Couldn't find starting boundary: %s" % boundary)

162

container.set_payload(payload)

163

return

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

164

start = mo.start()

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

165

if start > 0:

166

# there's some pre-MIME boundary preamble

167

preamble = payload[0:start]

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

168

# Find out what kind of line endings we're using

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

169

start += len(mo.group('sep')) + len(mo.group('ws'))

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

170

cre = re.compile('\r\n|\r|\n')

171

mo = cre.search(payload, start)

172

if mo:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

173

start += len(mo.group(0))

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

174

# We create a compiled regexp first because we need to be able to

175

# specify the start position, and the module function doesn't

176

# support this signature. :(

177

cre = re.compile('(?P<sep>\r\n|\r|\n)' +

178

re.escape(separator) + '--')

179

mo = cre.search(payload, start)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

180

if mo:

181

terminator = mo.start()

182

linesep = mo.group('sep')

183

if mo.end() < len(payload):

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

184

# There's some post-MIME boundary epilogue

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

185

epilogue = payload[mo.end():]

186

elif self._strict:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

187

raise Errors.BoundaryError(

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

188

"Couldn't find terminating boundary: %s" % boundary)

189

else:

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

190

# Handle the case of no trailing boundary. Check that it ends

191

# in a blank line. Some cases (spamspamspam) don't even have

192

# that!

193

mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

194

if not mo:

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

195

mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)

196

if not mo:

197

raise Errors.BoundaryError(

198

'No terminating boundary and no trailing empty line')

199

linesep = mo.group('sep')

200

terminator = len(payload)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

201

# We split the textual payload on the boundary separator, which

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

202

# includes the trailing newline. If the container is a

Tim Peters

280488b

2002-08-23 18:19:30 +0000

[diff] [blame]

203

# multipart/digest then the subparts are by default message/rfc822

204

# instead of text/plain. In that case, they'll have a optional

205

# block of MIME headers, then an empty line followed by the

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

206

# message headers.

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

207

parts = re.split(

208

linesep + re.escape(separator) + r'[ \t]*' + linesep,

209

payload[start:terminator])

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

210

for part in parts:

Tim Peters

280488b

2002-08-23 18:19:30 +0000

[diff] [blame]

211

if isdigest:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

212

if part[0] == linesep:

213

# There's no header block so create an empty message

214

# object as the container, and lop off the newline so

215

# we can parse the sub-subobject

216

msgobj = self._class()

217

part = part[1:]

218

else:

219

parthdrs, part = part.split(linesep+linesep, 1)

220

# msgobj in this case is the "message/rfc822" container

221

msgobj = self.parsestr(parthdrs, headersonly=1)

222

# while submsgobj is the message itself

223

submsgobj = self.parsestr(part)

224

msgobj.attach(submsgobj)

225

msgobj.set_default_type('message/rfc822')

226

else:

227

msgobj = self.parsestr(part)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

228

container.preamble = preamble

229

container.epilogue = epilogue

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

230

container.attach(msgobj)

231

elif container.get_main_type() == 'multipart':

232

# Very bad. A message is a multipart with no boundary!

233

raise Errors.BoundaryError(

234

'multipart message with no defined boundary')

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

235

elif container.get_type() == 'message/delivery-status':

236

# This special kind of type contains blocks of headers separated

237

# by a blank line. We'll represent each header block as a

238

# separate Message object

239

blocks = []

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

240

while True:

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

241

blockmsg = self._class()

242

self._parseheaders(blockmsg, fp)

243

if not len(blockmsg):

244

# No more header blocks left

245

break

246

blocks.append(blockmsg)

247

container.set_payload(blocks)

248

elif container.get_main_type() == 'message':

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

249

# Create a container for the payload, but watch out for there not

250

# being any headers left

251

try:

252

msg = self.parse(fp)

253

except Errors.HeaderParseError:

254

msg = self._class()

255

self._parsebody(msg, fp)

Barry Warsaw

69e18af

2002-06-02 19:12:03 +0000

[diff] [blame]

256

container.attach(msg)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

257

else:

Barry Warsaw