Blame - Lib/email/Parser.py - platform/external/python/cpython2

2001-09-23 03:17:28 +0000

[diff] [blame]

2

# Author: barry@zope.com (Barry Warsaw)

3

4

"""A parser of RFC 2822 and MIME email messages.

5

"""

6

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

7

import re

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

8

from cStringIO import StringIO

Barry Warsaw

15e9dc9

2002-01-27 06:48:02 +0000

[diff] [blame]

9

from types import ListType

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

10

Barry Warsaw

69e18af

2002-06-02 19:12:03 +0000

[diff] [blame]

11

from email import Errors

12

from email import Message

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

13

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

EMPTYSTRING = ''

NL = '\n'

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

try:

True, False

except NameError:

True = 1

False = 0

Barry Warsaw

2003-03-06 05:25:35 +0000

[diff] [blame]

23

NLCRE = re.compile('\r\n|\r|\n')

Barry Warsaw

2002-10-07 17:27:35 +0000

[diff] [blame]

24

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

25

Barry Warsaw

e968ead

2001-10-04 17:05:11 +0000

[diff] [blame]

26

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

27

class Parser:

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

28

def __init__(self, _class=Message.Message, strict=False):

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

29

"""Parser of RFC 2822 and MIME email messages.

30

31

Creates an in-memory object tree representing the email message, which

32

can then be manipulated and turned over to a Generator to return the

33

textual representation of the message.

34

35

The string must be formatted as a block of RFC 2822 headers and header

36

continuation lines, optionally preceeded by a `Unix-from' header. The

37

header block is terminated either by the end of the string or by a

38

blank line.

39

40

_class is the class to instantiate for new message objects when they

41

must be created. This class must have a constructor that can take

42

zero arguments. Default is Message.Message.

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

43

44

Optional strict tells the parser to be strictly RFC compliant or to be

45

more forgiving in parsing of ill-formatted MIME documents. When

46

non-strict mode is used, the parser will try to make up for missing or

47

erroneous boundaries and other peculiarities seen in the wild.

Barry Warsaw

bb26b45

2002-07-19 22:25:34 +0000

[diff] [blame]

48

Default is non-strict parsing.

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

49

"""

50

self._class = _class

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

51

self._strict = strict

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

52

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

53

def parse(self, fp, headersonly=False):

Barry Warsaw

057b842

2002-09-30 20:07:22 +0000

[diff] [blame]

54

"""Create a message structure from the data in a file.

55

56

Reads all the data from the file and returns the root of the message

57

structure. Optional headersonly is a flag specifying whether to stop

58

parsing after reading the headers or not. The default is False,

59

meaning it parses the entire contents of the file.

60

"""

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

61

root = self._class()

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

62

firstbodyline = self._parseheaders(root, fp)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

63

if not headersonly:

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

64

self._parsebody(root, fp, firstbodyline)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

65

return root

66

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

67

def parsestr(self, text, headersonly=False):

Barry Warsaw

057b842

2002-09-30 20:07:22 +0000

[diff] [blame]

68

"""Create a message structure from a string.

69

70

Returns the root of the message structure. Optional headersonly is a

71

flag specifying whether to stop parsing after reading the headers or

72

not. The default is False, meaning it parses the entire contents of

73

the file.

74

"""

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

75

return self.parse(StringIO(text), headersonly=headersonly)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

76

77

def _parseheaders(self, container, fp):

78

# Parse the headers, returning a list of header/value pairs. None as

79

# the header means the Unix-From header.

80

lastheader = ''

81

lastvalue = []

82

lineno = 0

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

83

firstbodyline = None

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

84

while True:

Barry Warsaw

409a4c0

2002-04-10 21:01:31 +0000

[diff] [blame]

85

# Don't strip the line before we test for the end condition,

86

# because whitespace-only header lines are RFC compliant

87

# continuation lines.

88

line = fp.readline()

89

if not line:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

90

break

Barry Warsaw

409a4c0

2002-04-10 21:01:31 +0000

[diff] [blame]

91

line = line.splitlines()[0]

92

if not line:

93

break

94

# Ignore the trailing newline

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

95

lineno += 1

96

# Check for initial Unix From_ line

97

if line.startswith('From '):

98

if lineno == 1:

99

container.set_unixfrom(line)

100

continue

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

101

elif self._strict:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

102

raise Errors.HeaderParseError(

103

'Unix-from in headers after first rfc822 header')

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

104

else:

105

# ignore the wierdly placed From_ line

106

# XXX: maybe set unixfrom anyway? or only if not already?

107

continue

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

108

# Header continuation line

109

if line[0] in ' \t':

110

if not lastheader:

111

raise Errors.HeaderParseError(

112

'Continuation line seen before first header')

113

lastvalue.append(line)

114

continue

115

# Normal, non-continuation header. BAW: this should check to make

116

# sure it's a legal header, e.g. doesn't contain spaces. Also, we

117

# should expose the header matching algorithm in the API, and

118

# allow for a non-strict parsing mode (that ignores the line

119

# instead of raising the exception).

120

i = line.find(':')

121

if i < 0:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

122

if self._strict:

123

raise Errors.HeaderParseError(

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

124

"Not a header, not a continuation: ``%s''" % line)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

125

elif lineno == 1 and line.startswith('--'):

126

# allow through duplicate boundary tags.

127

continue

128

else:

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

129

# There was no separating blank line as mandated by RFC

130

# 2822, but we're in non-strict mode. So just offer up

131

# this current line as the first body line.

132

firstbodyline = line

133

break

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

134

if lastheader:

135

container[lastheader] = NL.join(lastvalue)

136

lastheader = line[:i]

137

lastvalue = [line[i+1:].lstrip()]

138

# Make sure we retain the last header

139

if lastheader:

140

container[lastheader] = NL.join(lastvalue)

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

141

return firstbodyline

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

142

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

143

def _parsebody(self, container, fp, firstbodyline=None):

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

144

# Parse the body, but first split the payload on the content-type

145

# boundary if present.

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

146

boundary = container.get_boundary()

Barry Warsaw

2002-10-07 17:27:35 +0000

[diff] [blame]

147

isdigest = (container.get_content_type() == 'multipart/digest')

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

148

# If there's a boundary, split the payload text into its constituent

149

# parts and parse each separately. Otherwise, just parse the rest of

150

# the body as a single message. Note: any exceptions raised in the

151

# recursive parse need to have their line numbers coerced.

152

if boundary:

153

preamble = epilogue = None

154

# Split into subparts. The first boundary we're looking for won't

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

155

# always have a leading newline since we're at the start of the

156

# body text, and there's not always a preamble before the first

157

# boundary.

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

158

separator = '--' + boundary

159

payload = fp.read()

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

160

if firstbodyline is not None:

161

payload = firstbodyline + '\n' + payload

Tim Peters

280488b

2002-08-23 18:19:30 +0000

[diff] [blame]

162

# We use an RE here because boundaries can have trailing

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

163

# whitespace.

164

mo = re.search(

165

r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',

166

payload)

167

if not mo:

Barry Warsaw

034b47a

2002-09-10 16:14:56 +0000

[diff] [blame]

168

if self._strict:

169

raise Errors.BoundaryError(

170

"Couldn't find starting boundary: %s" % boundary)

171

container.set_payload(payload)

172

return

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

173

start = mo.start()

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

174

if start > 0:

175

# there's some pre-MIME boundary preamble

176

preamble = payload[0:start]

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

177

# Find out what kind of line endings we're using

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

178

start += len(mo.group('sep')) + len(mo.group('ws'))

Barry Warsaw

0e4570b

2003-03-06 05:25:35 +0000

[diff] [blame]

179

mo = NLCRE.search(payload, start)

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

180

if mo:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

181

start += len(mo.group(0))

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

182

# We create a compiled regexp first because we need to be able to

183

# specify the start position, and the module function doesn't

184

# support this signature. :(

185

cre = re.compile('(?P<sep>\r\n|\r|\n)' +

186

re.escape(separator) + '--')

187

mo = cre.search(payload, start)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

188

if mo:

189

terminator = mo.start()

190

linesep = mo.group('sep')

191

if mo.end() < len(payload):

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

192

# There's some post-MIME boundary epilogue

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

193

epilogue = payload[mo.end():]

194

elif self._strict:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

195

raise Errors.BoundaryError(

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

196

"Couldn't find terminating boundary: %s" % boundary)

197

else:

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

198

# Handle the case of no trailing boundary. Check that it ends

199

# in a blank line. Some cases (spamspamspam) don't even have

200

# that!

201

mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

202

if not mo:

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

203

mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)

204

if not mo:

205

raise Errors.BoundaryError(

206

'No terminating boundary and no trailing empty line')

207

linesep = mo.group('sep')

208

terminator = len(payload)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

209

# We split the textual payload on the boundary separator, which

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

210

# includes the trailing newline. If the container is a

Tim Peters

280488b

2002-08-23 18:19:30 +0000

[diff] [blame]

211

# multipart/digest then the subparts are by default message/rfc822

212

# instead of text/plain. In that case, they'll have a optional

213

# block of MIME headers, then an empty line followed by the

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

214

# message headers.

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

215

parts = re.split(

216

linesep + re.escape(separator) + r'[ \t]*' + linesep,

217

payload[start:terminator])

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

218

for part in parts:

Tim Peters

280488b

2002-08-23 18:19:30 +0000

[diff] [blame]

219

if isdigest:

Barry Warsaw

2002-10-07 17:27:35 +0000

[diff] [blame]

220

if part.startswith(linesep):

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

221

# There's no header block so create an empty message

222

# object as the container, and lop off the newline so

223

# we can parse the sub-subobject

224

msgobj = self._class()

Barry Warsaw

2002-10-07 17:27:35 +0000

[diff] [blame]

225

part = part[len(linesep):]

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

226

else:

227

parthdrs, part = part.split(linesep+linesep, 1)

228

# msgobj in this case is the "message/rfc822" container

229

msgobj = self.parsestr(parthdrs, headersonly=1)

230

# while submsgobj is the message itself

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

231

msgobj.set_default_type('message/rfc822')

Barry Warsaw

5c9130e

2002-11-05 20:54:37 +0000

[diff] [blame]

232

maintype = msgobj.get_content_maintype()

233

if maintype in ('message', 'multipart'):

234

submsgobj = self.parsestr(part)

235

msgobj.attach(submsgobj)

236

else:

237

msgobj.set_payload(part)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

238

else:

239

msgobj = self.parsestr(part)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

240

container.preamble = preamble

241

container.epilogue = epilogue

Barry Warsaw

409a4c0

2002-04-10 21:01:31 +0000

[diff] [blame]

242

container.attach(msgobj)

243

elif container.get_main_type() == 'multipart':

244

# Very bad. A message is a multipart with no boundary!

245

raise Errors.BoundaryError(

246

'multipart message with no defined boundary')

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

247

elif container.get_type() == 'message/delivery-status':

248

# This special kind of type contains blocks of headers separated

249

# by a blank line. We'll represent each header block as a

250

# separate Message object

251

blocks = []

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

252

while True:

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

253

blockmsg = self._class()

254

self._parseheaders(blockmsg, fp)

255

if not len(blockmsg):

256

# No more header blocks left

257

break

258

blocks.append(blockmsg)

259

container.set_payload(blocks)

260

elif container.get_main_type() == 'message':

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

261

# Create a container for the payload, but watch out for there not

262

# being any headers left

263

try:

264

msg = self.parse(fp)

265

except Errors.HeaderParseError:

266

msg = self._class()

267

self._parsebody(msg, fp)

Barry Warsaw

69e18af

2002-06-02 19:12:03 +0000

[diff] [blame]

268

container.attach(msg)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

269

else:

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

270

text = fp.read()

271

if firstbodyline is not None:

272

text = firstbodyline + '\n' + text

273

container.set_payload(text)

Barry Warsaw

e552882

2001-10-11 15:43:00 +0000

[diff] [blame]

class HeaderParser(Parser):

278

"""A subclass of Parser, this one only meaningfully parses message headers.

279

280

This class can be used if all you're interested in is the headers of a

281

message. While it consumes the message body, it does not parse it, but

282

simply makes it available as a string payload.

283

284

Parsing with this subclass can be considerably faster if all you're

285

interested in is the message headers.

286

"""

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

287

def _parsebody(self, container, fp, firstbodyline=None):

Barry Warsaw

e552882

2001-10-11 15:43:00 +0000

[diff] [blame]

288

# Consume but do not parse, the body

Barry Warsaw