Blame - Lib/email/Parser.py - platform/external/python/cpython3

2001-09-23 03:17:28 +0000

[diff] [blame]

2

# Author: barry@zope.com (Barry Warsaw)

3

4

"""A parser of RFC 2822 and MIME email messages.

5

"""

6

Barry Warsaw

7e21b67

2002-05-19 23:51:50 +0000

[diff] [blame]

7

import re

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

8

from cStringIO import StringIO

Barry Warsaw

15e9dc9

2002-01-27 06:48:02 +0000

[diff] [blame]

9

from types import ListType

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

10

Barry Warsaw

69e18af

2002-06-02 19:12:03 +0000

[diff] [blame]

11

from email import Errors

12

from email import Message

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

13

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

EMPTYSTRING = ''

NL = '\n'

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

try:

True, False

except NameError:

True = 1

False = 0

Barry Warsaw

2003-03-06 05:25:35 +0000

[diff] [blame]

23

NLCRE = re.compile('\r\n|\r|\n')

Barry Warsaw

487fe6a

2002-10-07 17:27:35 +0000

[diff] [blame]

24

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

25

class TextUtil:

26

""" A utility class for wrapping a file object and providing a

27

couple of additional useful functions.

28

"""

29

30

def __init__(self, fp):

self.fp = fp

self.unread = []

def readline(self):

""" Return a line of data.

36

37

If data has been pushed back with unreadline(), the most recently

38

returned unreadline()d data will be returned.

39

"""

40

if self.unread:

41

return self.unread.pop()

42

else:

43

return self.fp.readline()

44

45

def unreadline(self, line):

46

"""Push a line back into the object.

47

"""

48

self.unread.append(line)

49

50

def peekline(self):

51

"""Non-destructively look at the next line"""

52

line = self.readline()

53

self.unreadline(line)

return line

def read(self):

"""Return the remaining data

"""

r = self.fp.read()

if self.unread:

r = "\n".join(self.unread) + r

self.unread = []

return r

def readuntil(self, re, afterblank=0, includematch=0):

66

"""Read a line at a time until we get the specified RE.

67

68

Returns the text up to (and including, if includematch is true) the

69

matched text, and the RE match object. If afterblank is true,

70

there must be a blank line before the matched text. Moves current

71

filepointer to the line following the matched line. If we reach

72

end-of-file, return what we've got so far, and return None as the

RE match object.

"""

prematch = []

blankseen = 0

while 1:

line = self.readline()

79

if not line:

80

# end of file

81

return EMPTYSTRING.join(prematch), None

82

if afterblank:

83

if NLCRE.match(line):

blankseen = 1

continue

else:

blankseen = 0

m = re.match(line)

if (m and not afterblank) or (m and afterblank and blankseen):

90

if includematch:

91

prematch.append(line)

92

return EMPTYSTRING.join(prematch), m

93

prematch.append(line)

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

94

Barry Warsaw

e968ead

2001-10-04 17:05:11 +0000

[diff] [blame]

95

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

96

class Parser:

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

97

def __init__(self, _class=Message.Message, strict=False):

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

98

"""Parser of RFC 2822 and MIME email messages.

99

100

Creates an in-memory object tree representing the email message, which

101

can then be manipulated and turned over to a Generator to return the

102

textual representation of the message.

103

104

The string must be formatted as a block of RFC 2822 headers and header

105

continuation lines, optionally preceeded by a `Unix-from' header. The

106

header block is terminated either by the end of the string or by a

107

blank line.

108

109

_class is the class to instantiate for new message objects when they

110

must be created. This class must have a constructor that can take

111

zero arguments. Default is Message.Message.

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

112

113

Optional strict tells the parser to be strictly RFC compliant or to be

114

more forgiving in parsing of ill-formatted MIME documents. When

115

non-strict mode is used, the parser will try to make up for missing or

116

erroneous boundaries and other peculiarities seen in the wild.

Barry Warsaw

bb26b45

2002-07-19 22:25:34 +0000

[diff] [blame]

117

Default is non-strict parsing.

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

118

"""

119

self._class = _class

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

120

self._strict = strict

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

121

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

122

def parse(self, fp, headersonly=False):

Barry Warsaw

057b842

2002-09-30 20:07:22 +0000

[diff] [blame]

123

"""Create a message structure from the data in a file.

124

125

Reads all the data from the file and returns the root of the message

126

structure. Optional headersonly is a flag specifying whether to stop

127

parsing after reading the headers or not. The default is False,

128

meaning it parses the entire contents of the file.

129

"""

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

130

root = self._class()

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

131

fp = TextUtil(fp)

132

self._parseheaders(root, fp)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

133

if not headersonly:

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

134

obj = self._parsemessage(root, fp)

135

trailer = fp.read()

136

if obj and trailer:

137

self._attach_trailer(obj, trailer)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

138

return root

139

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

140

def parsestr(self, text, headersonly=False):

Barry Warsaw

057b842

2002-09-30 20:07:22 +0000

[diff] [blame]

141

"""Create a message structure from a string.

142

143

Returns the root of the message structure. Optional headersonly is a

144

flag specifying whether to stop parsing after reading the headers or

145

not. The default is False, meaning it parses the entire contents of

146

the file.

147

"""

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

148

return self.parse(StringIO(text), headersonly=headersonly)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

149

150

def _parseheaders(self, container, fp):

151

# Parse the headers, returning a list of header/value pairs. None as

152

# the header means the Unix-From header.

153

lastheader = ''

154

lastvalue = []

155

lineno = 0

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

156

while True:

Barry Warsaw

409a4c0

2002-04-10 21:01:31 +0000

[diff] [blame]

157

# Don't strip the line before we test for the end condition,

158

# because whitespace-only header lines are RFC compliant

159

# continuation lines.

160

line = fp.readline()

161

if not line:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

162

break

Barry Warsaw

409a4c0

2002-04-10 21:01:31 +0000

[diff] [blame]

163

line = line.splitlines()[0]

164

if not line:

165

break

166

# Ignore the trailing newline

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

167

lineno += 1

168

# Check for initial Unix From_ line

169

if line.startswith('From '):

170

if lineno == 1:

171

container.set_unixfrom(line)

172

continue

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

173

elif self._strict:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

174

raise Errors.HeaderParseError(

175

'Unix-from in headers after first rfc822 header')

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

176

else:

177

# ignore the wierdly placed From_ line

178

# XXX: maybe set unixfrom anyway? or only if not already?

179

continue

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

180

# Header continuation line

181

if line[0] in ' \t':

182

if not lastheader:

183

raise Errors.HeaderParseError(

184

'Continuation line seen before first header')

185

lastvalue.append(line)

186

continue

187

# Normal, non-continuation header. BAW: this should check to make

188

# sure it's a legal header, e.g. doesn't contain spaces. Also, we

189

# should expose the header matching algorithm in the API, and

190

# allow for a non-strict parsing mode (that ignores the line

191

# instead of raising the exception).

192

i = line.find(':')

193

if i < 0:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

194

if self._strict:

195

raise Errors.HeaderParseError(

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

196

"Not a header, not a continuation: ``%s''" % line)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

197

elif lineno == 1 and line.startswith('--'):

198

# allow through duplicate boundary tags.

199

continue

200

else:

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

201

# There was no separating blank line as mandated by RFC

202

# 2822, but we're in non-strict mode. So just offer up

203

# this current line as the first body line.

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

204

fp.unreadline(line)

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

205

break

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

206

if lastheader:

207

container[lastheader] = NL.join(lastvalue)

208

lastheader = line[:i]

209

lastvalue = [line[i+1:].lstrip()]

210

# Make sure we retain the last header

211

if lastheader:

212

container[lastheader] = NL.join(lastvalue)

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

213

return

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

214

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

215

def _parsemessage(self, container, fp):

216

# Parse the body. We walk through the body from top to bottom,

217

# keeping track of the current multipart nesting as we go.

218

# We return the object that gets the data at the end of this

219

# block.

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

220

boundary = container.get_boundary()

Barry Warsaw

487fe6a

2002-10-07 17:27:35 +0000

[diff] [blame]

221

isdigest = (container.get_content_type() == 'multipart/digest')

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

222

if boundary:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

223

separator = '--' + boundary

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

224

boundaryRE = re.compile(

225

r'(?P<sep>' + re.escape(separator) +

226

r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')

227

preamble, matchobj = fp.readuntil(boundaryRE)

228

if not matchobj:

229

# Broken - we hit the end of file. Just set the body

230

# to the text.

231

container.set_payload(preamble)

232

return container

233

if preamble:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

234

container.preamble = preamble

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

235

else:

236

# The module docs specify an empty preamble is None, not ''

237

container.preamble = None

238

while 1:

239

subobj = self._class()

240

if isdigest:

241

subobj.set_default_type('message/rfc822')

242

firstline = fp.peekline()

243

if firstline.strip():

244

# we have MIME headers. all good.

245

self._parseheaders(subobj, fp)

246

else:

247

# no MIME headers. this is allowed for multipart/digest

248

# Consume the extra blank line

fp.readline()

pass

else:

self._parseheaders(subobj, fp)

253

container.attach(subobj)

254

maintype = subobj.get_content_maintype()

255

hassubparts = (subobj.get_content_maintype() in

256

( "message", "multipart" ))

257

if hassubparts:

258

subobj = self._parsemessage(subobj, fp)

259

260

trailer, matchobj = fp.readuntil(boundaryRE)

261

if matchobj is None or trailer:

262

mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)

263

if not mo:

264

mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)

265

if not mo:

266

raise Errors.BoundaryError(

267

'No terminating boundary and no trailing empty line')

268

linesep = mo.group('sep')

269

trailer = trailer[:-len(linesep)]

270

if trailer:

271

self._attach_trailer(subobj, trailer)

272

if matchobj is None or matchobj.group('end'):

273

# That was the last piece of data. Let our caller attach

274

# the epilogue to us. But before we do that, push the

275

# line ending of the match group back into the readline

276

# buffer, as it's part of the epilogue.

277

if matchobj:

278

fp.unreadline(matchobj.group('linesep'))

279

return container

280

281

elif container.get_content_maintype() == "multipart":

Barry Warsaw

409a4c0

2002-04-10 21:01:31 +0000

[diff] [blame]

282

# Very bad. A message is a multipart with no boundary!

283

raise Errors.BoundaryError(

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

284

'multipart message with no defined boundary')

285

elif container.get_content_maintype() == "message":

286

ct = container.get_content_type()

287

if ct == "message/rfc822":

288

submessage = self._class()

289

self._parseheaders(submessage, fp)

290

self._parsemessage(submessage, fp)

291

container.attach(submessage)

292

return submessage

293

elif ct == "message/delivery-status":

294

# This special kind of type contains blocks of headers

295

# separated by a blank line. We'll represent each header

296

# block as a separate Message object

297

while 1:

298

nextblock = self._class()

299

self._parseheaders(nextblock, fp)

300

container.attach(nextblock)

301

# next peek ahead to see whether we've hit the end or not

302

nextline = fp.peekline()

303

if nextline[:2] == "--":

break

return container

else:

# Other sort of message object (e.g. external-body)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

308

msg = self._class()

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

309

self._parsemessage(msg, fp)

310

container.attach(msg)

311

return msg

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

312

else:

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

313

# single body section. We let our caller set the payload.

314

return container

Barry Warsaw

e552882

2001-10-11 15:43:00 +0000

[diff] [blame]

315

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

316

def _attach_trailer(self, obj, trailer):

317

if obj.get_content_maintype() in ("message", "multipart"):

318

obj.epilogue = trailer

319

else:

320

obj.set_payload(trailer)

Barry Warsaw

e552882

2001-10-11 15:43:00 +0000

[diff] [blame]

321

322

323

class HeaderParser(Parser):

324

"""A subclass of Parser, this one only meaningfully parses message headers.

325

326

This class can be used if all you're interested in is the headers of a

327

message. While it consumes the message body, it does not parse it, but

328

simply makes it available as a string payload.

329

330

Parsing with this subclass can be considerably faster if all you're

331

interested in is the message headers.

332

"""

Thomas Wouters

2004-03-20 17:31:29 +0000

[diff] [blame]

333

def _parsemessage(self, container, fp):

Barry Warsaw

e552882

2001-10-11 15:43:00 +0000

[diff] [blame]

334

# Consume but do not parse, the body

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

335

text = fp.read()

Barry Warsaw

2002-11-05 21:44:06 +0000

[diff] [blame]

336

container.set_payload(text)

Thomas Wouters