Blame - Lib/email/FeedParser.py - platform/external/python/cpython3

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

1

2

# Authors: Baxter, Wouters and Warsaw

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

3

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

4

"""FeedParser - An email feed parser.

5

6

The feed parser implements an interface for incrementally parsing an email

7

message, line by line. This has advantages for certain applications, such as

8

those reading email messages off a socket.

9

10

FeedParser.feed() is the primary interface for pushing new data into the

11

parser. It returns when there's nothing more it can do with the available

12

data. When you have no more data to push into the parser, call .close().

13

This completes the parsing and returns the root message object.

14

15

The other advantage of this parser is that it will never throw a parsing

16

exception. Instead, when it finds something unexpected, it adds a 'defect' to

17

the current message. Defects are just instances that live on the message

18

object's .defect attribute.

19

"""

20

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

21

import re

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

22

from email import Errors

23

from email import Message

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

24

25

NLCRE = re.compile('\r\n|\r|\n')

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

26

NLCRE_bol = re.compile('(\r\n|\r|\n)')

27

NLCRE_eol = re.compile('(\r\n|\r|\n)$')

28

NLCRE_crack = re.compile('(\r\n|\r|\n)')

29

headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

EMPTYSTRING = ''

NL = '\n'

NeedMoreData = object()

34

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

35

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

36

37

class BufferedSubFile(object):

38

"""A file-ish object that can have new data loaded into it.

39

40

You can also push and pop line-matching predicates onto a stack. When the

41

current predicate matches the current line, a false EOF response

42

(i.e. empty string) is returned instead. This lets the parser adhere to a

43

simple abstraction -- it parses until EOF closes the current message.

44

"""

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

45

def __init__(self):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

46

# The last partial line pushed into this object.

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

47

self._partial = ''

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

48

# The list of full, pushed lines, in reverse order

49

self._lines = []

50

# The stack of false-EOF checking predicates.

51

self._eofstack = []

52

# A flag indicating whether the file has been closed or not.

53

self._closed = False

54

55

def push_eof_matcher(self, pred):

56

self._eofstack.append(pred)

57

58

def pop_eof_matcher(self):

59

return self._eofstack.pop()

60

61

def close(self):

62

# Don't forget any trailing partial line.

63

self._lines.append(self._partial)

64

self._closed = True

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

65

66

def readline(self):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

67

if not self._lines:

68

if self._closed:

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

69

return ''

70

return NeedMoreData

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

71

# Pop the line off the stack and see if it matches the current

72

# false-EOF predicate.

73

line = self._lines.pop()

Barry Warsaw

4e59bc1

2004-05-13 20:17:51 +0000

[diff] [blame^]

74

# RFC 2046, section 5.1.2 requires us to recognize outer level

75

# boundaries at any level of inner nesting. Do this, but be sure it's

76

# in the order of most to least nested.

77

for ateof in self._eofstack[::-1]:

78

if ateof(line):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

79

# We're at the false EOF. But push the last line back first.

80

self._lines.append(line)

81

return ''

82

return line

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

83

84

def unreadline(self, line):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

85

# Let the consumer push a line back into the buffer.

86

self._lines.append(line)

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

87

88

def push(self, data):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

89

"""Push some new data into this object."""

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

90

# Handle any previous leftovers

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

91

data, self._partial = self._partial + data, ''

92

# Crack into lines, but preserve the newlines on the end of each

93

parts = NLCRE_crack.split(data)

94

# The *ahem* interesting behaviour of re.split when supplied grouping

95

# parentheses is that the last element of the resulting list is the

96

# data after the final RE. In the case of a NL/CR terminated string,

97

# this is the empty string.

98

self._partial = parts.pop()

99

# parts is a list of strings, alternating between the line contents

100

# and the eol character(s). Gather up a list of lines after

101

# re-attaching the newlines.

102

lines = []

103

for i in range(len(parts) / 2):

104

lines.append(parts[i*2] + parts[i*2+1])

105

self.pushlines(lines)

106

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

107

def pushlines(self, lines):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

108

# Reverse and insert at the front of the lines.

109

self._lines[:0] = lines[::-1]

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

110

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

111

def is_closed(self):

112

return self._closed

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

def __iter__(self):

return self

def next(self):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

118

line = self.readline()

119

if line == '':

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

120

raise StopIteration

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

121

return line

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

122

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

123

124

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

125

class FeedParser:

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

126

"""A feed-style parser of email."""

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

127

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

128

def __init__(self, _factory=Message.Message):

129

"""_factory is called with no arguments to create a new message obj"""

130

self._factory = _factory

131

self._input = BufferedSubFile()

132

self._msgstack = []

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

133

self._parse = self._parsegen().next

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

134

self._cur = None

135

self._last = None

136

self._headersonly = False

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

137

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

138

# Non-public interface for supporting Parser's headersonly flag

139

def _set_headersonly(self):

140

self._headersonly = True

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

141

142

def feed(self, data):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

143

"""Push more data into the parser."""

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

144

self._input.push(data)

145

self._call_parse()

146

147

def _call_parse(self):

148

try:

149

self._parse()

150

except StopIteration:

151

pass

152

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

153

def close(self):

154

"""Parse all remaining data and return the root message object."""

155

self._input.close()

156

self._call_parse()

157

root = self._pop_message()

158

assert not self._msgstack

159

return root

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

160

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

161

def _new_message(self):

162

msg = self._factory()

163

if self._cur and self._cur.get_content_type() == 'multipart/digest':

164

msg.set_default_type('message/rfc822')

165

if self._msgstack:

166

self._msgstack[-1].attach(msg)

167

self._msgstack.append(msg)

168

self._cur = msg

169

self._cur.defects = []

170

self._last = msg

171

172

def _pop_message(self):

173

retval = self._msgstack.pop()

174

if self._msgstack:

175

self._cur = self._msgstack[-1]

else:

self._cur = None

return retval

def _parsegen(self):

# Create a new message and start by parsing headers.

182

self._new_message()

183

headers = []

184

# Collect the headers, searching for a line that doesn't match the RFC

185

# 2822 header or continuation pattern (including an empty line).

186

for line in self._input:

187

if line is NeedMoreData:

188

yield NeedMoreData

189

continue

190

if not headerRE.match(line):

191

# If we saw the RFC defined header/body separator

192

# (i.e. newline), just throw it away. Otherwise the line is

193

# part of the body so push it back.

194

if not NLCRE.match(line):

195

self._input.unreadline(line)

196

break

197

headers.append(line)

198

# Done with the headers, so parse them and figure out what we're

199

# supposed to see in the body of the message.

200

self._parse_headers(headers)

201

# Headers-only parsing is a backwards compatibility hack, which was

202

# necessary in the older parser, which could throw errors. All

203

# remaining lines in the input are thrown into the message body.

204

if self._headersonly:

205

lines = []

206

while True:

207

line = self._input.readline()

208

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

215

return

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

216

if self._cur.get_content_type() == 'message/delivery-status':

217

# message/delivery-status contains blocks of headers separated by

218

# a blank line. We'll represent each header block as a separate

Barry Warsaw

d38f448

2004-05-11 20:19:09 +0000

[diff] [blame]

219

# nested message object, but the processing is a bit different

220

# than standard message/* types because there is no body for the

221

# nested messages. A blank line separates the subparts.

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

222

while True:

223

self._input.push_eof_matcher(NLCRE.match)

224

for retval in self._parsegen():

225

if retval is NeedMoreData:

yield NeedMoreData

continue

break

msg = self._pop_message()

230

# We need to pop the EOF matcher in order to tell if we're at

231

# the end of the current file, not the end of the last block

232

# of message headers.

233

self._input.pop_eof_matcher()

234

# The input stream must be sitting at the newline or at the

235

# EOF. We want to see if we're at the end of this subpart, so

236

# first consume the blank line, then test the next line to see

237

# if we're at this subpart's EOF.

238

line = self._input.readline()

239

line = self._input.readline()

240

if line == '':

241

break

242

# Not at EOF so this is a line we're going to need.

243

self._input.unreadline(line)

244

return

Barry Warsaw

d38f448

2004-05-11 20:19:09 +0000

[diff] [blame]

245

if self._cur.get_content_maintype() == 'message':

246

# The message claims to be a message/* type, then what follows is

247

# another RFC 2822 message.

248

for retval in self._parsegen():

249

if retval is NeedMoreData:

yield NeedMoreData

continue

break

self._pop_message()

return

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

255

if self._cur.get_content_maintype() == 'multipart':

256

boundary = self._cur.get_boundary()

257

if boundary is None:

258

# The message /claims/ to be a multipart but it has not

259

# defined a boundary. That's a problem which we'll handle by

260

# reading everything until the EOF and marking the message as

261

# defective.

262

self._cur.defects.append(Errors.NoBoundaryInMultipart())

263

lines = []

264

for line in self._input:

265

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

270

return

271

# Create a line match predicate which matches the inter-part

272

# boundary as well as the end-of-multipart boundary. Don't push

273

# this onto the input stream until we've scanned past the

274

# preamble.

275

separator = '--' + boundary

276

boundaryre = re.compile(

277

'(?P<sep>' + re.escape(separator) +

278

r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')

279

capturing_preamble = True

preamble = []

linesep = False

while True:

line = self._input.readline()

284

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

mo = boundaryre.match(line)

290

if mo:

291

# If we're looking at the end boundary, we're done with

292

# this multipart. If there was a newline at the end of

293

# the closing boundary, then we need to initialize the

294

# epilogue with the empty string (see below).

295

if mo.group('end'):

296

linesep = mo.group('linesep')

297

break

298

# We saw an inter-part boundary. Were we in the preamble?

299

if capturing_preamble:

300

if preamble:

301

# According to RFC 2046, the last newline belongs

302

# to the boundary.

303

lastline = preamble[-1]

304

eolmo = NLCRE_eol.search(lastline)

305

if eolmo:

306

preamble[-1] = lastline[:-len(eolmo.group(0))]

307

self._cur.preamble = EMPTYSTRING.join(preamble)

308

capturing_preamble = False

309

self._input.unreadline(line)

310

continue

Barry Warsaw

486cb0a

2004-05-11 22:23:59 +0000

[diff] [blame]

311

# We saw a boundary separating two parts. Consume any

312

# multiple boundary lines that may be following. Our

313

# interpretation of RFC 2046 BNF grammar does not produce

314

# body parts within such double boundaries.

315

while True:

316

line = self._input.readline()

317

mo = boundaryre.match(line)

318

if not mo:

319

self._input.unreadline(line)

320

break

321

# Recurse to parse this subpart; the input stream points

322

# at the subpart's first line.

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

323

self._input.push_eof_matcher(boundaryre.match)

324

for retval in self._parsegen():

325

if retval is NeedMoreData:

yield NeedMoreData

continue

break

# Because of RFC 2046, the newline preceding the boundary

330

# separator actually belongs to the boundary, not the

331

# previous subpart's payload (or epilogue if the previous

332

# part is a multipart).

333

if self._last.get_content_maintype() == 'multipart':

334

epilogue = self._last.epilogue

335

if epilogue == '':

336

self._last.epilogue = None

337

elif epilogue is not None:

338

mo = NLCRE_eol.search(epilogue)

339

if mo:

340

end = len(mo.group(0))

341

self._last.epilogue = epilogue[:-end]

342

else:

343

payload = self._last.get_payload()

344

if isinstance(payload, basestring):

345

mo = NLCRE_eol.search(payload)

346

if mo:

347

payload = payload[:-len(mo.group(0))]

348

self._last.set_payload(payload)

349

self._input.pop_eof_matcher()

350

self._pop_message()

351

# Set the multipart up for newline cleansing, which will

352

# happen if we're in a nested multipart.

353

self._last = self._cur

354

else:

355

# I think we must be in the preamble

356

assert capturing_preamble

357

preamble.append(line)

358

# We've seen either the EOF or the end boundary. If we're still

359

# capturing the preamble, we never saw the start boundary. Note

360

# that as a defect and store the captured text as the payload.

361

# Otherwise everything from here to the EOF is epilogue.

362

if capturing_preamble:

363

self._cur.defects.append(Errors.StartBoundaryNotFound())

364

self._cur.set_payload(EMPTYSTRING.join(preamble))

365

return

366

# If the end boundary ended in a newline, we'll need to make sure

367

# the epilogue isn't None

if linesep:

epilogue = ['']

else:

epilogue = []

for line in self._input:

373

if line is NeedMoreData:

374

yield NeedMoreData

375

continue

376

epilogue.append(line)

377

# Any CRLF at the front of the epilogue is not technically part of

378

# the epilogue. Also, watch out for an empty string epilogue,

379

# which means a single newline.

Barry Warsaw

5b44cd6

2004-05-11 18:10:15 +0000

[diff] [blame]

380

if epilogue:

381

firstline = epilogue[0]

382

bolmo = NLCRE_bol.match(firstline)

383

if bolmo:

384

epilogue[0] = firstline[len(bolmo.group(0)):]

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

385

self._cur.epilogue = EMPTYSTRING.join(epilogue)

386

return

387

# Otherwise, it's some non-multipart type, so the entire rest of the

388

# file contents becomes the payload.

389

lines = []

390

for line in self._input:

391

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

396

397

def _parse_headers(self, lines):

398

# Passed a list of lines that make up the headers for the current msg

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

399

lastheader = ''

400

lastvalue = []

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

401

for lineno, line in enumerate(lines):

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

402

# Check for continuation

403

if line[0] in ' \t':

404

if not lastheader:

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

405

# The first line of the headers was a continuation. This

406

# is illegal, so let's note the defect, store the illegal

407

# line, and ignore it for purposes of headers.

408

defect = Errors.FirstHeaderLineIsContinuation(line)

409

self._cur.defects.append(defect)

410

continue

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

411

lastvalue.append(line)

412

continue

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

413

if lastheader:

414

# XXX reconsider the joining of folded lines

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

415

self._cur[lastheader] = EMPTYSTRING.join(lastvalue)[:-1]

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

416

lastheader, lastvalue = '', []

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

417

# Check for envelope header, i.e. unix-from

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

418

if line.startswith('From '):

419

if lineno == 0:

Barry Warsaw

c29db26

2004-05-10 14:48:30 +0000

[diff] [blame]

420

# Strip off the trailing newline

421

mo = NLCRE_eol.search(line)

422

if mo:

423

line = line[:-len(mo.group(0))]

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

424

self._cur.set_unixfrom(line)

425

continue

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

426

elif lineno == len(lines) - 1:

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

427

# Something looking like a unix-from at the end - it's

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

428

# probably the first line of the body, so push back the

429

# line and stop.

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

430

self._input.unreadline(line)

431

return

432

else:

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

433

# Weirdly placed unix-from line. Note this as a defect

434

# and ignore it.

435

defect = Errors.MisplacedEnvelopeHeader(line)

436

self._cur.defects.append(defect)

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

437

continue

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

438

# Split the line on the colon separating field name from value.

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

439

i = line.find(':')

440

if i < 0:

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

441

defect = Errors.MalformedHeader(line)

442

self._cur.defects.append(defect)

443

continue

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

444

lastheader = line[:i]

445

lastvalue = [line[i+1:].lstrip()]

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame]

446

# Done with all the lines, so handle the last header.

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

447

if lastheader:

448

# XXX reconsider the joining of folded lines

449

self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()