Blame - Lib/email/feedparser.py - platform/external/python/cpython3

2007-08-30 01:15:14 +0000

[diff] [blame]

1

2

# Authors: Baxter, Wouters and Warsaw

3

# Contact: email-sig@python.org

4

5

"""FeedParser - An email feed parser.

6

7

The feed parser implements an interface for incrementally parsing an email

8

message, line by line. This has advantages for certain applications, such as

9

those reading email messages off a socket.

10

11

FeedParser.feed() is the primary interface for pushing new data into the

12

parser. It returns when there's nothing more it can do with the available

13

data. When you have no more data to push into the parser, call .close().

14

This completes the parsing and returns the root message object.

15

Andrew Svetlov

737fb89

2012-12-18 21:14:22 +0200

[diff] [blame]

16

The other advantage of this parser is that it will never raise a parsing

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

17

exception. Instead, when it finds something unexpected, it adds a 'defect' to

18

the current message. Defects are just instances that live on the message

19

object's .defects attribute.

20

"""

21

R David Murray

1b6c724

2012-03-16 22:43:05 -0400

[diff] [blame]

22

__all__ = ['FeedParser', 'BytesFeedParser']

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

import re

from email import errors

27

from email import message

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

28

from email._policybase import compat32

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

29

30

NLCRE = re.compile('\r\n|\r|\n')

31

NLCRE_bol = re.compile('(\r\n|\r|\n)')

R. David Murray

45e0e14

2010-06-16 02:19:40 +0000

[diff] [blame]

32

NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

33

NLCRE_crack = re.compile('(\r\n|\r|\n)')

34

# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character

35

# except controls, SP, and ":".

Benjamin Peterson

155ceaa

2015-01-25 23:30:30 -0500

[diff] [blame]

36

headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

EMPTYSTRING = ''

NL = '\n'

NeedMoreData = object()

class BufferedSubFile(object):

45

"""A file-ish object that can have new data loaded into it.

46

47

You can also push and pop line-matching predicates onto a stack. When the

48

current predicate matches the current line, a false EOF response

49

(i.e. empty string) is returned instead. This lets the parser adhere to a

50

simple abstraction -- it parses until EOF closes the current message.

51

"""

52

def __init__(self):

Serhiy Storchaka

2014-08-12 13:59:11 +0300

[diff] [blame]

53

# Chunks of the last partial line pushed into this object.

54

self._partial = []

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

55

# The list of full, pushed lines, in reverse order

56

self._lines = []

57

# The stack of false-EOF checking predicates.

58

self._eofstack = []

59

# A flag indicating whether the file has been closed or not.

60

self._closed = False

61

62

def push_eof_matcher(self, pred):

63

self._eofstack.append(pred)

64

65

def pop_eof_matcher(self):

66

return self._eofstack.pop()

67

68

def close(self):

69

# Don't forget any trailing partial line.

Serhiy Storchaka

2014-08-12 13:59:11 +0300

[diff] [blame]

70

self.pushlines(''.join(self._partial).splitlines(True))

71

self._partial = []

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

self._closed = True

def readline(self):

if not self._lines:

if self._closed:

return ''

return NeedMoreData

# Pop the line off the stack and see if it matches the current

80

# false-EOF predicate.

81

line = self._lines.pop()

82

# RFC 2046, section 5.1.2 requires us to recognize outer level

83

# boundaries at any level of inner nesting. Do this, but be sure it's

84

# in the order of most to least nested.

85

for ateof in self._eofstack[::-1]:

86

if ateof(line):

87

# We're at the false EOF. But push the last line back first.

88

self._lines.append(line)

return ''

return line

def unreadline(self, line):

93

# Let the consumer push a line back into the buffer.

94

assert line is not NeedMoreData

95

self._lines.append(line)

96

97

def push(self, data):

98

"""Push some new data into this object."""

R David Murray

2940e71

2013-02-13 21:17:13 -0500

[diff] [blame]

99

# Crack into lines, but preserve the linesep characters on the end of each

100

parts = data.splitlines(True)

Serhiy Storchaka

2014-08-12 13:59:11 +0300

[diff] [blame]

101

102

if not parts or not parts[0].endswith(('\n', '\r')):

103

# No new complete lines, so just accumulate partials

104

self._partial += parts

return

if self._partial:

# If there are previous leftovers, complete them now

109

self._partial.append(parts[0])

110

parts[0:1] = ''.join(self._partial).splitlines(True)

111

del self._partial[:]

112

R David Murray

2940e71

2013-02-13 21:17:13 -0500

[diff] [blame]

113

# If the last element of the list does not end in a newline, then treat

114

# it as a partial line. We only check for '\n' here because a line

115

# ending with '\r' might be a line that was split in the middle of a

116

# '\r\n' sequence (see bugs 1555570 and 1721862).

Serhiy Storchaka

2014-08-12 13:59:11 +0300

[diff] [blame]

117

if not parts[-1].endswith('\n'):

118

self._partial = [parts.pop()]

R David Murray

2940e71

2013-02-13 21:17:13 -0500

[diff] [blame]

119

self.pushlines(parts)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

120

121

def pushlines(self, lines):

122

# Reverse and insert at the front of the lines.

123

self._lines[:0] = lines[::-1]

124

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

def __iter__(self):

return self

def __next__(self):

line = self.readline()

if line == '':

raise StopIteration

return line

class FeedParser:

"""A feed-style parser of email."""

138

R David Murray

aa21297

2014-02-07 10:44:16 -0500

[diff] [blame]

139

def __init__(self, _factory=None, *, policy=compat32):

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

140

"""_factory is called with no arguments to create a new message obj

141

142

The policy keyword specifies a policy object that controls a number of

143

aspects of the parser's operation. The default policy maintains

144

backward compatibility.

145

146

"""

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

147

self.policy = policy

R David Murray

aa21297

2014-02-07 10:44:16 -0500

[diff] [blame]

148

self._factory_kwds = lambda: {'policy': self.policy}

149

if _factory is None:

150

# What this should be:

151

#self._factory = policy.default_message_factory

152

# but, because we are post 3.4 feature freeze, fix with temp hack:

153

if self.policy is compat32:

154

self._factory = message.Message

155

else:

156

self._factory = message.EmailMessage

157

else:

158

self._factory = _factory

159

try:

160

_factory(policy=self.policy)

161

except TypeError:

162

# Assume this is an old-style factory

163

self._factory_kwds = lambda: {}

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

164

self._input = BufferedSubFile()

165

self._msgstack = []

166

self._parse = self._parsegen().__next__

167

self._cur = None

168

self._last = None

169

self._headersonly = False

170

171

# Non-public interface for supporting Parser's headersonly flag

172

def _set_headersonly(self):

173

self._headersonly = True

174

175

def feed(self, data):

176

"""Push more data into the parser."""

177

self._input.push(data)

178

self._call_parse()

179

180

def _call_parse(self):

181

try:

182

self._parse()

183

except StopIteration:

pass

def close(self):

"""Parse all remaining data and return the root message object."""

188

self._input.close()

189

self._call_parse()

190

root = self._pop_message()

191

assert not self._msgstack

192

# Look for final set of defects

193

if root.get_content_maintype() == 'multipart' \

194

and not root.is_multipart():

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

195

defect = errors.MultipartInvariantViolationDefect()

196

self.policy.handle_defect(root, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

197

return root

198

199

def _new_message(self):

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

200

msg = self._factory(**self._factory_kwds())

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

201

if self._cur and self._cur.get_content_type() == 'multipart/digest':

202

msg.set_default_type('message/rfc822')

203

if self._msgstack:

204

self._msgstack[-1].attach(msg)

205

self._msgstack.append(msg)

self._cur = msg

self._last = msg

def _pop_message(self):

210

retval = self._msgstack.pop()

211

if self._msgstack:

212

self._cur = self._msgstack[-1]

else:

self._cur = None

return retval

def _parsegen(self):

# Create a new message and start by parsing headers.

219

self._new_message()

220

headers = []

221

# Collect the headers, searching for a line that doesn't match the RFC

222

# 2822 header or continuation pattern (including an empty line).

223

for line in self._input:

224

if line is NeedMoreData:

225

yield NeedMoreData

226

continue

227

if not headerRE.match(line):

228

# If we saw the RFC defined header/body separator

229

# (i.e. newline), just throw it away. Otherwise the line is

230

# part of the body so push it back.

231

if not NLCRE.match(line):

R David Murray

adbdcdb

2012-05-27 20:45:01 -0400

[diff] [blame]

232

defect = errors.MissingHeaderBodySeparatorDefect()

233

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

234

self._input.unreadline(line)

235

break

236

headers.append(line)

237

# Done with the headers, so parse them and figure out what we're

238

# supposed to see in the body of the message.

239

self._parse_headers(headers)

240

# Headers-only parsing is a backwards compatibility hack, which was

Andrew Svetlov

737fb89

2012-12-18 21:14:22 +0200

[diff] [blame]

241

# necessary in the older parser, which could raise errors. All

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

242

# remaining lines in the input are thrown into the message body.

243

if self._headersonly:

244

lines = []

245

while True:

246

line = self._input.readline()

247

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

254

return

255

if self._cur.get_content_type() == 'message/delivery-status':

256

# message/delivery-status contains blocks of headers separated by

257

# a blank line. We'll represent each header block as a separate

258

# nested message object, but the processing is a bit different

259

# than standard message/* types because there is no body for the

260

# nested messages. A blank line separates the subparts.

261

while True:

262

self._input.push_eof_matcher(NLCRE.match)

263

for retval in self._parsegen():

264

if retval is NeedMoreData:

yield NeedMoreData

continue

break

msg = self._pop_message()

269

# We need to pop the EOF matcher in order to tell if we're at

270

# the end of the current file, not the end of the last block

271

# of message headers.

272

self._input.pop_eof_matcher()

273

# The input stream must be sitting at the newline or at the

274

# EOF. We want to see if we're at the end of this subpart, so

275

# first consume the blank line, then test the next line to see

276

# if we're at this subpart's EOF.

277

while True:

278

line = self._input.readline()

279

if line is NeedMoreData:

yield NeedMoreData

continue

break

while True:

line = self._input.readline()

285

if line is NeedMoreData:

yield NeedMoreData

continue

break

if line == '':

break

# Not at EOF so this is a line we're going to need.

292

self._input.unreadline(line)

293

return

294

if self._cur.get_content_maintype() == 'message':

295

# The message claims to be a message/* type, then what follows is

296

# another RFC 2822 message.

297

for retval in self._parsegen():

298

if retval is NeedMoreData:

yield NeedMoreData

continue

break

self._pop_message()

return

if self._cur.get_content_maintype() == 'multipart':

305

boundary = self._cur.get_boundary()

306

if boundary is None:

307

# The message /claims/ to be a multipart but it has not

308

# defined a boundary. That's a problem which we'll handle by

309

# reading everything until the EOF and marking the message as

310

# defective.

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

311

defect = errors.NoBoundaryInMultipartDefect()

312

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

313

lines = []

314

for line in self._input:

315

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

320

return

R David Murray

749073a

2011-06-22 13:47:53 -0400

[diff] [blame]

321

# Make sure a valid content type was specified per RFC 2045:6.4.

322

if (self._cur.get('content-transfer-encoding', '8bit').lower()

323

not in ('7bit', '8bit', 'binary')):

324

defect = errors.InvalidMultipartContentTransferEncodingDefect()

325

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

326

# Create a line match predicate which matches the inter-part

327

# boundary as well as the end-of-multipart boundary. Don't push

328

# this onto the input stream until we've scanned past the

329

# preamble.

330

separator = '--' + boundary

331

boundaryre = re.compile(

332

'(?P<sep>' + re.escape(separator) +

333

r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')

334

capturing_preamble = True

335

preamble = []

336

linesep = False

R David Murray

7ef3ff3

2012-05-27 22:20:42 -0400

[diff] [blame]

337

close_boundary_seen = False

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

338

while True:

339

line = self._input.readline()

340

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

mo = boundaryre.match(line)

346

if mo:

347

# If we're looking at the end boundary, we're done with

348

# this multipart. If there was a newline at the end of

349

# the closing boundary, then we need to initialize the

350

# epilogue with the empty string (see below).

351

if mo.group('end'):

R David Murray

7ef3ff3

2012-05-27 22:20:42 -0400

[diff] [blame]

352

close_boundary_seen = True

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

353

linesep = mo.group('linesep')

354

break

355

# We saw an inter-part boundary. Were we in the preamble?

356

if capturing_preamble:

357

if preamble:

358

# According to RFC 2046, the last newline belongs

359

# to the boundary.

360

lastline = preamble[-1]

361

eolmo = NLCRE_eol.search(lastline)

362

if eolmo:

363

preamble[-1] = lastline[:-len(eolmo.group(0))]

364

self._cur.preamble = EMPTYSTRING.join(preamble)

365

capturing_preamble = False

366

self._input.unreadline(line)

367

continue

368

# We saw a boundary separating two parts. Consume any

369

# multiple boundary lines that may be following. Our

370

# interpretation of RFC 2046 BNF grammar does not produce

371

# body parts within such double boundaries.

372

while True:

373

line = self._input.readline()

374

if line is NeedMoreData:

375

yield NeedMoreData

376

continue

377

mo = boundaryre.match(line)

378

if not mo:

379

self._input.unreadline(line)

380

break

381

# Recurse to parse this subpart; the input stream points

382

# at the subpart's first line.

383

self._input.push_eof_matcher(boundaryre.match)

384

for retval in self._parsegen():

385

if retval is NeedMoreData:

yield NeedMoreData

continue

break

# Because of RFC 2046, the newline preceding the boundary

390

# separator actually belongs to the boundary, not the

391

# previous subpart's payload (or epilogue if the previous

392

# part is a multipart).

393

if self._last.get_content_maintype() == 'multipart':

394

epilogue = self._last.epilogue

395

if epilogue == '':

396

self._last.epilogue = None

397

elif epilogue is not None:

398

mo = NLCRE_eol.search(epilogue)

399

if mo:

400

end = len(mo.group(0))

401

self._last.epilogue = epilogue[:-end]

402

else:

R David Murray

c5c1472

2011-04-06 08:13:02 -0400

[diff] [blame]

403

payload = self._last._payload

Guido van Rossum

3172c5d

2007-10-16 18:12:55 +0000

[diff] [blame]

404

if isinstance(payload, str):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

405

mo = NLCRE_eol.search(payload)

406

if mo:

407

payload = payload[:-len(mo.group(0))]

R David Murray

c5c1472

2011-04-06 08:13:02 -0400

[diff] [blame]

408

self._last._payload = payload

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

409

self._input.pop_eof_matcher()

410

self._pop_message()

411

# Set the multipart up for newline cleansing, which will

412

# happen if we're in a nested multipart.

413

self._last = self._cur

414

else:

415

# I think we must be in the preamble

416

assert capturing_preamble

417

preamble.append(line)

418

# We've seen either the EOF or the end boundary. If we're still

419

# capturing the preamble, we never saw the start boundary. Note

420

# that as a defect and store the captured text as the payload.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

421

if capturing_preamble:

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

422

defect = errors.StartBoundaryNotFoundDefect()

423

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

424

self._cur.set_payload(EMPTYSTRING.join(preamble))

425

epilogue = []

426

for line in self._input:

427

if line is NeedMoreData:

428

yield NeedMoreData

429

continue

430

self._cur.epilogue = EMPTYSTRING.join(epilogue)

431

return

R David Murray

7ef3ff3

2012-05-27 22:20:42 -0400

[diff] [blame]

432

# If we're not processing the preamble, then we might have seen

433

# EOF without seeing that end boundary...that is also a defect.

434

if not close_boundary_seen:

435

defect = errors.CloseBoundaryNotFoundDefect()

436

self.policy.handle_defect(self._cur, defect)

437

return

438

# Everything from here to the EOF is epilogue. If the end boundary

439

# ended in a newline, we'll need to make sure the epilogue isn't

440

# None

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

if linesep:

epilogue = ['']

else:

epilogue = []

for line in self._input:

446

if line is NeedMoreData:

447

yield NeedMoreData

448

continue

449

epilogue.append(line)

450

# Any CRLF at the front of the epilogue is not technically part of

451

# the epilogue. Also, watch out for an empty string epilogue,

452

# which means a single newline.

453

if epilogue:

454

firstline = epilogue[0]

455

bolmo = NLCRE_bol.match(firstline)

456

if bolmo:

457

epilogue[0] = firstline[len(bolmo.group(0)):]

458

self._cur.epilogue = EMPTYSTRING.join(epilogue)

459

return

460

# Otherwise, it's some non-multipart type, so the entire rest of the

461

# file contents becomes the payload.

462

lines = []

463

for line in self._input:

464

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

469

470

def _parse_headers(self, lines):

471

# Passed a list of lines that make up the headers for the current msg

472

lastheader = ''

473

lastvalue = []

474

for lineno, line in enumerate(lines):

475

# Check for continuation

476

if line[0] in ' \t':

477

if not lastheader:

478

# The first line of the headers was a continuation. This

479

# is illegal, so let's note the defect, store the illegal

480

# line, and ignore it for purposes of headers.

481

defect = errors.FirstHeaderLineIsContinuationDefect(line)

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

482

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

483

continue

484

lastvalue.append(line)

485

continue

486

if lastheader:

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

487

self._cur.set_raw(*self.policy.header_source_parse(lastvalue))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

488

lastheader, lastvalue = '', []

489

# Check for envelope header, i.e. unix-from

490

if line.startswith('From '):

491

if lineno == 0:

492

# Strip off the trailing newline

493

mo = NLCRE_eol.search(line)

494

if mo:

495

line = line[:-len(mo.group(0))]

496

self._cur.set_unixfrom(line)

497

continue

498

elif lineno == len(lines) - 1:

499

# Something looking like a unix-from at the end - it's

500

# probably the first line of the body, so push back the

501

# line and stop.

502

self._input.unreadline(line)

503

return

504

else:

505

# Weirdly placed unix-from line. Note this as a defect

506

# and ignore it.

507

defect = errors.MisplacedEnvelopeHeaderDefect(line)

508

self._cur.defects.append(defect)

509

continue

510

# Split the line on the colon separating field name from value.

R David Murray

adbdcdb

2012-05-27 20:45:01 -0400

[diff] [blame]

511

# There will always be a colon, because if there wasn't the part of

512

# the parser that calls us would have started parsing the body.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

513

i = line.find(':')

Benjamin Peterson

155ceaa

2015-01-25 23:30:30 -0500

[diff] [blame]

514

515

# If the colon is on the start of the line the header is clearly

516

# malformed, but we might be able to salvage the rest of the

517

# message. Track the error but keep going.

518

if i == 0:

519

defect = errors.InvalidHeaderDefect("Missing header name.")

520

self._cur.defects.append(defect)

521

continue

522

R David Murray

adbdcdb

2012-05-27 20:45:01 -0400

[diff] [blame]

523

assert i>0, "_parse_headers fed line with no : and no leading WS"

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

524

lastheader = line[:i]

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

525

lastvalue = [line]

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

526

# Done with all the lines, so handle the last header.

527

if lastheader:

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

528

self._cur.set_raw(*self.policy.header_source_parse(lastvalue))

R. David Murray

96fd54e

2010-10-08 15:55:28 +0000

[diff] [blame]

529

R David Murray