Blame - Lib/email/feedparser.py - platform/external/python/cpython3

2007-08-30 01:15:14 +0000

[diff] [blame]

1

2

# Authors: Baxter, Wouters and Warsaw

3

# Contact: email-sig@python.org

4

5

"""FeedParser - An email feed parser.

6

7

The feed parser implements an interface for incrementally parsing an email

8

message, line by line. This has advantages for certain applications, such as

9

those reading email messages off a socket.

10

11

FeedParser.feed() is the primary interface for pushing new data into the

12

parser. It returns when there's nothing more it can do with the available

13

data. When you have no more data to push into the parser, call .close().

14

This completes the parsing and returns the root message object.

15

Andrew Svetlov

737fb89

2012-12-18 21:14:22 +0200

[diff] [blame]

16

The other advantage of this parser is that it will never raise a parsing

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

17

exception. Instead, when it finds something unexpected, it adds a 'defect' to

18

the current message. Defects are just instances that live on the message

19

object's .defects attribute.

20

"""

21

R David Murray

1b6c724

2012-03-16 22:43:05 -0400

[diff] [blame]

22

__all__ = ['FeedParser', 'BytesFeedParser']

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

import re

from email import errors

27

from email import message

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

28

from email._policybase import compat32

Raymond Hettinger

2015-05-22 17:23:28 -0700

[diff] [blame]

29

from collections import deque

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

30

31

NLCRE = re.compile('\r\n|\r|\n')

32

NLCRE_bol = re.compile('(\r\n|\r|\n)')

R. David Murray

45e0e14

2010-06-16 02:19:40 +0000

[diff] [blame]

33

NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

34

NLCRE_crack = re.compile('(\r\n|\r|\n)')

35

# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character

36

# except controls, SP, and ":".

Benjamin Peterson

155ceaa

2015-01-25 23:30:30 -0500

[diff] [blame]

37

headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

EMPTYSTRING = ''

NL = '\n'

NeedMoreData = object()

class BufferedSubFile(object):

46

"""A file-ish object that can have new data loaded into it.

47

48

You can also push and pop line-matching predicates onto a stack. When the

49

current predicate matches the current line, a false EOF response

50

(i.e. empty string) is returned instead. This lets the parser adhere to a

51

simple abstraction -- it parses until EOF closes the current message.

52

"""

53

def __init__(self):

Serhiy Storchaka

2014-08-12 13:59:11 +0300

[diff] [blame]

54

# Chunks of the last partial line pushed into this object.

55

self._partial = []

Raymond Hettinger

2015-05-22 17:23:28 -0700

[diff] [blame]

56

# A deque of full, pushed lines

57

self._lines = deque()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

58

# The stack of false-EOF checking predicates.

59

self._eofstack = []

60

# A flag indicating whether the file has been closed or not.

61

self._closed = False

62

63

def push_eof_matcher(self, pred):

64

self._eofstack.append(pred)

65

66

def pop_eof_matcher(self):

67

return self._eofstack.pop()

68

69

def close(self):

70

# Don't forget any trailing partial line.

Serhiy Storchaka

2014-08-12 13:59:11 +0300

[diff] [blame]

71

self.pushlines(''.join(self._partial).splitlines(True))

72

self._partial = []

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

self._closed = True

def readline(self):

if not self._lines:

if self._closed:

return ''

return NeedMoreData

# Pop the line off the stack and see if it matches the current

81

# false-EOF predicate.

Raymond Hettinger

2015-05-22 17:23:28 -0700

[diff] [blame]

82

line = self._lines.popleft()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

83

# RFC 2046, section 5.1.2 requires us to recognize outer level

84

# boundaries at any level of inner nesting. Do this, but be sure it's

85

# in the order of most to least nested.

Raymond Hettinger

2015-05-22 17:23:28 -0700

[diff] [blame]

86

for ateof in reversed(self._eofstack):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

87

if ateof(line):

88

# We're at the false EOF. But push the last line back first.

Raymond Hettinger

2015-05-22 17:23:28 -0700

[diff] [blame]

89

self._lines.appendleft(line)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

return ''

return line

def unreadline(self, line):

94

# Let the consumer push a line back into the buffer.

95

assert line is not NeedMoreData

Raymond Hettinger

2015-05-22 17:23:28 -0700

[diff] [blame]

96

self._lines.appendleft(line)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

97

98

def push(self, data):

99

"""Push some new data into this object."""

R David Murray

2940e71

2013-02-13 21:17:13 -0500

[diff] [blame]

100

# Crack into lines, but preserve the linesep characters on the end of each

101

parts = data.splitlines(True)

Serhiy Storchaka

2014-08-12 13:59:11 +0300

[diff] [blame]

102

103

if not parts or not parts[0].endswith(('\n', '\r')):

104

# No new complete lines, so just accumulate partials

105

self._partial += parts

return

if self._partial:

# If there are previous leftovers, complete them now

110

self._partial.append(parts[0])

111

parts[0:1] = ''.join(self._partial).splitlines(True)

112

del self._partial[:]

113

R David Murray

2940e71

2013-02-13 21:17:13 -0500

[diff] [blame]

114

# If the last element of the list does not end in a newline, then treat

115

# it as a partial line. We only check for '\n' here because a line

116

# ending with '\r' might be a line that was split in the middle of a

117

# '\r\n' sequence (see bugs 1555570 and 1721862).

Serhiy Storchaka

2014-08-12 13:59:11 +0300

[diff] [blame]

118

if not parts[-1].endswith('\n'):

119

self._partial = [parts.pop()]

R David Murray

2940e71

2013-02-13 21:17:13 -0500

[diff] [blame]

120

self.pushlines(parts)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

121

122

def pushlines(self, lines):

Raymond Hettinger

2015-05-22 17:23:28 -0700

[diff] [blame]

123

self._lines.extend(lines)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

124

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

def __iter__(self):

return self

def __next__(self):

line = self.readline()

if line == '':

raise StopIteration

return line

class FeedParser:

"""A feed-style parser of email."""

138

R David Murray

aa21297

2014-02-07 10:44:16 -0500

[diff] [blame]

139

def __init__(self, _factory=None, *, policy=compat32):

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

140

"""_factory is called with no arguments to create a new message obj

141

142

The policy keyword specifies a policy object that controls a number of

143

aspects of the parser's operation. The default policy maintains

144

backward compatibility.

145

146

"""

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

147

self.policy = policy

R David Murray

702b046

2016-07-15 21:29:13 -0400

[diff] [blame]

148

self._old_style_factory = False

R David Murray

aa21297

2014-02-07 10:44:16 -0500

[diff] [blame]

149

if _factory is None:

150

# What this should be:

151

#self._factory = policy.default_message_factory

152

# but, because we are post 3.4 feature freeze, fix with temp hack:

153

if self.policy is compat32:

154

self._factory = message.Message

155

else:

156

self._factory = message.EmailMessage

157

else:

158

self._factory = _factory

159

try:

160

_factory(policy=self.policy)

161

except TypeError:

162

# Assume this is an old-style factory

R David Murray

702b046

2016-07-15 21:29:13 -0400

[diff] [blame]

163

self._old_style_factory = True

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

164

self._input = BufferedSubFile()

165

self._msgstack = []

166

self._parse = self._parsegen().__next__

167

self._cur = None

168

self._last = None

169

self._headersonly = False

170

171

# Non-public interface for supporting Parser's headersonly flag

172

def _set_headersonly(self):

173

self._headersonly = True

174

175

def feed(self, data):

176

"""Push more data into the parser."""

177

self._input.push(data)

178

self._call_parse()

179

180

def _call_parse(self):

181

try:

182

self._parse()

183

except StopIteration:

pass

def close(self):

"""Parse all remaining data and return the root message object."""

188

self._input.close()

189

self._call_parse()

190

root = self._pop_message()

191

assert not self._msgstack

192

# Look for final set of defects

193

if root.get_content_maintype() == 'multipart' \

194

and not root.is_multipart():

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

195

defect = errors.MultipartInvariantViolationDefect()

196

self.policy.handle_defect(root, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

197

return root

198

199

def _new_message(self):

R David Murray

702b046

2016-07-15 21:29:13 -0400

[diff] [blame]

200

if self._old_style_factory:

201

msg = self._factory()

202

else:

203

msg = self._factory(policy=self.policy)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

204

if self._cur and self._cur.get_content_type() == 'multipart/digest':

205

msg.set_default_type('message/rfc822')

206

if self._msgstack:

207

self._msgstack[-1].attach(msg)

208

self._msgstack.append(msg)

self._cur = msg

self._last = msg

def _pop_message(self):

213

retval = self._msgstack.pop()

214

if self._msgstack:

215

self._cur = self._msgstack[-1]

else:

self._cur = None

return retval

def _parsegen(self):

# Create a new message and start by parsing headers.

222

self._new_message()

223

headers = []

224

# Collect the headers, searching for a line that doesn't match the RFC

225

# 2822 header or continuation pattern (including an empty line).

226

for line in self._input:

227

if line is NeedMoreData:

228

yield NeedMoreData

229

continue

230

if not headerRE.match(line):

231

# If we saw the RFC defined header/body separator

232

# (i.e. newline), just throw it away. Otherwise the line is

233

# part of the body so push it back.

234

if not NLCRE.match(line):

R David Murray

adbdcdb

2012-05-27 20:45:01 -0400

[diff] [blame]

235

defect = errors.MissingHeaderBodySeparatorDefect()

236

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

237

self._input.unreadline(line)

238

break

239

headers.append(line)

240

# Done with the headers, so parse them and figure out what we're

241

# supposed to see in the body of the message.

242

self._parse_headers(headers)

243

# Headers-only parsing is a backwards compatibility hack, which was

Andrew Svetlov

737fb89

2012-12-18 21:14:22 +0200

[diff] [blame]

244

# necessary in the older parser, which could raise errors. All

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

245

# remaining lines in the input are thrown into the message body.

246

if self._headersonly:

247

lines = []

248

while True:

249

line = self._input.readline()

250

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

257

return

258

if self._cur.get_content_type() == 'message/delivery-status':

259

# message/delivery-status contains blocks of headers separated by

260

# a blank line. We'll represent each header block as a separate

261

# nested message object, but the processing is a bit different

262

# than standard message/* types because there is no body for the

263

# nested messages. A blank line separates the subparts.

264

while True:

265

self._input.push_eof_matcher(NLCRE.match)

266

for retval in self._parsegen():

267

if retval is NeedMoreData:

yield NeedMoreData

continue

break

msg = self._pop_message()

272

# We need to pop the EOF matcher in order to tell if we're at

273

# the end of the current file, not the end of the last block

274

# of message headers.

275

self._input.pop_eof_matcher()

276

# The input stream must be sitting at the newline or at the

277

# EOF. We want to see if we're at the end of this subpart, so

278

# first consume the blank line, then test the next line to see

279

# if we're at this subpart's EOF.

280

while True:

281

line = self._input.readline()

282

if line is NeedMoreData:

yield NeedMoreData

continue

break

while True:

line = self._input.readline()

288

if line is NeedMoreData:

yield NeedMoreData

continue

break

if line == '':

break

# Not at EOF so this is a line we're going to need.

295

self._input.unreadline(line)

296

return

297

if self._cur.get_content_maintype() == 'message':

298

# The message claims to be a message/* type, then what follows is

299

# another RFC 2822 message.

300

for retval in self._parsegen():

301

if retval is NeedMoreData:

yield NeedMoreData

continue

break

self._pop_message()

return

if self._cur.get_content_maintype() == 'multipart':

308

boundary = self._cur.get_boundary()

309

if boundary is None:

310

# The message /claims/ to be a multipart but it has not

311

# defined a boundary. That's a problem which we'll handle by

312

# reading everything until the EOF and marking the message as

313

# defective.

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

314

defect = errors.NoBoundaryInMultipartDefect()

315

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

316

lines = []

317

for line in self._input:

318

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

323

return

R David Murray

749073a

2011-06-22 13:47:53 -0400

[diff] [blame]

324

# Make sure a valid content type was specified per RFC 2045:6.4.

325

if (self._cur.get('content-transfer-encoding', '8bit').lower()

326

not in ('7bit', '8bit', 'binary')):

327

defect = errors.InvalidMultipartContentTransferEncodingDefect()

328

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

329

# Create a line match predicate which matches the inter-part

330

# boundary as well as the end-of-multipart boundary. Don't push

331

# this onto the input stream until we've scanned past the

332

# preamble.

333

separator = '--' + boundary

334

boundaryre = re.compile(

335

'(?P<sep>' + re.escape(separator) +

336

r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')

337

capturing_preamble = True

338

preamble = []

339

linesep = False

R David Murray

7ef3ff3

2012-05-27 22:20:42 -0400

[diff] [blame]

340

close_boundary_seen = False

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

341

while True:

342

line = self._input.readline()

343

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

mo = boundaryre.match(line)

349

if mo:

350

# If we're looking at the end boundary, we're done with

351

# this multipart. If there was a newline at the end of

352

# the closing boundary, then we need to initialize the

353

# epilogue with the empty string (see below).

354

if mo.group('end'):

R David Murray

7ef3ff3

2012-05-27 22:20:42 -0400

[diff] [blame]

355

close_boundary_seen = True

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

356

linesep = mo.group('linesep')

357

break

358

# We saw an inter-part boundary. Were we in the preamble?

359

if capturing_preamble:

360

if preamble:

361

# According to RFC 2046, the last newline belongs

362

# to the boundary.

363

lastline = preamble[-1]

364

eolmo = NLCRE_eol.search(lastline)

365

if eolmo:

366

preamble[-1] = lastline[:-len(eolmo.group(0))]

367

self._cur.preamble = EMPTYSTRING.join(preamble)

368

capturing_preamble = False

369

self._input.unreadline(line)

370

continue

371

# We saw a boundary separating two parts. Consume any

372

# multiple boundary lines that may be following. Our

373

# interpretation of RFC 2046 BNF grammar does not produce

374

# body parts within such double boundaries.

375

while True:

376

line = self._input.readline()

377

if line is NeedMoreData:

378

yield NeedMoreData

379

continue

380

mo = boundaryre.match(line)

381

if not mo:

382

self._input.unreadline(line)

383

break

384

# Recurse to parse this subpart; the input stream points

385

# at the subpart's first line.

386

self._input.push_eof_matcher(boundaryre.match)

387

for retval in self._parsegen():

388

if retval is NeedMoreData:

yield NeedMoreData

continue

break

# Because of RFC 2046, the newline preceding the boundary

393

# separator actually belongs to the boundary, not the

394

# previous subpart's payload (or epilogue if the previous

395

# part is a multipart).

396

if self._last.get_content_maintype() == 'multipart':

397

epilogue = self._last.epilogue

398

if epilogue == '':

399

self._last.epilogue = None

400

elif epilogue is not None:

401

mo = NLCRE_eol.search(epilogue)

402

if mo:

403

end = len(mo.group(0))

404

self._last.epilogue = epilogue[:-end]

405

else:

R David Murray

c5c1472

2011-04-06 08:13:02 -0400

[diff] [blame]

406

payload = self._last._payload

Guido van Rossum

3172c5d

2007-10-16 18:12:55 +0000

[diff] [blame]

407

if isinstance(payload, str):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

408

mo = NLCRE_eol.search(payload)

409

if mo:

410

payload = payload[:-len(mo.group(0))]

R David Murray

c5c1472

2011-04-06 08:13:02 -0400

[diff] [blame]

411

self._last._payload = payload

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

412

self._input.pop_eof_matcher()

413

self._pop_message()

414

# Set the multipart up for newline cleansing, which will

415

# happen if we're in a nested multipart.

416

self._last = self._cur

417

else:

418

# I think we must be in the preamble

419

assert capturing_preamble

420

preamble.append(line)

421

# We've seen either the EOF or the end boundary. If we're still

422

# capturing the preamble, we never saw the start boundary. Note

423

# that as a defect and store the captured text as the payload.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

424

if capturing_preamble:

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

425

defect = errors.StartBoundaryNotFoundDefect()

426

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

427

self._cur.set_payload(EMPTYSTRING.join(preamble))

428

epilogue = []

429

for line in self._input:

430

if line is NeedMoreData:

431

yield NeedMoreData

432

continue

433

self._cur.epilogue = EMPTYSTRING.join(epilogue)

434

return

R David Murray

7ef3ff3

2012-05-27 22:20:42 -0400

[diff] [blame]

435

# If we're not processing the preamble, then we might have seen

436

# EOF without seeing that end boundary...that is also a defect.

437

if not close_boundary_seen:

438

defect = errors.CloseBoundaryNotFoundDefect()

439

self.policy.handle_defect(self._cur, defect)

440

return

441

# Everything from here to the EOF is epilogue. If the end boundary

442

# ended in a newline, we'll need to make sure the epilogue isn't

443

# None

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

if linesep:

epilogue = ['']

else:

epilogue = []

for line in self._input:

449

if line is NeedMoreData:

450

yield NeedMoreData

451

continue

452

epilogue.append(line)

453

# Any CRLF at the front of the epilogue is not technically part of

454

# the epilogue. Also, watch out for an empty string epilogue,

455

# which means a single newline.

456

if epilogue:

457

firstline = epilogue[0]

458

bolmo = NLCRE_bol.match(firstline)

459

if bolmo:

460

epilogue[0] = firstline[len(bolmo.group(0)):]

461

self._cur.epilogue = EMPTYSTRING.join(epilogue)

462

return

463

# Otherwise, it's some non-multipart type, so the entire rest of the

464

# file contents becomes the payload.

465

lines = []

466

for line in self._input:

467

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

472

473

def _parse_headers(self, lines):

474

# Passed a list of lines that make up the headers for the current msg

475

lastheader = ''

476

lastvalue = []

477

for lineno, line in enumerate(lines):

478

# Check for continuation

479

if line[0] in ' \t':

480

if not lastheader:

481

# The first line of the headers was a continuation. This

482

# is illegal, so let's note the defect, store the illegal

483

# line, and ignore it for purposes of headers.

484

defect = errors.FirstHeaderLineIsContinuationDefect(line)

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

485

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

486

continue

487

lastvalue.append(line)

488

continue

489

if lastheader:

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

490

self._cur.set_raw(*self.policy.header_source_parse(lastvalue))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

491

lastheader, lastvalue = '', []

492

# Check for envelope header, i.e. unix-from

493

if line.startswith('From '):

494

if lineno == 0:

495

# Strip off the trailing newline

496

mo = NLCRE_eol.search(line)

497

if mo:

498

line = line[:-len(mo.group(0))]

499

self._cur.set_unixfrom(line)

500

continue

501

elif lineno == len(lines) - 1:

502

# Something looking like a unix-from at the end - it's

503

# probably the first line of the body, so push back the

504

# line and stop.

505

self._input.unreadline(line)

506

return

507

else:

508

# Weirdly placed unix-from line. Note this as a defect

509

# and ignore it.

510

defect = errors.MisplacedEnvelopeHeaderDefect(line)

511

self._cur.defects.append(defect)

512

continue

513

# Split the line on the colon separating field name from value.

R David Murray

adbdcdb

2012-05-27 20:45:01 -0400

[diff] [blame]

514

# There will always be a colon, because if there wasn't the part of

515

# the parser that calls us would have started parsing the body.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

516

i = line.find(':')

Benjamin Peterson

155ceaa

2015-01-25 23:30:30 -0500

[diff] [blame]

517

518

# If the colon is on the start of the line the header is clearly

519

# malformed, but we might be able to salvage the rest of the

520

# message. Track the error but keep going.

521

if i == 0:

522

defect = errors.InvalidHeaderDefect("Missing header name.")

523

self._cur.defects.append(defect)

524

continue

525

R David Murray

adbdcdb

2012-05-27 20:45:01 -0400

[diff] [blame]

526

assert i>0, "_parse_headers fed line with no : and no leading WS"

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

527

lastheader = line[:i]

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

528

lastvalue = [line]

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

529

# Done with all the lines, so handle the last header.

530

if lastheader:

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

531

self._cur.set_raw(*self.policy.header_source_parse(lastvalue))

R. David Murray

96fd54e

2010-10-08 15:55:28 +0000

[diff] [blame]

532

R David Murray