Blame - Lib/email/feedparser.py - platform/external/python/cpython3

2007-08-30 01:15:14 +0000

[diff] [blame]

1

2

# Authors: Baxter, Wouters and Warsaw

3

# Contact: email-sig@python.org

4

5

"""FeedParser - An email feed parser.

6

7

The feed parser implements an interface for incrementally parsing an email

8

message, line by line. This has advantages for certain applications, such as

9

those reading email messages off a socket.

10

11

FeedParser.feed() is the primary interface for pushing new data into the

12

parser. It returns when there's nothing more it can do with the available

13

data. When you have no more data to push into the parser, call .close().

14

This completes the parsing and returns the root message object.

15

Andrew Svetlov

737fb89

2012-12-18 21:14:22 +0200

[diff] [blame]

16

The other advantage of this parser is that it will never raise a parsing

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

17

exception. Instead, when it finds something unexpected, it adds a 'defect' to

18

the current message. Defects are just instances that live on the message

19

object's .defects attribute.

20

"""

21

R David Murray

1b6c724

2012-03-16 22:43:05 -0400

[diff] [blame]

22

__all__ = ['FeedParser', 'BytesFeedParser']

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

import re

from email import errors

27

from email import message

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

28

from email._policybase import compat32

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

29

30

NLCRE = re.compile('\r\n|\r|\n')

31

NLCRE_bol = re.compile('(\r\n|\r|\n)')

R. David Murray

45e0e14

2010-06-16 02:19:40 +0000

[diff] [blame]

32

NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

33

NLCRE_crack = re.compile('(\r\n|\r|\n)')

34

# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character

35

# except controls, SP, and ":".

36

headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')

EMPTYSTRING = ''

NL = '\n'

NeedMoreData = object()

class BufferedSubFile(object):

45

"""A file-ish object that can have new data loaded into it.

46

47

You can also push and pop line-matching predicates onto a stack. When the

48

current predicate matches the current line, a false EOF response

49

(i.e. empty string) is returned instead. This lets the parser adhere to a

50

simple abstraction -- it parses until EOF closes the current message.

51

"""

52

def __init__(self):

53

# The last partial line pushed into this object.

54

self._partial = ''

55

# The list of full, pushed lines, in reverse order

56

self._lines = []

57

# The stack of false-EOF checking predicates.

58

self._eofstack = []

59

# A flag indicating whether the file has been closed or not.

60

self._closed = False

61

62

def push_eof_matcher(self, pred):

63

self._eofstack.append(pred)

64

65

def pop_eof_matcher(self):

66

return self._eofstack.pop()

67

68

def close(self):

69

# Don't forget any trailing partial line.

70

self._lines.append(self._partial)

self._partial = ''

self._closed = True

def readline(self):

if not self._lines:

if self._closed:

return ''

return NeedMoreData

# Pop the line off the stack and see if it matches the current

80

# false-EOF predicate.

81

line = self._lines.pop()

82

# RFC 2046, section 5.1.2 requires us to recognize outer level

83

# boundaries at any level of inner nesting. Do this, but be sure it's

84

# in the order of most to least nested.

85

for ateof in self._eofstack[::-1]:

86

if ateof(line):

87

# We're at the false EOF. But push the last line back first.

88

self._lines.append(line)

return ''

return line

def unreadline(self, line):

93

# Let the consumer push a line back into the buffer.

94

assert line is not NeedMoreData

95

self._lines.append(line)

96

97

def push(self, data):

98

"""Push some new data into this object."""

99

# Handle any previous leftovers

100

data, self._partial = self._partial + data, ''

R David Murray

2940e71

2013-02-13 21:17:13 -0500

[diff] [blame]

101

# Crack into lines, but preserve the linesep characters on the end of each

102

parts = data.splitlines(True)

103

# If the last element of the list does not end in a newline, then treat

104

# it as a partial line. We only check for '\n' here because a line

105

# ending with '\r' might be a line that was split in the middle of a

106

# '\r\n' sequence (see bugs 1555570 and 1721862).

107

if parts and not parts[-1].endswith('\n'):

108

self._partial = parts.pop()

109

self.pushlines(parts)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

110

111

def pushlines(self, lines):

112

# Reverse and insert at the front of the lines.

113

self._lines[:0] = lines[::-1]

114

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

def __iter__(self):

return self

def __next__(self):

line = self.readline()

if line == '':

raise StopIteration

return line

class FeedParser:

"""A feed-style parser of email."""

128

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

129

def __init__(self, _factory=message.Message, *, policy=compat32):

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

130

"""_factory is called with no arguments to create a new message obj

131

132

The policy keyword specifies a policy object that controls a number of

133

aspects of the parser's operation. The default policy maintains

134

backward compatibility.

135

136

"""

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

137

self._factory = _factory

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

138

self.policy = policy

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

139

try:

140

_factory(policy=self.policy)

141

self._factory_kwds = lambda: {'policy': self.policy}

142

except TypeError:

143

# Assume this is an old-style factory

144

self._factory_kwds = lambda: {}

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

145

self._input = BufferedSubFile()

146

self._msgstack = []

147

self._parse = self._parsegen().__next__

148

self._cur = None

149

self._last = None

150

self._headersonly = False

151

152

# Non-public interface for supporting Parser's headersonly flag

153

def _set_headersonly(self):

154

self._headersonly = True

155

156

def feed(self, data):

157

"""Push more data into the parser."""

158

self._input.push(data)

159

self._call_parse()

160

161

def _call_parse(self):

162

try:

163

self._parse()

164

except StopIteration:

pass

def close(self):

"""Parse all remaining data and return the root message object."""

169

self._input.close()

170

self._call_parse()

171

root = self._pop_message()

172

assert not self._msgstack

173

# Look for final set of defects

174

if root.get_content_maintype() == 'multipart' \

175

and not root.is_multipart():

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

176

defect = errors.MultipartInvariantViolationDefect()

177

self.policy.handle_defect(root, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

178

return root

179

180

def _new_message(self):

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

181

msg = self._factory(**self._factory_kwds())

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

182

if self._cur and self._cur.get_content_type() == 'multipart/digest':

183

msg.set_default_type('message/rfc822')

184

if self._msgstack:

185

self._msgstack[-1].attach(msg)

186

self._msgstack.append(msg)

self._cur = msg

self._last = msg

def _pop_message(self):

191

retval = self._msgstack.pop()

192

if self._msgstack:

193

self._cur = self._msgstack[-1]

else:

self._cur = None

return retval

def _parsegen(self):

# Create a new message and start by parsing headers.

200

self._new_message()

201

headers = []

202

# Collect the headers, searching for a line that doesn't match the RFC

203

# 2822 header or continuation pattern (including an empty line).

204

for line in self._input:

205

if line is NeedMoreData:

206

yield NeedMoreData

207

continue

208

if not headerRE.match(line):

209

# If we saw the RFC defined header/body separator

210

# (i.e. newline), just throw it away. Otherwise the line is

211

# part of the body so push it back.

212

if not NLCRE.match(line):

R David Murray

adbdcdb

2012-05-27 20:45:01 -0400

[diff] [blame]

213

defect = errors.MissingHeaderBodySeparatorDefect()

214

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

215

self._input.unreadline(line)

216

break

217

headers.append(line)

218

# Done with the headers, so parse them and figure out what we're

219

# supposed to see in the body of the message.

220

self._parse_headers(headers)

221

# Headers-only parsing is a backwards compatibility hack, which was

Andrew Svetlov

737fb89

2012-12-18 21:14:22 +0200

[diff] [blame]

222

# necessary in the older parser, which could raise errors. All

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

223

# remaining lines in the input are thrown into the message body.

224

if self._headersonly:

225

lines = []

226

while True:

227

line = self._input.readline()

228

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

235

return

236

if self._cur.get_content_type() == 'message/delivery-status':

237

# message/delivery-status contains blocks of headers separated by

238

# a blank line. We'll represent each header block as a separate

239

# nested message object, but the processing is a bit different

240

# than standard message/* types because there is no body for the

241

# nested messages. A blank line separates the subparts.

242

while True:

243

self._input.push_eof_matcher(NLCRE.match)

244

for retval in self._parsegen():

245

if retval is NeedMoreData:

yield NeedMoreData

continue

break

msg = self._pop_message()

250

# We need to pop the EOF matcher in order to tell if we're at

251

# the end of the current file, not the end of the last block

252

# of message headers.

253

self._input.pop_eof_matcher()

254

# The input stream must be sitting at the newline or at the

255

# EOF. We want to see if we're at the end of this subpart, so

256

# first consume the blank line, then test the next line to see

257

# if we're at this subpart's EOF.

258

while True:

259

line = self._input.readline()

260

if line is NeedMoreData:

yield NeedMoreData

continue

break

while True:

line = self._input.readline()

266

if line is NeedMoreData:

yield NeedMoreData

continue

break

if line == '':

break

# Not at EOF so this is a line we're going to need.

273

self._input.unreadline(line)

274

return

275

if self._cur.get_content_maintype() == 'message':

276

# The message claims to be a message/* type, then what follows is

277

# another RFC 2822 message.

278

for retval in self._parsegen():

279

if retval is NeedMoreData:

yield NeedMoreData

continue

break

self._pop_message()

return

if self._cur.get_content_maintype() == 'multipart':

286

boundary = self._cur.get_boundary()

287

if boundary is None:

288

# The message /claims/ to be a multipart but it has not

289

# defined a boundary. That's a problem which we'll handle by

290

# reading everything until the EOF and marking the message as

291

# defective.

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

292

defect = errors.NoBoundaryInMultipartDefect()

293

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

294

lines = []

295

for line in self._input:

296

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

301

return

R David Murray

749073a

2011-06-22 13:47:53 -0400

[diff] [blame]

302

# Make sure a valid content type was specified per RFC 2045:6.4.

303

if (self._cur.get('content-transfer-encoding', '8bit').lower()

304

not in ('7bit', '8bit', 'binary')):

305

defect = errors.InvalidMultipartContentTransferEncodingDefect()

306

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

307

# Create a line match predicate which matches the inter-part

308

# boundary as well as the end-of-multipart boundary. Don't push

309

# this onto the input stream until we've scanned past the

310

# preamble.

311

separator = '--' + boundary

312

boundaryre = re.compile(

313

'(?P<sep>' + re.escape(separator) +

314

r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')

315

capturing_preamble = True

316

preamble = []

317

linesep = False

R David Murray

7ef3ff3

2012-05-27 22:20:42 -0400

[diff] [blame]

318

close_boundary_seen = False

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

319

while True:

320

line = self._input.readline()

321

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

mo = boundaryre.match(line)

327

if mo:

328

# If we're looking at the end boundary, we're done with

329

# this multipart. If there was a newline at the end of

330

# the closing boundary, then we need to initialize the

331

# epilogue with the empty string (see below).

332

if mo.group('end'):

R David Murray

7ef3ff3

2012-05-27 22:20:42 -0400

[diff] [blame]

333

close_boundary_seen = True

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

334

linesep = mo.group('linesep')

335

break

336

# We saw an inter-part boundary. Were we in the preamble?

337

if capturing_preamble:

338

if preamble:

339

# According to RFC 2046, the last newline belongs

340

# to the boundary.

341

lastline = preamble[-1]

342

eolmo = NLCRE_eol.search(lastline)

343

if eolmo:

344

preamble[-1] = lastline[:-len(eolmo.group(0))]

345

self._cur.preamble = EMPTYSTRING.join(preamble)

346

capturing_preamble = False

347

self._input.unreadline(line)

348

continue

349

# We saw a boundary separating two parts. Consume any

350

# multiple boundary lines that may be following. Our

351

# interpretation of RFC 2046 BNF grammar does not produce

352

# body parts within such double boundaries.

353

while True:

354

line = self._input.readline()

355

if line is NeedMoreData:

356

yield NeedMoreData

357

continue

358

mo = boundaryre.match(line)

359

if not mo:

360

self._input.unreadline(line)

361

break

362

# Recurse to parse this subpart; the input stream points

363

# at the subpart's first line.

364

self._input.push_eof_matcher(boundaryre.match)

365

for retval in self._parsegen():

366

if retval is NeedMoreData:

yield NeedMoreData

continue

break

# Because of RFC 2046, the newline preceding the boundary

371

# separator actually belongs to the boundary, not the

372

# previous subpart's payload (or epilogue if the previous

373

# part is a multipart).

374

if self._last.get_content_maintype() == 'multipart':

375

epilogue = self._last.epilogue

376

if epilogue == '':

377

self._last.epilogue = None

378

elif epilogue is not None:

379

mo = NLCRE_eol.search(epilogue)

380

if mo:

381

end = len(mo.group(0))

382

self._last.epilogue = epilogue[:-end]

383

else:

R David Murray

c5c1472

2011-04-06 08:13:02 -0400

[diff] [blame]

384

payload = self._last._payload

Guido van Rossum

3172c5d

2007-10-16 18:12:55 +0000

[diff] [blame]

385

if isinstance(payload, str):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

386

mo = NLCRE_eol.search(payload)

387

if mo:

388

payload = payload[:-len(mo.group(0))]

R David Murray

c5c1472

2011-04-06 08:13:02 -0400

[diff] [blame]

389

self._last._payload = payload

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

390

self._input.pop_eof_matcher()

391

self._pop_message()

392

# Set the multipart up for newline cleansing, which will

393

# happen if we're in a nested multipart.

394

self._last = self._cur

395

else:

396

# I think we must be in the preamble

397

assert capturing_preamble

398

preamble.append(line)

399

# We've seen either the EOF or the end boundary. If we're still

400

# capturing the preamble, we never saw the start boundary. Note

401

# that as a defect and store the captured text as the payload.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

402

if capturing_preamble:

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

403

defect = errors.StartBoundaryNotFoundDefect()

404

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

405

self._cur.set_payload(EMPTYSTRING.join(preamble))

406

epilogue = []

407

for line in self._input:

408

if line is NeedMoreData:

409

yield NeedMoreData

410

continue

411

self._cur.epilogue = EMPTYSTRING.join(epilogue)

412

return

R David Murray

7ef3ff3

2012-05-27 22:20:42 -0400

[diff] [blame]

413

# If we're not processing the preamble, then we might have seen

414

# EOF without seeing that end boundary...that is also a defect.

415

if not close_boundary_seen:

416

defect = errors.CloseBoundaryNotFoundDefect()

417

self.policy.handle_defect(self._cur, defect)

418

return

419

# Everything from here to the EOF is epilogue. If the end boundary

420

# ended in a newline, we'll need to make sure the epilogue isn't

421

# None

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

if linesep:

epilogue = ['']

else:

epilogue = []

for line in self._input:

427

if line is NeedMoreData:

428

yield NeedMoreData

429

continue

430

epilogue.append(line)

431

# Any CRLF at the front of the epilogue is not technically part of

432

# the epilogue. Also, watch out for an empty string epilogue,

433

# which means a single newline.

434

if epilogue:

435

firstline = epilogue[0]

436

bolmo = NLCRE_bol.match(firstline)

437

if bolmo:

438

epilogue[0] = firstline[len(bolmo.group(0)):]

439

self._cur.epilogue = EMPTYSTRING.join(epilogue)

440

return

441

# Otherwise, it's some non-multipart type, so the entire rest of the

442

# file contents becomes the payload.

443

lines = []

444

for line in self._input:

445

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

450

451

def _parse_headers(self, lines):

452

# Passed a list of lines that make up the headers for the current msg

453

lastheader = ''

454

lastvalue = []

455

for lineno, line in enumerate(lines):

456

# Check for continuation

457

if line[0] in ' \t':

458

if not lastheader:

459

# The first line of the headers was a continuation. This

460

# is illegal, so let's note the defect, store the illegal

461

# line, and ignore it for purposes of headers.

462

defect = errors.FirstHeaderLineIsContinuationDefect(line)

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

463

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

464

continue

465

lastvalue.append(line)

466

continue

467

if lastheader:

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

468

self._cur.set_raw(*self.policy.header_source_parse(lastvalue))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

469

lastheader, lastvalue = '', []

470

# Check for envelope header, i.e. unix-from

471

if line.startswith('From '):

472

if lineno == 0:

473

# Strip off the trailing newline

474

mo = NLCRE_eol.search(line)

475

if mo:

476

line = line[:-len(mo.group(0))]

477

self._cur.set_unixfrom(line)

478

continue

479

elif lineno == len(lines) - 1:

480

# Something looking like a unix-from at the end - it's

481

# probably the first line of the body, so push back the

482

# line and stop.

483

self._input.unreadline(line)

484

return

485

else:

486

# Weirdly placed unix-from line. Note this as a defect

487

# and ignore it.

488

defect = errors.MisplacedEnvelopeHeaderDefect(line)

489

self._cur.defects.append(defect)

490

continue

491

# Split the line on the colon separating field name from value.

R David Murray

adbdcdb

2012-05-27 20:45:01 -0400

[diff] [blame]

492

# There will always be a colon, because if there wasn't the part of

493

# the parser that calls us would have started parsing the body.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

494

i = line.find(':')

R David Murray

adbdcdb

2012-05-27 20:45:01 -0400

[diff] [blame]

495

assert i>0, "_parse_headers fed line with no : and no leading WS"

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

496

lastheader = line[:i]

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

497

lastvalue = [line]

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

498

# Done with all the lines, so handle the last header.

499

if lastheader:

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

500

self._cur.set_raw(*self.policy.header_source_parse(lastvalue))

R. David Murray

96fd54e

2010-10-08 15:55:28 +0000

[diff] [blame]

501

R David Murray