Blame - Lib/email/feedparser.py - platform/external/python/cpython3

2007-08-30 01:15:14 +0000

[diff] [blame]

1

2

# Authors: Baxter, Wouters and Warsaw

3

# Contact: email-sig@python.org

4

5

"""FeedParser - An email feed parser.

6

7

The feed parser implements an interface for incrementally parsing an email

8

message, line by line. This has advantages for certain applications, such as

9

those reading email messages off a socket.

10

11

FeedParser.feed() is the primary interface for pushing new data into the

12

parser. It returns when there's nothing more it can do with the available

13

data. When you have no more data to push into the parser, call .close().

14

This completes the parsing and returns the root message object.

15

16

The other advantage of this parser is that it will never throw a parsing

17

exception. Instead, when it finds something unexpected, it adds a 'defect' to

18

the current message. Defects are just instances that live on the message

19

object's .defects attribute.

20

"""

21

R David Murray

1b6c724

2012-03-16 22:43:05 -0400

[diff] [blame]

22

__all__ = ['FeedParser', 'BytesFeedParser']

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

import re

from email import errors

27

from email import message

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

28

from email._policybase import compat32

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

29

30

NLCRE = re.compile('\r\n|\r|\n')

31

NLCRE_bol = re.compile('(\r\n|\r|\n)')

R. David Murray

45e0e14

2010-06-16 02:19:40 +0000

[diff] [blame]

32

NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

33

NLCRE_crack = re.compile('(\r\n|\r|\n)')

34

# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character

35

# except controls, SP, and ":".

36

headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')

EMPTYSTRING = ''

NL = '\n'

NeedMoreData = object()

class BufferedSubFile(object):

45

"""A file-ish object that can have new data loaded into it.

46

47

You can also push and pop line-matching predicates onto a stack. When the

48

current predicate matches the current line, a false EOF response

49

(i.e. empty string) is returned instead. This lets the parser adhere to a

50

simple abstraction -- it parses until EOF closes the current message.

51

"""

52

def __init__(self):

53

# The last partial line pushed into this object.

54

self._partial = ''

55

# The list of full, pushed lines, in reverse order

56

self._lines = []

57

# The stack of false-EOF checking predicates.

58

self._eofstack = []

59

# A flag indicating whether the file has been closed or not.

60

self._closed = False

61

62

def push_eof_matcher(self, pred):

63

self._eofstack.append(pred)

64

65

def pop_eof_matcher(self):

66

return self._eofstack.pop()

67

68

def close(self):

69

# Don't forget any trailing partial line.

70

self._lines.append(self._partial)

self._partial = ''

self._closed = True

def readline(self):

if not self._lines:

if self._closed:

return ''

return NeedMoreData

# Pop the line off the stack and see if it matches the current

80

# false-EOF predicate.

81

line = self._lines.pop()

82

# RFC 2046, section 5.1.2 requires us to recognize outer level

83

# boundaries at any level of inner nesting. Do this, but be sure it's

84

# in the order of most to least nested.

85

for ateof in self._eofstack[::-1]:

86

if ateof(line):

87

# We're at the false EOF. But push the last line back first.

88

self._lines.append(line)

return ''

return line

def unreadline(self, line):

93

# Let the consumer push a line back into the buffer.

94

assert line is not NeedMoreData

95

self._lines.append(line)

96

97

def push(self, data):

98

"""Push some new data into this object."""

99

# Handle any previous leftovers

100

data, self._partial = self._partial + data, ''

101

# Crack into lines, but preserve the newlines on the end of each

102

parts = NLCRE_crack.split(data)

103

# The *ahem* interesting behaviour of re.split when supplied grouping

104

# parentheses is that the last element of the resulting list is the

105

# data after the final RE. In the case of a NL/CR terminated string,

106

# this is the empty string.

107

self._partial = parts.pop()

R. David Murray

45bf773f

2010-07-17 01:19:57 +0000

[diff] [blame]

108

#GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:

109

# is there a \n to follow later?

110

if not self._partial and parts and parts[-1].endswith('\r'):

111

self._partial = parts.pop(-2)+parts.pop()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

112

# parts is a list of strings, alternating between the line contents

113

# and the eol character(s). Gather up a list of lines after

114

# re-attaching the newlines.

115

lines = []

116

for i in range(len(parts) // 2):

117

lines.append(parts[i*2] + parts[i*2+1])

118

self.pushlines(lines)

119

120

def pushlines(self, lines):

121

# Reverse and insert at the front of the lines.

122

self._lines[:0] = lines[::-1]

123

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

def __iter__(self):

return self

def __next__(self):

line = self.readline()

if line == '':

raise StopIteration

return line

class FeedParser:

"""A feed-style parser of email."""

137

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

138

def __init__(self, _factory=message.Message, *, policy=compat32):

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

139

"""_factory is called with no arguments to create a new message obj

140

141

The policy keyword specifies a policy object that controls a number of

142

aspects of the parser's operation. The default policy maintains

143

backward compatibility.

144

145

"""

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

146

self._factory = _factory

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

147

self.policy = policy

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

148

try:

149

_factory(policy=self.policy)

150

self._factory_kwds = lambda: {'policy': self.policy}

151

except TypeError:

152

# Assume this is an old-style factory

153

self._factory_kwds = lambda: {}

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

154

self._input = BufferedSubFile()

155

self._msgstack = []

156

self._parse = self._parsegen().__next__

157

self._cur = None

158

self._last = None

159

self._headersonly = False

160

161

# Non-public interface for supporting Parser's headersonly flag

162

def _set_headersonly(self):

163

self._headersonly = True

164

165

def feed(self, data):

166

"""Push more data into the parser."""

167

self._input.push(data)

168

self._call_parse()

169

170

def _call_parse(self):

171

try:

172

self._parse()

173

except StopIteration:

pass

def close(self):

"""Parse all remaining data and return the root message object."""

178

self._input.close()

179

self._call_parse()

180

root = self._pop_message()

181

assert not self._msgstack

182

# Look for final set of defects

183

if root.get_content_maintype() == 'multipart' \

184

and not root.is_multipart():

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

185

defect = errors.MultipartInvariantViolationDefect()

186

self.policy.handle_defect(root, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

187

return root

188

189

def _new_message(self):

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

190

msg = self._factory(**self._factory_kwds())

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

191

if self._cur and self._cur.get_content_type() == 'multipart/digest':

192

msg.set_default_type('message/rfc822')

193

if self._msgstack:

194

self._msgstack[-1].attach(msg)

195

self._msgstack.append(msg)

self._cur = msg

self._last = msg

def _pop_message(self):

200

retval = self._msgstack.pop()

201

if self._msgstack:

202

self._cur = self._msgstack[-1]

else:

self._cur = None

return retval

def _parsegen(self):

# Create a new message and start by parsing headers.

209

self._new_message()

210

headers = []

211

# Collect the headers, searching for a line that doesn't match the RFC

212

# 2822 header or continuation pattern (including an empty line).

213

for line in self._input:

214

if line is NeedMoreData:

215

yield NeedMoreData

216

continue

217

if not headerRE.match(line):

218

# If we saw the RFC defined header/body separator

219

# (i.e. newline), just throw it away. Otherwise the line is

220

# part of the body so push it back.

221

if not NLCRE.match(line):

R David Murray

adbdcdb

2012-05-27 20:45:01 -0400

[diff] [blame^]

222

defect = errors.MissingHeaderBodySeparatorDefect()

223

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

224

self._input.unreadline(line)

225

break

226

headers.append(line)

227

# Done with the headers, so parse them and figure out what we're

228

# supposed to see in the body of the message.

229

self._parse_headers(headers)

230

# Headers-only parsing is a backwards compatibility hack, which was

231

# necessary in the older parser, which could throw errors. All

232

# remaining lines in the input are thrown into the message body.

233

if self._headersonly:

234

lines = []

235

while True:

236

line = self._input.readline()

237

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

244

return

245

if self._cur.get_content_type() == 'message/delivery-status':

246

# message/delivery-status contains blocks of headers separated by

247

# a blank line. We'll represent each header block as a separate

248

# nested message object, but the processing is a bit different

249

# than standard message/* types because there is no body for the

250

# nested messages. A blank line separates the subparts.

251

while True:

252

self._input.push_eof_matcher(NLCRE.match)

253

for retval in self._parsegen():

254

if retval is NeedMoreData:

yield NeedMoreData

continue

break

msg = self._pop_message()

259

# We need to pop the EOF matcher in order to tell if we're at

260

# the end of the current file, not the end of the last block

261

# of message headers.

262

self._input.pop_eof_matcher()

263

# The input stream must be sitting at the newline or at the

264

# EOF. We want to see if we're at the end of this subpart, so

265

# first consume the blank line, then test the next line to see

266

# if we're at this subpart's EOF.

267

while True:

268

line = self._input.readline()

269

if line is NeedMoreData:

yield NeedMoreData

continue

break

while True:

line = self._input.readline()

275

if line is NeedMoreData:

yield NeedMoreData

continue

break

if line == '':

break

# Not at EOF so this is a line we're going to need.

282

self._input.unreadline(line)

283

return

284

if self._cur.get_content_maintype() == 'message':

285

# The message claims to be a message/* type, then what follows is

286

# another RFC 2822 message.

287

for retval in self._parsegen():

288

if retval is NeedMoreData:

yield NeedMoreData

continue

break

self._pop_message()

return

if self._cur.get_content_maintype() == 'multipart':

295

boundary = self._cur.get_boundary()

296

if boundary is None:

297

# The message /claims/ to be a multipart but it has not

298

# defined a boundary. That's a problem which we'll handle by

299

# reading everything until the EOF and marking the message as

300

# defective.

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

301

defect = errors.NoBoundaryInMultipartDefect()

302

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

303

lines = []

304

for line in self._input:

305

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

310

return

R David Murray

749073a

2011-06-22 13:47:53 -0400

[diff] [blame]

311

# Make sure a valid content type was specified per RFC 2045:6.4.

312

if (self._cur.get('content-transfer-encoding', '8bit').lower()

313

not in ('7bit', '8bit', 'binary')):

314

defect = errors.InvalidMultipartContentTransferEncodingDefect()

315

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

316

# Create a line match predicate which matches the inter-part

317

# boundary as well as the end-of-multipart boundary. Don't push

318

# this onto the input stream until we've scanned past the

319

# preamble.

320

separator = '--' + boundary

321

boundaryre = re.compile(

322

'(?P<sep>' + re.escape(separator) +

323

r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')

324

capturing_preamble = True

preamble = []

linesep = False

while True:

line = self._input.readline()

329

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

mo = boundaryre.match(line)

335

if mo:

336

# If we're looking at the end boundary, we're done with

337

# this multipart. If there was a newline at the end of

338

# the closing boundary, then we need to initialize the

339

# epilogue with the empty string (see below).

340

if mo.group('end'):

341

linesep = mo.group('linesep')

342

break

343

# We saw an inter-part boundary. Were we in the preamble?

344

if capturing_preamble:

345

if preamble:

346

# According to RFC 2046, the last newline belongs

347

# to the boundary.

348

lastline = preamble[-1]

349

eolmo = NLCRE_eol.search(lastline)

350

if eolmo:

351

preamble[-1] = lastline[:-len(eolmo.group(0))]

352

self._cur.preamble = EMPTYSTRING.join(preamble)

353

capturing_preamble = False

354

self._input.unreadline(line)

355

continue

356

# We saw a boundary separating two parts. Consume any

357

# multiple boundary lines that may be following. Our

358

# interpretation of RFC 2046 BNF grammar does not produce

359

# body parts within such double boundaries.

360

while True:

361

line = self._input.readline()

362

if line is NeedMoreData:

363

yield NeedMoreData

364

continue

365

mo = boundaryre.match(line)

366

if not mo:

367

self._input.unreadline(line)

368

break

369

# Recurse to parse this subpart; the input stream points

370

# at the subpart's first line.

371

self._input.push_eof_matcher(boundaryre.match)

372

for retval in self._parsegen():

373

if retval is NeedMoreData:

yield NeedMoreData

continue

break

# Because of RFC 2046, the newline preceding the boundary

378

# separator actually belongs to the boundary, not the

379

# previous subpart's payload (or epilogue if the previous

380

# part is a multipart).

381

if self._last.get_content_maintype() == 'multipart':

382

epilogue = self._last.epilogue

383

if epilogue == '':

384

self._last.epilogue = None

385

elif epilogue is not None:

386

mo = NLCRE_eol.search(epilogue)

387

if mo:

388

end = len(mo.group(0))

389

self._last.epilogue = epilogue[:-end]

390

else:

R David Murray

c5c1472

2011-04-06 08:13:02 -0400

[diff] [blame]

391

payload = self._last._payload

Guido van Rossum

3172c5d

2007-10-16 18:12:55 +0000

[diff] [blame]

392

if isinstance(payload, str):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

393

mo = NLCRE_eol.search(payload)

394

if mo:

395

payload = payload[:-len(mo.group(0))]

R David Murray

c5c1472

2011-04-06 08:13:02 -0400

[diff] [blame]

396

self._last._payload = payload

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

397

self._input.pop_eof_matcher()

398

self._pop_message()

399

# Set the multipart up for newline cleansing, which will

400

# happen if we're in a nested multipart.

401

self._last = self._cur

402

else:

403

# I think we must be in the preamble

404

assert capturing_preamble

405

preamble.append(line)

406

# We've seen either the EOF or the end boundary. If we're still

407

# capturing the preamble, we never saw the start boundary. Note

408

# that as a defect and store the captured text as the payload.

409

# Everything from here to the EOF is epilogue.

410

if capturing_preamble:

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

411

defect = errors.StartBoundaryNotFoundDefect()

412

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

413

self._cur.set_payload(EMPTYSTRING.join(preamble))

414

epilogue = []

415

for line in self._input:

416

if line is NeedMoreData:

417

yield NeedMoreData

418

continue

419

self._cur.epilogue = EMPTYSTRING.join(epilogue)

420

return

421

# If the end boundary ended in a newline, we'll need to make sure

422

# the epilogue isn't None

if linesep:

epilogue = ['']

else:

epilogue = []

for line in self._input:

428

if line is NeedMoreData:

429

yield NeedMoreData

430

continue

431

epilogue.append(line)

432

# Any CRLF at the front of the epilogue is not technically part of

433

# the epilogue. Also, watch out for an empty string epilogue,

434

# which means a single newline.

435

if epilogue:

436

firstline = epilogue[0]

437

bolmo = NLCRE_bol.match(firstline)

438

if bolmo:

439

epilogue[0] = firstline[len(bolmo.group(0)):]

440

self._cur.epilogue = EMPTYSTRING.join(epilogue)

441

return

442

# Otherwise, it's some non-multipart type, so the entire rest of the

443

# file contents becomes the payload.

444

lines = []

445

for line in self._input:

446

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

451

452

def _parse_headers(self, lines):

453

# Passed a list of lines that make up the headers for the current msg

454

lastheader = ''

455

lastvalue = []

456

for lineno, line in enumerate(lines):

457

# Check for continuation

458

if line[0] in ' \t':

459

if not lastheader:

460

# The first line of the headers was a continuation. This

461

# is illegal, so let's note the defect, store the illegal

462

# line, and ignore it for purposes of headers.

463

defect = errors.FirstHeaderLineIsContinuationDefect(line)

R David Murray

2011-04-18 13:59:37 -0400

[diff] [blame]

464

self.policy.handle_defect(self._cur, defect)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

465

continue

466

lastvalue.append(line)

467

continue

468

if lastheader:

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

469

self._cur.set_raw(*self.policy.header_source_parse(lastvalue))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

470

lastheader, lastvalue = '', []

471

# Check for envelope header, i.e. unix-from

472

if line.startswith('From '):

473

if lineno == 0:

474

# Strip off the trailing newline

475

mo = NLCRE_eol.search(line)

476

if mo:

477

line = line[:-len(mo.group(0))]

478

self._cur.set_unixfrom(line)

479

continue

480

elif lineno == len(lines) - 1:

481

# Something looking like a unix-from at the end - it's

482

# probably the first line of the body, so push back the

483

# line and stop.

484

self._input.unreadline(line)

485

return

486

else:

487

# Weirdly placed unix-from line. Note this as a defect

488

# and ignore it.

489

defect = errors.MisplacedEnvelopeHeaderDefect(line)

490

self._cur.defects.append(defect)

491

continue

492

# Split the line on the colon separating field name from value.

R David Murray

adbdcdb

2012-05-27 20:45:01 -0400

[diff] [blame^]

493

# There will always be a colon, because if there wasn't the part of

494

# the parser that calls us would have started parsing the body.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

495

i = line.find(':')

R David Murray

adbdcdb

2012-05-27 20:45:01 -0400

[diff] [blame^]

496

assert i>0, "_parse_headers fed line with no : and no leading WS"

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

497

lastheader = line[:i]

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

498

lastvalue = [line]

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

499

# Done with all the lines, so handle the last header.

500

if lastheader:

R David Murray

2012-05-25 15:01:48 -0400

[diff] [blame]

501

self._cur.set_raw(*self.policy.header_source_parse(lastvalue))

R. David Murray

96fd54e

2010-10-08 15:55:28 +0000

[diff] [blame]

502

R David Murray