Blame - Lib/email/FeedParser.py - platform/external/python/cpython3

2004-05-09 03:29:23 +0000

[diff] [blame]

1

2

# Authors: Baxter, Wouters and Warsaw

Barry Warsaw

2004-10-03 03:16:19 +0000

[diff] [blame]

3

# Contact: email-sig@python.org

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

4

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

5

"""FeedParser - An email feed parser.

6

7

The feed parser implements an interface for incrementally parsing an email

8

message, line by line. This has advantages for certain applications, such as

9

those reading email messages off a socket.

10

11

FeedParser.feed() is the primary interface for pushing new data into the

12

parser. It returns when there's nothing more it can do with the available

13

data. When you have no more data to push into the parser, call .close().

14

This completes the parsing and returns the root message object.

15

16

The other advantage of this parser is that it will never throw a parsing

17

exception. Instead, when it finds something unexpected, it adds a 'defect' to

18

the current message. Defects are just instances that live on the message

Barry Warsaw

2004-10-03 03:16:19 +0000

[diff] [blame]

19

object's .defects attribute.

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

20

"""

21

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

22

import re

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

23

from email import Errors

24

from email import Message

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

25

26

NLCRE = re.compile('\r\n|\r|\n')

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

27

NLCRE_bol = re.compile('(\r\n|\r|\n)')

28

NLCRE_eol = re.compile('(\r\n|\r|\n)$')

29

NLCRE_crack = re.compile('(\r\n|\r|\n)')

30

headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

EMPTYSTRING = ''

NL = '\n'

NeedMoreData = object()

35

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

36

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

37

38

class BufferedSubFile(object):

39

"""A file-ish object that can have new data loaded into it.

40

41

You can also push and pop line-matching predicates onto a stack. When the

42

current predicate matches the current line, a false EOF response

43

(i.e. empty string) is returned instead. This lets the parser adhere to a

44

simple abstraction -- it parses until EOF closes the current message.

45

"""

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

46

def __init__(self):

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

47

# The last partial line pushed into this object.

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

48

self._partial = ''

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

49

# The list of full, pushed lines, in reverse order

50

self._lines = []

51

# The stack of false-EOF checking predicates.

52

self._eofstack = []

53

# A flag indicating whether the file has been closed or not.

54

self._closed = False

55

56

def push_eof_matcher(self, pred):

57

self._eofstack.append(pred)

58

59

def pop_eof_matcher(self):

60

return self._eofstack.pop()

61

62

def close(self):

63

# Don't forget any trailing partial line.

64

self._lines.append(self._partial)

Barry Warsaw

2e8c1f1

2004-11-28 00:21:42 +0000

[diff] [blame^]

65

self._partial = ''

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

66

self._closed = True

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

67

68

def readline(self):

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

69

if not self._lines:

70

if self._closed:

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

71

return ''

72

return NeedMoreData

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

73

# Pop the line off the stack and see if it matches the current

74

# false-EOF predicate.

75

line = self._lines.pop()

Barry Warsaw

4e59bc1

2004-05-13 20:17:51 +0000

[diff] [blame]

76

# RFC 2046, section 5.1.2 requires us to recognize outer level

77

# boundaries at any level of inner nesting. Do this, but be sure it's

78

# in the order of most to least nested.

79

for ateof in self._eofstack[::-1]:

80

if ateof(line):

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

81

# We're at the false EOF. But push the last line back first.

82

self._lines.append(line)

83

return ''

84

return line

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

85

86

def unreadline(self, line):

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

87

# Let the consumer push a line back into the buffer.

88

self._lines.append(line)

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

89

90

def push(self, data):

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

91

"""Push some new data into this object."""

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

92

# Handle any previous leftovers

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

93

data, self._partial = self._partial + data, ''

94

# Crack into lines, but preserve the newlines on the end of each

95

parts = NLCRE_crack.split(data)

96

# The *ahem* interesting behaviour of re.split when supplied grouping

97

# parentheses is that the last element of the resulting list is the

98

# data after the final RE. In the case of a NL/CR terminated string,

99

# this is the empty string.

100

self._partial = parts.pop()

101

# parts is a list of strings, alternating between the line contents

102

# and the eol character(s). Gather up a list of lines after

103

# re-attaching the newlines.

104

lines = []

Barry Warsaw

2004-10-03 03:16:19 +0000

[diff] [blame]

105

for i in range(len(parts) // 2):

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

106

lines.append(parts[i*2] + parts[i*2+1])

107

self.pushlines(lines)

108

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

109

def pushlines(self, lines):

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

110

# Reverse and insert at the front of the lines.

111

self._lines[:0] = lines[::-1]

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

112

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

113

def is_closed(self):

114

return self._closed

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

def __iter__(self):

return self

def next(self):

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

120

line = self.readline()

121

if line == '':

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

122

raise StopIteration

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

123

return line

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

124

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

125

126

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

127

class FeedParser:

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

128

"""A feed-style parser of email."""

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

129

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

130

def __init__(self, _factory=Message.Message):

131

"""_factory is called with no arguments to create a new message obj"""

132

self._factory = _factory

133

self._input = BufferedSubFile()

134

self._msgstack = []

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

135

self._parse = self._parsegen().next

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

136

self._cur = None

137

self._last = None

138

self._headersonly = False

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

139

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

140

# Non-public interface for supporting Parser's headersonly flag

141

def _set_headersonly(self):

142

self._headersonly = True

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

143

144

def feed(self, data):

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

145

"""Push more data into the parser."""

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

146

self._input.push(data)

147

self._call_parse()

148

149

def _call_parse(self):

150

try:

151

self._parse()

152

except StopIteration:

153

pass

154

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

155

def close(self):

156

"""Parse all remaining data and return the root message object."""

157

self._input.close()

158

self._call_parse()

159

root = self._pop_message()

160

assert not self._msgstack

Barry Warsaw

2004-10-03 03:16:19 +0000

[diff] [blame]

161

# Look for final set of defects

162

if root.get_content_maintype() == 'multipart' \

163

and not root.is_multipart():

164

root.defects.append(Errors.MultipartInvariantViolationDefect())

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

165

return root

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

166

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

167

def _new_message(self):

168

msg = self._factory()

169

if self._cur and self._cur.get_content_type() == 'multipart/digest':

170

msg.set_default_type('message/rfc822')

171

if self._msgstack:

172

self._msgstack[-1].attach(msg)

173

self._msgstack.append(msg)

174

self._cur = msg

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

175

self._last = msg

176

177

def _pop_message(self):

178

retval = self._msgstack.pop()

179

if self._msgstack:

180

self._cur = self._msgstack[-1]

else:

self._cur = None

return retval

def _parsegen(self):

# Create a new message and start by parsing headers.

187

self._new_message()

188

headers = []

189

# Collect the headers, searching for a line that doesn't match the RFC

190

# 2822 header or continuation pattern (including an empty line).

191

for line in self._input:

192

if line is NeedMoreData:

193

yield NeedMoreData

194

continue

195

if not headerRE.match(line):

196

# If we saw the RFC defined header/body separator

197

# (i.e. newline), just throw it away. Otherwise the line is

198

# part of the body so push it back.

199

if not NLCRE.match(line):

200

self._input.unreadline(line)

201

break

202

headers.append(line)

203

# Done with the headers, so parse them and figure out what we're

204

# supposed to see in the body of the message.

205

self._parse_headers(headers)

206

# Headers-only parsing is a backwards compatibility hack, which was

207

# necessary in the older parser, which could throw errors. All

208

# remaining lines in the input are thrown into the message body.

209

if self._headersonly:

210

lines = []

211

while True:

212

line = self._input.readline()

213

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

220

return

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

221

if self._cur.get_content_type() == 'message/delivery-status':

222

# message/delivery-status contains blocks of headers separated by

223

# a blank line. We'll represent each header block as a separate

Barry Warsaw

d38f448

2004-05-11 20:19:09 +0000

[diff] [blame]

224

# nested message object, but the processing is a bit different

225

# than standard message/* types because there is no body for the

226

# nested messages. A blank line separates the subparts.

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

227

while True:

228

self._input.push_eof_matcher(NLCRE.match)

229

for retval in self._parsegen():

230

if retval is NeedMoreData:

yield NeedMoreData

continue

break

msg = self._pop_message()

235

# We need to pop the EOF matcher in order to tell if we're at

236

# the end of the current file, not the end of the last block

237

# of message headers.

238

self._input.pop_eof_matcher()

239

# The input stream must be sitting at the newline or at the

240

# EOF. We want to see if we're at the end of this subpart, so

241

# first consume the blank line, then test the next line to see

242

# if we're at this subpart's EOF.

243

line = self._input.readline()

244

line = self._input.readline()

245

if line == '':

246

break

247

# Not at EOF so this is a line we're going to need.

248

self._input.unreadline(line)

249

return

Barry Warsaw

d38f448

2004-05-11 20:19:09 +0000

[diff] [blame]

250

if self._cur.get_content_maintype() == 'message':

251

# The message claims to be a message/* type, then what follows is

252

# another RFC 2822 message.

253

for retval in self._parsegen():

254

if retval is NeedMoreData:

yield NeedMoreData

continue

break

self._pop_message()

return

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

260

if self._cur.get_content_maintype() == 'multipart':

261

boundary = self._cur.get_boundary()

262

if boundary is None:

263

# The message /claims/ to be a multipart but it has not

264

# defined a boundary. That's a problem which we'll handle by

265

# reading everything until the EOF and marking the message as

266

# defective.

Barry Warsaw

2004-10-03 03:16:19 +0000

[diff] [blame]

267

self._cur.defects.append(Errors.NoBoundaryInMultipartDefect())

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

268

lines = []

269

for line in self._input:

270

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

275

return

276

# Create a line match predicate which matches the inter-part

277

# boundary as well as the end-of-multipart boundary. Don't push

278

# this onto the input stream until we've scanned past the

279

# preamble.

280

separator = '--' + boundary

281

boundaryre = re.compile(

282

'(?P<sep>' + re.escape(separator) +

Barry Warsaw

2e8c1f1

2004-11-28 00:21:42 +0000

[diff] [blame^]

283

r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

284

capturing_preamble = True

preamble = []

linesep = False

while True:

line = self._input.readline()

289

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

mo = boundaryre.match(line)

295

if mo:

296

# If we're looking at the end boundary, we're done with

297

# this multipart. If there was a newline at the end of

298

# the closing boundary, then we need to initialize the

299

# epilogue with the empty string (see below).

300

if mo.group('end'):

301

linesep = mo.group('linesep')

302

break

303

# We saw an inter-part boundary. Were we in the preamble?

304

if capturing_preamble:

305

if preamble:

306

# According to RFC 2046, the last newline belongs

307

# to the boundary.

308

lastline = preamble[-1]

309

eolmo = NLCRE_eol.search(lastline)

310

if eolmo:

311

preamble[-1] = lastline[:-len(eolmo.group(0))]

312

self._cur.preamble = EMPTYSTRING.join(preamble)

313

capturing_preamble = False

314

self._input.unreadline(line)

315

continue

Barry Warsaw

486cb0a

2004-05-11 22:23:59 +0000

[diff] [blame]

316

# We saw a boundary separating two parts. Consume any

317

# multiple boundary lines that may be following. Our

318

# interpretation of RFC 2046 BNF grammar does not produce

319

# body parts within such double boundaries.

320

while True:

321

line = self._input.readline()

Barry Warsaw

e4aeb7d

2004-05-15 16:26:28 +0000

[diff] [blame]

322

if line is NeedMoreData:

323

yield NeedMoreData

324

continue

Barry Warsaw

486cb0a

2004-05-11 22:23:59 +0000

[diff] [blame]

325

mo = boundaryre.match(line)

326

if not mo:

327

self._input.unreadline(line)

328

break

329

# Recurse to parse this subpart; the input stream points

330

# at the subpart's first line.

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

331

self._input.push_eof_matcher(boundaryre.match)

332

for retval in self._parsegen():

333

if retval is NeedMoreData:

yield NeedMoreData

continue

break

# Because of RFC 2046, the newline preceding the boundary

338

# separator actually belongs to the boundary, not the

339

# previous subpart's payload (or epilogue if the previous

340

# part is a multipart).

341

if self._last.get_content_maintype() == 'multipart':

342

epilogue = self._last.epilogue

343

if epilogue == '':

344

self._last.epilogue = None

345

elif epilogue is not None:

346

mo = NLCRE_eol.search(epilogue)

347

if mo:

348

end = len(mo.group(0))

349

self._last.epilogue = epilogue[:-end]

350

else:

351

payload = self._last.get_payload()

352

if isinstance(payload, basestring):

353

mo = NLCRE_eol.search(payload)

354

if mo:

355

payload = payload[:-len(mo.group(0))]

356

self._last.set_payload(payload)

357

self._input.pop_eof_matcher()

358

self._pop_message()

359

# Set the multipart up for newline cleansing, which will

360

# happen if we're in a nested multipart.

361

self._last = self._cur

362

else:

363

# I think we must be in the preamble

364

assert capturing_preamble

365

preamble.append(line)

366

# We've seen either the EOF or the end boundary. If we're still

367

# capturing the preamble, we never saw the start boundary. Note

368

# that as a defect and store the captured text as the payload.

Barry Warsaw

dee0cf1

2004-10-09 23:00:11 +0000

[diff] [blame]

369

# Everything from here to the EOF is epilogue.

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

370

if capturing_preamble:

Barry Warsaw

2004-10-03 03:16:19 +0000

[diff] [blame]

371

self._cur.defects.append(Errors.StartBoundaryNotFoundDefect())

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

372

self._cur.set_payload(EMPTYSTRING.join(preamble))

Barry Warsaw

dee0cf1

2004-10-09 23:00:11 +0000

[diff] [blame]

373

epilogue = []

374

for line in self._input:

375

if line is NeedMoreData:

376

yield NeedMoreData

377

continue

378

self._cur.epilogue = EMPTYSTRING.join(epilogue)

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

379

return

380

# If the end boundary ended in a newline, we'll need to make sure

381

# the epilogue isn't None

if linesep:

epilogue = ['']

else:

epilogue = []

for line in self._input:

387

if line is NeedMoreData:

388

yield NeedMoreData

389

continue

390

epilogue.append(line)

391

# Any CRLF at the front of the epilogue is not technically part of

392

# the epilogue. Also, watch out for an empty string epilogue,

393

# which means a single newline.

Barry Warsaw

5b44cd6

2004-05-11 18:10:15 +0000

[diff] [blame]

394

if epilogue:

395

firstline = epilogue[0]

396

bolmo = NLCRE_bol.match(firstline)

397

if bolmo:

398

epilogue[0] = firstline[len(bolmo.group(0)):]

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

399

self._cur.epilogue = EMPTYSTRING.join(epilogue)

400

return

401

# Otherwise, it's some non-multipart type, so the entire rest of the

402

# file contents becomes the payload.

403

lines = []

404

for line in self._input:

405

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

410

411

def _parse_headers(self, lines):

412

# Passed a list of lines that make up the headers for the current msg

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

413

lastheader = ''

414

lastvalue = []

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

415

for lineno, line in enumerate(lines):

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

416

# Check for continuation

417

if line[0] in ' \t':

418

if not lastheader:

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

419

# The first line of the headers was a continuation. This

420

# is illegal, so let's note the defect, store the illegal

421

# line, and ignore it for purposes of headers.

Barry Warsaw

2004-10-03 03:16:19 +0000

[diff] [blame]

422

defect = Errors.FirstHeaderLineIsContinuationDefect(line)

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

423

self._cur.defects.append(defect)

424

continue

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

425

lastvalue.append(line)

426

continue

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

427

if lastheader:

428

# XXX reconsider the joining of folded lines

Barry Warsaw

8896bf5

2004-08-07 15:57:52 +0000

[diff] [blame]

429

lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')

430

self._cur[lastheader] = lhdr

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

431

lastheader, lastvalue = '', []

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

432

# Check for envelope header, i.e. unix-from

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

433

if line.startswith('From '):

434

if lineno == 0:

Barry Warsaw

c29db26

2004-05-10 14:48:30 +0000

[diff] [blame]

435

# Strip off the trailing newline

436

mo = NLCRE_eol.search(line)

437

if mo:

438

line = line[:-len(mo.group(0))]

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

439

self._cur.set_unixfrom(line)

440

continue

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

441

elif lineno == len(lines) - 1:

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

442

# Something looking like a unix-from at the end - it's

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

443

# probably the first line of the body, so push back the

444

# line and stop.

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

445

self._input.unreadline(line)

446

return

447

else:

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

448

# Weirdly placed unix-from line. Note this as a defect

449

# and ignore it.

Barry Warsaw

2004-10-03 03:16:19 +0000

[diff] [blame]

450

defect = Errors.MisplacedEnvelopeHeaderDefect(line)

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

451

self._cur.defects.append(defect)

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

452

continue

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

453

# Split the line on the colon separating field name from value.

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

454

i = line.find(':')

455

if i < 0:

Barry Warsaw

2004-10-03 03:16:19 +0000

[diff] [blame]

456

defect = Errors.MalformedHeaderDefect(line)

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

457

self._cur.defects.append(defect)

458

continue

Anthony Baxter

2004-03-22 00:33:28 +0000

[diff] [blame]

459

lastheader = line[:i]

460

lastvalue = [line[i+1:].lstrip()]

Barry Warsaw

2004-05-09 03:29:23 +0000

[diff] [blame]

461

# Done with all the lines, so handle the last header.

Anthony Baxter