Blame - Lib/email/FeedParser.py - platform/external/python/cpython3

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

1

2

# Authors: Baxter, Wouters and Warsaw

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

3

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

4

"""FeedParser - An email feed parser.

5

6

The feed parser implements an interface for incrementally parsing an email

7

message, line by line. This has advantages for certain applications, such as

8

those reading email messages off a socket.

9

10

FeedParser.feed() is the primary interface for pushing new data into the

11

parser. It returns when there's nothing more it can do with the available

12

data. When you have no more data to push into the parser, call .close().

13

This completes the parsing and returns the root message object.

14

15

The other advantage of this parser is that it will never throw a parsing

16

exception. Instead, when it finds something unexpected, it adds a 'defect' to

17

the current message. Defects are just instances that live on the message

18

object's .defect attribute.

19

"""

20

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

21

import re

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

22

from email import Errors

23

from email import Message

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

24

25

NLCRE = re.compile('\r\n|\r|\n')

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

26

NLCRE_bol = re.compile('(\r\n|\r|\n)')

27

NLCRE_eol = re.compile('(\r\n|\r|\n)$')

28

NLCRE_crack = re.compile('(\r\n|\r|\n)')

29

headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

EMPTYSTRING = ''

NL = '\n'

NeedMoreData = object()

34

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

35

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

36

37

class BufferedSubFile(object):

38

"""A file-ish object that can have new data loaded into it.

39

40

You can also push and pop line-matching predicates onto a stack. When the

41

current predicate matches the current line, a false EOF response

42

(i.e. empty string) is returned instead. This lets the parser adhere to a

43

simple abstraction -- it parses until EOF closes the current message.

44

"""

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

45

def __init__(self):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

46

# The last partial line pushed into this object.

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

47

self._partial = ''

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

48

# The list of full, pushed lines, in reverse order

49

self._lines = []

50

# The stack of false-EOF checking predicates.

51

self._eofstack = []

52

# A flag indicating whether the file has been closed or not.

53

self._closed = False

54

55

def push_eof_matcher(self, pred):

56

self._eofstack.append(pred)

57

58

def pop_eof_matcher(self):

59

return self._eofstack.pop()

60

61

def close(self):

62

# Don't forget any trailing partial line.

63

self._lines.append(self._partial)

64

self._closed = True

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

65

66

def readline(self):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

67

if not self._lines:

68

if self._closed:

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

69

return ''

70

return NeedMoreData

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

71

# Pop the line off the stack and see if it matches the current

72

# false-EOF predicate.

73

line = self._lines.pop()

74

if self._eofstack:

75

matches = self._eofstack[-1]

76

if matches(line):

77

# We're at the false EOF. But push the last line back first.

78

self._lines.append(line)

79

return ''

80

return line

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

81

82

def unreadline(self, line):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

83

# Let the consumer push a line back into the buffer.

84

self._lines.append(line)

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

85

86

def push(self, data):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

87

"""Push some new data into this object."""

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

88

# Handle any previous leftovers

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

89

data, self._partial = self._partial + data, ''

90

# Crack into lines, but preserve the newlines on the end of each

91

parts = NLCRE_crack.split(data)

92

# The *ahem* interesting behaviour of re.split when supplied grouping

93

# parentheses is that the last element of the resulting list is the

94

# data after the final RE. In the case of a NL/CR terminated string,

95

# this is the empty string.

96

self._partial = parts.pop()

97

# parts is a list of strings, alternating between the line contents

98

# and the eol character(s). Gather up a list of lines after

99

# re-attaching the newlines.

100

lines = []

101

for i in range(len(parts) / 2):

102

lines.append(parts[i*2] + parts[i*2+1])

103

self.pushlines(lines)

104

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

105

def pushlines(self, lines):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

106

# Reverse and insert at the front of the lines.

107

self._lines[:0] = lines[::-1]

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

108

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

109

def is_closed(self):

110

return self._closed

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

def __iter__(self):

return self

def next(self):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

116

line = self.readline()

117

if line == '':

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

118

raise StopIteration

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

119

return line

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

120

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

121

122

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

123

class FeedParser:

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

124

"""A feed-style parser of email."""

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

125

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

126

def __init__(self, _factory=Message.Message):

127

"""_factory is called with no arguments to create a new message obj"""

128

self._factory = _factory

129

self._input = BufferedSubFile()

130

self._msgstack = []

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

131

self._parse = self._parsegen().next

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

132

self._cur = None

133

self._last = None

134

self._headersonly = False

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

135

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

136

# Non-public interface for supporting Parser's headersonly flag

137

def _set_headersonly(self):

138

self._headersonly = True

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

139

140

def feed(self, data):

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

141

"""Push more data into the parser."""

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

142

self._input.push(data)

143

self._call_parse()

144

145

def _call_parse(self):

146

try:

147

self._parse()

148

except StopIteration:

149

pass

150

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

151

def close(self):

152

"""Parse all remaining data and return the root message object."""

153

self._input.close()

154

self._call_parse()

155

root = self._pop_message()

156

assert not self._msgstack

157

return root

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

158

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

159

def _new_message(self):

160

msg = self._factory()

161

if self._cur and self._cur.get_content_type() == 'multipart/digest':

162

msg.set_default_type('message/rfc822')

163

if self._msgstack:

164

self._msgstack[-1].attach(msg)

165

self._msgstack.append(msg)

166

self._cur = msg

167

self._cur.defects = []

168

self._last = msg

169

170

def _pop_message(self):

171

retval = self._msgstack.pop()

172

if self._msgstack:

173

self._cur = self._msgstack[-1]

else:

self._cur = None

return retval

def _parsegen(self):

# Create a new message and start by parsing headers.

180

self._new_message()

181

headers = []

182

# Collect the headers, searching for a line that doesn't match the RFC

183

# 2822 header or continuation pattern (including an empty line).

184

for line in self._input:

185

if line is NeedMoreData:

186

yield NeedMoreData

187

continue

188

if not headerRE.match(line):

189

# If we saw the RFC defined header/body separator

190

# (i.e. newline), just throw it away. Otherwise the line is

191

# part of the body so push it back.

192

if not NLCRE.match(line):

193

self._input.unreadline(line)

194

break

195

headers.append(line)

196

# Done with the headers, so parse them and figure out what we're

197

# supposed to see in the body of the message.

198

self._parse_headers(headers)

199

# Headers-only parsing is a backwards compatibility hack, which was

200

# necessary in the older parser, which could throw errors. All

201

# remaining lines in the input are thrown into the message body.

202

if self._headersonly:

203

lines = []

204

while True:

205

line = self._input.readline()

206

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

213

return

214

# So now the input is sitting at the first body line. If the message

215

# claims to be a message/rfc822 type, then what follows is another RFC

216

# 2822 message.

217

if self._cur.get_content_type() == 'message/rfc822':

218

for retval in self._parsegen():

219

if retval is NeedMoreData:

yield NeedMoreData

continue

break

self._pop_message()

return

if self._cur.get_content_type() == 'message/delivery-status':

226

# message/delivery-status contains blocks of headers separated by

227

# a blank line. We'll represent each header block as a separate

228

# nested message object. A blank line separates the subparts.

229

while True:

230

self._input.push_eof_matcher(NLCRE.match)

231

for retval in self._parsegen():

232

if retval is NeedMoreData:

yield NeedMoreData

continue

break

msg = self._pop_message()

237

# We need to pop the EOF matcher in order to tell if we're at

238

# the end of the current file, not the end of the last block

239

# of message headers.

240

self._input.pop_eof_matcher()

241

# The input stream must be sitting at the newline or at the

242

# EOF. We want to see if we're at the end of this subpart, so

243

# first consume the blank line, then test the next line to see

244

# if we're at this subpart's EOF.

245

line = self._input.readline()

246

line = self._input.readline()

247

if line == '':

248

break

249

# Not at EOF so this is a line we're going to need.

250

self._input.unreadline(line)

251

return

252

if self._cur.get_content_maintype() == 'multipart':

253

boundary = self._cur.get_boundary()

254

if boundary is None:

255

# The message /claims/ to be a multipart but it has not

256

# defined a boundary. That's a problem which we'll handle by

257

# reading everything until the EOF and marking the message as

258

# defective.

259

self._cur.defects.append(Errors.NoBoundaryInMultipart())

260

lines = []

261

for line in self._input:

262

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

267

return

268

# Create a line match predicate which matches the inter-part

269

# boundary as well as the end-of-multipart boundary. Don't push

270

# this onto the input stream until we've scanned past the

271

# preamble.

272

separator = '--' + boundary

273

boundaryre = re.compile(

274

'(?P<sep>' + re.escape(separator) +

275

r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')

276

capturing_preamble = True

preamble = []

linesep = False

while True:

line = self._input.readline()

281

if line is NeedMoreData:

yield NeedMoreData

continue

if line == '':

break

mo = boundaryre.match(line)

287

if mo:

288

# If we're looking at the end boundary, we're done with

289

# this multipart. If there was a newline at the end of

290

# the closing boundary, then we need to initialize the

291

# epilogue with the empty string (see below).

292

if mo.group('end'):

293

linesep = mo.group('linesep')

294

break

295

# We saw an inter-part boundary. Were we in the preamble?

296

if capturing_preamble:

297

if preamble:

298

# According to RFC 2046, the last newline belongs

299

# to the boundary.

300

lastline = preamble[-1]

301

eolmo = NLCRE_eol.search(lastline)

302

if eolmo:

303

preamble[-1] = lastline[:-len(eolmo.group(0))]

304

self._cur.preamble = EMPTYSTRING.join(preamble)

305

capturing_preamble = False

306

self._input.unreadline(line)

307

continue

308

# We saw a boundary separating two parts. Recurse to

309

# parse this subpart; the input stream points at the

310

# subpart's first line.

311

self._input.push_eof_matcher(boundaryre.match)

312

for retval in self._parsegen():

313

if retval is NeedMoreData:

yield NeedMoreData

continue

break

# Because of RFC 2046, the newline preceding the boundary

318

# separator actually belongs to the boundary, not the

319

# previous subpart's payload (or epilogue if the previous

320

# part is a multipart).

321

if self._last.get_content_maintype() == 'multipart':

322

epilogue = self._last.epilogue

323

if epilogue == '':

324

self._last.epilogue = None

325

elif epilogue is not None:

326

mo = NLCRE_eol.search(epilogue)

327

if mo:

328

end = len(mo.group(0))

329

self._last.epilogue = epilogue[:-end]

330

else:

331

payload = self._last.get_payload()

332

if isinstance(payload, basestring):

333

mo = NLCRE_eol.search(payload)

334

if mo:

335

payload = payload[:-len(mo.group(0))]

336

self._last.set_payload(payload)

337

self._input.pop_eof_matcher()

338

self._pop_message()

339

# Set the multipart up for newline cleansing, which will

340

# happen if we're in a nested multipart.

341

self._last = self._cur

342

else:

343

# I think we must be in the preamble

344

assert capturing_preamble

345

preamble.append(line)

346

# We've seen either the EOF or the end boundary. If we're still

347

# capturing the preamble, we never saw the start boundary. Note

348

# that as a defect and store the captured text as the payload.

349

# Otherwise everything from here to the EOF is epilogue.

350

if capturing_preamble:

351

self._cur.defects.append(Errors.StartBoundaryNotFound())

352

self._cur.set_payload(EMPTYSTRING.join(preamble))

353

return

354

# If the end boundary ended in a newline, we'll need to make sure

355

# the epilogue isn't None

if linesep:

epilogue = ['']

else:

epilogue = []

for line in self._input:

361

if line is NeedMoreData:

362

yield NeedMoreData

363

continue

364

epilogue.append(line)

365

# Any CRLF at the front of the epilogue is not technically part of

366

# the epilogue. Also, watch out for an empty string epilogue,

367

# which means a single newline.

368

firstline = epilogue[0]

369

bolmo = NLCRE_bol.match(firstline)

370

if bolmo:

371

epilogue[0] = firstline[len(bolmo.group(0)):]

372

self._cur.epilogue = EMPTYSTRING.join(epilogue)

373

return

374

# Otherwise, it's some non-multipart type, so the entire rest of the

375

# file contents becomes the payload.

376

lines = []

377

for line in self._input:

378

if line is NeedMoreData:

yield NeedMoreData

continue

lines.append(line)

self._cur.set_payload(EMPTYSTRING.join(lines))

383

384

def _parse_headers(self, lines):

385

# Passed a list of lines that make up the headers for the current msg

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

386

lastheader = ''

387

lastvalue = []

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

388

for lineno, line in enumerate(lines):

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

389

# Check for continuation

390

if line[0] in ' \t':

391

if not lastheader:

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

392

# The first line of the headers was a continuation. This

393

# is illegal, so let's note the defect, store the illegal

394

# line, and ignore it for purposes of headers.

395

defect = Errors.FirstHeaderLineIsContinuation(line)

396

self._cur.defects.append(defect)

397

continue

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

398

lastvalue.append(line)

399

continue

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

400

if lastheader:

401

# XXX reconsider the joining of folded lines

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

402

self._cur[lastheader] = EMPTYSTRING.join(lastvalue)[:-1]

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

403

lastheader, lastvalue = '', []

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

404

# Check for envelope header, i.e. unix-from

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

405

if line.startswith('From '):

406

if lineno == 0:

407

self._cur.set_unixfrom(line)

408

continue

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

409

elif lineno == len(lines) - 1:

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

410

# Something looking like a unix-from at the end - it's

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

411

# probably the first line of the body, so push back the

412

# line and stop.

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

413

self._input.unreadline(line)

414

return

415

else:

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

416

# Weirdly placed unix-from line. Note this as a defect

417

# and ignore it.

418

defect = Errors.MisplacedEnvelopeHeader(line)

419

self._cur.defects.append(defect)

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

420

continue

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

421

# Split the line on the colon separating field name from value.

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

422

i = line.find(':')

423

if i < 0:

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

424

defect = Errors.MalformedHeader(line)

425

self._cur.defects.append(defect)

426

continue

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

427

lastheader = line[:i]

428

lastvalue = [line[i+1:].lstrip()]

Barry Warsaw

418101f

2004-05-09 03:29:23 +0000

[diff] [blame^]

429

# Done with all the lines, so handle the last header.

Anthony Baxter

39a0f04

2004-03-22 00:33:28 +0000

[diff] [blame]

430

if lastheader:

431

# XXX reconsider the joining of folded lines

432

self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()