Blame - Lib/email/Parser.py - platform/external/python/cpython2

2001-09-23 03:17:28 +0000

[diff] [blame]

2

# Author: barry@zope.com (Barry Warsaw)

3

4

"""A parser of RFC 2822 and MIME email messages.

5

"""

6

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

7

import re

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

8

from cStringIO import StringIO

Barry Warsaw

15e9dc9

2002-01-27 06:48:02 +0000

[diff] [blame]

9

from types import ListType

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

10

Barry Warsaw

69e18af

2002-06-02 19:12:03 +0000

[diff] [blame]

11

from email import Errors

12

from email import Message

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

13

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

EMPTYSTRING = ''

NL = '\n'

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

try:

True, False

except NameError:

True = 1

False = 0

Barry Warsaw

2001-10-04 17:05:11 +0000

[diff] [blame]

24

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

25

class Parser:

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

26

def __init__(self, _class=Message.Message, strict=False):

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

27

"""Parser of RFC 2822 and MIME email messages.

28

29

Creates an in-memory object tree representing the email message, which

30

can then be manipulated and turned over to a Generator to return the

31

textual representation of the message.

32

33

The string must be formatted as a block of RFC 2822 headers and header

34

continuation lines, optionally preceeded by a `Unix-from' header. The

35

header block is terminated either by the end of the string or by a

36

blank line.

37

38

_class is the class to instantiate for new message objects when they

39

must be created. This class must have a constructor that can take

40

zero arguments. Default is Message.Message.

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

41

42

Optional strict tells the parser to be strictly RFC compliant or to be

43

more forgiving in parsing of ill-formatted MIME documents. When

44

non-strict mode is used, the parser will try to make up for missing or

45

erroneous boundaries and other peculiarities seen in the wild.

Barry Warsaw

bb26b45

2002-07-19 22:25:34 +0000

[diff] [blame]

46

Default is non-strict parsing.

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

47

"""

48

self._class = _class

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

49

self._strict = strict

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

50

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

51

def parse(self, fp, headersonly=False):

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

52

root = self._class()

53

self._parseheaders(root, fp)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

54

if not headersonly:

55

self._parsebody(root, fp)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

56

return root

57

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

58

def parsestr(self, text, headersonly=False):

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

59

return self.parse(StringIO(text), headersonly=headersonly)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

60

61

def _parseheaders(self, container, fp):

62

# Parse the headers, returning a list of header/value pairs. None as

63

# the header means the Unix-From header.

64

lastheader = ''

65

lastvalue = []

66

lineno = 0

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

67

while True:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

68

# Don't strip the line before we test for the end condition,

69

# because whitespace-only header lines are RFC compliant

70

# continuation lines.

71

line = fp.readline()

72

if not line:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

73

break

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

74

line = line.splitlines()[0]

75

if not line:

76

break

77

# Ignore the trailing newline

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

78

lineno += 1

79

# Check for initial Unix From_ line

80

if line.startswith('From '):

81

if lineno == 1:

82

container.set_unixfrom(line)

83

continue

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

84

elif self._strict:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

85

raise Errors.HeaderParseError(

86

'Unix-from in headers after first rfc822 header')

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

87

else:

88

# ignore the wierdly placed From_ line

89

# XXX: maybe set unixfrom anyway? or only if not already?

90

continue

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

91

# Header continuation line

92

if line[0] in ' \t':

93

if not lastheader:

94

raise Errors.HeaderParseError(

95

'Continuation line seen before first header')

96

lastvalue.append(line)

97

continue

98

# Normal, non-continuation header. BAW: this should check to make

99

# sure it's a legal header, e.g. doesn't contain spaces. Also, we

100

# should expose the header matching algorithm in the API, and

101

# allow for a non-strict parsing mode (that ignores the line

102

# instead of raising the exception).

103

i = line.find(':')

104

if i < 0:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

105

if self._strict:

106

raise Errors.HeaderParseError(

107

"Not a header, not a continuation: ``%s''"%line)

108

elif lineno == 1 and line.startswith('--'):

109

# allow through duplicate boundary tags.

110

continue

111

else:

112

raise Errors.HeaderParseError(

113

"Not a header, not a continuation: ``%s''"%line)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

114

if lastheader:

115

container[lastheader] = NL.join(lastvalue)

116

lastheader = line[:i]

117

lastvalue = [line[i+1:].lstrip()]

118

# Make sure we retain the last header

119

if lastheader:

120

container[lastheader] = NL.join(lastvalue)

121

122

def _parsebody(self, container, fp):

123

# Parse the body, but first split the payload on the content-type

124

# boundary if present.

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

125

boundary = container.get_boundary()

126

isdigest = (container.get_type() == 'multipart/digest')

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

127

# If there's a boundary, split the payload text into its constituent

128

# parts and parse each separately. Otherwise, just parse the rest of

129

# the body as a single message. Note: any exceptions raised in the

130

# recursive parse need to have their line numbers coerced.

131

if boundary:

132

preamble = epilogue = None

133

# Split into subparts. The first boundary we're looking for won't

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

134

# always have a leading newline since we're at the start of the

135

# body text, and there's not always a preamble before the first

136

# boundary.

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

137

separator = '--' + boundary

138

payload = fp.read()

Tim Peters

280488b

2002-08-23 18:19:30 +0000

[diff] [blame]

139

# We use an RE here because boundaries can have trailing

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

140

# whitespace.

141

mo = re.search(

142

r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',

143

payload)

144

if not mo:

Barry Warsaw

034b47a

2002-09-10 16:14:56 +0000

[diff] [blame]

145

if self._strict:

146

raise Errors.BoundaryError(

147

"Couldn't find starting boundary: %s" % boundary)

148

container.set_payload(payload)

149

return

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

150

start = mo.start()

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

151

if start > 0:

152

# there's some pre-MIME boundary preamble

153

preamble = payload[0:start]

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

154

# Find out what kind of line endings we're using

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

155

start += len(mo.group('sep')) + len(mo.group('ws'))

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

156

cre = re.compile('\r\n|\r|\n')

157

mo = cre.search(payload, start)

158

if mo:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

159

start += len(mo.group(0))

Barry Warsaw

2002-05-19 23:51:50 +0000

[diff] [blame]

160

# We create a compiled regexp first because we need to be able to

161

# specify the start position, and the module function doesn't

162

# support this signature. :(

163

cre = re.compile('(?P<sep>\r\n|\r|\n)' +

164

re.escape(separator) + '--')

165

mo = cre.search(payload, start)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

166

if mo:

167

terminator = mo.start()

168

linesep = mo.group('sep')

169

if mo.end() < len(payload):

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

170

# There's some post-MIME boundary epilogue

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

171

epilogue = payload[mo.end():]

172

elif self._strict:

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

173

raise Errors.BoundaryError(

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

174

"Couldn't find terminating boundary: %s" % boundary)

175

else:

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

176

# Handle the case of no trailing boundary. Check that it ends

177

# in a blank line. Some cases (spamspamspam) don't even have

178

# that!

179

mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

180

if not mo:

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

181

mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)

182

if not mo:

183

raise Errors.BoundaryError(

184

'No terminating boundary and no trailing empty line')

185

linesep = mo.group('sep')

186

terminator = len(payload)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

187

# We split the textual payload on the boundary separator, which

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

188

# includes the trailing newline. If the container is a

Tim Peters

280488b

2002-08-23 18:19:30 +0000

[diff] [blame]

189

# multipart/digest then the subparts are by default message/rfc822

190

# instead of text/plain. In that case, they'll have a optional

191

# block of MIME headers, then an empty line followed by the

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

192

# message headers.

Barry Warsaw

2002-07-18 23:09:09 +0000

[diff] [blame]

193

parts = re.split(

194

linesep + re.escape(separator) + r'[ \t]*' + linesep,

195

payload[start:terminator])

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

196

for part in parts:

Tim Peters

280488b

2002-08-23 18:19:30 +0000

[diff] [blame]

197

if isdigest:

Barry Warsaw

2002-07-09 02:50:02 +0000

[diff] [blame]

198

if part[0] == linesep:

199

# There's no header block so create an empty message

200

# object as the container, and lop off the newline so

201

# we can parse the sub-subobject

202

msgobj = self._class()

203

part = part[1:]

204

else:

205

parthdrs, part = part.split(linesep+linesep, 1)

206

# msgobj in this case is the "message/rfc822" container

207

msgobj = self.parsestr(parthdrs, headersonly=1)

208

# while submsgobj is the message itself

209

submsgobj = self.parsestr(part)

210

msgobj.attach(submsgobj)

211

msgobj.set_default_type('message/rfc822')

212

else:

213

msgobj = self.parsestr(part)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

214

container.preamble = preamble

215

container.epilogue = epilogue

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

216

container.attach(msgobj)

217

elif container.get_main_type() == 'multipart':

218

# Very bad. A message is a multipart with no boundary!

219

raise Errors.BoundaryError(

220

'multipart message with no defined boundary')

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

221

elif container.get_type() == 'message/delivery-status':

222

# This special kind of type contains blocks of headers separated

223

# by a blank line. We'll represent each header block as a

224

# separate Message object

225

blocks = []

Barry Warsaw

2002-09-28 20:44:58 +0000

[diff] [blame]

226

while True:

Barry Warsaw

66971fb

2001-09-26 05:44:09 +0000

[diff] [blame]

227

blockmsg = self._class()

228

self._parseheaders(blockmsg, fp)

229

if not len(blockmsg):

230

# No more header blocks left

231

break

232

blocks.append(blockmsg)

233

container.set_payload(blocks)

234

elif container.get_main_type() == 'message':

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

235

# Create a container for the payload, but watch out for there not

236

# being any headers left

237

try:

238

msg = self.parse(fp)

239

except Errors.HeaderParseError:

240

msg = self._class()

241

self._parsebody(msg, fp)

Barry Warsaw

69e18af

2002-06-02 19:12:03 +0000

[diff] [blame]

242

container.attach(msg)

Barry Warsaw

2001-09-23 03:17:28 +0000

[diff] [blame]

243

else:

Barry Warsaw