blob: a82d305ba529679fd512f3fbbe9b38b9f7604f5c [file] [log] [blame]
Anthony Baxter39a0f042004-03-22 00:33:28 +00001# A new Feed-style Parser
2
3from email import Errors, Message
4import re
5
6NLCRE = re.compile('\r\n|\r|\n')
7
8EMPTYSTRING = ''
9NL = '\n'
10
11NeedMoreData = object()
12
13class FeedableLumpOfText:
14 "A file-like object that can have new data loaded into it"
15
16 def __init__(self):
17 self._partial = ''
18 self._done = False
19 # _pending is a list of lines, in reverse order
20 self._pending = []
21
22 def readline(self):
23 """ Return a line of data.
24
25 If data has been pushed back with unreadline(), the most recently
26 returned unreadline()d data will be returned.
27 """
28 if not self._pending:
29 if self._done:
30 return ''
31 return NeedMoreData
32 return self._pending.pop()
33
34 def unreadline(self, line):
35 """ Push a line back into the object.
36 """
37 self._pending.append(line)
38
39 def peekline(self):
40 """ Non-destructively look at the next line """
41 if not self._pending:
42 if self._done:
43 return ''
44 return NeedMoreData
45 return self._pending[-1]
46
47
48 # for r in self._input.readuntil(regexp):
49 # if r is NeedMoreData:
50 # yield NeedMoreData
51 # preamble, matchobj = r
52 def readuntil(self, matchre, afterblank=False, includematch=False):
53 """ Read a line at a time until we get the specified RE.
54
55 Returns the text up to (and including, if includematch is true) the
56 matched text, and the RE match object. If afterblank is true,
57 there must be a blank line before the matched text. Moves current
58 filepointer to the line following the matched line. If we reach
59 end-of-file, return what we've got so far, and return None as the
60 RE match object.
61 """
62 prematch = []
63 blankseen = 0
64 while 1:
65 if not self._pending:
66 if self._done:
67 # end of file
68 yield EMPTYSTRING.join(prematch), None
69 else:
70 yield NeedMoreData
71 continue
72 line = self._pending.pop()
73 if afterblank:
74 if NLCRE.match(line):
75 blankseen = 1
76 continue
77 else:
78 blankseen = 0
79 m = matchre.match(line)
80 if (m and not afterblank) or (m and afterblank and blankseen):
81 if includematch:
82 prematch.append(line)
83 yield EMPTYSTRING.join(prematch), m
84 prematch.append(line)
85
86
87 NLatend = re.compile('(\r\n|\r|\n)$').match
88 NLCRE_crack = re.compile('(\r\n|\r|\n)')
89
90 def push(self, data):
91 """ Push some new data into this object """
92 # Handle any previous leftovers
93 data, self._partial = self._partial+data, ''
94 # Crack into lines, but leave the newlines on the end of each
95 lines = self.NLCRE_crack.split(data)
96 # The *ahem* interesting behaviour of re.split when supplied
97 # groups means that the last element is the data after the
98 # final RE. In the case of a NL/CR terminated string, this is
99 # the empty string.
100 self._partial = lines.pop()
101 o = []
102 for i in range(len(lines) / 2):
103 o.append(EMPTYSTRING.join([lines[i*2], lines[i*2+1]]))
104 self.pushlines(o)
105
106 def pushlines(self, lines):
107 """ Push a list of new lines into the object """
108 # Reverse and insert at the front of _pending
109 self._pending[:0] = lines[::-1]
110
111 def end(self):
112 """ There is no more data """
113 self._done = True
114
115 def is_done(self):
116 return self._done
117
118 def __iter__(self):
119 return self
120
121 def next(self):
122 l = self.readline()
123 if l == '':
124 raise StopIteration
125 return l
126
127class FeedParser:
128 "A feed-style parser of email. copy docstring here"
129
130 def __init__(self, _class=Message.Message):
131 "fnord fnord fnord"
132 self._class = _class
133 self._input = FeedableLumpOfText()
134 self._root = None
135 self._objectstack = []
136 self._parse = self._parsegen().next
137
138 def end(self):
139 self._input.end()
140 self._call_parse()
141 return self._root
142
143 def feed(self, data):
144 self._input.push(data)
145 self._call_parse()
146
147 def _call_parse(self):
148 try:
149 self._parse()
150 except StopIteration:
151 pass
152
153 headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
154
155 def _parse_headers(self,headerlist):
156 # Passed a list of strings that are the headers for the
157 # current object
158 lastheader = ''
159 lastvalue = []
160
161
162 for lineno, line in enumerate(headerlist):
163 # Check for continuation
164 if line[0] in ' \t':
165 if not lastheader:
166 raise Errors.HeaderParseError('First line must not be a continuation')
167 lastvalue.append(line)
168 continue
169
170 if lastheader:
171 # XXX reconsider the joining of folded lines
172 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
173 lastheader, lastvalue = '', []
174
175 # Check for Unix-From
176 if line.startswith('From '):
177 if lineno == 0:
178 self._cur.set_unixfrom(line)
179 continue
180 elif lineno == len(headerlist) - 1:
181 # Something looking like a unix-from at the end - it's
182 # probably the first line of the body
183 self._input.unreadline(line)
184 return
185 else:
186 # Weirdly placed unix-from line. Ignore it.
187 continue
188
189 i = line.find(':')
190 if i < 0:
191 # The older parser had various special-cases here. We've
192 # already handled them
193 raise Errors.HeaderParseError(
194 "Not a header, not a continuation: ``%s''" % line)
195 lastheader = line[:i]
196 lastvalue = [line[i+1:].lstrip()]
197
198 if lastheader:
199 # XXX reconsider the joining of folded lines
200 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
201
202
203 def _parsegen(self):
204 # Parse any currently available text
205 self._new_sub_object()
206 self._root = self._cur
207 completing = False
208 last = None
209
210 for line in self._input:
211 if line is NeedMoreData:
212 yield None # Need More Data
213 continue
214 self._input.unreadline(line)
215 if not completing:
216 headers = []
217 # Now collect all headers.
218 for line in self._input:
219 if line is NeedMoreData:
220 yield None # Need More Data
221 continue
222 if not self.headerRE.match(line):
223 self._parse_headers(headers)
224 # A message/rfc822 has no body and no internal
225 # boundary.
226 if self._cur.get_content_maintype() == "message":
227 self._new_sub_object()
228 completing = False
229 headers = []
230 continue
231 if line.strip():
232 # No blank line between headers and body.
233 # Push this line back, it's the first line of
234 # the body.
235 self._input.unreadline(line)
236 break
237 else:
238 headers.append(line)
239 else:
240 # We're done with the data and are still inside the headers
241 self._parse_headers(headers)
242
243 # Now we're dealing with the body
244 boundary = self._cur.get_boundary()
245 isdigest = (self._cur.get_content_type() == 'multipart/digest')
246 if boundary and not self._cur._finishing:
247 separator = '--' + boundary
248 self._cur._boundaryRE = re.compile(
249 r'(?P<sep>' + re.escape(separator) +
250 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
251 for r in self._input.readuntil(self._cur._boundaryRE):
252 if r is NeedMoreData:
253 yield NeedMoreData
254 else:
255 preamble, matchobj = r
256 break
257 if not matchobj:
258 # Broken - we hit the end of file. Just set the body
259 # to the text.
260 if completing:
261 self._attach_trailer(last, preamble)
262 else:
263 self._attach_preamble(self._cur, preamble)
264 # XXX move back to the parent container.
265 self._pop_container()
266 completing = True
267 continue
268 if preamble:
269 if completing:
270 preamble = preamble[:-len(matchobj.group('linesep'))]
271 self._attach_trailer(last, preamble)
272 else:
273 self._attach_preamble(self._cur, preamble)
274 elif not completing:
275 # The module docs specify an empty preamble is None, not ''
276 self._cur.preamble = None
277 # If we _are_ completing, the last object gets no payload
278
279 if matchobj.group('end'):
280 # That was the end boundary tag. Bounce back to the
281 # parent container
282 last = self._pop_container()
283 self._input.unreadline(matchobj.group('linesep'))
284 completing = True
285 continue
286
287 # A number of MTAs produced by a nameless large company
288 # we shall call "SicroMoft" produce repeated boundary
289 # lines.
290 while True:
291 line = self._input.peekline()
292 if line is NeedMoreData:
293 yield None
294 continue
295 if self._cur._boundaryRE.match(line):
296 self._input.readline()
297 else:
298 break
299
300 self._new_sub_object()
301
302 completing = False
303 if isdigest:
304 self._cur.set_default_type('message/rfc822')
305 continue
306 else:
307 # non-multipart or after end-boundary
308 if last is not self._root:
309 last = self._pop_container()
310 if self._cur.get_content_maintype() == "message":
311 # We double-pop to leave the RFC822 object
312 self._pop_container()
313 completing = True
314 elif self._cur._boundaryRE and last <> self._root:
315 completing = True
316 else:
317 # Non-multipart top level, or in the trailer of the
318 # top level multipart
319 while not self._input.is_done():
320 yield None
321 data = list(self._input)
322 body = EMPTYSTRING.join(data)
323 self._attach_trailer(last, body)
324
325
326 def _attach_trailer(self, obj, trailer):
327 #import pdb ; pdb.set_trace()
328 if obj.get_content_maintype() in ( "multipart", "message" ):
329 obj.epilogue = trailer
330 else:
331 obj.set_payload(trailer)
332
333 def _attach_preamble(self, obj, trailer):
334 if obj.get_content_maintype() in ( "multipart", "message" ):
335 obj.preamble = trailer
336 else:
337 obj.set_payload(trailer)
338
339
340 def _new_sub_object(self):
341 new = self._class()
342 #print "pushing", self._objectstack, repr(new)
343 if self._objectstack:
344 self._objectstack[-1].attach(new)
345 self._objectstack.append(new)
346 new._boundaryRE = None
347 new._finishing = False
348 self._cur = new
349
350 def _pop_container(self):
351 # Move the pointer to the container of the current object.
352 # Returns the (old) current object
353 #import pdb ; pdb.set_trace()
354 #print "popping", self._objectstack
355 last = self._objectstack.pop()
356 if self._objectstack:
357 self._cur = self._objectstack[-1]
358 else:
359 self._cur._finishing = True
360 return last
361
362