blob: 4f455a4284f0270fb2d57fe4c17332fa435600eb [file] [log] [blame]
Barry Warsaw409a4c02002-04-10 21:01:31 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""Classes to generate plain text from a message object tree.
5"""
6
Barry Warsawba925802001-09-23 03:17:28 +00007import re
Barry Warsaw5d384ef2003-03-06 05:22:02 +00008import time
9import locale
Barry Warsawba925802001-09-23 03:17:28 +000010import random
11
Barry Warsaw6c2bc462002-10-14 15:09:30 +000012from types import ListType, StringType
Barry Warsawba925802001-09-23 03:17:28 +000013from cStringIO import StringIO
14
Barry Warsaw062749a2002-06-28 23:41:42 +000015from email.Header import Header
Barry Warsaw5d384ef2003-03-06 05:22:02 +000016from email.Parser import NLCRE
Barry Warsaw062749a2002-06-28 23:41:42 +000017
Barry Warsawb1c1de32002-09-10 16:13:45 +000018try:
19 from email._compat22 import _isstring
20except SyntaxError:
21 from email._compat21 import _isstring
22
Barry Warsaw56835dd2002-09-28 18:04:55 +000023try:
24 True, False
25except NameError:
26 True = 1
27 False = 0
Barry Warsawb1c1de32002-09-10 16:13:45 +000028
Barry Warsawd1eeecb2001-10-17 20:51:42 +000029EMPTYSTRING = ''
Barry Warsawba925802001-09-23 03:17:28 +000030SEMISPACE = '; '
31BAR = '|'
32UNDERSCORE = '_'
33NL = '\n'
Barry Warsawd1eeecb2001-10-17 20:51:42 +000034NLTAB = '\n\t'
Barry Warsawba925802001-09-23 03:17:28 +000035SEMINLTAB = ';\n\t'
36SPACE8 = ' ' * 8
37
38fcre = re.compile(r'^From ', re.MULTILINE)
39
Barry Warsaw6c2bc462002-10-14 15:09:30 +000040def _is8bitstring(s):
41 if isinstance(s, StringType):
42 try:
43 unicode(s, 'us-ascii')
44 except UnicodeError:
45 return True
46 return False
47
Barry Warsawba925802001-09-23 03:17:28 +000048
Barry Warsawe968ead2001-10-04 17:05:11 +000049
Barry Warsawba925802001-09-23 03:17:28 +000050class Generator:
51 """Generates output from a Message object tree.
52
53 This basic generator writes the message to the given file object as plain
54 text.
55 """
56 #
57 # Public interface
58 #
59
Barry Warsaw56835dd2002-09-28 18:04:55 +000060 def __init__(self, outfp, mangle_from_=True, maxheaderlen=78):
Barry Warsawba925802001-09-23 03:17:28 +000061 """Create the generator for message flattening.
62
63 outfp is the output file-like object for writing the message to. It
64 must have a write() method.
65
Barry Warsaw56835dd2002-09-28 18:04:55 +000066 Optional mangle_from_ is a flag that, when True (the default), escapes
67 From_ lines in the body of the message by putting a `>' in front of
68 them.
Barry Warsawba925802001-09-23 03:17:28 +000069
70 Optional maxheaderlen specifies the longest length for a non-continued
71 header. When a header line is longer (in characters, with tabs
72 expanded to 8 spaces), than maxheaderlen, the header will be broken on
73 semicolons and continued as per RFC 2822. If no semicolon is found,
74 then the header is left alone. Set to zero to disable wrapping
75 headers. Default is 78, as recommended (but not required by RFC
76 2822.
77 """
78 self._fp = outfp
79 self._mangle_from_ = mangle_from_
Barry Warsawba925802001-09-23 03:17:28 +000080 self.__maxheaderlen = maxheaderlen
81
82 def write(self, s):
83 # Just delegate to the file object
84 self._fp.write(s)
85
Barry Warsaw56835dd2002-09-28 18:04:55 +000086 def flatten(self, msg, unixfrom=False):
Barry Warsawba925802001-09-23 03:17:28 +000087 """Print the message object tree rooted at msg to the output file
88 specified when the Generator instance was created.
89
90 unixfrom is a flag that forces the printing of a Unix From_ delimiter
91 before the first object in the message tree. If the original message
92 has no From_ delimiter, a `standard' one is crafted. By default, this
Barry Warsaw56835dd2002-09-28 18:04:55 +000093 is False to inhibit the printing of any From_ delimiter.
Barry Warsawba925802001-09-23 03:17:28 +000094
95 Note that for subobjects, no From_ line is printed.
96 """
97 if unixfrom:
98 ufrom = msg.get_unixfrom()
99 if not ufrom:
100 ufrom = 'From nobody ' + time.ctime(time.time())
101 print >> self._fp, ufrom
102 self._write(msg)
103
Barry Warsaw7dc865a2002-06-02 19:02:37 +0000104 # For backwards compatibility, but this is slower
105 __call__ = flatten
106
Barry Warsaw93c40f02002-07-09 02:43:47 +0000107 def clone(self, fp):
108 """Clone this generator with the exact same options."""
109 return self.__class__(fp, self._mangle_from_, self.__maxheaderlen)
110
Barry Warsawba925802001-09-23 03:17:28 +0000111 #
112 # Protected interface - undocumented ;/
113 #
114
115 def _write(self, msg):
116 # We can't write the headers yet because of the following scenario:
117 # say a multipart message includes the boundary string somewhere in
118 # its body. We'd have to calculate the new boundary /before/ we write
119 # the headers so that we can write the correct Content-Type:
120 # parameter.
121 #
122 # The way we do this, so as to make the _handle_*() methods simpler,
123 # is to cache any subpart writes into a StringIO. The we write the
124 # headers and the StringIO contents. That way, subpart handlers can
125 # Do The Right Thing, and can still modify the Content-Type: header if
126 # necessary.
127 oldfp = self._fp
128 try:
129 self._fp = sfp = StringIO()
130 self._dispatch(msg)
131 finally:
132 self._fp = oldfp
133 # Write the headers. First we see if the message object wants to
134 # handle that itself. If not, we'll do it generically.
135 meth = getattr(msg, '_write_headers', None)
136 if meth is None:
137 self._write_headers(msg)
138 else:
139 meth(self)
140 self._fp.write(sfp.getvalue())
141
142 def _dispatch(self, msg):
143 # Get the Content-Type: for the message, then try to dispatch to
Barry Warsawf488b2c2002-07-11 18:48:40 +0000144 # self._handle_<maintype>_<subtype>(). If there's no handler for the
145 # full MIME type, then dispatch to self._handle_<maintype>(). If
146 # that's missing too, then dispatch to self._writeBody().
Barry Warsawdfea3b32002-08-20 14:47:30 +0000147 main = msg.get_content_maintype()
148 sub = msg.get_content_subtype()
Barry Warsaw93c40f02002-07-09 02:43:47 +0000149 specific = UNDERSCORE.join((main, sub)).replace('-', '_')
150 meth = getattr(self, '_handle_' + specific, None)
151 if meth is None:
152 generic = main.replace('-', '_')
153 meth = getattr(self, '_handle_' + generic, None)
Barry Warsawba925802001-09-23 03:17:28 +0000154 if meth is None:
Barry Warsaw93c40f02002-07-09 02:43:47 +0000155 meth = self._writeBody
156 meth(msg)
Barry Warsawba925802001-09-23 03:17:28 +0000157
158 #
159 # Default handlers
160 #
161
162 def _write_headers(self, msg):
163 for h, v in msg.items():
Barry Warsawba925802001-09-23 03:17:28 +0000164 # RFC 2822 says that lines SHOULD be no more than maxheaderlen
165 # characters wide, so we're well within our rights to split long
166 # headers.
167 text = '%s: %s' % (h, v)
168 if self.__maxheaderlen > 0 and len(text) > self.__maxheaderlen:
Barry Warsaw56835dd2002-09-28 18:04:55 +0000169 text = self._split_header(text)
Barry Warsawba925802001-09-23 03:17:28 +0000170 print >> self._fp, text
171 # A blank line always separates headers from body
172 print >> self._fp
173
Barry Warsaw56835dd2002-09-28 18:04:55 +0000174 def _split_header(self, text):
Barry Warsawba925802001-09-23 03:17:28 +0000175 maxheaderlen = self.__maxheaderlen
176 # Find out whether any lines in the header are really longer than
177 # maxheaderlen characters wide. There could be continuation lines
178 # that actually shorten it. Also, replace hard tabs with 8 spaces.
Barry Warsaw062749a2002-06-28 23:41:42 +0000179 lines = [s.replace('\t', SPACE8) for s in text.splitlines()]
Barry Warsawba925802001-09-23 03:17:28 +0000180 for line in lines:
181 if len(line) > maxheaderlen:
182 break
183 else:
184 # No line was actually longer than maxheaderlen characters, so
185 # just return the original unchanged.
186 return text
Barry Warsaw6c2bc462002-10-14 15:09:30 +0000187 # If we have raw 8bit data in a byte string, we have no idea what the
188 # encoding is. I think there is no safe way to split this string. If
189 # it's ascii-subset, then we could do a normal ascii split, but if
190 # it's multibyte then we could break the string. There's no way to
191 # know so the least harm seems to be to not split the string and risk
192 # it being too long.
193 if _is8bitstring(text):
194 return text
Barry Warsaw062749a2002-06-28 23:41:42 +0000195 # The `text' argument already has the field name prepended, so don't
196 # provide it here or the first line will get folded too short.
197 h = Header(text, maxlinelen=maxheaderlen,
198 # For backwards compatibility, we use a hard tab here
199 continuation_ws='\t')
200 return h.encode()
Barry Warsawba925802001-09-23 03:17:28 +0000201
202 #
203 # Handlers for writing types and subtypes
204 #
205
206 def _handle_text(self, msg):
207 payload = msg.get_payload()
Barry Warsawb384e012001-09-26 05:32:41 +0000208 if payload is None:
209 return
Barry Warsaw409a4c02002-04-10 21:01:31 +0000210 cset = msg.get_charset()
211 if cset is not None:
212 payload = cset.body_encode(payload)
Barry Warsawb1c1de32002-09-10 16:13:45 +0000213 if not _isstring(payload):
Barry Warsawb384e012001-09-26 05:32:41 +0000214 raise TypeError, 'string payload expected: %s' % type(payload)
Barry Warsawba925802001-09-23 03:17:28 +0000215 if self._mangle_from_:
216 payload = fcre.sub('>From ', payload)
217 self._fp.write(payload)
218
219 # Default body handler
220 _writeBody = _handle_text
221
Barry Warsaw93c40f02002-07-09 02:43:47 +0000222 def _handle_multipart(self, msg):
Barry Warsawba925802001-09-23 03:17:28 +0000223 # The trick here is to write out each part separately, merge them all
224 # together, and then make sure that the boundary we've chosen isn't
225 # present in the payload.
226 msgtexts = []
Barry Warsaw409a4c02002-04-10 21:01:31 +0000227 subparts = msg.get_payload()
228 if subparts is None:
Barry Warsaw93c40f02002-07-09 02:43:47 +0000229 # Nothing has ever been attached
Barry Warsaw409a4c02002-04-10 21:01:31 +0000230 boundary = msg.get_boundary(failobj=_make_boundary())
231 print >> self._fp, '--' + boundary
232 print >> self._fp, '\n'
233 print >> self._fp, '--' + boundary + '--'
234 return
Barry Warsawb1c1de32002-09-10 16:13:45 +0000235 elif _isstring(subparts):
236 # e.g. a non-strict parse of a message with no starting boundary.
237 self._fp.write(subparts)
238 return
Barry Warsaw409a4c02002-04-10 21:01:31 +0000239 elif not isinstance(subparts, ListType):
240 # Scalar payload
241 subparts = [subparts]
242 for part in subparts:
Barry Warsawba925802001-09-23 03:17:28 +0000243 s = StringIO()
Barry Warsaw93c40f02002-07-09 02:43:47 +0000244 g = self.clone(s)
Barry Warsaw56835dd2002-09-28 18:04:55 +0000245 g.flatten(part, unixfrom=False)
Barry Warsawba925802001-09-23 03:17:28 +0000246 msgtexts.append(s.getvalue())
247 # Now make sure the boundary we've selected doesn't appear in any of
248 # the message texts.
249 alltext = NL.join(msgtexts)
250 # BAW: What about boundaries that are wrapped in double-quotes?
251 boundary = msg.get_boundary(failobj=_make_boundary(alltext))
252 # If we had to calculate a new boundary because the body text
253 # contained that string, set the new boundary. We don't do it
254 # unconditionally because, while set_boundary() preserves order, it
255 # doesn't preserve newlines/continuations in headers. This is no big
256 # deal in practice, but turns out to be inconvenient for the unittest
257 # suite.
258 if msg.get_boundary() <> boundary:
259 msg.set_boundary(boundary)
260 # Write out any preamble
261 if msg.preamble is not None:
262 self._fp.write(msg.preamble)
Barry Warsaw5d384ef2003-03-06 05:22:02 +0000263 # If preamble is the empty string, the length of the split will be
264 # 1, but the last element will be the empty string. If it's
265 # anything else but does not end in a line separator, the length
266 # will be > 1 and not end in an empty string. We need to
267 # guarantee a newline after the preamble, but don't add too many.
268 plines = NLCRE.split(msg.preamble)
269 if plines <> [''] and plines[-1] <> '':
270 self._fp.write('\n')
Barry Warsawba925802001-09-23 03:17:28 +0000271 # First boundary is a bit different; it doesn't have a leading extra
272 # newline.
273 print >> self._fp, '--' + boundary
Barry Warsawba925802001-09-23 03:17:28 +0000274 # Join and write the individual parts
275 joiner = '\n--' + boundary + '\n'
Barry Warsawba925802001-09-23 03:17:28 +0000276 self._fp.write(joiner.join(msgtexts))
277 print >> self._fp, '\n--' + boundary + '--',
278 # Write out any epilogue
279 if msg.epilogue is not None:
Barry Warsaw856c32b2001-10-19 04:06:39 +0000280 if not msg.epilogue.startswith('\n'):
281 print >> self._fp
Barry Warsawba925802001-09-23 03:17:28 +0000282 self._fp.write(msg.epilogue)
283
Barry Warsawb384e012001-09-26 05:32:41 +0000284 def _handle_message_delivery_status(self, msg):
285 # We can't just write the headers directly to self's file object
286 # because this will leave an extra newline between the last header
287 # block and the boundary. Sigh.
288 blocks = []
289 for part in msg.get_payload():
290 s = StringIO()
Barry Warsaw93c40f02002-07-09 02:43:47 +0000291 g = self.clone(s)
Barry Warsaw56835dd2002-09-28 18:04:55 +0000292 g.flatten(part, unixfrom=False)
Barry Warsawb384e012001-09-26 05:32:41 +0000293 text = s.getvalue()
294 lines = text.split('\n')
295 # Strip off the unnecessary trailing empty line
296 if lines and lines[-1] == '':
297 blocks.append(NL.join(lines[:-1]))
298 else:
299 blocks.append(text)
300 # Now join all the blocks with an empty line. This has the lovely
301 # effect of separating each block with an empty line, but not adding
302 # an extra one after the last one.
303 self._fp.write(NL.join(blocks))
304
305 def _handle_message(self, msg):
Barry Warsawba925802001-09-23 03:17:28 +0000306 s = StringIO()
Barry Warsaw93c40f02002-07-09 02:43:47 +0000307 g = self.clone(s)
Barry Warsaw7dc865a2002-06-02 19:02:37 +0000308 # The payload of a message/rfc822 part should be a multipart sequence
309 # of length 1. The zeroth element of the list should be the Message
Barry Warsaw93c40f02002-07-09 02:43:47 +0000310 # object for the subpart. Extract that object, stringify it, and
311 # write it out.
Barry Warsaw56835dd2002-09-28 18:04:55 +0000312 g.flatten(msg.get_payload(0), unixfrom=False)
Barry Warsawba925802001-09-23 03:17:28 +0000313 self._fp.write(s.getvalue())
314
315
Barry Warsawe968ead2001-10-04 17:05:11 +0000316
Barry Warsawba925802001-09-23 03:17:28 +0000317class DecodedGenerator(Generator):
318 """Generator a text representation of a message.
319
320 Like the Generator base class, except that non-text parts are substituted
321 with a format string representing the part.
322 """
Barry Warsaw56835dd2002-09-28 18:04:55 +0000323 def __init__(self, outfp, mangle_from_=True, maxheaderlen=78, fmt=None):
Barry Warsawba925802001-09-23 03:17:28 +0000324 """Like Generator.__init__() except that an additional optional
325 argument is allowed.
326
327 Walks through all subparts of a message. If the subpart is of main
328 type `text', then it prints the decoded payload of the subpart.
329
330 Otherwise, fmt is a format string that is used instead of the message
331 payload. fmt is expanded with the following keywords (in
332 %(keyword)s format):
333
334 type : Full MIME type of the non-text part
335 maintype : Main MIME type of the non-text part
336 subtype : Sub-MIME type of the non-text part
337 filename : Filename of the non-text part
338 description: Description associated with the non-text part
339 encoding : Content transfer encoding of the non-text part
340
341 The default value for fmt is None, meaning
342
343 [Non-text (%(type)s) part of message omitted, filename %(filename)s]
344 """
345 Generator.__init__(self, outfp, mangle_from_, maxheaderlen)
346 if fmt is None:
347 fmt = ('[Non-text (%(type)s) part of message omitted, '
348 'filename %(filename)s]')
349 self._fmt = fmt
350
351 def _dispatch(self, msg):
352 for part in msg.walk():
Barry Warsawb384e012001-09-26 05:32:41 +0000353 maintype = part.get_main_type('text')
354 if maintype == 'text':
Barry Warsaw56835dd2002-09-28 18:04:55 +0000355 print >> self, part.get_payload(decode=True)
Barry Warsawb384e012001-09-26 05:32:41 +0000356 elif maintype == 'multipart':
357 # Just skip this
358 pass
Barry Warsawba925802001-09-23 03:17:28 +0000359 else:
360 print >> self, self._fmt % {
361 'type' : part.get_type('[no MIME type]'),
362 'maintype' : part.get_main_type('[no main MIME type]'),
363 'subtype' : part.get_subtype('[no sub-MIME type]'),
364 'filename' : part.get_filename('[no filename]'),
365 'description': part.get('Content-Description',
366 '[no description]'),
367 'encoding' : part.get('Content-Transfer-Encoding',
368 '[no encoding]'),
369 }
370
371
Barry Warsawe968ead2001-10-04 17:05:11 +0000372
Barry Warsawba925802001-09-23 03:17:28 +0000373# Helper
Barry Warsaw409a4c02002-04-10 21:01:31 +0000374def _make_boundary(text=None):
Barry Warsawba925802001-09-23 03:17:28 +0000375 # Craft a random boundary. If text is given, ensure that the chosen
376 # boundary doesn't appear in the text.
Barry Warsaw5d384ef2003-03-06 05:22:02 +0000377 dp = locale.localeconv().get('decimal_point', '.')
378 boundary = ('=' * 15) + repr(random.random()).split(dp)[1] + '=='
Barry Warsawba925802001-09-23 03:17:28 +0000379 if text is None:
380 return boundary
381 b = boundary
382 counter = 0
Barry Warsaw56835dd2002-09-28 18:04:55 +0000383 while True:
Barry Warsawba925802001-09-23 03:17:28 +0000384 cre = re.compile('^--' + re.escape(b) + '(--)?$', re.MULTILINE)
385 if not cre.search(text):
386 break
387 b = boundary + '.' + str(counter)
388 counter += 1
389 return b