blob: 46fe9ebb2034731bef2ebece3936231e86839eee [file] [log] [blame]
"""Generic MIME parser.
Classes:
MimeParser - Generic MIME parser.
Exceptions:
MimeError - Exception raised by MimeParser class.
XXX To do:
- Content-transfer-encoding issues
- Use Content-length header in rawbody()?
- Cache parts instead of reparsing each time
- The message strings in exceptions could use some work
"""
from types import * # Python types, not MIME types :-)
import string
import regex
import SubFile
import mimetools
MimeError = "MimeParser.MimeError" # Exception raised by this class
class MimeParser:
"""Generic MIME parser.
This requires a seekable file.
"""
def __init__(self, fp):
"""Constructor: store the file pointer and parse the headers."""
self._fp = fp
self._start = fp.tell()
self._headers = h = mimetools.Message(fp)
self._bodystart = fp.tell()
self._multipart = h.getmaintype() == 'multipart'
def multipart(self):
"""Return whether this is a multipart message."""
return self._multipart
def headers(self):
"""Return the headers of the MIME message, as a Message object."""
return self._headers
def rawbody(self):
"""Return the raw body of the MIME message, as a file-like object.
This is a fairly low-level interface -- for a multipart
message, you'd have to parse the body yourself, and it doesn't
translate the Content-transfer-encoding.
"""
# XXX Use Content-length to set end if it exists?
return SubFile.SubFile(self._fp, self._bodystart)
def body(self):
"""Return the body of a 1-part MIME message, as a file-like object.
This should interpret the Content-transfer-encoding, if any
(XXX currently it doesn't).
"""
if self._multipart:
raise MimeError, "body() only works for 1-part messages"
return self.rawbody()
_re_content_length = regex.compile('content-length:[ \t]*\([0-9]+\)',
regex.casefold)
def rawparts(self):
"""Return the raw body parts of a multipart MIME message.
This returns a list of SubFile() objects corresponding to the
parts. Note that the phantom part before the first separator
is returned too, as list item 0. If the final part is not
followed by a terminator, it is ignored, and this error is not
reported. (XXX: the error should be raised).
"""
if not self._multipart:
raise MimeError, "[raw]parts() only works for multipart messages"
h = self._headers
separator = h.getparam('boundary')
if not separator:
raise MimeError, "multipart boundary not specified"
separator = "--" + separator
terminator = separator + "--"
ns = len(separator)
list = []
f = self._fp
start = f.tell()
clength = -1
bodystart = -1
inheaders = 0
while 1:
end = f.tell()
line = f.readline()
if not line:
break
if line[:2] != "--" or line[:ns] != separator:
if inheaders:
re = self._re_content_length
if re.match(line) > 0:
try:
clength = string.atoi(re.group(1))
except string.atoi_error:
pass
if not string.strip(line):
inheaders = 0
bodystart = f.tell()
if clength > 0:
# Skip binary data
f.read(clength)
continue
line = string.strip(line)
if line == terminator or line == separator:
if clength >= 0:
# The Content-length header determines the subfile size
end = bodystart + clength
else:
# The final newline is not part of the content
end = end-1
list.append(SubFile.SubFile(f, start, end))
start = f.tell()
clength = -1
inheaders = 1
if line == terminator:
break
return list
def parts(self):
"""Return the parsed body parts of a multipart MIME message.
This returns a list of MimeParser() instances corresponding to
the parts. The phantom part before the first separator is not
included.
"""
return map(MimeParser, self.rawparts()[1:])
def getsubpartbyposition(self, indices):
part = self
for i in indices:
part = part.parts()[i]
return part
def getsubpartbyid(self, id):
h = self._headers
cid = h.getheader('content-id')
if cid and cid == id:
return self
if self._multipart:
for part in self.parts():
parser = MimeParser(part)
hit = parser.getsubpartbyid(id)
if hit:
return hit
return None
def index(self):
"""Return an index of the MIME file.
This parses the entire file and returns index information
about it, in the form of a tuple
(ctype, headers, body)
where 'ctype' is the content type string of the message
(e.g. `text/plain' or `multipart/mixed') and 'headers' is a
Message instance containing the message headers (which should
be treated as read-only).
The 'body' item depends on the content type:
- If it is an atomic message (anything except for content type
multipart/*), it is the file-like object returned by
self.body().
- For a content type of multipart/*, it is the list of
MimeParser() objects returned by self.parts().
"""
if self._multipart:
body = self.parts()
else:
body = self.body()
return self._headers.gettype(), self._headers, body
def _show(parser, level=0):
"""Helper for _test()."""
ctype, headers, body = parser.index()
print ctype,
if type(body) == ListType:
nparts = len(body)
print "(%d part%s):" % (nparts, nparts != 1 and "s" or "")
n = 0
for part in body:
n = n+1
print "%*d." % (4*level+2, n),
_show(part, level+1)
else:
bodylines = body.readlines()
print "(%d header lines, %d body lines)" % (
len(headers.headers), len(bodylines))
for line in headers.headers + ['\n'] + bodylines:
if line[-1:] == '\n': line = line[:-1]
print " "*level + line
def _test(args = None):
"""Test program invoked when run as a script.
When a filename argument is specified, it reads from that file.
When no arguments are present, it defaults to 'testkp.txt' if it
exists, else it defaults to stdin.
"""
if not args:
import sys
args = sys.argv[1:]
if args:
fn = args[0]
else:
import os
fn = 'testkp.txt'
if not os.path.exists(fn):
fn = '-'
if fn == '-':
fp = sys.stdin
else:
fp = open(fn)
mp = MimeParser(fp)
_show(mp)
if __name__ == '__main__':
import sys
_test()