blob: 46fe9ebb2034731bef2ebece3936231e86839eee [file] [log] [blame]
Guido van Rossumaad67612000-05-08 17:31:04 +00001"""Generic MIME parser.
2
3Classes:
4
5 MimeParser - Generic MIME parser.
6
7Exceptions:
8
9 MimeError - Exception raised by MimeParser class.
10
11XXX To do:
12
13- Content-transfer-encoding issues
14- Use Content-length header in rawbody()?
15- Cache parts instead of reparsing each time
16- The message strings in exceptions could use some work
17
18"""
19
20from types import * # Python types, not MIME types :-)
21import string
22import regex
23import SubFile
24import mimetools
25
26
27MimeError = "MimeParser.MimeError" # Exception raised by this class
28
29
30class MimeParser:
31
32 """Generic MIME parser.
33
34 This requires a seekable file.
35
36 """
37
38 def __init__(self, fp):
39 """Constructor: store the file pointer and parse the headers."""
40 self._fp = fp
41 self._start = fp.tell()
42 self._headers = h = mimetools.Message(fp)
43 self._bodystart = fp.tell()
44 self._multipart = h.getmaintype() == 'multipart'
45
46 def multipart(self):
47 """Return whether this is a multipart message."""
48 return self._multipart
49
50 def headers(self):
51 """Return the headers of the MIME message, as a Message object."""
52 return self._headers
53
54 def rawbody(self):
55 """Return the raw body of the MIME message, as a file-like object.
56
57 This is a fairly low-level interface -- for a multipart
58 message, you'd have to parse the body yourself, and it doesn't
59 translate the Content-transfer-encoding.
60
61 """
62 # XXX Use Content-length to set end if it exists?
63 return SubFile.SubFile(self._fp, self._bodystart)
64
65 def body(self):
66 """Return the body of a 1-part MIME message, as a file-like object.
67
68 This should interpret the Content-transfer-encoding, if any
69 (XXX currently it doesn't).
70
71 """
72 if self._multipart:
73 raise MimeError, "body() only works for 1-part messages"
74 return self.rawbody()
75
76 _re_content_length = regex.compile('content-length:[ \t]*\([0-9]+\)',
77 regex.casefold)
78
79 def rawparts(self):
80 """Return the raw body parts of a multipart MIME message.
81
82 This returns a list of SubFile() objects corresponding to the
83 parts. Note that the phantom part before the first separator
84 is returned too, as list item 0. If the final part is not
85 followed by a terminator, it is ignored, and this error is not
86 reported. (XXX: the error should be raised).
87
88 """
89 if not self._multipart:
90 raise MimeError, "[raw]parts() only works for multipart messages"
91 h = self._headers
92 separator = h.getparam('boundary')
93 if not separator:
94 raise MimeError, "multipart boundary not specified"
95 separator = "--" + separator
96 terminator = separator + "--"
97 ns = len(separator)
98 list = []
99 f = self._fp
100 start = f.tell()
101 clength = -1
102 bodystart = -1
103 inheaders = 0
104 while 1:
105 end = f.tell()
106 line = f.readline()
107 if not line:
108 break
109 if line[:2] != "--" or line[:ns] != separator:
110 if inheaders:
111 re = self._re_content_length
112 if re.match(line) > 0:
113 try:
114 clength = string.atoi(re.group(1))
115 except string.atoi_error:
116 pass
117 if not string.strip(line):
118 inheaders = 0
119 bodystart = f.tell()
120 if clength > 0:
121 # Skip binary data
122 f.read(clength)
123 continue
124 line = string.strip(line)
125 if line == terminator or line == separator:
126 if clength >= 0:
127 # The Content-length header determines the subfile size
128 end = bodystart + clength
129 else:
130 # The final newline is not part of the content
131 end = end-1
132 list.append(SubFile.SubFile(f, start, end))
133 start = f.tell()
134 clength = -1
135 inheaders = 1
136 if line == terminator:
137 break
138 return list
139
140 def parts(self):
141 """Return the parsed body parts of a multipart MIME message.
142
143 This returns a list of MimeParser() instances corresponding to
144 the parts. The phantom part before the first separator is not
145 included.
146
147 """
148 return map(MimeParser, self.rawparts()[1:])
149
150 def getsubpartbyposition(self, indices):
151 part = self
152 for i in indices:
153 part = part.parts()[i]
154 return part
155
156 def getsubpartbyid(self, id):
157 h = self._headers
158 cid = h.getheader('content-id')
159 if cid and cid == id:
160 return self
161 if self._multipart:
162 for part in self.parts():
163 parser = MimeParser(part)
164 hit = parser.getsubpartbyid(id)
165 if hit:
166 return hit
167 return None
168
169 def index(self):
170 """Return an index of the MIME file.
171
172 This parses the entire file and returns index information
173 about it, in the form of a tuple
174
175 (ctype, headers, body)
176
177 where 'ctype' is the content type string of the message
178 (e.g. `text/plain' or `multipart/mixed') and 'headers' is a
179 Message instance containing the message headers (which should
180 be treated as read-only).
181
182 The 'body' item depends on the content type:
183
184 - If it is an atomic message (anything except for content type
185 multipart/*), it is the file-like object returned by
186 self.body().
187
188 - For a content type of multipart/*, it is the list of
189 MimeParser() objects returned by self.parts().
190
191 """
192 if self._multipart:
193 body = self.parts()
194 else:
195 body = self.body()
196 return self._headers.gettype(), self._headers, body
197
198
199def _show(parser, level=0):
200 """Helper for _test()."""
201 ctype, headers, body = parser.index()
202 print ctype,
203 if type(body) == ListType:
204 nparts = len(body)
205 print "(%d part%s):" % (nparts, nparts != 1 and "s" or "")
206 n = 0
207 for part in body:
208 n = n+1
209 print "%*d." % (4*level+2, n),
210 _show(part, level+1)
211 else:
212 bodylines = body.readlines()
213 print "(%d header lines, %d body lines)" % (
214 len(headers.headers), len(bodylines))
215 for line in headers.headers + ['\n'] + bodylines:
216 if line[-1:] == '\n': line = line[:-1]
217 print " "*level + line
218
219def _test(args = None):
220 """Test program invoked when run as a script.
221
222 When a filename argument is specified, it reads from that file.
223 When no arguments are present, it defaults to 'testkp.txt' if it
224 exists, else it defaults to stdin.
225
226 """
227 if not args:
228 import sys
229 args = sys.argv[1:]
230 if args:
231 fn = args[0]
232 else:
233 import os
234 fn = 'testkp.txt'
235 if not os.path.exists(fn):
236 fn = '-'
237 if fn == '-':
238 fp = sys.stdin
239 else:
240 fp = open(fn)
241 mp = MimeParser(fp)
242 _show(mp)
243
244if __name__ == '__main__':
245 import sys
246 _test()