blob: 402a1d08ab9e11773f2d5048cf8383da49c2094e [file] [log] [blame]
Guido van Rossumac8a9f31997-09-30 19:05:50 +00001"""Guess the MIME type of a file.
2
Fred Drake5109ffd1998-05-18 16:27:20 +00003This module defines two useful functions:
Guido van Rossumac8a9f31997-09-30 19:05:50 +00004
5guess_type(url) -- guess the MIME type and encoding of a URL.
6
Fred Drake5109ffd1998-05-18 16:27:20 +00007guess_extension(type) -- guess the extension for a given MIME type.
8
Guido van Rossumac8a9f31997-09-30 19:05:50 +00009It also contains the following, for tuning the behavior:
10
11Data:
12
13knownfiles -- list of files to parse
14inited -- flag set when init() has been called
Fred Drakeeeee4ec2001-08-03 21:01:44 +000015suffix_map -- dictionary mapping suffixes to suffixes
Guido van Rossumac8a9f31997-09-30 19:05:50 +000016encodings_map -- dictionary mapping suffixes to encodings
17types_map -- dictionary mapping suffixes to types
18
19Functions:
20
21init([files]) -- parse a list of files, default knownfiles
22read_mime_types(file) -- parse one file, return a dictionary or None
23
24"""
25
Fred Drakeeeee4ec2001-08-03 21:01:44 +000026import os
Guido van Rossumac8a9f31997-09-30 19:05:50 +000027import posixpath
Guido van Rossum1c5fb1c1998-10-12 15:12:28 +000028import urllib
Guido van Rossumac8a9f31997-09-30 19:05:50 +000029
Skip Montanaro03d90142001-01-25 15:29:22 +000030__all__ = ["guess_type","guess_extension","read_mime_types","init"]
31
Guido van Rossumac8a9f31997-09-30 19:05:50 +000032knownfiles = [
33 "/usr/local/etc/httpd/conf/mime.types",
34 "/usr/local/lib/netscape/mime.types",
Fred Drake13a2c272000-02-10 17:17:14 +000035 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
36 "/usr/local/etc/mime.types", # Apache 1.3
Guido van Rossumac8a9f31997-09-30 19:05:50 +000037 ]
38
39inited = 0
40
Fred Drakeeeee4ec2001-08-03 21:01:44 +000041
42class MimeTypes:
43 """MIME-types datastore.
44
45 This datastore can handle information from mime.types-style files
46 and supports basic determination of MIME type from a filename or
47 URL, and can guess a reasonable extension given a MIME type.
48 """
49
50 def __init__(self, filenames=()):
51 if not inited:
52 init()
53 self.encodings_map = encodings_map.copy()
54 self.suffix_map = suffix_map.copy()
55 self.types_map = types_map.copy()
56 for name in filenames:
57 self.read(name)
58
59 def guess_type(self, url):
60 """Guess the type of a file based on its URL.
61
62 Return value is a tuple (type, encoding) where type is None if
63 the type can't be guessed (no or unknown suffix) or a string
64 of the form type/subtype, usable for a MIME Content-type
65 header; and encoding is None for no encoding or the name of
66 the program used to encode (e.g. compress or gzip). The
67 mappings are table driven. Encoding suffixes are case
68 sensitive; type suffixes are first tried case sensitive, then
69 case insensitive.
70
71 The suffixes .tgz, .taz and .tz (case sensitive!) are all
72 mapped to '.tar.gz'. (This is table-driven too, using the
73 dictionary suffix_map.)
74 """
75 scheme, url = urllib.splittype(url)
76 if scheme == 'data':
77 # syntax of data URLs:
78 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
79 # mediatype := [ type "/" subtype ] *( ";" parameter )
80 # data := *urlchar
81 # parameter := attribute "=" value
82 # type/subtype defaults to "text/plain"
83 comma = url.find(',')
84 if comma < 0:
85 # bad data URL
86 return None, None
87 semi = url.find(';', 0, comma)
88 if semi >= 0:
89 type = url[:semi]
90 else:
91 type = url[:comma]
92 if '=' in type or '/' not in type:
93 type = 'text/plain'
94 return type, None # never compressed, so encoding is None
95 base, ext = posixpath.splitext(url)
96 while self.suffix_map.has_key(ext):
97 base, ext = posixpath.splitext(base + self.suffix_map[ext])
98 if self.encodings_map.has_key(ext):
99 encoding = self.encodings_map[ext]
100 base, ext = posixpath.splitext(base)
101 else:
102 encoding = None
103 types_map = self.types_map
104 if types_map.has_key(ext):
105 return types_map[ext], encoding
106 elif types_map.has_key(ext.lower()):
107 return types_map[ext.lower()], encoding
108 else:
109 return None, encoding
110
111 def guess_extension(self, type):
112 """Guess the extension for a file based on its MIME type.
113
114 Return value is a string giving a filename extension,
115 including the leading dot ('.'). The extension is not
116 guaranteed to have been associated with any particular data
117 stream, but would be mapped to the MIME type `type' by
118 guess_type(). If no extension can be guessed for `type', None
119 is returned.
120 """
121 type = type.lower()
122 for ext, stype in self.types_map.items():
123 if type == stype:
124 return ext
125 return None
126
127 def read(self, filename):
128 """Read a single mime.types-format file, specified by pathname."""
129 fp = open(filename)
130 self.readfp(fp)
131 fp.close()
132
133 def readfp(self):
134 """Read a single mime.types-format file."""
135 map = self.types_map
136 while 1:
137 line = f.readline()
138 if not line:
139 break
140 words = line.split()
141 for i in range(len(words)):
142 if words[i][0] == '#':
143 del words[i:]
144 break
145 if not words:
146 continue
147 type, suffixes = words[0], words[1:]
148 for suff in suffixes:
149 map['.' + suff] = type
150
151
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000152def guess_type(url):
153 """Guess the type of a file based on its URL.
154
155 Return value is a tuple (type, encoding) where type is None if the
156 type can't be guessed (no or unknown suffix) or a string of the
157 form type/subtype, usable for a MIME Content-type header; and
158 encoding is None for no encoding or the name of the program used
159 to encode (e.g. compress or gzip). The mappings are table
160 driven. Encoding suffixes are case sensitive; type suffixes are
161 first tried case sensitive, then case insensitive.
162
163 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
164 to ".tar.gz". (This is table-driven too, using the dictionary
Fred Drake3130b7a1998-05-18 16:05:24 +0000165 suffix_map).
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000166 """
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000167 init()
168 return guess_type(url)
169
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000170
Fred Drake5109ffd1998-05-18 16:27:20 +0000171def guess_extension(type):
172 """Guess the extension for a file based on its MIME type.
173
174 Return value is a string giving a filename extension, including the
175 leading dot ('.'). The extension is not guaranteed to have been
Fred Drake49413411998-05-19 15:15:59 +0000176 associated with any particular data stream, but would be mapped to the
177 MIME type `type' by guess_type(). If no extension can be guessed for
178 `type', None is returned.
Fred Drake5109ffd1998-05-18 16:27:20 +0000179 """
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000180 init()
181 return guess_extension(type)
182
Fred Drake5109ffd1998-05-18 16:27:20 +0000183
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000184def init(files=None):
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000185 global guess_extension, guess_type
186 global suffix_map, types_map, encodings_map
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000187 global inited
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000188 inited = 1
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000189 db = MimeTypes()
190 if files is None:
191 files = knownfiles
192 for file in files:
193 if os.path.isfile(file):
194 db.readfp(open(file))
195 encodings_map = db.encodings_map
196 suffix_map = db.encodings_map
197 types_map = db.types_map
198 guess_extension = db.guess_extension
199 guess_type = db.guess_type
200
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000201
202def read_mime_types(file):
203 try:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000204 f = open(file)
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000205 except IOError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000206 return None
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000207 db = MimeTypes()
208 db.readfp(f)
209 return db.types_map
210
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000211
212suffix_map = {
213 '.tgz': '.tar.gz',
214 '.taz': '.tar.gz',
215 '.tz': '.tar.gz',
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000216 }
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000217
218encodings_map = {
219 '.gz': 'gzip',
220 '.Z': 'compress',
221 }
222
223types_map = {
224 '.a': 'application/octet-stream',
225 '.ai': 'application/postscript',
226 '.aif': 'audio/x-aiff',
227 '.aifc': 'audio/x-aiff',
228 '.aiff': 'audio/x-aiff',
229 '.au': 'audio/basic',
230 '.avi': 'video/x-msvideo',
231 '.bcpio': 'application/x-bcpio',
232 '.bin': 'application/octet-stream',
233 '.cdf': 'application/x-netcdf',
234 '.cpio': 'application/x-cpio',
235 '.csh': 'application/x-csh',
236 '.dll': 'application/octet-stream',
237 '.dvi': 'application/x-dvi',
238 '.exe': 'application/octet-stream',
239 '.eps': 'application/postscript',
240 '.etx': 'text/x-setext',
241 '.gif': 'image/gif',
242 '.gtar': 'application/x-gtar',
243 '.hdf': 'application/x-hdf',
244 '.htm': 'text/html',
245 '.html': 'text/html',
246 '.ief': 'image/ief',
247 '.jpe': 'image/jpeg',
248 '.jpeg': 'image/jpeg',
249 '.jpg': 'image/jpeg',
Fred Drakec40c5471999-05-20 12:52:04 +0000250 '.js': 'application/x-javascript',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000251 '.latex': 'application/x-latex',
252 '.man': 'application/x-troff-man',
253 '.me': 'application/x-troff-me',
254 '.mif': 'application/x-mif',
255 '.mov': 'video/quicktime',
256 '.movie': 'video/x-sgi-movie',
257 '.mpe': 'video/mpeg',
258 '.mpeg': 'video/mpeg',
259 '.mpg': 'video/mpeg',
260 '.ms': 'application/x-troff-ms',
261 '.nc': 'application/x-netcdf',
262 '.o': 'application/octet-stream',
263 '.obj': 'application/octet-stream',
264 '.oda': 'application/oda',
265 '.pbm': 'image/x-portable-bitmap',
266 '.pdf': 'application/pdf',
267 '.pgm': 'image/x-portable-graymap',
268 '.pnm': 'image/x-portable-anymap',
269 '.png': 'image/png',
270 '.ppm': 'image/x-portable-pixmap',
Martin v. Löwis2750bcc2001-06-05 05:17:00 +0000271 '.ps': 'application/postscript',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000272 '.py': 'text/x-python',
273 '.pyc': 'application/x-python-code',
Martin v. Löwis2750bcc2001-06-05 05:17:00 +0000274 '.pyo': 'application/x-python-code',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000275 '.qt': 'video/quicktime',
276 '.ras': 'image/x-cmu-raster',
277 '.rgb': 'image/x-rgb',
Fred Drakecbd98701999-03-11 16:04:04 +0000278 '.rdf': 'application/xml',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000279 '.roff': 'application/x-troff',
280 '.rtf': 'application/rtf',
281 '.rtx': 'text/richtext',
282 '.sgm': 'text/x-sgml',
283 '.sgml': 'text/x-sgml',
284 '.sh': 'application/x-sh',
285 '.shar': 'application/x-shar',
286 '.snd': 'audio/basic',
287 '.so': 'application/octet-stream',
288 '.src': 'application/x-wais-source',
289 '.sv4cpio': 'application/x-sv4cpio',
290 '.sv4crc': 'application/x-sv4crc',
291 '.t': 'application/x-troff',
292 '.tar': 'application/x-tar',
293 '.tcl': 'application/x-tcl',
294 '.tex': 'application/x-tex',
295 '.texi': 'application/x-texinfo',
296 '.texinfo': 'application/x-texinfo',
297 '.tif': 'image/tiff',
298 '.tiff': 'image/tiff',
299 '.tr': 'application/x-troff',
300 '.tsv': 'text/tab-separated-values',
301 '.txt': 'text/plain',
302 '.ustar': 'application/x-ustar',
303 '.wav': 'audio/x-wav',
304 '.xbm': 'image/x-xbitmap',
Guido van Rossum7beaad41998-05-18 14:25:08 +0000305 '.xml': 'text/xml',
Fred Drakecbd98701999-03-11 16:04:04 +0000306 '.xsl': 'application/xml',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000307 '.xpm': 'image/x-xpixmap',
308 '.xwd': 'image/x-xwindowdump',
309 '.zip': 'application/zip',
310 }
Eric S. Raymond51cc3bc2001-02-09 09:44:47 +0000311
312if __name__ == '__main__':
313 import sys
314 print guess_type(sys.argv[1])