blob: ad13be374edeb75e4f29677bb6a961303e0d55d0 [file] [log] [blame]
Guido van Rossumac8a9f31997-09-30 19:05:50 +00001"""Guess the MIME type of a file.
2
Fred Drake5109ffd1998-05-18 16:27:20 +00003This module defines two useful functions:
Guido van Rossumac8a9f31997-09-30 19:05:50 +00004
Barry Warsaw107771a2001-10-25 21:49:18 +00005guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
Guido van Rossumac8a9f31997-09-30 19:05:50 +00006
Barry Warsaw107771a2001-10-25 21:49:18 +00007guess_extension(type, strict=1) -- guess the extension for a given MIME type.
Fred Drake5109ffd1998-05-18 16:27:20 +00008
Guido van Rossumac8a9f31997-09-30 19:05:50 +00009It also contains the following, for tuning the behavior:
10
11Data:
12
13knownfiles -- list of files to parse
14inited -- flag set when init() has been called
Fred Drakeeeee4ec2001-08-03 21:01:44 +000015suffix_map -- dictionary mapping suffixes to suffixes
Guido van Rossumac8a9f31997-09-30 19:05:50 +000016encodings_map -- dictionary mapping suffixes to encodings
17types_map -- dictionary mapping suffixes to types
18
19Functions:
20
21init([files]) -- parse a list of files, default knownfiles
22read_mime_types(file) -- parse one file, return a dictionary or None
Guido van Rossumac8a9f31997-09-30 19:05:50 +000023"""
24
Fred Drakeeeee4ec2001-08-03 21:01:44 +000025import os
Guido van Rossumac8a9f31997-09-30 19:05:50 +000026import posixpath
Guido van Rossum1c5fb1c1998-10-12 15:12:28 +000027import urllib
Guido van Rossumac8a9f31997-09-30 19:05:50 +000028
Skip Montanaro03d90142001-01-25 15:29:22 +000029__all__ = ["guess_type","guess_extension","read_mime_types","init"]
30
Guido van Rossumac8a9f31997-09-30 19:05:50 +000031knownfiles = [
32 "/usr/local/etc/httpd/conf/mime.types",
33 "/usr/local/lib/netscape/mime.types",
Fred Drake13a2c272000-02-10 17:17:14 +000034 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
35 "/usr/local/etc/mime.types", # Apache 1.3
Guido van Rossumac8a9f31997-09-30 19:05:50 +000036 ]
37
38inited = 0
39
Fred Drakeeeee4ec2001-08-03 21:01:44 +000040
41class MimeTypes:
42 """MIME-types datastore.
43
44 This datastore can handle information from mime.types-style files
45 and supports basic determination of MIME type from a filename or
46 URL, and can guess a reasonable extension given a MIME type.
47 """
48
49 def __init__(self, filenames=()):
50 if not inited:
51 init()
52 self.encodings_map = encodings_map.copy()
53 self.suffix_map = suffix_map.copy()
54 self.types_map = types_map.copy()
Barry Warsaw107771a2001-10-25 21:49:18 +000055 self.common_types = common_types.copy()
Fred Drakeeeee4ec2001-08-03 21:01:44 +000056 for name in filenames:
57 self.read(name)
58
Barry Warsaw107771a2001-10-25 21:49:18 +000059 def guess_type(self, url, strict=1):
Fred Drakeeeee4ec2001-08-03 21:01:44 +000060 """Guess the type of a file based on its URL.
61
62 Return value is a tuple (type, encoding) where type is None if
63 the type can't be guessed (no or unknown suffix) or a string
64 of the form type/subtype, usable for a MIME Content-type
65 header; and encoding is None for no encoding or the name of
66 the program used to encode (e.g. compress or gzip). The
67 mappings are table driven. Encoding suffixes are case
68 sensitive; type suffixes are first tried case sensitive, then
69 case insensitive.
70
71 The suffixes .tgz, .taz and .tz (case sensitive!) are all
72 mapped to '.tar.gz'. (This is table-driven too, using the
73 dictionary suffix_map.)
Barry Warsaw107771a2001-10-25 21:49:18 +000074
75 Optional `strict' argument when false adds a bunch of commonly found,
76 but non-standard types.
Fred Drakeeeee4ec2001-08-03 21:01:44 +000077 """
78 scheme, url = urllib.splittype(url)
79 if scheme == 'data':
80 # syntax of data URLs:
81 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
82 # mediatype := [ type "/" subtype ] *( ";" parameter )
83 # data := *urlchar
84 # parameter := attribute "=" value
85 # type/subtype defaults to "text/plain"
86 comma = url.find(',')
87 if comma < 0:
88 # bad data URL
89 return None, None
90 semi = url.find(';', 0, comma)
91 if semi >= 0:
92 type = url[:semi]
93 else:
94 type = url[:comma]
95 if '=' in type or '/' not in type:
96 type = 'text/plain'
97 return type, None # never compressed, so encoding is None
98 base, ext = posixpath.splitext(url)
Raymond Hettinger54f02222002-06-01 14:18:47 +000099 while ext in self.suffix_map:
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000100 base, ext = posixpath.splitext(base + self.suffix_map[ext])
Raymond Hettinger54f02222002-06-01 14:18:47 +0000101 if ext in self.encodings_map:
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000102 encoding = self.encodings_map[ext]
103 base, ext = posixpath.splitext(base)
104 else:
105 encoding = None
106 types_map = self.types_map
Barry Warsaw107771a2001-10-25 21:49:18 +0000107 common_types = self.common_types
Raymond Hettinger54f02222002-06-01 14:18:47 +0000108 if ext in types_map:
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000109 return types_map[ext], encoding
Raymond Hettinger54f02222002-06-01 14:18:47 +0000110 elif ext.lower() in types_map:
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000111 return types_map[ext.lower()], encoding
Barry Warsaw107771a2001-10-25 21:49:18 +0000112 elif strict:
113 return None, encoding
Raymond Hettinger54f02222002-06-01 14:18:47 +0000114 elif ext in common_types:
Barry Warsaw107771a2001-10-25 21:49:18 +0000115 return common_types[ext], encoding
Raymond Hettinger54f02222002-06-01 14:18:47 +0000116 elif ext.lower() in common_types:
Barry Warsaw107771a2001-10-25 21:49:18 +0000117 return common_types[ext.lower()], encoding
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000118 else:
119 return None, encoding
120
Barry Warsaw107771a2001-10-25 21:49:18 +0000121 def guess_extension(self, type, strict=1):
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000122 """Guess the extension for a file based on its MIME type.
123
124 Return value is a string giving a filename extension,
125 including the leading dot ('.'). The extension is not
126 guaranteed to have been associated with any particular data
127 stream, but would be mapped to the MIME type `type' by
128 guess_type(). If no extension can be guessed for `type', None
129 is returned.
Barry Warsaw107771a2001-10-25 21:49:18 +0000130
131 Optional `strict' argument when false adds a bunch of commonly found,
132 but non-standard types.
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000133 """
134 type = type.lower()
135 for ext, stype in self.types_map.items():
136 if type == stype:
137 return ext
Barry Warsaw107771a2001-10-25 21:49:18 +0000138 if not strict:
139 for ext, stype in common_types.items():
140 if type == stype:
141 return ext
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000142 return None
143
144 def read(self, filename):
145 """Read a single mime.types-format file, specified by pathname."""
146 fp = open(filename)
147 self.readfp(fp)
148 fp.close()
149
Fred Drakec019ecb2001-08-16 15:54:28 +0000150 def readfp(self, fp):
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000151 """Read a single mime.types-format file."""
152 map = self.types_map
153 while 1:
Fred Drakec019ecb2001-08-16 15:54:28 +0000154 line = fp.readline()
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000155 if not line:
156 break
157 words = line.split()
158 for i in range(len(words)):
159 if words[i][0] == '#':
160 del words[i:]
161 break
162 if not words:
163 continue
164 type, suffixes = words[0], words[1:]
165 for suff in suffixes:
166 map['.' + suff] = type
167
168
Barry Warsaw107771a2001-10-25 21:49:18 +0000169def guess_type(url, strict=1):
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000170 """Guess the type of a file based on its URL.
171
172 Return value is a tuple (type, encoding) where type is None if the
173 type can't be guessed (no or unknown suffix) or a string of the
174 form type/subtype, usable for a MIME Content-type header; and
175 encoding is None for no encoding or the name of the program used
176 to encode (e.g. compress or gzip). The mappings are table
177 driven. Encoding suffixes are case sensitive; type suffixes are
178 first tried case sensitive, then case insensitive.
179
180 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
181 to ".tar.gz". (This is table-driven too, using the dictionary
Fred Drake3130b7a1998-05-18 16:05:24 +0000182 suffix_map).
Barry Warsaw107771a2001-10-25 21:49:18 +0000183
184 Optional `strict' argument when false adds a bunch of commonly found, but
185 non-standard types.
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000186 """
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000187 init()
Barry Warsaw107771a2001-10-25 21:49:18 +0000188 return guess_type(url, strict)
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000189
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000190
Barry Warsaw107771a2001-10-25 21:49:18 +0000191def guess_extension(type, strict=1):
Fred Drake5109ffd1998-05-18 16:27:20 +0000192 """Guess the extension for a file based on its MIME type.
193
194 Return value is a string giving a filename extension, including the
195 leading dot ('.'). The extension is not guaranteed to have been
Fred Drake49413411998-05-19 15:15:59 +0000196 associated with any particular data stream, but would be mapped to the
197 MIME type `type' by guess_type(). If no extension can be guessed for
198 `type', None is returned.
Barry Warsaw107771a2001-10-25 21:49:18 +0000199
200 Optional `strict' argument when false adds a bunch of commonly found,
201 but non-standard types.
Fred Drake5109ffd1998-05-18 16:27:20 +0000202 """
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000203 init()
Barry Warsaw107771a2001-10-25 21:49:18 +0000204 return guess_extension(type, strict)
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000205
Fred Drake5109ffd1998-05-18 16:27:20 +0000206
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000207def init(files=None):
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000208 global guess_extension, guess_type
Barry Warsaw107771a2001-10-25 21:49:18 +0000209 global suffix_map, types_map, encodings_map, common_types
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000210 global inited
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000211 inited = 1
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000212 db = MimeTypes()
213 if files is None:
214 files = knownfiles
215 for file in files:
216 if os.path.isfile(file):
217 db.readfp(open(file))
218 encodings_map = db.encodings_map
Fred Drakec81a0692001-08-16 18:14:38 +0000219 suffix_map = db.suffix_map
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000220 types_map = db.types_map
221 guess_extension = db.guess_extension
222 guess_type = db.guess_type
Barry Warsaw107771a2001-10-25 21:49:18 +0000223 common_types = db.common_types
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000224
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000225
226def read_mime_types(file):
227 try:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000228 f = open(file)
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000229 except IOError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000230 return None
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000231 db = MimeTypes()
232 db.readfp(f)
233 return db.types_map
234
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000235
236suffix_map = {
237 '.tgz': '.tar.gz',
238 '.taz': '.tar.gz',
239 '.tz': '.tar.gz',
Fred Drakeeeee4ec2001-08-03 21:01:44 +0000240 }
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000241
242encodings_map = {
243 '.gz': 'gzip',
244 '.Z': 'compress',
245 }
246
Martin v. Löwisa3689fe2001-09-07 16:49:12 +0000247# Before adding new types, make sure they are either registered with IANA, at
248# http://www.isi.edu/in-notes/iana/assignments/media-types
249# or extensions, i.e. using the x- prefix
Barry Warsaw107771a2001-10-25 21:49:18 +0000250
251# If you add to these, please keep them sorted!
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000252types_map = {
Barry Warsaw107771a2001-10-25 21:49:18 +0000253 '.a' : 'application/octet-stream',
254 '.ai' : 'application/postscript',
255 '.aif' : 'audio/x-aiff',
256 '.aifc' : 'audio/x-aiff',
257 '.aiff' : 'audio/x-aiff',
258 '.au' : 'audio/basic',
259 '.avi' : 'video/x-msvideo',
260 '.bat' : 'text/plain',
261 '.bcpio' : 'application/x-bcpio',
262 '.bin' : 'application/octet-stream',
263 '.bmp' : 'image/x-ms-bmp',
264 '.c' : 'text/plain',
265 # Duplicates :(
266 '.cdf' : 'application/x-cdf',
267 '.cdf' : 'application/x-netcdf',
268 '.cpio' : 'application/x-cpio',
269 '.csh' : 'application/x-csh',
270 '.css' : 'text/css',
271 '.dll' : 'application/octet-stream',
272 '.doc' : 'application/msword',
273 '.dot' : 'application/msword',
274 '.dvi' : 'application/x-dvi',
275 '.eml' : 'message/rfc822',
276 '.eps' : 'application/postscript',
277 '.etx' : 'text/x-setext',
278 '.exe' : 'application/octet-stream',
279 '.gif' : 'image/gif',
280 '.gtar' : 'application/x-gtar',
281 '.h' : 'text/plain',
282 '.hdf' : 'application/x-hdf',
283 '.htm' : 'text/html',
284 '.html' : 'text/html',
285 '.ief' : 'image/ief',
286 '.jpe' : 'image/jpeg',
287 '.jpeg' : 'image/jpeg',
288 '.jpg' : 'image/jpeg',
289 '.js' : 'application/x-javascript',
290 '.ksh' : 'text/plain',
291 '.latex' : 'application/x-latex',
292 '.m1v' : 'video/mpeg',
293 '.man' : 'application/x-troff-man',
294 '.me' : 'application/x-troff-me',
295 '.mht' : 'message/rfc822',
296 '.mhtml' : 'message/rfc822',
297 '.mif' : 'application/x-mif',
298 '.mov' : 'video/quicktime',
299 '.movie' : 'video/x-sgi-movie',
300 '.mp2' : 'audio/mpeg',
301 '.mp3' : 'audio/mpeg',
302 '.mpa' : 'video/mpeg',
303 '.mpe' : 'video/mpeg',
304 '.mpeg' : 'video/mpeg',
305 '.mpg' : 'video/mpeg',
306 '.ms' : 'application/x-troff-ms',
307 '.nc' : 'application/x-netcdf',
308 '.nws' : 'message/rfc822',
309 '.o' : 'application/octet-stream',
310 '.obj' : 'application/octet-stream',
311 '.oda' : 'application/oda',
312 '.p12' : 'application/x-pkcs12',
313 '.p7c' : 'application/pkcs7-mime',
314 '.pbm' : 'image/x-portable-bitmap',
315 '.pdf' : 'application/pdf',
316 '.pfx' : 'application/x-pkcs12',
317 '.pgm' : 'image/x-portable-graymap',
318 '.pl' : 'text/plain',
319 '.png' : 'image/png',
320 '.pnm' : 'image/x-portable-anymap',
321 '.pot' : 'application/vnd.ms-powerpoint',
322 '.ppa' : 'application/vnd.ms-powerpoint',
323 '.ppm' : 'image/x-portable-pixmap',
324 '.pps' : 'application/vnd.ms-powerpoint',
325 '.ppt' : 'application/vnd.ms-powerpoint',
326 '.ps' : 'application/postscript',
327 '.pwz' : 'application/vnd.ms-powerpoint',
328 '.py' : 'text/x-python',
329 '.pyc' : 'application/x-python-code',
330 '.pyo' : 'application/x-python-code',
331 '.qt' : 'video/quicktime',
332 '.ra' : 'audio/x-pn-realaudio',
333 '.ram' : 'application/x-pn-realaudio',
334 '.ras' : 'image/x-cmu-raster',
335 '.rdf' : 'application/xml',
336 '.rgb' : 'image/x-rgb',
337 '.roff' : 'application/x-troff',
338 '.rtx' : 'text/richtext',
339 '.sgm' : 'text/x-sgml',
340 '.sgml' : 'text/x-sgml',
341 '.sh' : 'application/x-sh',
342 '.shar' : 'application/x-shar',
343 '.snd' : 'audio/basic',
344 '.so' : 'application/octet-stream',
345 '.src' : 'application/x-wais-source',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000346 '.sv4cpio': 'application/x-sv4cpio',
Barry Warsaw107771a2001-10-25 21:49:18 +0000347 '.sv4crc' : 'application/x-sv4crc',
348 '.t' : 'application/x-troff',
349 '.tar' : 'application/x-tar',
350 '.tcl' : 'application/x-tcl',
351 '.tex' : 'application/x-tex',
352 '.texi' : 'application/x-texinfo',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000353 '.texinfo': 'application/x-texinfo',
Barry Warsaw107771a2001-10-25 21:49:18 +0000354 '.tif' : 'image/tiff',
355 '.tiff' : 'image/tiff',
356 '.tr' : 'application/x-troff',
357 '.tsv' : 'text/tab-separated-values',
358 '.txt' : 'text/plain',
359 '.ustar' : 'application/x-ustar',
360 '.vcf' : 'text/x-vcard',
361 '.wav' : 'audio/x-wav',
362 '.wiz' : 'application/msword',
363 '.xbm' : 'image/x-xbitmap',
364 '.xlb' : 'application/vnd.ms-excel',
365 # Duplicates :(
366 '.xls' : 'application/excel',
367 '.xls' : 'application/vnd.ms-excel',
368 '.xml' : 'text/xml',
369 '.xpm' : 'image/x-xpixmap',
370 '.xsl' : 'application/xml',
371 '.xwd' : 'image/x-xwindowdump',
372 '.zip' : 'application/zip',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000373 }
Eric S. Raymond51cc3bc2001-02-09 09:44:47 +0000374
Barry Warsaw107771a2001-10-25 21:49:18 +0000375# These are non-standard types, commonly found in the wild. They will only
376# match if strict=0 flag is given to the API methods.
377
378# Please sort these too
379common_types = {
380 '.jpg' : 'image/jpg',
381 '.mid' : 'audio/midi',
382 '.midi': 'audio/midi',
383 '.pct' : 'image/pict',
384 '.pic' : 'image/pict',
385 '.pict': 'image/pict',
386 '.rtf' : 'application/rtf',
387 '.xul' : 'text/xul'
388 }
389
390
Eric S. Raymond51cc3bc2001-02-09 09:44:47 +0000391if __name__ == '__main__':
392 import sys
Barry Warsaw107771a2001-10-25 21:49:18 +0000393 import getopt
394
Fred Drake698da022001-12-05 15:58:29 +0000395 USAGE = """\
396Usage: mimetypes.py [options] type
397
398Options:
399 --help / -h -- print this message and exit
400 --lenient / -l -- additionally search of some common, but non-standard
401 types.
402 --extension / -e -- guess extension instead of type
403
404More than one type argument may be given.
405"""
406
407 def usage(code, msg=''):
408 print USAGE
409 if msg: print msg
410 sys.exit(code)
411
Barry Warsaw107771a2001-10-25 21:49:18 +0000412 try:
413 opts, args = getopt.getopt(sys.argv[1:], 'hle',
414 ['help', 'lenient', 'extension'])
415 except getopt.error, msg:
416 usage(1, msg)
417
418 strict = 1
419 extension = 0
420 for opt, arg in opts:
421 if opt in ('-h', '--help'):
422 usage(0)
423 elif opt in ('-l', '--lenient'):
424 strict = 0
425 elif opt in ('-e', '--extension'):
426 extension = 1
427 for gtype in args:
428 if extension:
429 guess = guess_extension(gtype, strict)
430 if not guess: print "I don't know anything about type", gtype
431 else: print guess
432 else:
433 guess, encoding = guess_type(gtype, strict)
434 if not guess: print "I don't know anything about type", gtype
435 else: print 'type:', guess, 'encoding:', encoding