blob: 8f5c076c0525c5a71173643e7076305c8748f5c0 [file] [log] [blame]
Guido van Rossumac8a9f31997-09-30 19:05:50 +00001"""Guess the MIME type of a file.
2
Fred Drake5109ffd1998-05-18 16:27:20 +00003This module defines two useful functions:
Guido van Rossumac8a9f31997-09-30 19:05:50 +00004
5guess_type(url) -- guess the MIME type and encoding of a URL.
6
Fred Drake5109ffd1998-05-18 16:27:20 +00007guess_extension(type) -- guess the extension for a given MIME type.
8
Guido van Rossumac8a9f31997-09-30 19:05:50 +00009It also contains the following, for tuning the behavior:
10
11Data:
12
13knownfiles -- list of files to parse
14inited -- flag set when init() has been called
15suffixes_map -- dictionary mapping suffixes to suffixes
16encodings_map -- dictionary mapping suffixes to encodings
17types_map -- dictionary mapping suffixes to types
18
19Functions:
20
21init([files]) -- parse a list of files, default knownfiles
22read_mime_types(file) -- parse one file, return a dictionary or None
23
24"""
25
Guido van Rossumac8a9f31997-09-30 19:05:50 +000026import posixpath
Guido van Rossum1c5fb1c1998-10-12 15:12:28 +000027import urllib
Guido van Rossumac8a9f31997-09-30 19:05:50 +000028
Skip Montanaro03d90142001-01-25 15:29:22 +000029__all__ = ["guess_type","guess_extension","read_mime_types","init"]
30
Guido van Rossumac8a9f31997-09-30 19:05:50 +000031knownfiles = [
32 "/usr/local/etc/httpd/conf/mime.types",
33 "/usr/local/lib/netscape/mime.types",
Fred Drake13a2c272000-02-10 17:17:14 +000034 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
35 "/usr/local/etc/mime.types", # Apache 1.3
Guido van Rossumac8a9f31997-09-30 19:05:50 +000036 ]
37
38inited = 0
39
40def guess_type(url):
41 """Guess the type of a file based on its URL.
42
43 Return value is a tuple (type, encoding) where type is None if the
44 type can't be guessed (no or unknown suffix) or a string of the
45 form type/subtype, usable for a MIME Content-type header; and
46 encoding is None for no encoding or the name of the program used
47 to encode (e.g. compress or gzip). The mappings are table
48 driven. Encoding suffixes are case sensitive; type suffixes are
49 first tried case sensitive, then case insensitive.
50
51 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
52 to ".tar.gz". (This is table-driven too, using the dictionary
Fred Drake3130b7a1998-05-18 16:05:24 +000053 suffix_map).
Guido van Rossumac8a9f31997-09-30 19:05:50 +000054
55 """
56 if not inited:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000057 init()
Guido van Rossum1c5fb1c1998-10-12 15:12:28 +000058 scheme, url = urllib.splittype(url)
59 if scheme == 'data':
Fred Drake13a2c272000-02-10 17:17:14 +000060 # syntax of data URLs:
61 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
62 # mediatype := [ type "/" subtype ] *( ";" parameter )
63 # data := *urlchar
64 # parameter := attribute "=" value
65 # type/subtype defaults to "text/plain"
Eric S. Raymond51cc3bc2001-02-09 09:44:47 +000066 comma = url.find(',')
Fred Drake13a2c272000-02-10 17:17:14 +000067 if comma < 0:
68 # bad data URL
69 return None, None
Eric S. Raymond51cc3bc2001-02-09 09:44:47 +000070 semi = url.find(';', 0, comma)
Fred Drake13a2c272000-02-10 17:17:14 +000071 if semi >= 0:
72 type = url[:semi]
73 else:
74 type = url[:comma]
75 if '=' in type or '/' not in type:
76 type = 'text/plain'
77 return type, None # never compressed, so encoding is None
Guido van Rossumac8a9f31997-09-30 19:05:50 +000078 base, ext = posixpath.splitext(url)
79 while suffix_map.has_key(ext):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000080 base, ext = posixpath.splitext(base + suffix_map[ext])
Guido van Rossumac8a9f31997-09-30 19:05:50 +000081 if encodings_map.has_key(ext):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000082 encoding = encodings_map[ext]
83 base, ext = posixpath.splitext(base)
Guido van Rossumac8a9f31997-09-30 19:05:50 +000084 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000085 encoding = None
Guido van Rossumac8a9f31997-09-30 19:05:50 +000086 if types_map.has_key(ext):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000087 return types_map[ext], encoding
Eric S. Raymond51cc3bc2001-02-09 09:44:47 +000088 elif types_map.has_key(ext.lower()):
89 return types_map[ext.lower()], encoding
Guido van Rossumac8a9f31997-09-30 19:05:50 +000090 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000091 return None, encoding
Guido van Rossumac8a9f31997-09-30 19:05:50 +000092
Fred Drake5109ffd1998-05-18 16:27:20 +000093def guess_extension(type):
94 """Guess the extension for a file based on its MIME type.
95
96 Return value is a string giving a filename extension, including the
97 leading dot ('.'). The extension is not guaranteed to have been
Fred Drake49413411998-05-19 15:15:59 +000098 associated with any particular data stream, but would be mapped to the
99 MIME type `type' by guess_type(). If no extension can be guessed for
100 `type', None is returned.
Fred Drake5109ffd1998-05-18 16:27:20 +0000101 """
Fred Drake49413411998-05-19 15:15:59 +0000102 global inited
103 if not inited:
104 init()
Eric S. Raymond51cc3bc2001-02-09 09:44:47 +0000105 type = type.lower()
Fred Drake5109ffd1998-05-18 16:27:20 +0000106 for ext, stype in types_map.items():
107 if type == stype:
108 return ext
109 return None
110
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000111def init(files=None):
112 global inited
113 for file in files or knownfiles:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000114 s = read_mime_types(file)
115 if s:
116 for key, value in s.items():
117 types_map[key] = value
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000118 inited = 1
119
120def read_mime_types(file):
121 try:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000122 f = open(file)
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000123 except IOError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000124 return None
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000125 map = {}
126 while 1:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000127 line = f.readline()
128 if not line: break
Eric S. Raymond51cc3bc2001-02-09 09:44:47 +0000129 words = line.split()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000130 for i in range(len(words)):
131 if words[i][0] == '#':
132 del words[i:]
133 break
134 if not words: continue
135 type, suffixes = words[0], words[1:]
136 for suff in suffixes:
137 map['.'+suff] = type
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000138 f.close()
139 return map
140
141suffix_map = {
142 '.tgz': '.tar.gz',
143 '.taz': '.tar.gz',
144 '.tz': '.tar.gz',
145}
146
147encodings_map = {
148 '.gz': 'gzip',
149 '.Z': 'compress',
150 }
151
152types_map = {
153 '.a': 'application/octet-stream',
154 '.ai': 'application/postscript',
155 '.aif': 'audio/x-aiff',
156 '.aifc': 'audio/x-aiff',
157 '.aiff': 'audio/x-aiff',
158 '.au': 'audio/basic',
159 '.avi': 'video/x-msvideo',
160 '.bcpio': 'application/x-bcpio',
161 '.bin': 'application/octet-stream',
162 '.cdf': 'application/x-netcdf',
163 '.cpio': 'application/x-cpio',
164 '.csh': 'application/x-csh',
165 '.dll': 'application/octet-stream',
166 '.dvi': 'application/x-dvi',
167 '.exe': 'application/octet-stream',
168 '.eps': 'application/postscript',
169 '.etx': 'text/x-setext',
170 '.gif': 'image/gif',
171 '.gtar': 'application/x-gtar',
172 '.hdf': 'application/x-hdf',
173 '.htm': 'text/html',
174 '.html': 'text/html',
175 '.ief': 'image/ief',
176 '.jpe': 'image/jpeg',
177 '.jpeg': 'image/jpeg',
178 '.jpg': 'image/jpeg',
Fred Drakec40c5471999-05-20 12:52:04 +0000179 '.js': 'application/x-javascript',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000180 '.latex': 'application/x-latex',
181 '.man': 'application/x-troff-man',
182 '.me': 'application/x-troff-me',
183 '.mif': 'application/x-mif',
184 '.mov': 'video/quicktime',
185 '.movie': 'video/x-sgi-movie',
186 '.mpe': 'video/mpeg',
187 '.mpeg': 'video/mpeg',
188 '.mpg': 'video/mpeg',
189 '.ms': 'application/x-troff-ms',
190 '.nc': 'application/x-netcdf',
191 '.o': 'application/octet-stream',
192 '.obj': 'application/octet-stream',
193 '.oda': 'application/oda',
194 '.pbm': 'image/x-portable-bitmap',
195 '.pdf': 'application/pdf',
196 '.pgm': 'image/x-portable-graymap',
197 '.pnm': 'image/x-portable-anymap',
198 '.png': 'image/png',
199 '.ppm': 'image/x-portable-pixmap',
Martin v. Löwis2750bcc2001-06-05 05:17:00 +0000200 '.ps': 'application/postscript',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000201 '.py': 'text/x-python',
202 '.pyc': 'application/x-python-code',
Martin v. Löwis2750bcc2001-06-05 05:17:00 +0000203 '.pyo': 'application/x-python-code',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000204 '.qt': 'video/quicktime',
205 '.ras': 'image/x-cmu-raster',
206 '.rgb': 'image/x-rgb',
Fred Drakecbd98701999-03-11 16:04:04 +0000207 '.rdf': 'application/xml',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000208 '.roff': 'application/x-troff',
209 '.rtf': 'application/rtf',
210 '.rtx': 'text/richtext',
211 '.sgm': 'text/x-sgml',
212 '.sgml': 'text/x-sgml',
213 '.sh': 'application/x-sh',
214 '.shar': 'application/x-shar',
215 '.snd': 'audio/basic',
216 '.so': 'application/octet-stream',
217 '.src': 'application/x-wais-source',
218 '.sv4cpio': 'application/x-sv4cpio',
219 '.sv4crc': 'application/x-sv4crc',
220 '.t': 'application/x-troff',
221 '.tar': 'application/x-tar',
222 '.tcl': 'application/x-tcl',
223 '.tex': 'application/x-tex',
224 '.texi': 'application/x-texinfo',
225 '.texinfo': 'application/x-texinfo',
226 '.tif': 'image/tiff',
227 '.tiff': 'image/tiff',
228 '.tr': 'application/x-troff',
229 '.tsv': 'text/tab-separated-values',
230 '.txt': 'text/plain',
231 '.ustar': 'application/x-ustar',
232 '.wav': 'audio/x-wav',
233 '.xbm': 'image/x-xbitmap',
Guido van Rossum7beaad41998-05-18 14:25:08 +0000234 '.xml': 'text/xml',
Fred Drakecbd98701999-03-11 16:04:04 +0000235 '.xsl': 'application/xml',
Guido van Rossumac8a9f31997-09-30 19:05:50 +0000236 '.xpm': 'image/x-xpixmap',
237 '.xwd': 'image/x-xwindowdump',
238 '.zip': 'application/zip',
239 }
Eric S. Raymond51cc3bc2001-02-09 09:44:47 +0000240
241if __name__ == '__main__':
242 import sys
243 print guess_type(sys.argv[1])