| """Guess the MIME type of a file. |
| |
| This module defines two useful functions: |
| |
| guess_type(url, strict=True) -- guess the MIME type and encoding of a URL. |
| |
| guess_extension(type, strict=True) -- guess the extension for a given MIME type. |
| |
| It also contains the following, for tuning the behavior: |
| |
| Data: |
| |
| knownfiles -- list of files to parse |
| inited -- flag set when init() has been called |
| suffix_map -- dictionary mapping suffixes to suffixes |
| encodings_map -- dictionary mapping suffixes to encodings |
| types_map -- dictionary mapping suffixes to types |
| |
| Functions: |
| |
| init([files]) -- parse a list of files, default knownfiles (on Windows, the |
| default values are taken from the registry) |
| read_mime_types(file) -- parse one file, return a dictionary or None |
| """ |
| |
| import os |
| import sys |
| import posixpath |
| import urllib.parse |
| try: |
| import winreg as _winreg |
| except ImportError: |
| _winreg = None |
| |
| __all__ = [ |
| "knownfiles", "inited", "MimeTypes", |
| "guess_type", "guess_all_extensions", "guess_extension", |
| "add_type", "init", "read_mime_types", |
| "suffix_map", "encodings_map", "types_map", "common_types" |
| ] |
| |
| knownfiles = [ |
| "/etc/mime.types", |
| "/etc/httpd/mime.types", # Mac OS X |
| "/etc/httpd/conf/mime.types", # Apache |
| "/etc/apache/mime.types", # Apache 1 |
| "/etc/apache2/mime.types", # Apache 2 |
| "/usr/local/etc/httpd/conf/mime.types", |
| "/usr/local/lib/netscape/mime.types", |
| "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2 |
| "/usr/local/etc/mime.types", # Apache 1.3 |
| ] |
| |
| inited = False |
| _db = None |
| |
| |
| class MimeTypes: |
| """MIME-types datastore. |
| |
| This datastore can handle information from mime.types-style files |
| and supports basic determination of MIME type from a filename or |
| URL, and can guess a reasonable extension given a MIME type. |
| """ |
| |
| def __init__(self, filenames=(), strict=True): |
| if not inited: |
| init() |
| self.encodings_map = encodings_map.copy() |
| self.suffix_map = suffix_map.copy() |
| self.types_map = ({}, {}) # dict for (non-strict, strict) |
| self.types_map_inv = ({}, {}) |
| for (ext, type) in types_map.items(): |
| self.add_type(type, ext, True) |
| for (ext, type) in common_types.items(): |
| self.add_type(type, ext, False) |
| for name in filenames: |
| self.read(name, strict) |
| |
| def add_type(self, type, ext, strict=True): |
| """Add a mapping between a type and an extension. |
| |
| When the extension is already known, the new |
| type will replace the old one. When the type |
| is already known the extension will be added |
| to the list of known extensions. |
| |
| If strict is true, information will be added to |
| list of standard types, else to the list of non-standard |
| types. |
| """ |
| self.types_map[strict][ext] = type |
| exts = self.types_map_inv[strict].setdefault(type, []) |
| if ext not in exts: |
| exts.append(ext) |
| |
| def guess_type(self, url, strict=True): |
| """Guess the type of a file based on its URL. |
| |
| Return value is a tuple (type, encoding) where type is None if |
| the type can't be guessed (no or unknown suffix) or a string |
| of the form type/subtype, usable for a MIME Content-type |
| header; and encoding is None for no encoding or the name of |
| the program used to encode (e.g. compress or gzip). The |
| mappings are table driven. Encoding suffixes are case |
| sensitive; type suffixes are first tried case sensitive, then |
| case insensitive. |
| |
| The suffixes .tgz, .taz and .tz (case sensitive!) are all |
| mapped to '.tar.gz'. (This is table-driven too, using the |
| dictionary suffix_map.) |
| |
| Optional `strict' argument when False adds a bunch of commonly found, |
| but non-standard types. |
| """ |
| scheme, url = urllib.parse.splittype(url) |
| if scheme == 'data': |
| # syntax of data URLs: |
| # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data |
| # mediatype := [ type "/" subtype ] *( ";" parameter ) |
| # data := *urlchar |
| # parameter := attribute "=" value |
| # type/subtype defaults to "text/plain" |
| comma = url.find(',') |
| if comma < 0: |
| # bad data URL |
| return None, None |
| semi = url.find(';', 0, comma) |
| if semi >= 0: |
| type = url[:semi] |
| else: |
| type = url[:comma] |
| if '=' in type or '/' not in type: |
| type = 'text/plain' |
| return type, None # never compressed, so encoding is None |
| base, ext = posixpath.splitext(url) |
| while ext in self.suffix_map: |
| base, ext = posixpath.splitext(base + self.suffix_map[ext]) |
| if ext in self.encodings_map: |
| encoding = self.encodings_map[ext] |
| base, ext = posixpath.splitext(base) |
| else: |
| encoding = None |
| types_map = self.types_map[True] |
| if ext in types_map: |
| return types_map[ext], encoding |
| elif ext.lower() in types_map: |
| return types_map[ext.lower()], encoding |
| elif strict: |
| return None, encoding |
| types_map = self.types_map[False] |
| if ext in types_map: |
| return types_map[ext], encoding |
| elif ext.lower() in types_map: |
| return types_map[ext.lower()], encoding |
| else: |
| return None, encoding |
| |
| def guess_all_extensions(self, type, strict=True): |
| """Guess the extensions for a file based on its MIME type. |
| |
| Return value is a list of strings giving the possible filename |
| extensions, including the leading dot ('.'). The extension is not |
| guaranteed to have been associated with any particular data stream, |
| but would be mapped to the MIME type `type' by guess_type(). |
| |
| Optional `strict' argument when false adds a bunch of commonly found, |
| but non-standard types. |
| """ |
| type = type.lower() |
| extensions = self.types_map_inv[True].get(type, []) |
| if not strict: |
| for ext in self.types_map_inv[False].get(type, []): |
| if ext not in extensions: |
| extensions.append(ext) |
| return extensions |
| |
| def guess_extension(self, type, strict=True): |
| """Guess the extension for a file based on its MIME type. |
| |
| Return value is a string giving a filename extension, |
| including the leading dot ('.'). The extension is not |
| guaranteed to have been associated with any particular data |
| stream, but would be mapped to the MIME type `type' by |
| guess_type(). If no extension can be guessed for `type', None |
| is returned. |
| |
| Optional `strict' argument when false adds a bunch of commonly found, |
| but non-standard types. |
| """ |
| extensions = self.guess_all_extensions(type, strict) |
| if not extensions: |
| return None |
| return extensions[0] |
| |
| def read(self, filename, strict=True): |
| """ |
| Read a single mime.types-format file, specified by pathname. |
| |
| If strict is true, information will be added to |
| list of standard types, else to the list of non-standard |
| types. |
| """ |
| with open(filename, encoding='utf-8') as fp: |
| self.readfp(fp, strict) |
| |
| def readfp(self, fp, strict=True): |
| """ |
| Read a single mime.types-format file. |
| |
| If strict is true, information will be added to |
| list of standard types, else to the list of non-standard |
| types. |
| """ |
| while 1: |
| line = fp.readline() |
| if not line: |
| break |
| words = line.split() |
| for i in range(len(words)): |
| if words[i][0] == '#': |
| del words[i:] |
| break |
| if not words: |
| continue |
| type, suffixes = words[0], words[1:] |
| for suff in suffixes: |
| self.add_type(type, '.' + suff, strict) |
| |
| def read_windows_registry(self, strict=True): |
| """ |
| Load the MIME types database from Windows registry. |
| |
| If strict is true, information will be added to |
| list of standard types, else to the list of non-standard |
| types. |
| """ |
| |
| # Windows only |
| if not _winreg: |
| return |
| |
| def enum_types(mimedb): |
| i = 0 |
| while True: |
| try: |
| ctype = _winreg.EnumKey(mimedb, i) |
| except EnvironmentError: |
| break |
| else: |
| if '\0' not in ctype: |
| yield ctype |
| i += 1 |
| |
| with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr: |
| for subkeyname in enum_types(hkcr): |
| try: |
| with _winreg.OpenKey(hkcr, subkeyname) as subkey: |
| # Only check file extensions |
| if not subkeyname.startswith("."): |
| continue |
| # raises EnvironmentError if no 'Content Type' value |
| mimetype, datatype = _winreg.QueryValueEx( |
| subkey, 'Content Type') |
| if datatype != _winreg.REG_SZ: |
| continue |
| self.add_type(mimetype, subkeyname, strict) |
| except EnvironmentError: |
| continue |
| |
| def guess_type(url, strict=True): |
| """Guess the type of a file based on its URL. |
| |
| Return value is a tuple (type, encoding) where type is None if the |
| type can't be guessed (no or unknown suffix) or a string of the |
| form type/subtype, usable for a MIME Content-type header; and |
| encoding is None for no encoding or the name of the program used |
| to encode (e.g. compress or gzip). The mappings are table |
| driven. Encoding suffixes are case sensitive; type suffixes are |
| first tried case sensitive, then case insensitive. |
| |
| The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped |
| to ".tar.gz". (This is table-driven too, using the dictionary |
| suffix_map). |
| |
| Optional `strict' argument when false adds a bunch of commonly found, but |
| non-standard types. |
| """ |
| if _db is None: |
| init() |
| return _db.guess_type(url, strict) |
| |
| |
| def guess_all_extensions(type, strict=True): |
| """Guess the extensions for a file based on its MIME type. |
| |
| Return value is a list of strings giving the possible filename |
| extensions, including the leading dot ('.'). The extension is not |
| guaranteed to have been associated with any particular data |
| stream, but would be mapped to the MIME type `type' by |
| guess_type(). If no extension can be guessed for `type', None |
| is returned. |
| |
| Optional `strict' argument when false adds a bunch of commonly found, |
| but non-standard types. |
| """ |
| if _db is None: |
| init() |
| return _db.guess_all_extensions(type, strict) |
| |
| def guess_extension(type, strict=True): |
| """Guess the extension for a file based on its MIME type. |
| |
| Return value is a string giving a filename extension, including the |
| leading dot ('.'). The extension is not guaranteed to have been |
| associated with any particular data stream, but would be mapped to the |
| MIME type `type' by guess_type(). If no extension can be guessed for |
| `type', None is returned. |
| |
| Optional `strict' argument when false adds a bunch of commonly found, |
| but non-standard types. |
| """ |
| if _db is None: |
| init() |
| return _db.guess_extension(type, strict) |
| |
| def add_type(type, ext, strict=True): |
| """Add a mapping between a type and an extension. |
| |
| When the extension is already known, the new |
| type will replace the old one. When the type |
| is already known the extension will be added |
| to the list of known extensions. |
| |
| If strict is true, information will be added to |
| list of standard types, else to the list of non-standard |
| types. |
| """ |
| if _db is None: |
| init() |
| return _db.add_type(type, ext, strict) |
| |
| |
| def init(files=None): |
| global suffix_map, types_map, encodings_map, common_types |
| global inited, _db |
| inited = True # so that MimeTypes.__init__() doesn't call us again |
| db = MimeTypes() |
| if files is None: |
| if _winreg: |
| db.read_windows_registry() |
| files = knownfiles |
| for file in files: |
| if os.path.isfile(file): |
| db.read(file) |
| encodings_map = db.encodings_map |
| suffix_map = db.suffix_map |
| types_map = db.types_map[True] |
| common_types = db.types_map[False] |
| # Make the DB a global variable now that it is fully initialized |
| _db = db |
| |
| |
| def read_mime_types(file): |
| try: |
| f = open(file) |
| except OSError: |
| return None |
| with f: |
| db = MimeTypes() |
| db.readfp(f, True) |
| return db.types_map[True] |
| |
| |
| def _default_mime_types(): |
| global suffix_map |
| global encodings_map |
| global types_map |
| global common_types |
| |
| suffix_map = { |
| '.svgz': '.svg.gz', |
| '.tgz': '.tar.gz', |
| '.taz': '.tar.gz', |
| '.tz': '.tar.gz', |
| '.tbz2': '.tar.bz2', |
| '.txz': '.tar.xz', |
| } |
| |
| encodings_map = { |
| '.gz': 'gzip', |
| '.Z': 'compress', |
| '.bz2': 'bzip2', |
| '.xz': 'xz', |
| } |
| |
| # Before adding new types, make sure they are either registered with IANA, |
| # at http://www.iana.org/assignments/media-types |
| # or extensions, i.e. using the x- prefix |
| |
| # If you add to these, please keep them sorted! |
| types_map = { |
| '.a' : 'application/octet-stream', |
| '.ai' : 'application/postscript', |
| '.aif' : 'audio/x-aiff', |
| '.aifc' : 'audio/x-aiff', |
| '.aiff' : 'audio/x-aiff', |
| '.au' : 'audio/basic', |
| '.avi' : 'video/x-msvideo', |
| '.bat' : 'text/plain', |
| '.bcpio' : 'application/x-bcpio', |
| '.bin' : 'application/octet-stream', |
| '.bmp' : 'image/x-ms-bmp', |
| '.c' : 'text/plain', |
| # Duplicates :( |
| '.cdf' : 'application/x-cdf', |
| '.cdf' : 'application/x-netcdf', |
| '.cpio' : 'application/x-cpio', |
| '.csh' : 'application/x-csh', |
| '.css' : 'text/css', |
| '.csv' : 'text/csv', |
| '.dll' : 'application/octet-stream', |
| '.doc' : 'application/msword', |
| '.dot' : 'application/msword', |
| '.dvi' : 'application/x-dvi', |
| '.eml' : 'message/rfc822', |
| '.eps' : 'application/postscript', |
| '.etx' : 'text/x-setext', |
| '.exe' : 'application/octet-stream', |
| '.gif' : 'image/gif', |
| '.gtar' : 'application/x-gtar', |
| '.h' : 'text/plain', |
| '.hdf' : 'application/x-hdf', |
| '.htm' : 'text/html', |
| '.html' : 'text/html', |
| '.ico' : 'image/vnd.microsoft.icon', |
| '.ief' : 'image/ief', |
| '.jpe' : 'image/jpeg', |
| '.jpeg' : 'image/jpeg', |
| '.jpg' : 'image/jpeg', |
| '.js' : 'application/javascript', |
| '.ksh' : 'text/plain', |
| '.latex' : 'application/x-latex', |
| '.m1v' : 'video/mpeg', |
| '.m3u' : 'application/vnd.apple.mpegurl', |
| '.m3u8' : 'application/vnd.apple.mpegurl', |
| '.man' : 'application/x-troff-man', |
| '.me' : 'application/x-troff-me', |
| '.mht' : 'message/rfc822', |
| '.mhtml' : 'message/rfc822', |
| '.mif' : 'application/x-mif', |
| '.mov' : 'video/quicktime', |
| '.movie' : 'video/x-sgi-movie', |
| '.mp2' : 'audio/mpeg', |
| '.mp3' : 'audio/mpeg', |
| '.mp4' : 'video/mp4', |
| '.mpa' : 'video/mpeg', |
| '.mpe' : 'video/mpeg', |
| '.mpeg' : 'video/mpeg', |
| '.mpg' : 'video/mpeg', |
| '.ms' : 'application/x-troff-ms', |
| '.nc' : 'application/x-netcdf', |
| '.nws' : 'message/rfc822', |
| '.o' : 'application/octet-stream', |
| '.obj' : 'application/octet-stream', |
| '.oda' : 'application/oda', |
| '.p12' : 'application/x-pkcs12', |
| '.p7c' : 'application/pkcs7-mime', |
| '.pbm' : 'image/x-portable-bitmap', |
| '.pdf' : 'application/pdf', |
| '.pfx' : 'application/x-pkcs12', |
| '.pgm' : 'image/x-portable-graymap', |
| '.pl' : 'text/plain', |
| '.png' : 'image/png', |
| '.pnm' : 'image/x-portable-anymap', |
| '.pot' : 'application/vnd.ms-powerpoint', |
| '.ppa' : 'application/vnd.ms-powerpoint', |
| '.ppm' : 'image/x-portable-pixmap', |
| '.pps' : 'application/vnd.ms-powerpoint', |
| '.ppt' : 'application/vnd.ms-powerpoint', |
| '.ps' : 'application/postscript', |
| '.pwz' : 'application/vnd.ms-powerpoint', |
| '.py' : 'text/x-python', |
| '.pyc' : 'application/x-python-code', |
| '.pyo' : 'application/x-python-code', |
| '.qt' : 'video/quicktime', |
| '.ra' : 'audio/x-pn-realaudio', |
| '.ram' : 'application/x-pn-realaudio', |
| '.ras' : 'image/x-cmu-raster', |
| '.rdf' : 'application/xml', |
| '.rgb' : 'image/x-rgb', |
| '.roff' : 'application/x-troff', |
| '.rtx' : 'text/richtext', |
| '.sgm' : 'text/x-sgml', |
| '.sgml' : 'text/x-sgml', |
| '.sh' : 'application/x-sh', |
| '.shar' : 'application/x-shar', |
| '.snd' : 'audio/basic', |
| '.so' : 'application/octet-stream', |
| '.src' : 'application/x-wais-source', |
| '.sv4cpio': 'application/x-sv4cpio', |
| '.sv4crc' : 'application/x-sv4crc', |
| '.svg' : 'image/svg+xml', |
| '.swf' : 'application/x-shockwave-flash', |
| '.t' : 'application/x-troff', |
| '.tar' : 'application/x-tar', |
| '.tcl' : 'application/x-tcl', |
| '.tex' : 'application/x-tex', |
| '.texi' : 'application/x-texinfo', |
| '.texinfo': 'application/x-texinfo', |
| '.tif' : 'image/tiff', |
| '.tiff' : 'image/tiff', |
| '.tr' : 'application/x-troff', |
| '.tsv' : 'text/tab-separated-values', |
| '.txt' : 'text/plain', |
| '.ustar' : 'application/x-ustar', |
| '.vcf' : 'text/x-vcard', |
| '.wav' : 'audio/x-wav', |
| '.webm' : 'video/webm', |
| '.wiz' : 'application/msword', |
| '.wsdl' : 'application/xml', |
| '.xbm' : 'image/x-xbitmap', |
| '.xlb' : 'application/vnd.ms-excel', |
| # Duplicates :( |
| '.xls' : 'application/excel', |
| '.xls' : 'application/vnd.ms-excel', |
| '.xml' : 'text/xml', |
| '.xpdl' : 'application/xml', |
| '.xpm' : 'image/x-xpixmap', |
| '.xsl' : 'application/xml', |
| '.xwd' : 'image/x-xwindowdump', |
| '.zip' : 'application/zip', |
| } |
| |
| # These are non-standard types, commonly found in the wild. They will |
| # only match if strict=0 flag is given to the API methods. |
| |
| # Please sort these too |
| common_types = { |
| '.jpg' : 'image/jpg', |
| '.mid' : 'audio/midi', |
| '.midi': 'audio/midi', |
| '.pct' : 'image/pict', |
| '.pic' : 'image/pict', |
| '.pict': 'image/pict', |
| '.rtf' : 'application/rtf', |
| '.xul' : 'text/xul' |
| } |
| |
| |
| _default_mime_types() |
| |
| |
| if __name__ == '__main__': |
| import getopt |
| |
| USAGE = """\ |
| Usage: mimetypes.py [options] type |
| |
| Options: |
| --help / -h -- print this message and exit |
| --lenient / -l -- additionally search of some common, but non-standard |
| types. |
| --extension / -e -- guess extension instead of type |
| |
| More than one type argument may be given. |
| """ |
| |
| def usage(code, msg=''): |
| print(USAGE) |
| if msg: print(msg) |
| sys.exit(code) |
| |
| try: |
| opts, args = getopt.getopt(sys.argv[1:], 'hle', |
| ['help', 'lenient', 'extension']) |
| except getopt.error as msg: |
| usage(1, msg) |
| |
| strict = 1 |
| extension = 0 |
| for opt, arg in opts: |
| if opt in ('-h', '--help'): |
| usage(0) |
| elif opt in ('-l', '--lenient'): |
| strict = 0 |
| elif opt in ('-e', '--extension'): |
| extension = 1 |
| for gtype in args: |
| if extension: |
| guess = guess_extension(gtype, strict) |
| if not guess: print("I don't know anything about type", gtype) |
| else: print(guess) |
| else: |
| guess, encoding = guess_type(gtype, strict) |
| if not guess: print("I don't know anything about type", gtype) |
| else: print('type:', guess, 'encoding:', encoding) |