Guido van Rossum | ab096c9 | 1997-04-02 05:47:11 +0000 | [diff] [blame] | 1 | """Filename globbing utility.""" |
Guido van Rossum | 65a9620 | 1991-01-01 18:17:49 +0000 | [diff] [blame] | 2 | |
Miss Islington (bot) | 38e021a | 2021-06-23 03:28:08 -0700 | [diff] [blame] | 3 | import contextlib |
Guido van Rossum | bba77af | 1992-01-12 23:26:24 +0000 | [diff] [blame] | 4 | import os |
Guido van Rossum | 9694fca | 1997-10-22 21:00:49 +0000 | [diff] [blame] | 5 | import re |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 6 | import fnmatch |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 7 | import itertools |
| 8 | import stat |
Steve Dower | 60419a7 | 2019-06-24 08:42:54 -0700 | [diff] [blame] | 9 | import sys |
Guido van Rossum | 65a9620 | 1991-01-01 18:17:49 +0000 | [diff] [blame] | 10 | |
Serhiy Storchaka | 04b5700 | 2015-11-09 23:18:19 +0200 | [diff] [blame] | 11 | __all__ = ["glob", "iglob", "escape"] |
Guido van Rossum | bba77af | 1992-01-12 23:26:24 +0000 | [diff] [blame] | 12 | |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 13 | def glob(pathname, *, root_dir=None, dir_fd=None, recursive=False): |
Tim Peters | 07e99cb | 2001-01-14 23:47:14 +0000 | [diff] [blame] | 14 | """Return a list of paths matching a pathname pattern. |
Guido van Rossum | ab096c9 | 1997-04-02 05:47:11 +0000 | [diff] [blame] | 15 | |
Petri Lehtinen | ee4a20b | 2013-02-23 19:53:03 +0100 | [diff] [blame] | 16 | The pattern may contain simple shell-style wildcards a la |
| 17 | fnmatch. However, unlike fnmatch, filenames starting with a |
| 18 | dot are special cases that are not matched by '*' and '?' |
| 19 | patterns. |
Guido van Rossum | ab096c9 | 1997-04-02 05:47:11 +0000 | [diff] [blame] | 20 | |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 21 | If recursive is true, the pattern '**' will match any files and |
| 22 | zero or more directories and subdirectories. |
Tim Peters | 07e99cb | 2001-01-14 23:47:14 +0000 | [diff] [blame] | 23 | """ |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 24 | return list(iglob(pathname, root_dir=root_dir, dir_fd=dir_fd, recursive=recursive)) |
Johannes Gijsbers | 836f543 | 2005-01-08 13:13:19 +0000 | [diff] [blame] | 25 | |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 26 | def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False): |
Benjamin Peterson | d23f822 | 2009-04-05 19:13:16 +0000 | [diff] [blame] | 27 | """Return an iterator which yields the paths matching a pathname pattern. |
Johannes Gijsbers | 836f543 | 2005-01-08 13:13:19 +0000 | [diff] [blame] | 28 | |
Petri Lehtinen | ee4a20b | 2013-02-23 19:53:03 +0100 | [diff] [blame] | 29 | The pattern may contain simple shell-style wildcards a la |
| 30 | fnmatch. However, unlike fnmatch, filenames starting with a |
| 31 | dot are special cases that are not matched by '*' and '?' |
| 32 | patterns. |
Johannes Gijsbers | 836f543 | 2005-01-08 13:13:19 +0000 | [diff] [blame] | 33 | |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 34 | If recursive is true, the pattern '**' will match any files and |
| 35 | zero or more directories and subdirectories. |
Johannes Gijsbers | 836f543 | 2005-01-08 13:13:19 +0000 | [diff] [blame] | 36 | """ |
Serhiy Storchaka | 1d34699 | 2020-10-20 19:45:38 +0300 | [diff] [blame] | 37 | sys.audit("glob.glob", pathname, recursive) |
Saiyang Gou | a32f8fe | 2021-04-21 15:42:55 -0700 | [diff] [blame] | 38 | sys.audit("glob.glob/2", pathname, recursive, root_dir, dir_fd) |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 39 | if root_dir is not None: |
| 40 | root_dir = os.fspath(root_dir) |
| 41 | else: |
| 42 | root_dir = pathname[:0] |
| 43 | it = _iglob(pathname, root_dir, dir_fd, recursive, False) |
| 44 | if not pathname or recursive and _isrecursive(pathname[:2]): |
| 45 | try: |
| 46 | s = next(it) # skip empty string |
| 47 | if s: |
| 48 | it = itertools.chain((s,), it) |
| 49 | except StopIteration: |
| 50 | pass |
Serhiy Storchaka | 735b790 | 2015-11-09 23:12:07 +0200 | [diff] [blame] | 51 | return it |
| 52 | |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 53 | def _iglob(pathname, root_dir, dir_fd, recursive, dironly): |
Tim Golden | 9b3fb0c | 2012-11-06 15:33:30 +0000 | [diff] [blame] | 54 | dirname, basename = os.path.split(pathname) |
Serhiy Storchaka | 6f20170 | 2014-08-12 12:55:12 +0300 | [diff] [blame] | 55 | if not has_magic(pathname): |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 56 | assert not dironly |
Serhiy Storchaka | 6f20170 | 2014-08-12 12:55:12 +0300 | [diff] [blame] | 57 | if basename: |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 58 | if _lexists(_join(root_dir, pathname), dir_fd): |
Serhiy Storchaka | 6f20170 | 2014-08-12 12:55:12 +0300 | [diff] [blame] | 59 | yield pathname |
| 60 | else: |
| 61 | # Patterns ending with a slash should match only directories |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 62 | if _isdir(_join(root_dir, dirname), dir_fd): |
Serhiy Storchaka | 6f20170 | 2014-08-12 12:55:12 +0300 | [diff] [blame] | 63 | yield pathname |
| 64 | return |
Tim Golden | 9b3fb0c | 2012-11-06 15:33:30 +0000 | [diff] [blame] | 65 | if not dirname: |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 66 | if recursive and _isrecursive(basename): |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 67 | yield from _glob2(root_dir, basename, dir_fd, dironly) |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 68 | else: |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 69 | yield from _glob1(root_dir, basename, dir_fd, dironly) |
Tim Golden | 9b3fb0c | 2012-11-06 15:33:30 +0000 | [diff] [blame] | 70 | return |
Antoine Pitrou | 3d068b2 | 2012-12-16 13:49:37 +0100 | [diff] [blame] | 71 | # `os.path.split()` returns the argument itself as a dirname if it is a |
| 72 | # drive or UNC path. Prevent an infinite recursion if a drive or UNC path |
| 73 | # contains magic characters (i.e. r'\\?\C:'). |
| 74 | if dirname != pathname and has_magic(dirname): |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 75 | dirs = _iglob(dirname, root_dir, dir_fd, recursive, True) |
Tim Golden | 9b3fb0c | 2012-11-06 15:33:30 +0000 | [diff] [blame] | 76 | else: |
| 77 | dirs = [dirname] |
| 78 | if has_magic(basename): |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 79 | if recursive and _isrecursive(basename): |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 80 | glob_in_dir = _glob2 |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 81 | else: |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 82 | glob_in_dir = _glob1 |
Tim Golden | 9b3fb0c | 2012-11-06 15:33:30 +0000 | [diff] [blame] | 83 | else: |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 84 | glob_in_dir = _glob0 |
Tim Golden | 9b3fb0c | 2012-11-06 15:33:30 +0000 | [diff] [blame] | 85 | for dirname in dirs: |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 86 | for name in glob_in_dir(_join(root_dir, dirname), basename, dir_fd, dironly): |
Tim Golden | 9b3fb0c | 2012-11-06 15:33:30 +0000 | [diff] [blame] | 87 | yield os.path.join(dirname, name) |
Johannes Gijsbers | 836f543 | 2005-01-08 13:13:19 +0000 | [diff] [blame] | 88 | |
| 89 | # These 2 helper functions non-recursively glob inside a literal directory. |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 90 | # They return a list of basenames. _glob1 accepts a pattern while _glob0 |
Johannes Gijsbers | 836f543 | 2005-01-08 13:13:19 +0000 | [diff] [blame] | 91 | # takes a literal basename (so it only has to check for its existence). |
Guido van Rossum | 65a9620 | 1991-01-01 18:17:49 +0000 | [diff] [blame] | 92 | |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 93 | def _glob1(dirname, pattern, dir_fd, dironly): |
Miss Islington (bot) | 38e021a | 2021-06-23 03:28:08 -0700 | [diff] [blame] | 94 | names = _listdir(dirname, dir_fd, dironly) |
Hynek Schlawack | e26568f | 2012-12-27 10:10:11 +0100 | [diff] [blame] | 95 | if not _ishidden(pattern): |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 96 | names = (x for x in names if not _ishidden(x)) |
Guido van Rossum | d8faa36 | 2007-04-27 19:54:29 +0000 | [diff] [blame] | 97 | return fnmatch.filter(names, pattern) |
Guido van Rossum | 65a9620 | 1991-01-01 18:17:49 +0000 | [diff] [blame] | 98 | |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 99 | def _glob0(dirname, basename, dir_fd, dironly): |
| 100 | if basename: |
| 101 | if _lexists(_join(dirname, basename), dir_fd): |
Johannes Gijsbers | 836f543 | 2005-01-08 13:13:19 +0000 | [diff] [blame] | 102 | return [basename] |
| 103 | else: |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 104 | # `os.path.split()` returns an empty basename for paths ending with a |
| 105 | # directory separator. 'q*x/' should match only directories. |
| 106 | if _isdir(dirname, dir_fd): |
Johannes Gijsbers | 836f543 | 2005-01-08 13:13:19 +0000 | [diff] [blame] | 107 | return [basename] |
| 108 | return [] |
| 109 | |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 110 | # Following functions are not public but can be used by third-party code. |
| 111 | |
| 112 | def glob0(dirname, pattern): |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 113 | return _glob0(dirname, pattern, None, False) |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 114 | |
| 115 | def glob1(dirname, pattern): |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 116 | return _glob1(dirname, pattern, None, False) |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 117 | |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 118 | # This helper function recursively yields relative pathnames inside a literal |
| 119 | # directory. |
| 120 | |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 121 | def _glob2(dirname, pattern, dir_fd, dironly): |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 122 | assert _isrecursive(pattern) |
Serhiy Storchaka | 735b790 | 2015-11-09 23:12:07 +0200 | [diff] [blame] | 123 | yield pattern[:0] |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 124 | yield from _rlistdir(dirname, dir_fd, dironly) |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 125 | |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 126 | # If dironly is false, yields all file names inside a directory. |
| 127 | # If dironly is true, yields only directory names. |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 128 | def _iterdir(dirname, dir_fd, dironly): |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 129 | try: |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 130 | fd = None |
| 131 | fsencode = None |
| 132 | if dir_fd is not None: |
| 133 | if dirname: |
| 134 | fd = arg = os.open(dirname, _dir_open_flags, dir_fd=dir_fd) |
| 135 | else: |
| 136 | arg = dir_fd |
| 137 | if isinstance(dirname, bytes): |
| 138 | fsencode = os.fsencode |
| 139 | elif dirname: |
| 140 | arg = dirname |
| 141 | elif isinstance(dirname, bytes): |
| 142 | arg = bytes(os.curdir, 'ASCII') |
| 143 | else: |
| 144 | arg = os.curdir |
| 145 | try: |
| 146 | with os.scandir(arg) as it: |
| 147 | for entry in it: |
| 148 | try: |
| 149 | if not dironly or entry.is_dir(): |
| 150 | if fsencode is not None: |
| 151 | yield fsencode(entry.name) |
| 152 | else: |
| 153 | yield entry.name |
| 154 | except OSError: |
| 155 | pass |
| 156 | finally: |
| 157 | if fd is not None: |
| 158 | os.close(fd) |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 159 | except OSError: |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 160 | return |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 161 | |
Miss Islington (bot) | 38e021a | 2021-06-23 03:28:08 -0700 | [diff] [blame] | 162 | def _listdir(dirname, dir_fd, dironly): |
| 163 | with contextlib.closing(_iterdir(dirname, dir_fd, dironly)) as it: |
| 164 | return list(it) |
| 165 | |
Serhiy Storchaka | 28ab634 | 2016-09-06 22:33:41 +0300 | [diff] [blame] | 166 | # Recursively yields relative pathnames inside a literal directory. |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 167 | def _rlistdir(dirname, dir_fd, dironly): |
Miss Islington (bot) | 38e021a | 2021-06-23 03:28:08 -0700 | [diff] [blame] | 168 | names = _listdir(dirname, dir_fd, dironly) |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 169 | for x in names: |
| 170 | if not _ishidden(x): |
| 171 | yield x |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 172 | path = _join(dirname, x) if dirname else x |
| 173 | for y in _rlistdir(path, dir_fd, dironly): |
| 174 | yield _join(x, y) |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 175 | |
Guido van Rossum | c2ef5c2 | 1992-01-12 23:32:11 +0000 | [diff] [blame] | 176 | |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 177 | def _lexists(pathname, dir_fd): |
| 178 | # Same as os.path.lexists(), but with dir_fd |
| 179 | if dir_fd is None: |
| 180 | return os.path.lexists(pathname) |
| 181 | try: |
| 182 | os.lstat(pathname, dir_fd=dir_fd) |
| 183 | except (OSError, ValueError): |
| 184 | return False |
| 185 | else: |
| 186 | return True |
| 187 | |
| 188 | def _isdir(pathname, dir_fd): |
| 189 | # Same as os.path.isdir(), but with dir_fd |
| 190 | if dir_fd is None: |
| 191 | return os.path.isdir(pathname) |
| 192 | try: |
| 193 | st = os.stat(pathname, dir_fd=dir_fd) |
| 194 | except (OSError, ValueError): |
| 195 | return False |
| 196 | else: |
| 197 | return stat.S_ISDIR(st.st_mode) |
| 198 | |
| 199 | def _join(dirname, basename): |
| 200 | # It is common if dirname or basename is empty |
| 201 | if not dirname or not basename: |
| 202 | return dirname or basename |
| 203 | return os.path.join(dirname, basename) |
| 204 | |
Serhiy Storchaka | fd32fff | 2013-11-18 13:06:43 +0200 | [diff] [blame] | 205 | magic_check = re.compile('([*?[])') |
| 206 | magic_check_bytes = re.compile(b'([*?[])') |
Tim Golden | 9b3fb0c | 2012-11-06 15:33:30 +0000 | [diff] [blame] | 207 | |
Guido van Rossum | 65a9620 | 1991-01-01 18:17:49 +0000 | [diff] [blame] | 208 | def has_magic(s): |
Guido van Rossum | f0af3e3 | 2008-10-02 18:55:37 +0000 | [diff] [blame] | 209 | if isinstance(s, bytes): |
| 210 | match = magic_check_bytes.search(s) |
| 211 | else: |
| 212 | match = magic_check.search(s) |
| 213 | return match is not None |
Hynek Schlawack | e26568f | 2012-12-27 10:10:11 +0100 | [diff] [blame] | 214 | |
| 215 | def _ishidden(path): |
| 216 | return path[0] in ('.', b'.'[0]) |
Serhiy Storchaka | fd32fff | 2013-11-18 13:06:43 +0200 | [diff] [blame] | 217 | |
Serhiy Storchaka | c2edcdd | 2014-09-11 12:17:37 +0300 | [diff] [blame] | 218 | def _isrecursive(pattern): |
| 219 | if isinstance(pattern, bytes): |
| 220 | return pattern == b'**' |
| 221 | else: |
| 222 | return pattern == '**' |
| 223 | |
Serhiy Storchaka | fd32fff | 2013-11-18 13:06:43 +0200 | [diff] [blame] | 224 | def escape(pathname): |
| 225 | """Escape all special characters. |
| 226 | """ |
| 227 | # Escaping is done by wrapping any of "*?[" between square brackets. |
| 228 | # Metacharacters do not work in the drive part and shouldn't be escaped. |
| 229 | drive, pathname = os.path.splitdrive(pathname) |
| 230 | if isinstance(pathname, bytes): |
| 231 | pathname = magic_check_bytes.sub(br'[\1]', pathname) |
| 232 | else: |
| 233 | pathname = magic_check.sub(r'[\1]', pathname) |
| 234 | return drive + pathname |
Serhiy Storchaka | 8a64cea | 2020-06-18 22:08:27 +0300 | [diff] [blame] | 235 | |
| 236 | |
| 237 | _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) |