====== pathlib, glob.iglob(), os.walk() ====== ===== os.walk() の実装 ===== os.walk(top, topdown=True, onerror=None, followlinks=False) は、内部的には os.scandir(path='.') によって処理される。\\ ===== glob.iglob() の実装 ===== glob.iglob(pathname, *, recursive=False) は、内部的には os.scandir(path='.') によって処理される。\\ os.scandir(path='.') によって取得された内容は **list()** 化されて fnmatch.filter(names, pattern) によってフィルター処理される。\\ ここで注意が必要なのは、 **pattern** は **__Unix Shell Style__** のパターンであって正規表現は利用できない。\\ fnmatch.filter(names, pattern) の **pattern** は fnmatch.translate(pattern) で正規表現に変換してから re.compile() される。\\ \\ **OK パターン**\\ >>> import fnmatch >>> fnmatch.translate('*.txt') '(?s:.*\\.txt)\\Z' **NG パターン**\\ >>> import fnmatch >>> fnmatch.translate('*.(bat|com|exe)') '(?s:.*\\.\\(bat\\|com\\|exe\\))\\Z' """Filename globbing utility.""" import os import re import fnmatch import sys __all__ = ["glob", "iglob", "escape"] def glob(pathname, *, recursive=False): """Return a list of paths matching a pathname pattern. The pattern may contain simple shell-style wildcards a la fnmatch. However, unlike fnmatch, filenames starting with a dot are special cases that are not matched by '*' and '?' patterns. If recursive is true, the pattern '**' will match any files and zero or more directories and subdirectories. """ return list(iglob(pathname, recursive=recursive)) def iglob(pathname, *, recursive=False): """Return an iterator which yields the paths matching a pathname pattern. The pattern may contain simple shell-style wildcards a la fnmatch. However, unlike fnmatch, filenames starting with a dot are special cases that are not matched by '*' and '?' patterns. If recursive is true, the pattern '**' will match any files and zero or more directories and subdirectories. """ it = _iglob(pathname, recursive, False) if recursive and _isrecursive(pathname): s = next(it) # skip empty string assert not s return it def _iglob(pathname, recursive, dironly): sys.audit("glob.glob", pathname, recursive) dirname, basename = os.path.split(pathname) if not has_magic(pathname): assert not dironly if basename: if os.path.lexists(pathname): yield pathname else: # Patterns ending with a slash should match only directories if os.path.isdir(dirname): yield pathname return if not dirname: if recursive and _isrecursive(basename): yield from _glob2(dirname, basename, dironly) else: yield from _glob1(dirname, basename, dironly) return # `os.path.split()` returns the argument itself as a dirname if it is a # drive or UNC path. Prevent an infinite recursion if a drive or UNC path # contains magic characters (i.e. r'\\?\C:'). if dirname != pathname and has_magic(dirname): dirs = _iglob(dirname, recursive, True) else: dirs = [dirname] if has_magic(basename): if recursive and _isrecursive(basename): glob_in_dir = _glob2 else: glob_in_dir = _glob1 else: glob_in_dir = _glob0 for dirname in dirs: for name in glob_in_dir(dirname, basename, dironly): yield os.path.join(dirname, name) # These 2 helper functions non-recursively glob inside a literal directory. # They return a list of basenames. _glob1 accepts a pattern while _glob0 # takes a literal basename (so it only has to check for its existence). def _glob1(dirname, pattern, dironly): names = list(_iterdir(dirname, dironly)) if not _ishidden(pattern): names = (x for x in names if not _ishidden(x)) return fnmatch.filter(names, pattern) def _glob0(dirname, basename, dironly): if not basename: # `os.path.split()` returns an empty basename for paths ending with a # directory separator. 'q*x/' should match only directories. if os.path.isdir(dirname): return [basename] else: if os.path.lexists(os.path.join(dirname, basename)): return [basename] return [] # Following functions are not public but can be used by third-party code. def glob0(dirname, pattern): return _glob0(dirname, pattern, False) def glob1(dirname, pattern): return _glob1(dirname, pattern, False) # This helper function recursively yields relative pathnames inside a literal # directory. def _glob2(dirname, pattern, dironly): assert _isrecursive(pattern) yield pattern[:0] yield from _rlistdir(dirname, dironly) # If dironly is false, yields all file names inside a directory. # If dironly is true, yields only directory names. def _iterdir(dirname, dironly): if not dirname: if isinstance(dirname, bytes): dirname = bytes(os.curdir, 'ASCII') else: dirname = os.curdir try: with os.scandir(dirname) as it: for entry in it: try: if not dironly or entry.is_dir(): yield entry.name except OSError: pass except OSError: return # Recursively yields relative pathnames inside a literal directory. def _rlistdir(dirname, dironly): names = list(_iterdir(dirname, dironly)) for x in names: if not _ishidden(x): yield x path = os.path.join(dirname, x) if dirname else x for y in _rlistdir(path, dironly): yield os.path.join(x, y) magic_check = re.compile('([*?[])') magic_check_bytes = re.compile(b'([*?[])') def has_magic(s): if isinstance(s, bytes): match = magic_check_bytes.search(s) else: match = magic_check.search(s) return match is not None def _ishidden(path): return path[0] in ('.', b'.'[0]) def _isrecursive(pattern): if isinstance(pattern, bytes): return pattern == b'**' else: return pattern == '**' def escape(pathname): """Escape all special characters. """ # Escaping is done by wrapping any of "*?[" between square brackets. # Metacharacters do not work in the drive part and shouldn't be escaped. drive, pathname = os.path.splitdrive(pathname) if isinstance(pathname, bytes): pathname = magic_check_bytes.sub(br'[\1]', pathname) else: pathname = magic_check.sub(r'[\1]', pathname) return drive + pathname ===== os.scandir() の実装 ===== [[https://docs.python.org/3/library/os.html#os.scandir|os — Miscellaneous operating system interfaces — Python 3.8.2 documentation]]\\ Note On Unix-based systems, scandir() uses the system’s opendir() and readdir() functions. On Windows, it uses the Win32 FindFirstFileW and FindNextFileW functions.\\ \\ ノート(翻訳) Unix ベースのシステムでは、scandir() はシステムの opendir() 関数と readdir() 関数を使用します。 Windows では、Win32 FindFirstFileW 関数と FindNextFileW 関数を使用します。 /*[clinic input] os.scandir path : path_t(nullable=True, allow_fd='PATH_HAVE_FDOPENDIR') = None Return an iterator of DirEntry objects for given path. path can be specified as either str, bytes, or a path-like object. If path is bytes, the names of yielded DirEntry objects will also be bytes; in all other circumstances they will be str. If path is None, uses the path='.'. [clinic start generated code]*/ static PyObject * os_scandir_impl(PyObject *module, path_t *path) /*[clinic end generated code: output=6eb2668b675ca89e input=6bdd312708fc3bb0]*/ { ScandirIterator *iterator; #ifdef MS_WINDOWS wchar_t *path_strW; #else const char *path_str; #ifdef HAVE_FDOPENDIR int fd = -1; #endif #endif if (PySys_Audit("os.scandir", "O", path->object ? path->object : Py_None) < 0) { return NULL; } PyObject *ScandirIteratorType = get_posix_state(module)->ScandirIteratorType; iterator = PyObject_New(ScandirIterator, (PyTypeObject *)ScandirIteratorType); if (!iterator) return NULL; #ifdef MS_WINDOWS iterator->handle = INVALID_HANDLE_VALUE; #else iterator->dirp = NULL; #endif memcpy(&iterator->path, path, sizeof(path_t)); /* Move the ownership to iterator->path */ path->object = NULL; path->cleanup = NULL; #ifdef MS_WINDOWS iterator->first_time = 1; path_strW = join_path_filenameW(iterator->path.wide, L"*.*"); if (!path_strW) goto error; Py_BEGIN_ALLOW_THREADS iterator->handle = FindFirstFileW(path_strW, &iterator->file_data); Py_END_ALLOW_THREADS PyMem_Free(path_strW); if (iterator->handle == INVALID_HANDLE_VALUE) { path_error(&iterator->path); goto error; } #else /* POSIX */ errno = 0; #ifdef HAVE_FDOPENDIR if (path->fd != -1) { /* closedir() closes the FD, so we duplicate it */ fd = _Py_dup(path->fd); if (fd == -1) goto error; Py_BEGIN_ALLOW_THREADS iterator->dirp = fdopendir(fd); Py_END_ALLOW_THREADS } else #endif { if (iterator->path.narrow) path_str = iterator->path.narrow; else path_str = "."; Py_BEGIN_ALLOW_THREADS iterator->dirp = opendir(path_str); Py_END_ALLOW_THREADS } if (!iterator->dirp) { path_error(&iterator->path); #ifdef HAVE_FDOPENDIR if (fd != -1) { Py_BEGIN_ALLOW_THREADS close(fd); Py_END_ALLOW_THREADS } #endif goto error; } #endif return (PyObject *)iterator; error: Py_DECREF(iterator); return NULL; } [[https://github.com/python/cpython/blob/master/Modules/posixmodule.c|cpython/Modules/posixmodule.c - github.com]] より\\ ===== パフォーマンス比較 ===== #!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import glob import timeit def walk(): #result = [[file for file in files if file.endswith('.py')] for path, dirs, files in os.walk('.')] result = [] for path, dirs, files in os.walk('.'): for file in files: if file.endswith('.py'): result.append(file) return result def iglob(): result = [] for file in glob.iglob('**/*', recursive=True): if file.endswith('.py'): result.append(file) return result def iglob2(): result = [] for file in glob.iglob('**/*.py', recursive=True): result.append(file) return result def main(): # os.chdir('./') num_of_exec = 10 print(timeit.timeit(walk, number=num_of_exec) / num_of_exec) print(timeit.timeit(iglob, number=num_of_exec) / num_of_exec) print(timeit.timeit(iglob2, number=num_of_exec) / num_of_exec) if __name__ == '__main__': main() $ ./dir_search.py 1.3243966199999704 1.4991581099999167 1.61260242999997 $ time find . -name '*.py' > /dev/null real 0m0.523s user 0m0.020s sys 0m0.077s ===== 参考文献 ===== [[https://docs.python.org/ja/3/library/glob.html|glob --- Unix 形式のパス名のパターン展開 — Python ドキュメント]]\\ [[https://stackoverflow.com/questions/51167093/how-can-i-find-the-source-code-of-os-scandir-in-python3|python - How can I find the source code of os.scandir in Python3? - Stack Overflow]]\\ [[https://stackoverflow.com/questions/20638040/glob-exclude-pattern|python - glob exclude pattern - Stack Overflow]]\\ [[https://github.com/python/cpython/blob/master/Lib/pathlib.py|cpython/pathlib.py at master · python/cpython]]\\ [[https://stackoverflow.com/questions/50948391/whats-the-fastest-way-to-recursively-search-for-files-in-python/50950952|What's the fastest way to recursively search for files in python? - Stack Overflow]]\\ [[https://qiita.com/amowwee/items/e63b3610ea750f7dba1b|Pythonでフォルダ内のファイルリストを取得する - Qiita]]\\ [[https://stackoverflow.com/questions/24812253/how-can-i-capture-return-value-with-python-timeit-module|How can I capture return value with Python timeit module? - Stack Overflow]]\\ [[https://living-sun.com/ja/python/705207-quicker-to-oswalk-or-glob-python-traversal-glob-oswalk-directory-walk.html|os.walkやglobにすばやく - Python、トラバーサル、グロブ、os.walk、ディレクトリウォーク]]\\ [[https://note.dokeep.jp/post/csharp-fast-enumerate-file/|[C#] 高速でファイルとフォルダを列挙する - ざこノート]]\\