====== pathlib, glob.iglob(), os.walk() ======
===== os.walk() の実装 =====
os.walk(top, topdown=True, onerror=None, followlinks=False)
は、内部的には os.scandir(path='.')
によって処理される。\\
===== glob.iglob() の実装 =====
glob.iglob(pathname, *, recursive=False)
は、内部的には os.scandir(path='.')
によって処理される。\\
os.scandir(path='.')
によって取得された内容は **list()** 化されて fnmatch.filter(names, pattern)
によってフィルター処理される。\\
ここで注意が必要なのは、 **pattern** は **__Unix Shell Style__** のパターンであって正規表現は利用できない。\\
fnmatch.filter(names, pattern)
の **pattern** は fnmatch.translate(pattern)
で正規表現に変換してから re.compile()
される。\\
\\
**OK パターン**\\
>>> import fnmatch
>>> fnmatch.translate('*.txt')
'(?s:.*\\.txt)\\Z'
**NG パターン**\\
>>> import fnmatch
>>> fnmatch.translate('*.(bat|com|exe)')
'(?s:.*\\.\\(bat\\|com\\|exe\\))\\Z'
"""Filename globbing utility."""
import os
import re
import fnmatch
import sys
__all__ = ["glob", "iglob", "escape"]
def glob(pathname, *, recursive=False):
"""Return a list of paths matching a pathname pattern.
The pattern may contain simple shell-style wildcards a la
fnmatch. However, unlike fnmatch, filenames starting with a
dot are special cases that are not matched by '*' and '?'
patterns.
If recursive is true, the pattern '**' will match any files and
zero or more directories and subdirectories.
"""
return list(iglob(pathname, recursive=recursive))
def iglob(pathname, *, recursive=False):
"""Return an iterator which yields the paths matching a pathname pattern.
The pattern may contain simple shell-style wildcards a la
fnmatch. However, unlike fnmatch, filenames starting with a
dot are special cases that are not matched by '*' and '?'
patterns.
If recursive is true, the pattern '**' will match any files and
zero or more directories and subdirectories.
"""
it = _iglob(pathname, recursive, False)
if recursive and _isrecursive(pathname):
s = next(it) # skip empty string
assert not s
return it
def _iglob(pathname, recursive, dironly):
sys.audit("glob.glob", pathname, recursive)
dirname, basename = os.path.split(pathname)
if not has_magic(pathname):
assert not dironly
if basename:
if os.path.lexists(pathname):
yield pathname
else:
# Patterns ending with a slash should match only directories
if os.path.isdir(dirname):
yield pathname
return
if not dirname:
if recursive and _isrecursive(basename):
yield from _glob2(dirname, basename, dironly)
else:
yield from _glob1(dirname, basename, dironly)
return
# `os.path.split()` returns the argument itself as a dirname if it is a
# drive or UNC path. Prevent an infinite recursion if a drive or UNC path
# contains magic characters (i.e. r'\\?\C:').
if dirname != pathname and has_magic(dirname):
dirs = _iglob(dirname, recursive, True)
else:
dirs = [dirname]
if has_magic(basename):
if recursive and _isrecursive(basename):
glob_in_dir = _glob2
else:
glob_in_dir = _glob1
else:
glob_in_dir = _glob0
for dirname in dirs:
for name in glob_in_dir(dirname, basename, dironly):
yield os.path.join(dirname, name)
# These 2 helper functions non-recursively glob inside a literal directory.
# They return a list of basenames. _glob1 accepts a pattern while _glob0
# takes a literal basename (so it only has to check for its existence).
def _glob1(dirname, pattern, dironly):
names = list(_iterdir(dirname, dironly))
if not _ishidden(pattern):
names = (x for x in names if not _ishidden(x))
return fnmatch.filter(names, pattern)
def _glob0(dirname, basename, dironly):
if not basename:
# `os.path.split()` returns an empty basename for paths ending with a
# directory separator. 'q*x/' should match only directories.
if os.path.isdir(dirname):
return [basename]
else:
if os.path.lexists(os.path.join(dirname, basename)):
return [basename]
return []
# Following functions are not public but can be used by third-party code.
def glob0(dirname, pattern):
return _glob0(dirname, pattern, False)
def glob1(dirname, pattern):
return _glob1(dirname, pattern, False)
# This helper function recursively yields relative pathnames inside a literal
# directory.
def _glob2(dirname, pattern, dironly):
assert _isrecursive(pattern)
yield pattern[:0]
yield from _rlistdir(dirname, dironly)
# If dironly is false, yields all file names inside a directory.
# If dironly is true, yields only directory names.
def _iterdir(dirname, dironly):
if not dirname:
if isinstance(dirname, bytes):
dirname = bytes(os.curdir, 'ASCII')
else:
dirname = os.curdir
try:
with os.scandir(dirname) as it:
for entry in it:
try:
if not dironly or entry.is_dir():
yield entry.name
except OSError:
pass
except OSError:
return
# Recursively yields relative pathnames inside a literal directory.
def _rlistdir(dirname, dironly):
names = list(_iterdir(dirname, dironly))
for x in names:
if not _ishidden(x):
yield x
path = os.path.join(dirname, x) if dirname else x
for y in _rlistdir(path, dironly):
yield os.path.join(x, y)
magic_check = re.compile('([*?[])')
magic_check_bytes = re.compile(b'([*?[])')
def has_magic(s):
if isinstance(s, bytes):
match = magic_check_bytes.search(s)
else:
match = magic_check.search(s)
return match is not None
def _ishidden(path):
return path[0] in ('.', b'.'[0])
def _isrecursive(pattern):
if isinstance(pattern, bytes):
return pattern == b'**'
else:
return pattern == '**'
def escape(pathname):
"""Escape all special characters.
"""
# Escaping is done by wrapping any of "*?[" between square brackets.
# Metacharacters do not work in the drive part and shouldn't be escaped.
drive, pathname = os.path.splitdrive(pathname)
if isinstance(pathname, bytes):
pathname = magic_check_bytes.sub(br'[\1]', pathname)
else:
pathname = magic_check.sub(r'[\1]', pathname)
return drive + pathname
===== os.scandir() の実装 =====
[[https://docs.python.org/3/library/os.html#os.scandir|os — Miscellaneous operating system interfaces — Python 3.8.2 documentation]]\\
Note On Unix-based systems, scandir() uses the system’s opendir() and readdir() functions. On Windows, it uses the Win32 FindFirstFileW and FindNextFileW functions.\\
\\
ノート(翻訳) Unix ベースのシステムでは、scandir() はシステムの opendir() 関数と readdir() 関数を使用します。 Windows では、Win32 FindFirstFileW 関数と FindNextFileW 関数を使用します。
/*[clinic input]
os.scandir
path : path_t(nullable=True, allow_fd='PATH_HAVE_FDOPENDIR') = None
Return an iterator of DirEntry objects for given path.
path can be specified as either str, bytes, or a path-like object. If path
is bytes, the names of yielded DirEntry objects will also be bytes; in
all other circumstances they will be str.
If path is None, uses the path='.'.
[clinic start generated code]*/
static PyObject *
os_scandir_impl(PyObject *module, path_t *path)
/*[clinic end generated code: output=6eb2668b675ca89e input=6bdd312708fc3bb0]*/
{
ScandirIterator *iterator;
#ifdef MS_WINDOWS
wchar_t *path_strW;
#else
const char *path_str;
#ifdef HAVE_FDOPENDIR
int fd = -1;
#endif
#endif
if (PySys_Audit("os.scandir", "O",
path->object ? path->object : Py_None) < 0) {
return NULL;
}
PyObject *ScandirIteratorType = get_posix_state(module)->ScandirIteratorType;
iterator = PyObject_New(ScandirIterator, (PyTypeObject *)ScandirIteratorType);
if (!iterator)
return NULL;
#ifdef MS_WINDOWS
iterator->handle = INVALID_HANDLE_VALUE;
#else
iterator->dirp = NULL;
#endif
memcpy(&iterator->path, path, sizeof(path_t));
/* Move the ownership to iterator->path */
path->object = NULL;
path->cleanup = NULL;
#ifdef MS_WINDOWS
iterator->first_time = 1;
path_strW = join_path_filenameW(iterator->path.wide, L"*.*");
if (!path_strW)
goto error;
Py_BEGIN_ALLOW_THREADS
iterator->handle = FindFirstFileW(path_strW, &iterator->file_data);
Py_END_ALLOW_THREADS
PyMem_Free(path_strW);
if (iterator->handle == INVALID_HANDLE_VALUE) {
path_error(&iterator->path);
goto error;
}
#else /* POSIX */
errno = 0;
#ifdef HAVE_FDOPENDIR
if (path->fd != -1) {
/* closedir() closes the FD, so we duplicate it */
fd = _Py_dup(path->fd);
if (fd == -1)
goto error;
Py_BEGIN_ALLOW_THREADS
iterator->dirp = fdopendir(fd);
Py_END_ALLOW_THREADS
}
else
#endif
{
if (iterator->path.narrow)
path_str = iterator->path.narrow;
else
path_str = ".";
Py_BEGIN_ALLOW_THREADS
iterator->dirp = opendir(path_str);
Py_END_ALLOW_THREADS
}
if (!iterator->dirp) {
path_error(&iterator->path);
#ifdef HAVE_FDOPENDIR
if (fd != -1) {
Py_BEGIN_ALLOW_THREADS
close(fd);
Py_END_ALLOW_THREADS
}
#endif
goto error;
}
#endif
return (PyObject *)iterator;
error:
Py_DECREF(iterator);
return NULL;
}
[[https://github.com/python/cpython/blob/master/Modules/posixmodule.c|cpython/Modules/posixmodule.c - github.com]] より\\
===== パフォーマンス比較 =====
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import glob
import timeit
def walk():
#result = [[file for file in files if file.endswith('.py')] for path, dirs, files in os.walk('.')]
result = []
for path, dirs, files in os.walk('.'):
for file in files:
if file.endswith('.py'):
result.append(file)
return result
def iglob():
result = []
for file in glob.iglob('**/*', recursive=True):
if file.endswith('.py'):
result.append(file)
return result
def iglob2():
result = []
for file in glob.iglob('**/*.py', recursive=True):
result.append(file)
return result
def main():
# os.chdir('./')
num_of_exec = 10
print(timeit.timeit(walk, number=num_of_exec) / num_of_exec)
print(timeit.timeit(iglob, number=num_of_exec) / num_of_exec)
print(timeit.timeit(iglob2, number=num_of_exec) / num_of_exec)
if __name__ == '__main__':
main()
$ ./dir_search.py
1.3243966199999704
1.4991581099999167
1.61260242999997
$ time find . -name '*.py' > /dev/null
real 0m0.523s
user 0m0.020s
sys 0m0.077s
===== 参考文献 =====
[[https://docs.python.org/ja/3/library/glob.html|glob --- Unix 形式のパス名のパターン展開 — Python ドキュメント]]\\
[[https://stackoverflow.com/questions/51167093/how-can-i-find-the-source-code-of-os-scandir-in-python3|python - How can I find the source code of os.scandir in Python3? - Stack Overflow]]\\
[[https://stackoverflow.com/questions/20638040/glob-exclude-pattern|python - glob exclude pattern - Stack Overflow]]\\
[[https://github.com/python/cpython/blob/master/Lib/pathlib.py|cpython/pathlib.py at master · python/cpython]]\\
[[https://stackoverflow.com/questions/50948391/whats-the-fastest-way-to-recursively-search-for-files-in-python/50950952|What's the fastest way to recursively search for files in python? - Stack Overflow]]\\
[[https://qiita.com/amowwee/items/e63b3610ea750f7dba1b|Pythonでフォルダ内のファイルリストを取得する - Qiita]]\\
[[https://stackoverflow.com/questions/24812253/how-can-i-capture-return-value-with-python-timeit-module|How can I capture return value with Python timeit module? - Stack Overflow]]\\
[[https://living-sun.com/ja/python/705207-quicker-to-oswalk-or-glob-python-traversal-glob-oswalk-directory-walk.html|os.walkやglobにすばやく - Python、トラバーサル、グロブ、os.walk、ディレクトリウォーク]]\\
[[https://note.dokeep.jp/post/csharp-fast-enumerate-file/|[C#] 高速でファイルとフォルダを列挙する - ざこノート]]\\