python:pathlib

pathlib, glob.iglob(), os.walk()

os.walk(top, topdown=True, onerror=None, followlinks=False) は、内部的には os.scandir(path='.') によって処理される。

glob.iglob(pathname, *, recursive=False) は、内部的には os.scandir(path='.') によって処理される。
os.scandir(path='.') によって取得された内容は list() 化されて fnmatch.filter(names, pattern) によってフィルター処理される。
ここで注意が必要なのは、 patternUnix Shell Style のパターンであって正規表現は利用できない。
fnmatch.filter(names, pattern)patternfnmatch.translate(pattern) で正規表現に変換してから re.compile() される。

OK パターン

>>> import fnmatch
>>> fnmatch.translate('*.txt')
'(?s:.*\\.txt)\\Z'

NG パターン

>>> import fnmatch
>>> fnmatch.translate('*.(bat|com|exe)')
'(?s:.*\\.\\(bat\\|com\\|exe\\))\\Z'
python38/Lib/glob.py
"""Filename globbing utility."""
 
import os
import re
import fnmatch
import sys
 
__all__ = ["glob", "iglob", "escape"]
 
def glob(pathname, *, recursive=False):
    """Return a list of paths matching a pathname pattern.
 
    The pattern may contain simple shell-style wildcards a la
    fnmatch. However, unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns.
 
    If recursive is true, the pattern '**' will match any files and
    zero or more directories and subdirectories.
    """
    return list(iglob(pathname, recursive=recursive))
 
def iglob(pathname, *, recursive=False):
    """Return an iterator which yields the paths matching a pathname pattern.
 
    The pattern may contain simple shell-style wildcards a la
    fnmatch. However, unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns.
 
    If recursive is true, the pattern '**' will match any files and
    zero or more directories and subdirectories.
    """
    it = _iglob(pathname, recursive, False)
    if recursive and _isrecursive(pathname):
        s = next(it)  # skip empty string
        assert not s
    return it
 
def _iglob(pathname, recursive, dironly):
    sys.audit("glob.glob", pathname, recursive)
    dirname, basename = os.path.split(pathname)
    if not has_magic(pathname):
        assert not dironly
        if basename:
            if os.path.lexists(pathname):
                yield pathname
        else:
            # Patterns ending with a slash should match only directories
            if os.path.isdir(dirname):
                yield pathname
        return
    if not dirname:
        if recursive and _isrecursive(basename):
            yield from _glob2(dirname, basename, dironly)
        else:
            yield from _glob1(dirname, basename, dironly)
        return
    # `os.path.split()` returns the argument itself as a dirname if it is a
    # drive or UNC path.  Prevent an infinite recursion if a drive or UNC path
    # contains magic characters (i.e. r'\\?\C:').
    if dirname != pathname and has_magic(dirname):
        dirs = _iglob(dirname, recursive, True)
    else:
        dirs = [dirname]
    if has_magic(basename):
        if recursive and _isrecursive(basename):
            glob_in_dir = _glob2
        else:
            glob_in_dir = _glob1
    else:
        glob_in_dir = _glob0
    for dirname in dirs:
        for name in glob_in_dir(dirname, basename, dironly):
            yield os.path.join(dirname, name)
 
# These 2 helper functions non-recursively glob inside a literal directory.
# They return a list of basenames.  _glob1 accepts a pattern while _glob0
# takes a literal basename (so it only has to check for its existence).
 
def _glob1(dirname, pattern, dironly):
    names = list(_iterdir(dirname, dironly))
    if not _ishidden(pattern):
        names = (x for x in names if not _ishidden(x))
    return fnmatch.filter(names, pattern)
 
def _glob0(dirname, basename, dironly):
    if not basename:
        # `os.path.split()` returns an empty basename for paths ending with a
        # directory separator.  'q*x/' should match only directories.
        if os.path.isdir(dirname):
            return [basename]
    else:
        if os.path.lexists(os.path.join(dirname, basename)):
            return [basename]
    return []
 
# Following functions are not public but can be used by third-party code.
 
def glob0(dirname, pattern):
    return _glob0(dirname, pattern, False)
 
def glob1(dirname, pattern):
    return _glob1(dirname, pattern, False)
 
# This helper function recursively yields relative pathnames inside a literal
# directory.
 
def _glob2(dirname, pattern, dironly):
    assert _isrecursive(pattern)
    yield pattern[:0]
    yield from _rlistdir(dirname, dironly)
 
# If dironly is false, yields all file names inside a directory.
# If dironly is true, yields only directory names.
def _iterdir(dirname, dironly):
    if not dirname:
        if isinstance(dirname, bytes):
            dirname = bytes(os.curdir, 'ASCII')
        else:
            dirname = os.curdir
    try:
        with os.scandir(dirname) as it:
            for entry in it:
                try:
                    if not dironly or entry.is_dir():
                        yield entry.name
                except OSError:
                    pass
    except OSError:
        return
 
# Recursively yields relative pathnames inside a literal directory.
def _rlistdir(dirname, dironly):
    names = list(_iterdir(dirname, dironly))
    for x in names:
        if not _ishidden(x):
            yield x
            path = os.path.join(dirname, x) if dirname else x
            for y in _rlistdir(path, dironly):
                yield os.path.join(x, y)
 
 
magic_check = re.compile('([*?[])')
magic_check_bytes = re.compile(b'([*?[])')
 
def has_magic(s):
    if isinstance(s, bytes):
        match = magic_check_bytes.search(s)
    else:
        match = magic_check.search(s)
    return match is not None
 
def _ishidden(path):
    return path[0] in ('.', b'.'[0])
 
def _isrecursive(pattern):
    if isinstance(pattern, bytes):
        return pattern == b'**'
    else:
        return pattern == '**'
 
def escape(pathname):
    """Escape all special characters.
    """
    # Escaping is done by wrapping any of "*?[" between square brackets.
    # Metacharacters do not work in the drive part and shouldn't be escaped.
    drive, pathname = os.path.splitdrive(pathname)
    if isinstance(pathname, bytes):
        pathname = magic_check_bytes.sub(br'[\1]', pathname)
    else:
        pathname = magic_check.sub(r'[\1]', pathname)
    return drive + pathname

os — Miscellaneous operating system interfaces — Python 3.8.2 documentation

Note On Unix-based systems, scandir() uses the system’s opendir() and readdir() functions. On Windows, it uses the Win32 FindFirstFileW and FindNextFileW functions.

ノート(翻訳) Unix ベースのシステムでは、scandir() はシステムの opendir() 関数と readdir() 関数を使用します。 Windows では、Win32 FindFirstFileW 関数と FindNextFileW 関数を使用します。
cpython/Modules/posixmodule.c
/*[clinic input]
os.scandir
 
    path : path_t(nullable=True, allow_fd='PATH_HAVE_FDOPENDIR') = None
 
Return an iterator of DirEntry objects for given path.
 
path can be specified as either str, bytes, or a path-like object.  If path
is bytes, the names of yielded DirEntry objects will also be bytes; in
all other circumstances they will be str.
 
If path is None, uses the path='.'.
[clinic start generated code]*/
 
static PyObject *
os_scandir_impl(PyObject *module, path_t *path)
/*[clinic end generated code: output=6eb2668b675ca89e input=6bdd312708fc3bb0]*/
{
    ScandirIterator *iterator;
#ifdef MS_WINDOWS
    wchar_t *path_strW;
#else
    const char *path_str;
#ifdef HAVE_FDOPENDIR
    int fd = -1;
#endif
#endif
 
    if (PySys_Audit("os.scandir", "O",
                    path->object ? path->object : Py_None) < 0) {
        return NULL;
    }
 
    PyObject *ScandirIteratorType = get_posix_state(module)->ScandirIteratorType;
    iterator = PyObject_New(ScandirIterator, (PyTypeObject *)ScandirIteratorType);
    if (!iterator)
        return NULL;
 
#ifdef MS_WINDOWS
    iterator->handle = INVALID_HANDLE_VALUE;
#else
    iterator->dirp = NULL;
#endif
 
    memcpy(&iterator->path, path, sizeof(path_t));
    /* Move the ownership to iterator->path */
    path->object = NULL;
    path->cleanup = NULL;
 
#ifdef MS_WINDOWS
    iterator->first_time = 1;
 
    path_strW = join_path_filenameW(iterator->path.wide, L"*.*");
    if (!path_strW)
        goto error;
 
    Py_BEGIN_ALLOW_THREADS
    iterator->handle = FindFirstFileW(path_strW, &iterator->file_data);
    Py_END_ALLOW_THREADS
 
    PyMem_Free(path_strW);
 
    if (iterator->handle == INVALID_HANDLE_VALUE) {
        path_error(&iterator->path);
        goto error;
    }
#else /* POSIX */
    errno = 0;
#ifdef HAVE_FDOPENDIR
    if (path->fd != -1) {
        /* closedir() closes the FD, so we duplicate it */
        fd = _Py_dup(path->fd);
        if (fd == -1)
            goto error;
 
        Py_BEGIN_ALLOW_THREADS
        iterator->dirp = fdopendir(fd);
        Py_END_ALLOW_THREADS
    }
    else
#endif
    {
        if (iterator->path.narrow)
            path_str = iterator->path.narrow;
        else
            path_str = ".";
 
        Py_BEGIN_ALLOW_THREADS
        iterator->dirp = opendir(path_str);
        Py_END_ALLOW_THREADS
    }
 
    if (!iterator->dirp) {
        path_error(&iterator->path);
#ifdef HAVE_FDOPENDIR
        if (fd != -1) {
            Py_BEGIN_ALLOW_THREADS
            close(fd);
            Py_END_ALLOW_THREADS
        }
#endif
        goto error;
    }
#endif
 
    return (PyObject *)iterator;
 
error:
    Py_DECREF(iterator);
    return NULL;
}

cpython/Modules/posixmodule.c - github.com より

dir_search.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import glob
import timeit
 
def walk():
    #result = [[file for file in files if file.endswith('.py')] for path, dirs, files in os.walk('.')]
    result = []
    for path, dirs, files in os.walk('.'):
        for file in files:
            if file.endswith('.py'):
                result.append(file)
    return result
 
def iglob():
    result = []
    for file in glob.iglob('**/*', recursive=True):
        if file.endswith('.py'):
            result.append(file)
    return result
 
def iglob2():
    result = []
    for file in glob.iglob('**/*.py', recursive=True):
        result.append(file)
    return result
 
def main():
    # os.chdir('./')
    num_of_exec = 10
    print(timeit.timeit(walk, number=num_of_exec) / num_of_exec)
    print(timeit.timeit(iglob, number=num_of_exec) / num_of_exec)
    print(timeit.timeit(iglob2, number=num_of_exec) / num_of_exec)
 
if __name__ == '__main__':
    main()
$ ./dir_search.py
1.3243966199999704
1.4991581099999167
1.61260242999997
$ time find . -name '*.py' > /dev/null

real    0m0.523s
user    0m0.020s
sys     0m0.077s
  • python/pathlib.txt
  • 最終更新: 2023/05/27 09:00
  • by ともやん