python:pathlib

文書の過去の版を表示しています。


pathlib, glob, os

os.walk(top, topdown=True, onerror=None, followlinks=False) は、内部的には os.scandir(path='.') によって処理される。

glob.iglob(pathname, *, recursive=False) は、内部的には os.scandir(path='.') によって処理される。
os.scandir(path='.') によって取得された内容は list() 化されて fnmatch.filter(names, pattern) によってフィルター処理される。
fnmatch.filter(names, pattern)patternfnmatch.translate(pattern) で正規表現に変換してから re.compile() される。ここで注意が必要なのは、 patternUnix Shell Style のパターンであって正規表現は利用できない。

OK パターン

>>> import fnmatch
>>> fnmatch.translate('*.txt')
'(?s:.*\\.txt)\\Z'

NG パターン

>>> import fnmatch
>>> fnmatch.translate('*.(bat|com|exe)')
'(?s:.*\\.\\(bat\\|com\\|exe\\))\\Z'
python38/Lib/glob.py
"""Filename globbing utility."""
 
import os
import re
import fnmatch
import sys
 
__all__ = ["glob", "iglob", "escape"]
 
def glob(pathname, *, recursive=False):
    """Return a list of paths matching a pathname pattern.
 
    The pattern may contain simple shell-style wildcards a la
    fnmatch. However, unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns.
 
    If recursive is true, the pattern '**' will match any files and
    zero or more directories and subdirectories.
    """
    return list(iglob(pathname, recursive=recursive))
 
def iglob(pathname, *, recursive=False):
    """Return an iterator which yields the paths matching a pathname pattern.
 
    The pattern may contain simple shell-style wildcards a la
    fnmatch. However, unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns.
 
    If recursive is true, the pattern '**' will match any files and
    zero or more directories and subdirectories.
    """
    it = _iglob(pathname, recursive, False)
    if recursive and _isrecursive(pathname):
        s = next(it)  # skip empty string
        assert not s
    return it
 
def _iglob(pathname, recursive, dironly):
    sys.audit("glob.glob", pathname, recursive)
    dirname, basename = os.path.split(pathname)
    if not has_magic(pathname):
        assert not dironly
        if basename:
            if os.path.lexists(pathname):
                yield pathname
        else:
            # Patterns ending with a slash should match only directories
            if os.path.isdir(dirname):
                yield pathname
        return
    if not dirname:
        if recursive and _isrecursive(basename):
            yield from _glob2(dirname, basename, dironly)
        else:
            yield from _glob1(dirname, basename, dironly)
        return
    # `os.path.split()` returns the argument itself as a dirname if it is a
    # drive or UNC path.  Prevent an infinite recursion if a drive or UNC path
    # contains magic characters (i.e. r'\\?\C:').
    if dirname != pathname and has_magic(dirname):
        dirs = _iglob(dirname, recursive, True)
    else:
        dirs = [dirname]
    if has_magic(basename):
        if recursive and _isrecursive(basename):
            glob_in_dir = _glob2
        else:
            glob_in_dir = _glob1
    else:
        glob_in_dir = _glob0
    for dirname in dirs:
        for name in glob_in_dir(dirname, basename, dironly):
            yield os.path.join(dirname, name)
 
# These 2 helper functions non-recursively glob inside a literal directory.
# They return a list of basenames.  _glob1 accepts a pattern while _glob0
# takes a literal basename (so it only has to check for its existence).
 
def _glob1(dirname, pattern, dironly):
    names = list(_iterdir(dirname, dironly))
    if not _ishidden(pattern):
        names = (x for x in names if not _ishidden(x))
    return fnmatch.filter(names, pattern)
 
def _glob0(dirname, basename, dironly):
    if not basename:
        # `os.path.split()` returns an empty basename for paths ending with a
        # directory separator.  'q*x/' should match only directories.
        if os.path.isdir(dirname):
            return [basename]
    else:
        if os.path.lexists(os.path.join(dirname, basename)):
            return [basename]
    return []
 
# Following functions are not public but can be used by third-party code.
 
def glob0(dirname, pattern):
    return _glob0(dirname, pattern, False)
 
def glob1(dirname, pattern):
    return _glob1(dirname, pattern, False)
 
# This helper function recursively yields relative pathnames inside a literal
# directory.
 
def _glob2(dirname, pattern, dironly):
    assert _isrecursive(pattern)
    yield pattern[:0]
    yield from _rlistdir(dirname, dironly)
 
# If dironly is false, yields all file names inside a directory.
# If dironly is true, yields only directory names.
def _iterdir(dirname, dironly):
    if not dirname:
        if isinstance(dirname, bytes):
            dirname = bytes(os.curdir, 'ASCII')
        else:
            dirname = os.curdir
    try:
        with os.scandir(dirname) as it:
            for entry in it:
                try:
                    if not dironly or entry.is_dir():
                        yield entry.name
                except OSError:
                    pass
    except OSError:
        return
 
# Recursively yields relative pathnames inside a literal directory.
def _rlistdir(dirname, dironly):
    names = list(_iterdir(dirname, dironly))
    for x in names:
        if not _ishidden(x):
            yield x
            path = os.path.join(dirname, x) if dirname else x
            for y in _rlistdir(path, dironly):
                yield os.path.join(x, y)
 
 
magic_check = re.compile('([*?[])')
magic_check_bytes = re.compile(b'([*?[])')
 
def has_magic(s):
    if isinstance(s, bytes):
        match = magic_check_bytes.search(s)
    else:
        match = magic_check.search(s)
    return match is not None
 
def _ishidden(path):
    return path[0] in ('.', b'.'[0])
 
def _isrecursive(pattern):
    if isinstance(pattern, bytes):
        return pattern == b'**'
    else:
        return pattern == '**'
 
def escape(pathname):
    """Escape all special characters.
    """
    # Escaping is done by wrapping any of "*?[" between square brackets.
    # Metacharacters do not work in the drive part and shouldn't be escaped.
    drive, pathname = os.path.splitdrive(pathname)
    if isinstance(pathname, bytes):
        pathname = magic_check_bytes.sub(br'[\1]', pathname)
    else:
        pathname = magic_check.sub(r'[\1]', pathname)
    return drive + pathname

os — Miscellaneous operating system interfaces — Python 3.8.2 documentation

Note On Unix-based systems, scandir() uses the system’s opendir() and readdir() functions. On Windows, it uses the Win32 FindFirstFileW and FindNextFileW functions.
cpython/Modules/posixmodule.c
/*[clinic input]
os.scandir
 
    path : path_t(nullable=True, allow_fd='PATH_HAVE_FDOPENDIR') = None
 
Return an iterator of DirEntry objects for given path.
 
path can be specified as either str, bytes, or a path-like object.  If path
is bytes, the names of yielded DirEntry objects will also be bytes; in
all other circumstances they will be str.
 
If path is None, uses the path='.'.
[clinic start generated code]*/
 
static PyObject *
os_scandir_impl(PyObject *module, path_t *path)
/*[clinic end generated code: output=6eb2668b675ca89e input=6bdd312708fc3bb0]*/
{
    ScandirIterator *iterator;
#ifdef MS_WINDOWS
    wchar_t *path_strW;
#else
    const char *path_str;
#ifdef HAVE_FDOPENDIR
    int fd = -1;
#endif
#endif
 
    if (PySys_Audit("os.scandir", "O",
                    path->object ? path->object : Py_None) < 0) {
        return NULL;
    }
 
    PyObject *ScandirIteratorType = get_posix_state(module)->ScandirIteratorType;
    iterator = PyObject_New(ScandirIterator, (PyTypeObject *)ScandirIteratorType);
    if (!iterator)
        return NULL;
 
#ifdef MS_WINDOWS
    iterator->handle = INVALID_HANDLE_VALUE;
#else
    iterator->dirp = NULL;
#endif
 
    memcpy(&iterator->path, path, sizeof(path_t));
    /* Move the ownership to iterator->path */
    path->object = NULL;
    path->cleanup = NULL;
 
#ifdef MS_WINDOWS
    iterator->first_time = 1;
 
    path_strW = join_path_filenameW(iterator->path.wide, L"*.*");
    if (!path_strW)
        goto error;
 
    Py_BEGIN_ALLOW_THREADS
    iterator->handle = FindFirstFileW(path_strW, &iterator->file_data);
    Py_END_ALLOW_THREADS
 
    PyMem_Free(path_strW);
 
    if (iterator->handle == INVALID_HANDLE_VALUE) {
        path_error(&iterator->path);
        goto error;
    }
#else /* POSIX */
    errno = 0;
#ifdef HAVE_FDOPENDIR
    if (path->fd != -1) {
        /* closedir() closes the FD, so we duplicate it */
        fd = _Py_dup(path->fd);
        if (fd == -1)
            goto error;
 
        Py_BEGIN_ALLOW_THREADS
        iterator->dirp = fdopendir(fd);
        Py_END_ALLOW_THREADS
    }
    else
#endif
    {
        if (iterator->path.narrow)
            path_str = iterator->path.narrow;
        else
            path_str = ".";
 
        Py_BEGIN_ALLOW_THREADS
        iterator->dirp = opendir(path_str);
        Py_END_ALLOW_THREADS
    }
 
    if (!iterator->dirp) {
        path_error(&iterator->path);
#ifdef HAVE_FDOPENDIR
        if (fd != -1) {
            Py_BEGIN_ALLOW_THREADS
            close(fd);
            Py_END_ALLOW_THREADS
        }
#endif
        goto error;
    }
#endif
 
    return (PyObject *)iterator;
 
error:
    Py_DECREF(iterator);
    return NULL;
}

cpython/Modules/posixmodule.c - github.com より

dir_search.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import glob
import timeit
 
def walk():
    #result = [[file for file in files if file.endswith('.py')] for path, dirs, files in os.walk('.')]
    result = []
    for path, dirs, files in os.walk('.'):
        for file in files:
            if file.endswith('.py'):
                result.append(file)
    return result
 
def iglob():
    result = []
    for file in glob.iglob('**/*', recursive=True):
        if file.endswith('.py'):
            result.append(file)
    return result
 
def iglob2():
    result = []
    for file in glob.iglob('**/*.py', recursive=True):
        result.append(file)
    return result
 
def main():
    # os.chdir('./')
    num_of_exec = 10
    print(timeit.timeit(walk, number=num_of_exec) / num_of_exec)
    print(timeit.timeit(iglob, number=num_of_exec) / num_of_exec)
    print(timeit.timeit(iglob2, number=num_of_exec) / num_of_exec)
 
if __name__ == '__main__':
    main()
$ ./dir_search.py
1.3243966199999704
1.4991581099999167
1.61260242999997
$ time find . -name '*.py' > /dev/null

real    0m0.523s
user    0m0.020s
sys     0m0.077s
  • python/pathlib.1585592573.txt.gz
  • 最終更新: 2020/03/31 03:22
  • by ともやん