python:pathlib

文書の過去の版を表示しています。


pathlib, glob, os

glob.iglob(pathname, *, recursive=False) は、内部的には os.scandir(path='.') によって処理される。

glob.iglob(pathname, *, recursive=False) は、内部的には os.scandir(path='.') によって処理される。
os.scandir(path='.') によって取得された内容は list() 化されて fnmatch.filter(names, pattern) によってフィルター処理される。
fnmatch.filter(names, pattern)patternfnmatch.translate(pattern) で正規表現に変換してから re.compile() される。ここで注意が必要なのは、 patternUnix Shell Style のパターンであって正規表現は利用できない。

OK パターン

>>> import fnmatch
>>> fnmatch.translate('*.txt')
'(?s:.*\\.txt)\\Z'

NG パターン

>>> import fnmatch
>>> fnmatch.translate('*.(bat|com|exe)')
'(?s:.*\\.\\(bat\\|com\\|exe\\))\\Z'
python38/Lib/glob.py
"""Filename globbing utility."""
 
import os
import re
import fnmatch
import sys
 
__all__ = ["glob", "iglob", "escape"]
 
def glob(pathname, *, recursive=False):
    """Return a list of paths matching a pathname pattern.
 
    The pattern may contain simple shell-style wildcards a la
    fnmatch. However, unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns.
 
    If recursive is true, the pattern '**' will match any files and
    zero or more directories and subdirectories.
    """
    return list(iglob(pathname, recursive=recursive))
 
def iglob(pathname, *, recursive=False):
    """Return an iterator which yields the paths matching a pathname pattern.
 
    The pattern may contain simple shell-style wildcards a la
    fnmatch. However, unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns.
 
    If recursive is true, the pattern '**' will match any files and
    zero or more directories and subdirectories.
    """
    it = _iglob(pathname, recursive, False)
    if recursive and _isrecursive(pathname):
        s = next(it)  # skip empty string
        assert not s
    return it
 
def _iglob(pathname, recursive, dironly):
    sys.audit("glob.glob", pathname, recursive)
    dirname, basename = os.path.split(pathname)
    if not has_magic(pathname):
        assert not dironly
        if basename:
            if os.path.lexists(pathname):
                yield pathname
        else:
            # Patterns ending with a slash should match only directories
            if os.path.isdir(dirname):
                yield pathname
        return
    if not dirname:
        if recursive and _isrecursive(basename):
            yield from _glob2(dirname, basename, dironly)
        else:
            yield from _glob1(dirname, basename, dironly)
        return
    # `os.path.split()` returns the argument itself as a dirname if it is a
    # drive or UNC path.  Prevent an infinite recursion if a drive or UNC path
    # contains magic characters (i.e. r'\\?\C:').
    if dirname != pathname and has_magic(dirname):
        dirs = _iglob(dirname, recursive, True)
    else:
        dirs = [dirname]
    if has_magic(basename):
        if recursive and _isrecursive(basename):
            glob_in_dir = _glob2
        else:
            glob_in_dir = _glob1
    else:
        glob_in_dir = _glob0
    for dirname in dirs:
        for name in glob_in_dir(dirname, basename, dironly):
            yield os.path.join(dirname, name)
 
# These 2 helper functions non-recursively glob inside a literal directory.
# They return a list of basenames.  _glob1 accepts a pattern while _glob0
# takes a literal basename (so it only has to check for its existence).
 
def _glob1(dirname, pattern, dironly):
    names = list(_iterdir(dirname, dironly))
    if not _ishidden(pattern):
        names = (x for x in names if not _ishidden(x))
    return fnmatch.filter(names, pattern)
 
def _glob0(dirname, basename, dironly):
    if not basename:
        # `os.path.split()` returns an empty basename for paths ending with a
        # directory separator.  'q*x/' should match only directories.
        if os.path.isdir(dirname):
            return [basename]
    else:
        if os.path.lexists(os.path.join(dirname, basename)):
            return [basename]
    return []
 
# Following functions are not public but can be used by third-party code.
 
def glob0(dirname, pattern):
    return _glob0(dirname, pattern, False)
 
def glob1(dirname, pattern):
    return _glob1(dirname, pattern, False)
 
# This helper function recursively yields relative pathnames inside a literal
# directory.
 
def _glob2(dirname, pattern, dironly):
    assert _isrecursive(pattern)
    yield pattern[:0]
    yield from _rlistdir(dirname, dironly)
 
# If dironly is false, yields all file names inside a directory.
# If dironly is true, yields only directory names.
def _iterdir(dirname, dironly):
    if not dirname:
        if isinstance(dirname, bytes):
            dirname = bytes(os.curdir, 'ASCII')
        else:
            dirname = os.curdir
    try:
        with os.scandir(dirname) as it:
            for entry in it:
                try:
                    if not dironly or entry.is_dir():
                        yield entry.name
                except OSError:
                    pass
    except OSError:
        return
 
# Recursively yields relative pathnames inside a literal directory.
def _rlistdir(dirname, dironly):
    names = list(_iterdir(dirname, dironly))
    for x in names:
        if not _ishidden(x):
            yield x
            path = os.path.join(dirname, x) if dirname else x
            for y in _rlistdir(path, dironly):
                yield os.path.join(x, y)
 
 
magic_check = re.compile('([*?[])')
magic_check_bytes = re.compile(b'([*?[])')
 
def has_magic(s):
    if isinstance(s, bytes):
        match = magic_check_bytes.search(s)
    else:
        match = magic_check.search(s)
    return match is not None
 
def _ishidden(path):
    return path[0] in ('.', b'.'[0])
 
def _isrecursive(pattern):
    if isinstance(pattern, bytes):
        return pattern == b'**'
    else:
        return pattern == '**'
 
def escape(pathname):
    """Escape all special characters.
    """
    # Escaping is done by wrapping any of "*?[" between square brackets.
    # Metacharacters do not work in the drive part and shouldn't be escaped.
    drive, pathname = os.path.splitdrive(pathname)
    if isinstance(pathname, bytes):
        pathname = magic_check_bytes.sub(br'[\1]', pathname)
    else:
        pathname = magic_check.sub(r'[\1]', pathname)
    return drive + pathname
dir_search.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import glob
import timeit
 
def walk():
    #result = [[file for file in files if file.endswith('.py')] for path, dirs, files in os.walk('.')]
    result = []
    for path, dirs, files in os.walk('.'):
        for file in files:
            if file.endswith('.py'):
                result.append(file)
    return result
 
def iglob():
    result = []
    for file in glob.iglob('**/*', recursive=True):
        if file.endswith('.py'):
            result.append(file)
    return result
 
def iglob2():
    result = []
    for file in glob.iglob('**/*.py', recursive=True):
        result.append(file)
    return result
 
def main():
    # os.chdir('./')
    num_of_exec = 10
    print(timeit.timeit(walk, number=num_of_exec) / num_of_exec)
    print(timeit.timeit(iglob, number=num_of_exec) / num_of_exec)
    print(timeit.timeit(iglob2, number=num_of_exec) / num_of_exec)
 
if __name__ == '__main__':
    main()
$ ./dir_search.py
1.3243966199999704
1.4991581099999167
1.61260242999997
$ time find . -name '*.py' > /dev/null

real    0m0.523s
user    0m0.020s
sys     0m0.077s
  • python/pathlib.1585537464.txt.gz
  • 最終更新: 2020/03/30 12:04
  • by ともやん