差分

このページの2つのバージョン間の差分を表示します。

--- python:pathlib [2020/02/01 20:14] – [参考文献] ともやん
+++ python:pathlib [2020/03/26 09:31] – ともやん
@@ 行 1: / 行 1: @@
-====== pathlib, glob ======
+<html>
+  <style>
+    #mincode pre {
+      /*height: 300px;*/
+      overflow: scroll;
+      overflow-x: hidden;
+      font-size: 10px;
+    }
+    #mincode_long pre {
+      height: 400px;
+      overflow: scroll;
+      overflow-x: hidden;
+      font-size: 10px;
+    }
+    #mintbl table {
+      font-size: 12px;
+    }
+    .dokuwiki .plugin_wrap table {
+      width: auto;
+    }
+    #result pre {
+      /*height: 300px;*/
+      overflow: scroll;
+      overflow-x: hidden;
+      font-size: 10px;
+    }
+  </style>
+</html>
+====== pathlib, glob, os ======
+===== glob.iglob() の実装 =====
+<WRAP prewrap 100% #mincode_long>
+<code python python38/Lib/glob.py>
+"""Filename globbing utility."""
+import os
+import re
+import fnmatch
+import sys
+__all__ = ["glob", "iglob", "escape"]
+def glob(pathname, *, recursive=False):
+    """Return a list of paths matching a pathname pattern.
+    The pattern may contain simple shell-style wildcards a la
+    fnmatch. However, unlike fnmatch, filenames starting with a
+    dot are special cases that are not matched by '*' and '?'
+    patterns.
+    If recursive is true, the pattern '**' will match any files and
+    zero or more directories and subdirectories.
+    """
+    return list(iglob(pathname, recursive=recursive))
+def iglob(pathname, *, recursive=False):
+    """Return an iterator which yields the paths matching a pathname pattern.
+    The pattern may contain simple shell-style wildcards a la
+    fnmatch. However, unlike fnmatch, filenames starting with a
+    dot are special cases that are not matched by '*' and '?'
+    patterns.
+    If recursive is true, the pattern '**' will match any files and
+    zero or more directories and subdirectories.
+    """
+    it = _iglob(pathname, recursive, False)
+    if recursive and _isrecursive(pathname):
+        s = next(it)  # skip empty string
+        assert not s
+    return it
+def _iglob(pathname, recursive, dironly):
+    sys.audit("glob.glob", pathname, recursive)
+    dirname, basename = os.path.split(pathname)
+    if not has_magic(pathname):
+        assert not dironly
+        if basename:
+            if os.path.lexists(pathname):
+                yield pathname
+        else:
+            # Patterns ending with a slash should match only directories
+            if os.path.isdir(dirname):
+                yield pathname
+        return
+    if not dirname:
+        if recursive and _isrecursive(basename):
+            yield from _glob2(dirname, basename, dironly)
+        else:
+            yield from _glob1(dirname, basename, dironly)
+        return
+    # `os.path.split()` returns the argument itself as a dirname if it is a
+    # drive or UNC path.  Prevent an infinite recursion if a drive or UNC path
+    # contains magic characters (i.e. r'\\?\C:').
+    if dirname != pathname and has_magic(dirname):
+        dirs = _iglob(dirname, recursive, True)
+    else:
+        dirs = [dirname]
+    if has_magic(basename):
+        if recursive and _isrecursive(basename):
+            glob_in_dir = _glob2
+        else:
+            glob_in_dir = _glob1
+    else:
+        glob_in_dir = _glob0
+    for dirname in dirs:
+        for name in glob_in_dir(dirname, basename, dironly):
+            yield os.path.join(dirname, name)
+# These 2 helper functions non-recursively glob inside a literal directory.
+# They return a list of basenames.  _glob1 accepts a pattern while _glob0
+# takes a literal basename (so it only has to check for its existence).
+def _glob1(dirname, pattern, dironly):
+    names = list(_iterdir(dirname, dironly))
+    if not _ishidden(pattern):
+        names = (x for x in names if not _ishidden(x))
+    return fnmatch.filter(names, pattern)
+def _glob0(dirname, basename, dironly):
+    if not basename:
+        # `os.path.split()` returns an empty basename for paths ending with a
+        # directory separator.  'q*x/' should match only directories.
+        if os.path.isdir(dirname):
+            return [basename]
+    else:
+        if os.path.lexists(os.path.join(dirname, basename)):
+            return [basename]
+    return []
+# Following functions are not public but can be used by third-party code.
+def glob0(dirname, pattern):
+    return _glob0(dirname, pattern, False)
+def glob1(dirname, pattern):
+    return _glob1(dirname, pattern, False)
+# This helper function recursively yields relative pathnames inside a literal
+# directory.
+def _glob2(dirname, pattern, dironly):
+    assert _isrecursive(pattern)
+    yield pattern[:0]
+    yield from _rlistdir(dirname, dironly)
+# If dironly is false, yields all file names inside a directory.
+# If dironly is true, yields only directory names.
+def _iterdir(dirname, dironly):
+    if not dirname:
+        if isinstance(dirname, bytes):
+            dirname = bytes(os.curdir, 'ASCII')
+        else:
+            dirname = os.curdir
+    try:
+        with os.scandir(dirname) as it:
+            for entry in it:
+                try:
+                    if not dironly or entry.is_dir():
+                        yield entry.name
+                except OSError:
+                    pass
+    except OSError:
+        return
+# Recursively yields relative pathnames inside a literal directory.
+def _rlistdir(dirname, dironly):
+    names = list(_iterdir(dirname, dironly))
+    for x in names:
+        if not _ishidden(x):
+            yield x
+            path = os.path.join(dirname, x) if dirname else x
+            for y in _rlistdir(path, dironly):
+                yield os.path.join(x, y)
+magic_check = re.compile('([*?[])')
+magic_check_bytes = re.compile(b'([*?[])')
+def has_magic(s):
+    if isinstance(s, bytes):
+        match = magic_check_bytes.search(s)
+    else:
+        match = magic_check.search(s)
+    return match is not None
+def _ishidden(path):
+    return path[0] in ('.', b'.'[0])
+def _isrecursive(pattern):
+    if isinstance(pattern, bytes):
+        return pattern == b'**'
+    else:
+        return pattern == '**'
+def escape(pathname):
+    """Escape all special characters.
+    """
+    # Escaping is done by wrapping any of "*?[" between square brackets.
+    # Metacharacters do not work in the drive part and shouldn't be escaped.
+    drive, pathname = os.path.splitdrive(pathname)
+    if isinstance(pathname, bytes):
+        pathname = magic_check_bytes.sub(br'[\1]', pathname)
+    else:
+        pathname = magic_check.sub(r'[\1]', pathname)
+    return drive + pathname
+</code>
+</WRAP>
+===== パフォーマンス比較 =====
+<WRAP prewrap 100% #mincode>
+<code python dir_search.py>
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import glob
+import timeit
+def walk():
+    #result = [[file for file in files if file.endswith('.py')] for path, dirs, files in os.walk('.')]
+    result = []
+    for path, dirs, files in os.walk('.'):
+        for file in files:
+            if file.endswith('.py'):
+                result.append(file)
+    return result
+def iglob():
+    result = []
+    for file in glob.iglob('**/*', recursive=True):
+        if file.endswith('.py'):
+            result.append(file)
+    return result
+def iglob2():
+    result = []
+    for file in glob.iglob('**/*.py', recursive=True):
+        result.append(file)
+    return result
+def main():
+    # os.chdir('./')
+    num_of_exec = 10
+    print(timeit.timeit(walk, number=num_of_exec) / num_of_exec)
+    print(timeit.timeit(iglob, number=num_of_exec) / num_of_exec)
+    print(timeit.timeit(iglob2, number=num_of_exec) / num_of_exec)
+if __name__ == '__main__':
+    main()
+</code>
+</WRAP>
+<WRAP prewrap 100% #result>
+<code>
+$ ./dir_search.py
+.3243966199999704
+.4991581099999167
+.61260242999997
+$ time find . -name '*.py' > /dev/null
+real    0m0.523s
+user    0m0.020s
+sys     0m0.077s
+</code>
+</WRAP>
 ===== 参考文献 =====
 [[https://docs.python.org/ja/3/library/glob.html|glob --- Unix 形式のパス名のパターン展開 — Python 3.8.1 ドキュメント]]\\
 [[https://stackoverflow.com/questions/20638040/glob-exclude-pattern|python - glob exclude pattern - Stack Overflow]]\\
-[[https://stackoverflow.com/questions/20638040/glob-exclude-pattern/36295481|python - glob exclude pattern - Stack Overflow]]\\
 [[https://github.com/python/cpython/blob/master/Lib/pathlib.py|cpython/pathlib.py at master · python/cpython]]\\
+[[https://stackoverflow.com/questions/50948391/whats-the-fastest-way-to-recursively-search-for-files-in-python/50950952|What's the fastest way to recursively search for files in python? - Stack Overflow]]\\
+[[https://qiita.com/amowwee/items/e63b3610ea750f7dba1b|Pythonでフォルダ内のファイルリストを取得する - Qiita]]\\