差分

このページの2つのバージョン間の差分を表示します。

--- python:pathlib [2020/02/01 20:15] – [参考文献] ともやん
+++ python:pathlib [2020/04/04 07:31] – ともやん
@@ 行 1: / 行 1: @@
-====== pathlib, glob ======
+<html>
+  <style>
+    #result pre, #mincode pre {
+      /*height: 300px;*/
+      overflow: scroll;
+      overflow-x: hidden;
+      font-size: 10px;
+    }
+    #mincode_long pre {
+      height: 400px;
+      overflow: scroll;
+      overflow-x: hidden;
+      font-size: 10px;
+    }
+    #mintbl table {
+      font-size: 12px;
+    }
+    .dokuwiki .plugin_wrap table {
+      width: auto;
+    }
+  </style>
+</html>
+====== pathlib, glob, os ======
+===== os.walk() の実装 =====
+**os.walk(top, topdown=True, onerror=None, followlinks=False)** は、内部的には **os.scandir(path='.')** によって処理される。\\
+===== glob.iglob() の実装 =====
+**glob.iglob(pathname, *, recursive=False)** は、内部的には **os.scandir(path='.')** によって処理される。\\
+**os.scandir(path='.')** によって取得された内容は **list()** 化されて **fnmatch.filter(names, pattern)** によってフィルター処理される。\\
+**fnmatch.filter(names, pattern)** の **pattern** は **fnmatch.translate(pattern)** で正規表現に変換してから **re.compile()** される。ここで注意が必要なのは、 **pattern** は **__Unix Shell Style__** のパターンであって正規表現は利用できない。\\
+\\
+**OK パターン**\\
+<code python>
+>>> import fnmatch
+>>> fnmatch.translate('*.txt')
+'(?s:.*\\.txt)\\Z'
+</code>
+**NG パターン**\\
+<code python>
+>>> import fnmatch
+>>> fnmatch.translate('*.(bat|com|exe)')
+'(?s:.*\\.\\(bat\\|com\\|exe\\))\\Z'
+</code>
+<WRAP prewrap 100% #mincode_long>
+<code python python38/Lib/glob.py>
+"""Filename globbing utility."""
+import os
+import re
+import fnmatch
+import sys
+__all__ = ["glob", "iglob", "escape"]
+def glob(pathname, *, recursive=False):
+    """Return a list of paths matching a pathname pattern.
+    The pattern may contain simple shell-style wildcards a la
+    fnmatch. However, unlike fnmatch, filenames starting with a
+    dot are special cases that are not matched by '*' and '?'
+    patterns.
+    If recursive is true, the pattern '**' will match any files and
+    zero or more directories and subdirectories.
+    """
+    return list(iglob(pathname, recursive=recursive))
+def iglob(pathname, *, recursive=False):
+    """Return an iterator which yields the paths matching a pathname pattern.
+    The pattern may contain simple shell-style wildcards a la
+    fnmatch. However, unlike fnmatch, filenames starting with a
+    dot are special cases that are not matched by '*' and '?'
+    patterns.
+    If recursive is true, the pattern '**' will match any files and
+    zero or more directories and subdirectories.
+    """
+    it = _iglob(pathname, recursive, False)
+    if recursive and _isrecursive(pathname):
+        s = next(it)  # skip empty string
+        assert not s
+    return it
+def _iglob(pathname, recursive, dironly):
+    sys.audit("glob.glob", pathname, recursive)
+    dirname, basename = os.path.split(pathname)
+    if not has_magic(pathname):
+        assert not dironly
+        if basename:
+            if os.path.lexists(pathname):
+                yield pathname
+        else:
+            # Patterns ending with a slash should match only directories
+            if os.path.isdir(dirname):
+                yield pathname
+        return
+    if not dirname:
+        if recursive and _isrecursive(basename):
+            yield from _glob2(dirname, basename, dironly)
+        else:
+            yield from _glob1(dirname, basename, dironly)
+        return
+    # `os.path.split()` returns the argument itself as a dirname if it is a
+    # drive or UNC path.  Prevent an infinite recursion if a drive or UNC path
+    # contains magic characters (i.e. r'\\?\C:').
+    if dirname != pathname and has_magic(dirname):
+        dirs = _iglob(dirname, recursive, True)
+    else:
+        dirs = [dirname]
+    if has_magic(basename):
+        if recursive and _isrecursive(basename):
+            glob_in_dir = _glob2
+        else:
+            glob_in_dir = _glob1
+    else:
+        glob_in_dir = _glob0
+    for dirname in dirs:
+        for name in glob_in_dir(dirname, basename, dironly):
+            yield os.path.join(dirname, name)
+# These 2 helper functions non-recursively glob inside a literal directory.
+# They return a list of basenames.  _glob1 accepts a pattern while _glob0
+# takes a literal basename (so it only has to check for its existence).
+def _glob1(dirname, pattern, dironly):
+    names = list(_iterdir(dirname, dironly))
+    if not _ishidden(pattern):
+        names = (x for x in names if not _ishidden(x))
+    return fnmatch.filter(names, pattern)
+def _glob0(dirname, basename, dironly):
+    if not basename:
+        # `os.path.split()` returns an empty basename for paths ending with a
+        # directory separator.  'q*x/' should match only directories.
+        if os.path.isdir(dirname):
+            return [basename]
+    else:
+        if os.path.lexists(os.path.join(dirname, basename)):
+            return [basename]
+    return []
+# Following functions are not public but can be used by third-party code.
+def glob0(dirname, pattern):
+    return _glob0(dirname, pattern, False)
+def glob1(dirname, pattern):
+    return _glob1(dirname, pattern, False)
+# This helper function recursively yields relative pathnames inside a literal
+# directory.
+def _glob2(dirname, pattern, dironly):
+    assert _isrecursive(pattern)
+    yield pattern[:0]
+    yield from _rlistdir(dirname, dironly)
+# If dironly is false, yields all file names inside a directory.
+# If dironly is true, yields only directory names.
+def _iterdir(dirname, dironly):
+    if not dirname:
+        if isinstance(dirname, bytes):
+            dirname = bytes(os.curdir, 'ASCII')
+        else:
+            dirname = os.curdir
+    try:
+        with os.scandir(dirname) as it:
+            for entry in it:
+                try:
+                    if not dironly or entry.is_dir():
+                        yield entry.name
+                except OSError:
+                    pass
+    except OSError:
+        return
+# Recursively yields relative pathnames inside a literal directory.
+def _rlistdir(dirname, dironly):
+    names = list(_iterdir(dirname, dironly))
+    for x in names:
+        if not _ishidden(x):
+            yield x
+            path = os.path.join(dirname, x) if dirname else x
+            for y in _rlistdir(path, dironly):
+                yield os.path.join(x, y)
+magic_check = re.compile('([*?[])')
+magic_check_bytes = re.compile(b'([*?[])')
+def has_magic(s):
+    if isinstance(s, bytes):
+        match = magic_check_bytes.search(s)
+    else:
+        match = magic_check.search(s)
+    return match is not None
+def _ishidden(path):
+    return path[0] in ('.', b'.'[0])
+def _isrecursive(pattern):
+    if isinstance(pattern, bytes):
+        return pattern == b'**'
+    else:
+        return pattern == '**'
+def escape(pathname):
+    """Escape all special characters.
+    """
+    # Escaping is done by wrapping any of "*?[" between square brackets.
+    # Metacharacters do not work in the drive part and shouldn't be escaped.
+    drive, pathname = os.path.splitdrive(pathname)
+    if isinstance(pathname, bytes):
+        pathname = magic_check_bytes.sub(br'[\1]', pathname)
+    else:
+        pathname = magic_check.sub(r'[\1]', pathname)
+    return drive + pathname
+</code>
+</WRAP>
+===== os.scandir() の実装 =====
+[[https://docs.python.org/3/library/os.html#os.scandir|os — Miscellaneous operating system interfaces — Python 3.8.2 documentation]]\\
+<note>
+Note On Unix-based systems, scandir() uses the system’s opendir() and readdir() functions. On Windows, it uses the Win32 FindFirstFileW and FindNextFileW functions.\\
+\\
+ノート(翻訳) Unix ベースのシステムでは、scandir() はシステムの opendir() 関数と readdir() 関数を使用します。 Windows では、Win32 FindFirstFileW 関数と FindNextFileW 関数を使用します。
+</note>
+<WRAP prewrap 100% #mincode_long>
+<code c cpython/Modules/posixmodule.c>
+/*[clinic input]
+os.scandir
+    path : path_t(nullable=True, allow_fd='PATH_HAVE_FDOPENDIR') = None
+Return an iterator of DirEntry objects for given path.
+path can be specified as either str, bytes, or a path-like object.  If path
+is bytes, the names of yielded DirEntry objects will also be bytes; in
+all other circumstances they will be str.
+If path is None, uses the path='.'.
+[clinic start generated code]*/
+static PyObject *
+os_scandir_impl(PyObject *module, path_t *path)
+/*[clinic end generated code: output=6eb2668b675ca89e input=6bdd312708fc3bb0]*/
+{
+    ScandirIterator *iterator;
+#ifdef MS_WINDOWS
+    wchar_t *path_strW;
+#else
+    const char *path_str;
+#ifdef HAVE_FDOPENDIR
+    int fd = -1;
+#endif
+#endif
+    if (PySys_Audit("os.scandir", "O",
+                    path->object ? path->object : Py_None) < 0) {
+        return NULL;
+    }
+    PyObject *ScandirIteratorType = get_posix_state(module)->ScandirIteratorType;
+    iterator = PyObject_New(ScandirIterator, (PyTypeObject *)ScandirIteratorType);
+    if (!iterator)
+        return NULL;
+#ifdef MS_WINDOWS
+    iterator->handle = INVALID_HANDLE_VALUE;
+#else
+    iterator->dirp = NULL;
+#endif
+    memcpy(&iterator->path, path, sizeof(path_t));
+    /* Move the ownership to iterator->path */
+    path->object = NULL;
+    path->cleanup = NULL;
+#ifdef MS_WINDOWS
+    iterator->first_time = 1;
+    path_strW = join_path_filenameW(iterator->path.wide, L"*.*");
+    if (!path_strW)
+        goto error;
+    Py_BEGIN_ALLOW_THREADS
+    iterator->handle = FindFirstFileW(path_strW, &iterator->file_data);
+    Py_END_ALLOW_THREADS
+    PyMem_Free(path_strW);
+    if (iterator->handle == INVALID_HANDLE_VALUE) {
+        path_error(&iterator->path);
+        goto error;
+    }
+#else /* POSIX */
+    errno = 0;
+#ifdef HAVE_FDOPENDIR
+    if (path->fd != -1) {
+        /* closedir() closes the FD, so we duplicate it */
+        fd = _Py_dup(path->fd);
+        if (fd == -1)
+            goto error;
+        Py_BEGIN_ALLOW_THREADS
+        iterator->dirp = fdopendir(fd);
+        Py_END_ALLOW_THREADS
+    }
+    else
+#endif
+    {
+        if (iterator->path.narrow)
+            path_str = iterator->path.narrow;
+        else
+            path_str = ".";
+        Py_BEGIN_ALLOW_THREADS
+        iterator->dirp = opendir(path_str);
+        Py_END_ALLOW_THREADS
+    }
+    if (!iterator->dirp) {
+        path_error(&iterator->path);
+#ifdef HAVE_FDOPENDIR
+        if (fd != -1) {
+            Py_BEGIN_ALLOW_THREADS
+            close(fd);
+            Py_END_ALLOW_THREADS
+        }
+#endif
+        goto error;
+    }
+#endif
+    return (PyObject *)iterator;
+error:
+    Py_DECREF(iterator);
+    return NULL;
+}
+</code>
+</WRAP>
+[[https://github.com/python/cpython/blob/master/Modules/posixmodule.c|cpython/Modules/posixmodule.c - github.com]] より\\
+===== パフォーマンス比較 =====
+<WRAP prewrap 100% #mincode>
+<code python dir_search.py>
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import glob
+import timeit
+def walk():
+    #result = [[file for file in files if file.endswith('.py')] for path, dirs, files in os.walk('.')]
+    result = []
+    for path, dirs, files in os.walk('.'):
+        for file in files:
+            if file.endswith('.py'):
+                result.append(file)
+    return result
+def iglob():
+    result = []
+    for file in glob.iglob('**/*', recursive=True):
+        if file.endswith('.py'):
+            result.append(file)
+    return result
+def iglob2():
+    result = []
+    for file in glob.iglob('**/*.py', recursive=True):
+        result.append(file)
+    return result
+def main():
+    # os.chdir('./')
+    num_of_exec = 10
+    print(timeit.timeit(walk, number=num_of_exec) / num_of_exec)
+    print(timeit.timeit(iglob, number=num_of_exec) / num_of_exec)
+    print(timeit.timeit(iglob2, number=num_of_exec) / num_of_exec)
+if __name__ == '__main__':
+    main()
+</code>
+</WRAP>
+<WRAP prewrap 100% #result>
+<code>
+$ ./dir_search.py
+.3243966199999704
+.4991581099999167
+.61260242999997
+$ time find . -name '*.py' > /dev/null
+real    0m0.523s
+user    0m0.020s
+sys     0m0.077s
+</code>
+</WRAP>
 ===== 参考文献 =====
-[[https://docs.python.org/ja/3/library/glob.html|glob --- Unix 形式のパス名のパターン展開 — Python 3.8.1 ドキュメント]]\\
+[[https://docs.python.org/ja/3/library/glob.html|glob --- Unix 形式のパス名のパターン展開 — Python ドキュメント]]\\
+[[https://stackoverflow.com/questions/51167093/how-can-i-find-the-source-code-of-os-scandir-in-python3|python - How can I find the source code of os.scandir in Python3? - Stack Overflow]]\\
 [[https://stackoverflow.com/questions/20638040/glob-exclude-pattern|python - glob exclude pattern - Stack Overflow]]\\
 [[https://github.com/python/cpython/blob/master/Lib/pathlib.py|cpython/pathlib.py at master · python/cpython]]\\
+[[https://stackoverflow.com/questions/50948391/whats-the-fastest-way-to-recursively-search-for-files-in-python/50950952|What's the fastest way to recursively search for files in python? - Stack Overflow]]\\
+[[https://qiita.com/amowwee/items/e63b3610ea750f7dba1b|Pythonでフォルダ内のファイルリストを取得する - Qiita]]\\
+[[https://stackoverflow.com/questions/24812253/how-can-i-capture-return-value-with-python-timeit-module|How can I capture return value with Python timeit module? - Stack Overflow]]\\
+[[https://living-sun.com/ja/python/705207-quicker-to-oswalk-or-glob-python-traversal-glob-oswalk-directory-walk.html|os.walkやglobにすばやく - Python、トラバーサル、グロブ、os.walk、ディレクトリウォーク]]\\
+[[https://note.dokeep.jp/post/csharp-fast-enumerate-file/|[C#] 高速でファイルとフォルダを列挙する - ざこノート]]\\