差分
このページの2つのバージョン間の差分を表示します。
| 両方とも前のリビジョン 前のリビジョン 次のリビジョン | 前のリビジョン | ||
| python:pathlib [2020/02/01 20:15] – [参考文献] ともやん | python:pathlib [2023/05/27 09:00] (現在) – [glob.iglob() の実装] ともやん | ||
|---|---|---|---|
| 行 1: | 行 1: | ||
| - | ====== pathlib, glob ====== | + | ====== pathlib, glob.iglob(), os.walk() |
| + | |||
| + | ===== os.walk() の実装 ===== | ||
| + | < | ||
| + | |||
| + | ===== glob.iglob() の実装 ===== | ||
| + | < | ||
| + | < | ||
| + | ここで注意が必要なのは、 **pattern** は **__Unix Shell Style__** のパターンであって正規表現は利用できない。\\ | ||
| + | < | ||
| + | \\ | ||
| + | **OK パターン**\\ | ||
| + | <code python> | ||
| + | >>> | ||
| + | >>> | ||
| + | ' | ||
| + | </ | ||
| + | **NG パターン**\\ | ||
| + | <code python> | ||
| + | >>> | ||
| + | >>> | ||
| + | ' | ||
| + | </ | ||
| + | |||
| + | <WRAP mincode_long> | ||
| + | <code python python38/ | ||
| + | """ | ||
| + | |||
| + | import os | ||
| + | import re | ||
| + | import fnmatch | ||
| + | import sys | ||
| + | |||
| + | __all__ = [" | ||
| + | |||
| + | def glob(pathname, | ||
| + | """ | ||
| + | |||
| + | The pattern may contain simple shell-style wildcards a la | ||
| + | fnmatch. However, unlike fnmatch, filenames starting with a | ||
| + | dot are special cases that are not matched by ' | ||
| + | patterns. | ||
| + | |||
| + | If recursive is true, the pattern ' | ||
| + | zero or more directories and subdirectories. | ||
| + | """ | ||
| + | return list(iglob(pathname, | ||
| + | |||
| + | def iglob(pathname, | ||
| + | """ | ||
| + | |||
| + | The pattern may contain simple shell-style wildcards a la | ||
| + | fnmatch. However, unlike fnmatch, filenames starting with a | ||
| + | dot are special cases that are not matched by ' | ||
| + | patterns. | ||
| + | |||
| + | If recursive is true, the pattern ' | ||
| + | zero or more directories and subdirectories. | ||
| + | """ | ||
| + | it = _iglob(pathname, | ||
| + | if recursive and _isrecursive(pathname): | ||
| + | s = next(it) | ||
| + | assert not s | ||
| + | return it | ||
| + | |||
| + | def _iglob(pathname, | ||
| + | sys.audit(" | ||
| + | dirname, basename = os.path.split(pathname) | ||
| + | if not has_magic(pathname): | ||
| + | assert not dironly | ||
| + | if basename: | ||
| + | if os.path.lexists(pathname): | ||
| + | yield pathname | ||
| + | else: | ||
| + | # Patterns ending with a slash should match only directories | ||
| + | if os.path.isdir(dirname): | ||
| + | yield pathname | ||
| + | return | ||
| + | if not dirname: | ||
| + | if recursive and _isrecursive(basename): | ||
| + | yield from _glob2(dirname, | ||
| + | else: | ||
| + | yield from _glob1(dirname, | ||
| + | return | ||
| + | # `os.path.split()` returns the argument itself as a dirname if it is a | ||
| + | # drive or UNC path. Prevent an infinite recursion if a drive or UNC path | ||
| + | # contains magic characters (i.e. r' | ||
| + | if dirname != pathname and has_magic(dirname): | ||
| + | dirs = _iglob(dirname, | ||
| + | else: | ||
| + | dirs = [dirname] | ||
| + | if has_magic(basename): | ||
| + | if recursive and _isrecursive(basename): | ||
| + | glob_in_dir = _glob2 | ||
| + | else: | ||
| + | glob_in_dir = _glob1 | ||
| + | else: | ||
| + | glob_in_dir = _glob0 | ||
| + | for dirname in dirs: | ||
| + | for name in glob_in_dir(dirname, | ||
| + | yield os.path.join(dirname, | ||
| + | |||
| + | # These 2 helper functions non-recursively glob inside a literal directory. | ||
| + | # They return a list of basenames. | ||
| + | # takes a literal basename (so it only has to check for its existence). | ||
| + | |||
| + | def _glob1(dirname, | ||
| + | names = list(_iterdir(dirname, | ||
| + | if not _ishidden(pattern): | ||
| + | names = (x for x in names if not _ishidden(x)) | ||
| + | return fnmatch.filter(names, | ||
| + | |||
| + | def _glob0(dirname, | ||
| + | if not basename: | ||
| + | # `os.path.split()` returns an empty basename for paths ending with a | ||
| + | # directory separator. | ||
| + | if os.path.isdir(dirname): | ||
| + | return [basename] | ||
| + | else: | ||
| + | if os.path.lexists(os.path.join(dirname, | ||
| + | return [basename] | ||
| + | return [] | ||
| + | |||
| + | # Following functions are not public but can be used by third-party code. | ||
| + | |||
| + | def glob0(dirname, | ||
| + | return _glob0(dirname, | ||
| + | |||
| + | def glob1(dirname, | ||
| + | return _glob1(dirname, | ||
| + | |||
| + | # This helper function recursively yields relative pathnames inside a literal | ||
| + | # directory. | ||
| + | |||
| + | def _glob2(dirname, | ||
| + | assert _isrecursive(pattern) | ||
| + | yield pattern[: | ||
| + | yield from _rlistdir(dirname, | ||
| + | |||
| + | # If dironly is false, yields all file names inside a directory. | ||
| + | # If dironly is true, yields only directory names. | ||
| + | def _iterdir(dirname, | ||
| + | if not dirname: | ||
| + | if isinstance(dirname, | ||
| + | dirname = bytes(os.curdir, | ||
| + | else: | ||
| + | dirname = os.curdir | ||
| + | try: | ||
| + | with os.scandir(dirname) as it: | ||
| + | for entry in it: | ||
| + | try: | ||
| + | if not dironly or entry.is_dir(): | ||
| + | yield entry.name | ||
| + | except OSError: | ||
| + | pass | ||
| + | except OSError: | ||
| + | return | ||
| + | |||
| + | # Recursively yields relative pathnames inside a literal directory. | ||
| + | def _rlistdir(dirname, | ||
| + | names = list(_iterdir(dirname, | ||
| + | for x in names: | ||
| + | if not _ishidden(x): | ||
| + | yield x | ||
| + | path = os.path.join(dirname, | ||
| + | for y in _rlistdir(path, | ||
| + | yield os.path.join(x, | ||
| + | |||
| + | |||
| + | magic_check = re.compile(' | ||
| + | magic_check_bytes = re.compile(b' | ||
| + | |||
| + | def has_magic(s): | ||
| + | if isinstance(s, | ||
| + | match = magic_check_bytes.search(s) | ||
| + | else: | ||
| + | match = magic_check.search(s) | ||
| + | return match is not None | ||
| + | |||
| + | def _ishidden(path): | ||
| + | return path[0] in (' | ||
| + | |||
| + | def _isrecursive(pattern): | ||
| + | if isinstance(pattern, | ||
| + | return pattern == b' | ||
| + | else: | ||
| + | return pattern == ' | ||
| + | |||
| + | def escape(pathname): | ||
| + | """ | ||
| + | """ | ||
| + | # Escaping is done by wrapping any of " | ||
| + | # Metacharacters do not work in the drive part and shouldn' | ||
| + | drive, pathname = os.path.splitdrive(pathname) | ||
| + | if isinstance(pathname, | ||
| + | pathname = magic_check_bytes.sub(br' | ||
| + | else: | ||
| + | pathname = magic_check.sub(r' | ||
| + | return drive + pathname | ||
| + | </ | ||
| + | </ | ||
| + | |||
| + | ===== os.scandir() の実装 ===== | ||
| + | [[https:// | ||
| + | < | ||
| + | Note On Unix-based systems, scandir() uses the system’s opendir() and readdir() functions. On Windows, it uses the Win32 FindFirstFileW and FindNextFileW functions.\\ | ||
| + | \\ | ||
| + | ノート(翻訳) Unix ベースのシステムでは、scandir() はシステムの opendir() 関数と readdir() 関数を使用します。 Windows では、Win32 FindFirstFileW 関数と FindNextFileW 関数を使用します。 | ||
| + | </ | ||
| + | <WRAP mincode_long> | ||
| + | <code c cpython/ | ||
| + | /*[clinic input] | ||
| + | os.scandir | ||
| + | |||
| + | path : path_t(nullable=True, | ||
| + | |||
| + | Return an iterator of DirEntry objects for given path. | ||
| + | |||
| + | path can be specified as either str, bytes, or a path-like object. | ||
| + | is bytes, the names of yielded DirEntry objects will also be bytes; in | ||
| + | all other circumstances they will be str. | ||
| + | |||
| + | If path is None, uses the path=' | ||
| + | [clinic start generated code]*/ | ||
| + | |||
| + | static PyObject * | ||
| + | os_scandir_impl(PyObject *module, path_t *path) | ||
| + | /*[clinic end generated code: output=6eb2668b675ca89e input=6bdd312708fc3bb0]*/ | ||
| + | { | ||
| + | ScandirIterator *iterator; | ||
| + | #ifdef MS_WINDOWS | ||
| + | wchar_t *path_strW; | ||
| + | #else | ||
| + | const char *path_str; | ||
| + | #ifdef HAVE_FDOPENDIR | ||
| + | int fd = -1; | ||
| + | #endif | ||
| + | #endif | ||
| + | |||
| + | if (PySys_Audit(" | ||
| + | path-> | ||
| + | return NULL; | ||
| + | } | ||
| + | |||
| + | PyObject *ScandirIteratorType = get_posix_state(module)-> | ||
| + | iterator = PyObject_New(ScandirIterator, | ||
| + | if (!iterator) | ||
| + | return NULL; | ||
| + | |||
| + | #ifdef MS_WINDOWS | ||
| + | iterator-> | ||
| + | #else | ||
| + | iterator-> | ||
| + | #endif | ||
| + | |||
| + | memcpy(& | ||
| + | /* Move the ownership to iterator-> | ||
| + | path-> | ||
| + | path-> | ||
| + | |||
| + | #ifdef MS_WINDOWS | ||
| + | iterator-> | ||
| + | |||
| + | path_strW = join_path_filenameW(iterator-> | ||
| + | if (!path_strW) | ||
| + | goto error; | ||
| + | |||
| + | Py_BEGIN_ALLOW_THREADS | ||
| + | iterator-> | ||
| + | Py_END_ALLOW_THREADS | ||
| + | |||
| + | PyMem_Free(path_strW); | ||
| + | |||
| + | if (iterator-> | ||
| + | path_error(& | ||
| + | goto error; | ||
| + | } | ||
| + | #else /* POSIX */ | ||
| + | errno = 0; | ||
| + | #ifdef HAVE_FDOPENDIR | ||
| + | if (path-> | ||
| + | /* closedir() closes the FD, so we duplicate it */ | ||
| + | fd = _Py_dup(path-> | ||
| + | if (fd == -1) | ||
| + | goto error; | ||
| + | |||
| + | Py_BEGIN_ALLOW_THREADS | ||
| + | iterator-> | ||
| + | Py_END_ALLOW_THREADS | ||
| + | } | ||
| + | else | ||
| + | #endif | ||
| + | { | ||
| + | if (iterator-> | ||
| + | path_str = iterator-> | ||
| + | else | ||
| + | path_str = " | ||
| + | |||
| + | Py_BEGIN_ALLOW_THREADS | ||
| + | iterator-> | ||
| + | Py_END_ALLOW_THREADS | ||
| + | } | ||
| + | |||
| + | if (!iterator-> | ||
| + | path_error(& | ||
| + | #ifdef HAVE_FDOPENDIR | ||
| + | if (fd != -1) { | ||
| + | Py_BEGIN_ALLOW_THREADS | ||
| + | close(fd); | ||
| + | Py_END_ALLOW_THREADS | ||
| + | } | ||
| + | #endif | ||
| + | goto error; | ||
| + | } | ||
| + | #endif | ||
| + | |||
| + | return (PyObject *)iterator; | ||
| + | |||
| + | error: | ||
| + | Py_DECREF(iterator); | ||
| + | return NULL; | ||
| + | } | ||
| + | </ | ||
| + | </ | ||
| + | [[https:// | ||
| + | |||
| + | ===== パフォーマンス比較 ===== | ||
| + | <WRAP prewrap 100% # | ||
| + | <code python dir_search.py> | ||
| + | # | ||
| + | # -*- coding: utf-8 -*- | ||
| + | import os | ||
| + | import glob | ||
| + | import timeit | ||
| + | |||
| + | def walk(): | ||
| + | #result = [[file for file in files if file.endswith(' | ||
| + | result = [] | ||
| + | for path, dirs, files in os.walk(' | ||
| + | for file in files: | ||
| + | if file.endswith(' | ||
| + | result.append(file) | ||
| + | return result | ||
| + | |||
| + | def iglob(): | ||
| + | result = [] | ||
| + | for file in glob.iglob(' | ||
| + | if file.endswith(' | ||
| + | result.append(file) | ||
| + | return result | ||
| + | |||
| + | def iglob2(): | ||
| + | result = [] | ||
| + | for file in glob.iglob(' | ||
| + | result.append(file) | ||
| + | return result | ||
| + | |||
| + | def main(): | ||
| + | # os.chdir(' | ||
| + | num_of_exec = 10 | ||
| + | print(timeit.timeit(walk, | ||
| + | print(timeit.timeit(iglob, | ||
| + | print(timeit.timeit(iglob2, | ||
| + | |||
| + | if __name__ == ' | ||
| + | main() | ||
| + | </ | ||
| + | </ | ||
| + | <WRAP prewrap 100% # | ||
| + | < | ||
| + | $ ./ | ||
| + | 1.3243966199999704 | ||
| + | 1.4991581099999167 | ||
| + | 1.61260242999997 | ||
| + | $ time find . -name ' | ||
| + | |||
| + | real 0m0.523s | ||
| + | user 0m0.020s | ||
| + | sys | ||
| + | </ | ||
| + | </ | ||
| ===== 参考文献 ===== | ===== 参考文献 ===== | ||
| - | [[https:// | + | [[https:// |
| + | [[https:// | ||
| [[https:// | [[https:// | ||
| [[https:// | [[https:// | ||
| + | [[https:// | ||
| + | [[https:// | ||
| + | [[https:// | ||
| + | [[https:// | ||
| + | [[https:// | ||