差分
このページの2つのバージョン間の差分を表示します。
両方とも前のリビジョン 前のリビジョン 次のリビジョン | 前のリビジョン | ||
python:pathlib [2020/02/01 20:15] – [参考文献] ともやん | python:pathlib [2023/05/27 09:00] (現在) – [glob.iglob() の実装] ともやん | ||
---|---|---|---|
行 1: | 行 1: | ||
- | ====== pathlib, glob ====== | + | ====== pathlib, glob.iglob(), os.walk() |
+ | |||
+ | ===== os.walk() の実装 ===== | ||
+ | < | ||
+ | |||
+ | ===== glob.iglob() の実装 ===== | ||
+ | < | ||
+ | < | ||
+ | ここで注意が必要なのは、 **pattern** は **__Unix Shell Style__** のパターンであって正規表現は利用できない。\\ | ||
+ | < | ||
+ | \\ | ||
+ | **OK パターン**\\ | ||
+ | <code python> | ||
+ | >>> | ||
+ | >>> | ||
+ | ' | ||
+ | </ | ||
+ | **NG パターン**\\ | ||
+ | <code python> | ||
+ | >>> | ||
+ | >>> | ||
+ | ' | ||
+ | </ | ||
+ | |||
+ | <WRAP mincode_long> | ||
+ | <code python python38/ | ||
+ | """ | ||
+ | |||
+ | import os | ||
+ | import re | ||
+ | import fnmatch | ||
+ | import sys | ||
+ | |||
+ | __all__ = [" | ||
+ | |||
+ | def glob(pathname, | ||
+ | """ | ||
+ | |||
+ | The pattern may contain simple shell-style wildcards a la | ||
+ | fnmatch. However, unlike fnmatch, filenames starting with a | ||
+ | dot are special cases that are not matched by ' | ||
+ | patterns. | ||
+ | |||
+ | If recursive is true, the pattern ' | ||
+ | zero or more directories and subdirectories. | ||
+ | """ | ||
+ | return list(iglob(pathname, | ||
+ | |||
+ | def iglob(pathname, | ||
+ | """ | ||
+ | |||
+ | The pattern may contain simple shell-style wildcards a la | ||
+ | fnmatch. However, unlike fnmatch, filenames starting with a | ||
+ | dot are special cases that are not matched by ' | ||
+ | patterns. | ||
+ | |||
+ | If recursive is true, the pattern ' | ||
+ | zero or more directories and subdirectories. | ||
+ | """ | ||
+ | it = _iglob(pathname, | ||
+ | if recursive and _isrecursive(pathname): | ||
+ | s = next(it) | ||
+ | assert not s | ||
+ | return it | ||
+ | |||
+ | def _iglob(pathname, | ||
+ | sys.audit(" | ||
+ | dirname, basename = os.path.split(pathname) | ||
+ | if not has_magic(pathname): | ||
+ | assert not dironly | ||
+ | if basename: | ||
+ | if os.path.lexists(pathname): | ||
+ | yield pathname | ||
+ | else: | ||
+ | # Patterns ending with a slash should match only directories | ||
+ | if os.path.isdir(dirname): | ||
+ | yield pathname | ||
+ | return | ||
+ | if not dirname: | ||
+ | if recursive and _isrecursive(basename): | ||
+ | yield from _glob2(dirname, | ||
+ | else: | ||
+ | yield from _glob1(dirname, | ||
+ | return | ||
+ | # `os.path.split()` returns the argument itself as a dirname if it is a | ||
+ | # drive or UNC path. Prevent an infinite recursion if a drive or UNC path | ||
+ | # contains magic characters (i.e. r' | ||
+ | if dirname != pathname and has_magic(dirname): | ||
+ | dirs = _iglob(dirname, | ||
+ | else: | ||
+ | dirs = [dirname] | ||
+ | if has_magic(basename): | ||
+ | if recursive and _isrecursive(basename): | ||
+ | glob_in_dir = _glob2 | ||
+ | else: | ||
+ | glob_in_dir = _glob1 | ||
+ | else: | ||
+ | glob_in_dir = _glob0 | ||
+ | for dirname in dirs: | ||
+ | for name in glob_in_dir(dirname, | ||
+ | yield os.path.join(dirname, | ||
+ | |||
+ | # These 2 helper functions non-recursively glob inside a literal directory. | ||
+ | # They return a list of basenames. | ||
+ | # takes a literal basename (so it only has to check for its existence). | ||
+ | |||
+ | def _glob1(dirname, | ||
+ | names = list(_iterdir(dirname, | ||
+ | if not _ishidden(pattern): | ||
+ | names = (x for x in names if not _ishidden(x)) | ||
+ | return fnmatch.filter(names, | ||
+ | |||
+ | def _glob0(dirname, | ||
+ | if not basename: | ||
+ | # `os.path.split()` returns an empty basename for paths ending with a | ||
+ | # directory separator. | ||
+ | if os.path.isdir(dirname): | ||
+ | return [basename] | ||
+ | else: | ||
+ | if os.path.lexists(os.path.join(dirname, | ||
+ | return [basename] | ||
+ | return [] | ||
+ | |||
+ | # Following functions are not public but can be used by third-party code. | ||
+ | |||
+ | def glob0(dirname, | ||
+ | return _glob0(dirname, | ||
+ | |||
+ | def glob1(dirname, | ||
+ | return _glob1(dirname, | ||
+ | |||
+ | # This helper function recursively yields relative pathnames inside a literal | ||
+ | # directory. | ||
+ | |||
+ | def _glob2(dirname, | ||
+ | assert _isrecursive(pattern) | ||
+ | yield pattern[: | ||
+ | yield from _rlistdir(dirname, | ||
+ | |||
+ | # If dironly is false, yields all file names inside a directory. | ||
+ | # If dironly is true, yields only directory names. | ||
+ | def _iterdir(dirname, | ||
+ | if not dirname: | ||
+ | if isinstance(dirname, | ||
+ | dirname = bytes(os.curdir, | ||
+ | else: | ||
+ | dirname = os.curdir | ||
+ | try: | ||
+ | with os.scandir(dirname) as it: | ||
+ | for entry in it: | ||
+ | try: | ||
+ | if not dironly or entry.is_dir(): | ||
+ | yield entry.name | ||
+ | except OSError: | ||
+ | pass | ||
+ | except OSError: | ||
+ | return | ||
+ | |||
+ | # Recursively yields relative pathnames inside a literal directory. | ||
+ | def _rlistdir(dirname, | ||
+ | names = list(_iterdir(dirname, | ||
+ | for x in names: | ||
+ | if not _ishidden(x): | ||
+ | yield x | ||
+ | path = os.path.join(dirname, | ||
+ | for y in _rlistdir(path, | ||
+ | yield os.path.join(x, | ||
+ | |||
+ | |||
+ | magic_check = re.compile(' | ||
+ | magic_check_bytes = re.compile(b' | ||
+ | |||
+ | def has_magic(s): | ||
+ | if isinstance(s, | ||
+ | match = magic_check_bytes.search(s) | ||
+ | else: | ||
+ | match = magic_check.search(s) | ||
+ | return match is not None | ||
+ | |||
+ | def _ishidden(path): | ||
+ | return path[0] in (' | ||
+ | |||
+ | def _isrecursive(pattern): | ||
+ | if isinstance(pattern, | ||
+ | return pattern == b' | ||
+ | else: | ||
+ | return pattern == ' | ||
+ | |||
+ | def escape(pathname): | ||
+ | """ | ||
+ | """ | ||
+ | # Escaping is done by wrapping any of " | ||
+ | # Metacharacters do not work in the drive part and shouldn' | ||
+ | drive, pathname = os.path.splitdrive(pathname) | ||
+ | if isinstance(pathname, | ||
+ | pathname = magic_check_bytes.sub(br' | ||
+ | else: | ||
+ | pathname = magic_check.sub(r' | ||
+ | return drive + pathname | ||
+ | </ | ||
+ | </ | ||
+ | |||
+ | ===== os.scandir() の実装 ===== | ||
+ | [[https:// | ||
+ | < | ||
+ | Note On Unix-based systems, scandir() uses the system’s opendir() and readdir() functions. On Windows, it uses the Win32 FindFirstFileW and FindNextFileW functions.\\ | ||
+ | \\ | ||
+ | ノート(翻訳) Unix ベースのシステムでは、scandir() はシステムの opendir() 関数と readdir() 関数を使用します。 Windows では、Win32 FindFirstFileW 関数と FindNextFileW 関数を使用します。 | ||
+ | </ | ||
+ | <WRAP mincode_long> | ||
+ | <code c cpython/ | ||
+ | /*[clinic input] | ||
+ | os.scandir | ||
+ | |||
+ | path : path_t(nullable=True, | ||
+ | |||
+ | Return an iterator of DirEntry objects for given path. | ||
+ | |||
+ | path can be specified as either str, bytes, or a path-like object. | ||
+ | is bytes, the names of yielded DirEntry objects will also be bytes; in | ||
+ | all other circumstances they will be str. | ||
+ | |||
+ | If path is None, uses the path=' | ||
+ | [clinic start generated code]*/ | ||
+ | |||
+ | static PyObject * | ||
+ | os_scandir_impl(PyObject *module, path_t *path) | ||
+ | /*[clinic end generated code: output=6eb2668b675ca89e input=6bdd312708fc3bb0]*/ | ||
+ | { | ||
+ | ScandirIterator *iterator; | ||
+ | #ifdef MS_WINDOWS | ||
+ | wchar_t *path_strW; | ||
+ | #else | ||
+ | const char *path_str; | ||
+ | #ifdef HAVE_FDOPENDIR | ||
+ | int fd = -1; | ||
+ | #endif | ||
+ | #endif | ||
+ | |||
+ | if (PySys_Audit(" | ||
+ | path-> | ||
+ | return NULL; | ||
+ | } | ||
+ | |||
+ | PyObject *ScandirIteratorType = get_posix_state(module)-> | ||
+ | iterator = PyObject_New(ScandirIterator, | ||
+ | if (!iterator) | ||
+ | return NULL; | ||
+ | |||
+ | #ifdef MS_WINDOWS | ||
+ | iterator-> | ||
+ | #else | ||
+ | iterator-> | ||
+ | #endif | ||
+ | |||
+ | memcpy(& | ||
+ | /* Move the ownership to iterator-> | ||
+ | path-> | ||
+ | path-> | ||
+ | |||
+ | #ifdef MS_WINDOWS | ||
+ | iterator-> | ||
+ | |||
+ | path_strW = join_path_filenameW(iterator-> | ||
+ | if (!path_strW) | ||
+ | goto error; | ||
+ | |||
+ | Py_BEGIN_ALLOW_THREADS | ||
+ | iterator-> | ||
+ | Py_END_ALLOW_THREADS | ||
+ | |||
+ | PyMem_Free(path_strW); | ||
+ | |||
+ | if (iterator-> | ||
+ | path_error(& | ||
+ | goto error; | ||
+ | } | ||
+ | #else /* POSIX */ | ||
+ | errno = 0; | ||
+ | #ifdef HAVE_FDOPENDIR | ||
+ | if (path-> | ||
+ | /* closedir() closes the FD, so we duplicate it */ | ||
+ | fd = _Py_dup(path-> | ||
+ | if (fd == -1) | ||
+ | goto error; | ||
+ | |||
+ | Py_BEGIN_ALLOW_THREADS | ||
+ | iterator-> | ||
+ | Py_END_ALLOW_THREADS | ||
+ | } | ||
+ | else | ||
+ | #endif | ||
+ | { | ||
+ | if (iterator-> | ||
+ | path_str = iterator-> | ||
+ | else | ||
+ | path_str = " | ||
+ | |||
+ | Py_BEGIN_ALLOW_THREADS | ||
+ | iterator-> | ||
+ | Py_END_ALLOW_THREADS | ||
+ | } | ||
+ | |||
+ | if (!iterator-> | ||
+ | path_error(& | ||
+ | #ifdef HAVE_FDOPENDIR | ||
+ | if (fd != -1) { | ||
+ | Py_BEGIN_ALLOW_THREADS | ||
+ | close(fd); | ||
+ | Py_END_ALLOW_THREADS | ||
+ | } | ||
+ | #endif | ||
+ | goto error; | ||
+ | } | ||
+ | #endif | ||
+ | |||
+ | return (PyObject *)iterator; | ||
+ | |||
+ | error: | ||
+ | Py_DECREF(iterator); | ||
+ | return NULL; | ||
+ | } | ||
+ | </ | ||
+ | </ | ||
+ | [[https:// | ||
+ | |||
+ | ===== パフォーマンス比較 ===== | ||
+ | <WRAP prewrap 100% # | ||
+ | <code python dir_search.py> | ||
+ | # | ||
+ | # -*- coding: utf-8 -*- | ||
+ | import os | ||
+ | import glob | ||
+ | import timeit | ||
+ | |||
+ | def walk(): | ||
+ | #result = [[file for file in files if file.endswith(' | ||
+ | result = [] | ||
+ | for path, dirs, files in os.walk(' | ||
+ | for file in files: | ||
+ | if file.endswith(' | ||
+ | result.append(file) | ||
+ | return result | ||
+ | |||
+ | def iglob(): | ||
+ | result = [] | ||
+ | for file in glob.iglob(' | ||
+ | if file.endswith(' | ||
+ | result.append(file) | ||
+ | return result | ||
+ | |||
+ | def iglob2(): | ||
+ | result = [] | ||
+ | for file in glob.iglob(' | ||
+ | result.append(file) | ||
+ | return result | ||
+ | |||
+ | def main(): | ||
+ | # os.chdir(' | ||
+ | num_of_exec = 10 | ||
+ | print(timeit.timeit(walk, | ||
+ | print(timeit.timeit(iglob, | ||
+ | print(timeit.timeit(iglob2, | ||
+ | |||
+ | if __name__ == ' | ||
+ | main() | ||
+ | </ | ||
+ | </ | ||
+ | <WRAP prewrap 100% # | ||
+ | < | ||
+ | $ ./ | ||
+ | 1.3243966199999704 | ||
+ | 1.4991581099999167 | ||
+ | 1.61260242999997 | ||
+ | $ time find . -name ' | ||
+ | |||
+ | real 0m0.523s | ||
+ | user 0m0.020s | ||
+ | sys | ||
+ | </ | ||
+ | </ | ||
===== 参考文献 ===== | ===== 参考文献 ===== | ||
- | [[https:// | + | [[https:// |
+ | [[https:// | ||
[[https:// | [[https:// | ||
[[https:// | [[https:// | ||
+ | [[https:// | ||
+ | [[https:// | ||
+ | [[https:// | ||
+ | [[https:// | ||
+ | [[https:// | ||