import inspect
import os
import re
from typing import Iterable, Optional, Set
[docs]class PythonFileCollector:
"""
PythonFileCollector class provides general configurations
and utility methods for collecting Python files
"""
_WILDCARD = "*"
_NEGATION = "!"
_INIT_FILE = "__init__.py"
_DEFAULT_FILE_EXCLUSION_PATTERNS = {"!*.py", ".*", "~*"}
_DEFAULT_DIR_EXCLUSION_PATTERNS = {
"__pycache__",
"tmp",
"build",
"dist",
"sdist",
"wheelhouse",
"develop-eggs",
"parts",
"eggs",
"var",
"htmlcov",
"bin",
"venv*",
"pyvenv*",
".*",
"*~",
}
_DEFAULT_FILE_EXCLUSION_REGEX_PATTERNS = {r"(?!(.*\.py(?!.)))", r"^\..*", r"^\~.*"}
_DEFAULT_DIR_EXCLUSION_REGEX_PATTERNS = {
r"^__pycache__(?!.)",
r"^tmp(?!.)",
r"^build(?!.)",
r"^dist(?!.)",
r"^sdist(?!.)",
r"^wheelhouse(?!.)",
r"^develop-eggs(?!.)",
r"^parts(?!.)",
r"^eggs(?!.)",
r"^var(?!.)",
r"^htmlcov(?!.)",
r"^bin(?!.)",
r"^venv.*",
r"^pyvenv.*",
r"^\..*",
r".*~(?!.)",
}
def __init__(
self,
use_regex_patterns: bool = False,
additional_file_exclusion_patterns: Iterable[str] = None,
additional_dir_exclusion_patterns: Iterable[str] = None,
):
"""
PythonFileCollector provides method "collect" to collect files
while applying exclusion patterns to files and directories based exclusively on
their names and NOT taking into account their absolute nor relative paths.
When not using regex pattern a single wildcard, "*", can be used anywhere in
a pattern to filter names "starting with" and/or "ending with". Also, a single
exclamation mark, "!", can be used at the beginning of the pattern to negate it.
These only applies when :param use_regex_patterns: is False.
Notice that using regex patterns may be slower.
:param use_regex_patterns:
(default: False) flag to indicate whether or not to use regex to match
patterns. When this flag it set to False the PythonFileCollector._WILDCARD
(default: "*") character is interpreted as wildcard and patterns starting
with the PythonFileCollector._NEGATION (default: "!") character are negated.
:param additional_file_exclusion_patterns:
(default: None) additional patterns to filter out of collection files. In
addition to the PythonFileCollector._DEFAULT_FILE_EXCLUSION_PATTERNS or
PythonFileCollector._DEFAULT_FILE_EXCLUSION_REGEX_PATTERNS any file that
matches these patterns will be excluded from collection.
:param additional_dir_exclusion_patterns:
(default: None) additional patterns to filter out of collection directories.
In addition to the PythonFileCollector._DEFAULT_DIR_EXCLUSION_PATTERNS or
PythonFileCollector._DEFAULT_DIR_EXCLUSION_REGEX_PATTERNS any directory that
matches these patterns will be excluded from collection.
"""
self.enable_regex_patterns = use_regex_patterns
if self.enable_regex_patterns:
self.file_exclusion_patterns = (
self._DEFAULT_FILE_EXCLUSION_REGEX_PATTERNS.copy()
)
self.dir_exclusion_patterns = (
self._DEFAULT_DIR_EXCLUSION_REGEX_PATTERNS.copy()
)
else:
self.file_exclusion_patterns = self._DEFAULT_FILE_EXCLUSION_PATTERNS.copy()
self.dir_exclusion_patterns = self._DEFAULT_DIR_EXCLUSION_PATTERNS.copy()
if additional_file_exclusion_patterns:
self.file_exclusion_patterns.update(additional_file_exclusion_patterns)
if additional_dir_exclusion_patterns:
self.dir_exclusion_patterns.update(additional_dir_exclusion_patterns)
@staticmethod
def _get_caller_path() -> str:
caller_filename = inspect.stack(2)[1]
caller_abs_path = os.path.abspath(caller_filename)
return os.path.dirname(caller_abs_path)
def collect(
self,
search_path: Optional[str] = None,
recursion_limit: Optional[int] = None,
follow_symlinks: bool = True,
) -> Set[os.DirEntry]:
if search_path is None:
search_path = self._get_caller_path()
collected_files = set() # type: Set[os.DirEntry]
excluded_files = set() # type: Set[os.DirEntry]
collected_dirs = set() # type: Set[os.DirEntry]
excluded_dirs = set() # type: Set[os.DirEntry]
for entry in os.scandir(search_path):
if entry.is_file(follow_symlinks=follow_symlinks):
if self._should_exclude_file(entry):
excluded_files.add(entry)
continue
collected_files.add(entry)
continue
if entry.is_dir(follow_symlinks=follow_symlinks):
if self._should_exclude_dir(entry):
excluded_dirs.add(entry)
continue
collected_dirs.add(entry)
continue
if recursion_limit is None or recursion_limit > 0:
if recursion_limit is not None:
recursion_limit -= 1
for subdir in collected_dirs:
if not any(
entry.is_file(follow_symlinks=follow_symlinks)
and self._INIT_FILE == entry.name
for entry in os.scandir(subdir.path)
):
continue
collected_files.update(
self.collect(
search_path=subdir.path,
recursion_limit=recursion_limit,
follow_symlinks=follow_symlinks,
)
)
return collected_files
def _should_exclude_file(self, entry: os.DirEntry) -> bool:
filename = entry.name # type: str
return any(
self._matches_pattern(pattern, filename)
for pattern in self.file_exclusion_patterns
)
def _should_exclude_dir(self, entry: os.DirEntry) -> bool:
dirname = entry.name # type: str
return any(
self._matches_pattern(pattern, dirname)
for pattern in self.dir_exclusion_patterns
)
def _matches_pattern(self, pattern: str, name: str) -> bool:
if self.enable_regex_patterns:
return bool(re.match(pattern, name))
negate = False
if pattern.startswith(self._NEGATION):
negate = True
pattern = pattern[1:]
splitted = pattern.split(self._WILDCARD)
matches_pattern = name.startswith(splitted[0]) and name.endswith(splitted[-1])
return matches_pattern if not negate else not matches_pattern