Source code for ms3.view

import os
import random
import re
from collections import defaultdict, Counter
from copy import deepcopy
from typing import Collection, Union, Iterable, List, Dict, Iterator, Optional, Tuple, Set, Literal

import numpy as np
import numpy.typing as npt

from .score import Score
from ._typing import FileList, Category, Categories
from .utils import File, unpack_json_paths, resolve_paths_argument
from .logger import LoggedClass


[docs]def empty_counts(): """Array for counting kept items, discarded items, and their sum.""" return np.zeros(3, dtype=int)
[docs]class View(LoggedClass): """ Object storing regular expressions and filter lists, storing and keeping track of things filtered out. """ review_regex = "review" categories = ( 'corpora', 'folders', 'fnames', 'files', 'suffixes', 'facets', ) available_facets = ('scores',) + Score.dataframe_types + ('unknown',) singular2category: Dict[str, Category] = dict(zip(('corpus', 'folder', 'fname', 'file', 'suffix', 'facet'), categories)) tsv_regex = re.compile(r"\.tsv$", re.IGNORECASE) convertible_regex = Score.make_extension_regex(native=False, convertible=True, tsv=False) registered_regexes = (convertible_regex, review_regex, tsv_regex) def __init__(self, view_name: Optional[str] = 'all', only_metadata_fnames: bool = False, include_convertible: bool = True, include_tsv: bool = True, exclude_review: bool = False, **logger_cfg ): super().__init__(subclass='View', logger_cfg=logger_cfg) # fields self._name: str = '' # the two main dicts self.including: dict = {c: [] for c in self.categories} self.excluding: dict = {c: [] for c in self.categories} self.excluded_file_paths: List[str] = [] self.selected_facets = self.available_facets self._last_filtering_counts: Dict[str, npt.NDArray[int, int, int]] = defaultdict(empty_counts) """For each filter method, store the counts of the last run as [n_kept, n_discarded, N (the sum)]. Keys are "category" for :meth:`filter_by_token` and 'files' or 'parsed' for :meth:`filtered_file_list`. To inspect, you can use the method :meth:`filtering_report` """ self._discarded_items: Dict[str, Set[str]] = defaultdict(set) self._discarded_file_criteria: dict[Literal['subdir', 'file', 'suffix'], Counter] = defaultdict(Counter) """{criterion -> {excluded_name -> n_excluded}} dict for keeping track of which file was discarded based on which criterion. """ # booleans self.fnames_in_metadata: bool = True self.fnames_not_in_metadata: bool = not only_metadata_fnames self.fnames_with_incomplete_facets = True # properties self.name = view_name self.include_convertible = include_convertible self.include_tsv = include_tsv self.exclude_review = exclude_review # %%%%%%%%%%%%%%%%%%%%%%%%%%%%% END of __init__() %%%%%%%%%%%%%%%%%%%%%%%%%%%%%# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# @staticmethod def check_name(view_name) -> Tuple[bool, str]: if not isinstance(view_name, str): return False, f"Name of the view should be a string, not '{type(view_name)}'" if not view_name.isidentifier(): return False, f"The string '{view_name}' cannot be used as attribute name." return True, '' @property def name(self) -> str: return self._name @name.setter def name(self, new_name: str): name_valid, msg = self.check_name(new_name) if not name_valid: raise ValueError(msg) self._name = new_name
[docs] def is_default(self, relax_for_cli: bool = False) -> bool: """Checks includes and excludes that may influence the selection of fnames. Returns True if the settings do not filter out any fnames. Only if ``relax_for_cli`` is set to True, the filters :attr:`include_convertible` and :attr:`exclude_review` are permitted, too.""" # define the expected number of filter regexes per category (ignore 'corpora' and 'facets') default_excluding_lengths = { 'suffixes': 0, 'folders': 0, 'fnames': 0, 'files': 0 } if relax_for_cli: if self.exclude_review: default_excluding_lengths.update({ 'folders': 1, 'fnames': 1, 'files': 1 }) default_excluding_lengths['files'] += not self.include_convertible ## debugging: # print(f"""no includes: {all(len(self.including[category]) == 0 for category in default_excluding_lengths.keys())} # default_excludes: {all(len(self.excluding[category]) == expected for category, expected in default_excluding_lengths.items())} # exclude_review: {not self.exclude_review or relax_for_cli} # include_convertible: {self.include_convertible or relax_for_cli} # no paths excluded: {len(self.excluded_file_paths) == 0} # fnames in metadata: {self.fnames_in_metadata} # not in metadata excluded: {self.fnames_not_in_metadata or relax_for_cli} # incomplete facets: {self.fnames_with_incomplete_facets}""") return ( all(len(self.including[category]) == 0 for category in default_excluding_lengths.keys()) and all(len(self.excluding[category]) == expected for category, expected in default_excluding_lengths.items()) and len(self.excluded_file_paths) == 0 and self.fnames_in_metadata and self.fnames_with_incomplete_facets and (relax_for_cli or ( self.include_convertible and self.fnames_not_in_metadata )) )
[docs] def copy(self, new_name: Optional[str] = None) -> 'View': """Returns a copy of this view, i.e., a new View object.""" if new_name is None: new_name = get_ferocious_name() new_view = self.__class__(view_name=new_name) new_view.including = deepcopy(self.including) new_view.excluding = deepcopy(self.excluding) new_view.update_facet_selection() new_view.excluded_file_paths = list(self.excluded_file_paths) new_view.fnames_in_metadata = self.fnames_in_metadata new_view.fnames_not_in_metadata = self.fnames_not_in_metadata new_view.fnames_with_incomplete_facets = self.fnames_with_incomplete_facets return new_view
def update_config(self, view_name: Optional[str] = None, only_metadata: bool = None, include_convertible: bool = None, include_tsv: bool = None, exclude_review: bool = None, paths=None, file_re=None, folder_re=None, exclude_re=None, **logger_cfg): for param, value in zip(('view_name', 'only_metadata', 'include_convertible', 'include_tsv', 'exclude_review'), (view_name, only_metadata, include_convertible, include_tsv, exclude_review) ): if value is None: continue old_value = getattr(self, param) if value != old_value: setattr(self, param, value) self.logger.debug(f"Set '{param}' (previously {old_value}) to {value}.") if file_re is not None and file_re != '.*': self.include('files', file_re) if folder_re is not None and folder_re != '.*': self.include('folders', folder_re) if exclude_re is not None: self.exclude(('files', 'folders'), exclude_re) if paths is not None: resolved_paths = resolve_paths_argument(paths) if len(resolved_paths) > 0: unpack_json_paths(resolved_paths) regexes = [re.escape(os.path.basename(p)) for p in paths] self.include('files', *regexes) if len(logger_cfg) > 0: self.change_logger_cfg(**logger_cfg) @property def include_convertible(self): return self.convertible_regex not in self.excluding['files'] @include_convertible.setter def include_convertible(self, yes: bool): if yes: self.unexclude('files', self.convertible_regex) else: self.exclude('files', self.convertible_regex) @property def include_tsv(self): return self.tsv_regex not in self.excluding['files'] @include_tsv.setter def include_tsv(self, yes: bool): if yes: self.unexclude('files', self.tsv_regex) else: self.exclude('files', self.tsv_regex) @property def exclude_review(self): return all(self.review_regex in self.excluding[what_to_exclude] for what_to_exclude in ('files', 'fnames', 'folders')) @exclude_review.setter def exclude_review(self, yes: bool): if yes: self.exclude(('files', 'fnames', 'folders'), self.review_regex) else: self.unexclude(('files', 'fnames', 'folders'), self.review_regex)
[docs] def check_token(self, category: Category, token: str) -> bool: """Checks if a string pertaining to a certain category should be included in the view or not.""" category = self.resolve_category(category) if any(re.search(rgx, token) is not None for rgx in self.excluding[category]): return False if len(self.including[category]) == 0: return True return any(re.search(rgx, token) is not None for rgx in self.including[category])
[docs] def check_file(self, file: File) -> Tuple[bool, str]: """ Check if an individual File passes all filters w.r.t. its subdirectories, file name and suffix. Args: file: Returns: False if file is to be discarded from this view. The criterion based on which the file is being excluded. """ if file.full_path in self.excluded_file_paths: return False, 'file' category2file_component = dict(zip((('folders', 'subdir'), ('files', 'file'), ('suffixes', 'suffix')), (file.subdir, file.file, file.suffix) )) for (category, criterion), component in category2file_component.items(): if any(re.search(rgx, component) is not None for rgx in self.excluding[category]): return False, criterion for (category, criterion), component in category2file_component.items(): if len(self.including[category]) == 0: continue if not any(re.search(rgx, component) is not None for rgx in self.including[category]): return False, criterion return True, 'files'
def reset_filtering_data(self, categories: Categories = None): if categories is None: # reset everything self._last_filtering_counts = defaultdict(empty_counts) self._discarded_items = defaultdict(set) self._discarded_file_criteria = defaultdict(Counter) else: categories = self.resolve_categories(categories) for ctgr in categories: if ctgr in self._last_filtering_counts: del(self._last_filtering_counts[ctgr]) if ctgr in self._discarded_items: del(self._discarded_items[ctgr]) if 'files' in categories: self._discarded_file_criteria = defaultdict(Counter) self.update_facet_selection() def reset_view(self): self.__init__()
[docs] def filter_by_token(self, category: Category, tuples: Iterable[tuple]) -> Iterator[tuple]: """Filters out those tuples where the token (first element) does not pass _.check_token(category, token).""" category = self.resolve_category(category) n_kept, n_discarded, N = 0, 0, 0 discarded_items = [] for tup in tuples: N += 1 token, *_ = tup if self.check_token(category, token=token): n_kept += 1 yield tup else: n_discarded += 1 discarded_items.append(token) key = category self._last_filtering_counts[key] += np.array([n_kept, n_discarded, N], dtype='int') self._discarded_items[key].update(discarded_items)
[docs] def filtered_tokens(self, category: Category, tokens: Collection[str]) -> List[str]: """Applies :meth:`filter_by_token` to a collection of tokens.""" return [token[0] for token in self.filter_by_token(category, ((t,) for t in tokens))]
[docs] def filtered_file_list(self, files: Collection[File], key: str = None) -> FileList: """ Keep only the files that pass _.check_file(). Args: files: :obj:`File` objects to be filtered. key: Aggregate results from several filter runs under this dictionary key. Returns: """ if len(files) == 0: return [] result, discarded_items = [], [] for file in files: accept, criterion = self.check_file(file) if accept: result.append(file) else: discarded_items.append(file.rel_path) if key is None: # do not track discarding criteria for special keys such as 'parsed', used by View.iter_facet2parsed self._discarded_file_criteria[criterion][getattr(file, criterion)] += 1 n_kept, n_discarded, N = len(result), len(discarded_items), len(files) if key is None: key = 'files' self._last_filtering_counts[key] += np.array([n_kept, n_discarded, N], dtype='int') self._discarded_items[key].update(discarded_items) return result
def filtering_report(self, drop_zero=True, show_discarded=True, return_str=False) -> Optional[str]: aggregated_counts = defaultdict(empty_counts) for key, counts in self._last_filtering_counts.items(): aggregated_counts[key] += counts if show_discarded: discarded = defaultdict(list) for key, items in self._discarded_items.items(): discarded[key].extend(items) msg = '' for key, (_, n_discarded, N) in aggregated_counts.items(): if not drop_zero or n_discarded > 0: msg += f"{n_discarded}/{N} {key} are excluded from this view" if show_discarded: if len(discarded[key]) > 0: msg += f":\n{sorted(discarded[key])}\n\n" else: msg += ", but unfortunately I don't know which ones.\n" else: msg += '.\n' if len(self._discarded_file_criteria) > 0: msg += '\n' for criterion, cntr in self._discarded_file_criteria.items(): crit = 'file name' if criterion == 'file' else criterion msg += f"{sum(cntr.values())} files have been excluded based on their {crit}" if show_discarded: msg += ':\n' for excluded_name, n in cntr.items(): msg += f"\t- '{excluded_name}': {n}\n" else: msg += '.\n' if return_str: return msg print(msg) def info(self, return_str=False): msg_components = [] if self.fnames_in_metadata + self.fnames_not_in_metadata == 0: msg = f"This view is called '{self.name}'. It excludes everything because both its attributes " \ f"fnames_in_metadata and fnames_not_in_metadata are set to False." if return_str: return msg print(msg) return if not self.fnames_in_metadata: msg_components.append("excludes fnames that are contained in the metadata") if not self.fnames_not_in_metadata: msg_components.append("excludes fnames that are not contained in the metadata") if not self.fnames_with_incomplete_facets: msg_components.append("excludes pieces that do not have at least one file per selected facet") if not self.include_convertible: msg_components.append("filters out file extensions requiring conversion (such as .xml)") if not self.include_tsv: msg_components.append("disregards all TSV files") if self.exclude_review: msg_components.append("excludes review files and folders") included_re = {what_to_include: [rgx for rgx in regexes if rgx not in self.registered_regexes] for what_to_include, regexes in self.including.items()} excluded_re = {what_to_exclude: [rgx for rgx in regexes if rgx not in self.registered_regexes] for what_to_exclude, regexes in self.excluding.items()} for what_to_exclude, re_strings in included_re.items(): n_included = len(re_strings) if n_included == 0: continue if n_included == 1: included = f"'{re_strings[0]}'" elif n_included < 11: included = 'one of ' + str(re_strings) else: included = 'one of [' + ', '.join(f"'{regex}'" for regex in re_strings[:10]) + '... ' included += f" ({n_included - 10} more, see filtering_report()))" msg_components.append(f"includes only {what_to_exclude} containing {included}") for what_to_exclude, re_strings in excluded_re.items(): n_excluded = len(re_strings) if n_excluded == 0: continue if n_excluded == 1: excluded = f"'{re_strings[0]}'" elif n_excluded < 11: excluded = 'one of ' + str(re_strings) else: excluded = 'one of [' + ', '.join(f"'{regex}'" for regex in re_strings[:10]) + '... ' excluded += f" ({n_excluded - 10} more, see filtering_report())" msg_components.append(f"excludes any {what_to_exclude} containing {excluded}") if len(self.excluded_file_paths) > 0: msg_components.append(f"excludes {len(self.excluded_file_paths)} files based on user input") msg = f"This view is called '{self.name}'. It " n_components = len(msg_components) if n_components == 0: msg += "selects everything." elif n_components == 1: msg += msg_components[0] + "." else: separator = '\n\t- ' msg += separator + (',' + separator).join(msg_components[:-1]) msg += f", and{separator}{msg_components[-1]}." if return_str: return msg print(msg) def resolve_category(self, category: Category) -> Category: if isinstance(category, str): if category not in self.categories: if category in self.singular2category: return self.singular2category[category] else: raise ValueError(f"'{category}' is not one of the known categories {self.categories}") return category else: raise ValueError(f"Pass a single category string ∈ {self.categories}, not a '{type(category)}'") def resolve_categories(self, categories: Categories) -> List[str]: if isinstance(categories, str): categories = [categories] return [self.resolve_category(categ) for categ in categories] def update_facet_selection(self): selected, discarded = [], [] for facet in self.available_facets: if self.check_token('facet', facet): selected.append(facet) else: discarded.append(facet) self.selected_facets = selected key = 'facets' if len(discarded) == 0: if key in self._last_filtering_counts: del(self._last_filtering_counts[key]) if key in self._discarded_items: del(self._discarded_items[key]) return n_kept, n_discarded = len(selected), len(discarded) counts = np.array([n_kept, n_discarded, n_kept+n_discarded]) self._last_filtering_counts[key] = counts self._discarded_items[key] = set(discarded) def include(self, categories: Categories, *regex: Union[str, re.Pattern]): categories = self.resolve_categories(categories) for what_to_include in categories: for rgx in regex: if rgx not in self.including[what_to_include]: self.including[what_to_include].append(rgx) if what_to_include == 'facets': self.update_facet_selection() def exclude(self, categories: Categories, *regex: Union[str, re.Pattern]): categories = self.resolve_categories(categories) for what_to_exclude in categories: for rgx in regex: if rgx not in self.excluding[what_to_exclude]: self.excluding[what_to_exclude].append(rgx) if what_to_exclude == 'facets': self.update_facet_selection() def uninclude(self, categories: Categories, *regex: Union[str, re.Pattern]): categories = self.resolve_categories(categories) for what_to_uninclude in categories: for rgx in regex: try: self.including[what_to_uninclude].remove(rgx) except ValueError: pass def unexclude(self, categories: Categories, *regex: Union[str, re.Pattern]): categories = self.resolve_categories(categories) for what_to_unexclude in categories: for rgx in regex: try: self.excluding[what_to_unexclude].remove(rgx) except ValueError: pass def __repr__(self): return self.info(return_str=True)
[docs]class DefaultView(View): def __init__(self, view_name: Optional[str] = 'default', only_metadata_fnames: bool = True, include_convertible: bool = False, include_tsv: bool = True, exclude_review: bool = True, **logger_cfg ): super().__init__(view_name=view_name, only_metadata_fnames=only_metadata_fnames, include_convertible=include_convertible, include_tsv=include_tsv, exclude_review=exclude_review, **logger_cfg )
[docs] def is_default(self, relax_for_cli: bool = False) -> bool: default_excluding_lengths = { 'folders': 1, 'fnames': 1, 'files': 2, 'suffixes': 0, } if relax_for_cli: default_excluding_lengths['files'] -= self.include_convertible ## debugging: # print(f"""no includes: {all(len(self.including[category]) == 0 for category in default_excluding_lengths.keys())} # default_excludes: {all(len(self.excluding[category]) == expected for category, expected in default_excluding_lengths.items())} # exclude_review: {self.exclude_review} # include_convertible: {not self.include_convertible or relax_for_cli} # no paths excluded: {len(self.excluded_file_paths) == 0} # fnames in metadata: {self.fnames_in_metadata} # not in metadata excluded: {not self.fnames_not_in_metadata or relax_for_cli} # incomplete facets: {self.fnames_with_incomplete_facets}""") return ( all(len(self.including[category]) == 0 for category in default_excluding_lengths.keys()) and all(len(self.excluding[category]) == expected for category, expected in default_excluding_lengths.items()) and len(self.excluded_file_paths) == 0 and self.fnames_in_metadata and self.fnames_with_incomplete_facets and (relax_for_cli or ( not self.include_convertible and not self.fnames_not_in_metadata )) )
def create_view_from_parameters(only_metadata_fnames: bool = True, include_convertible: bool = False, include_tsv: bool = True, exclude_review: bool = True, paths=None, file_re=None, folder_re=None, exclude_re=None, level=None ) -> View: no_legacy_params = all(param is None for param in (paths, file_re, folder_re, exclude_re)) all_default = only_metadata_fnames and include_tsv and exclude_review and not include_convertible if no_legacy_params and all_default: return DefaultView(level=level) ferocious_name = get_ferocious_name() view = View(ferocious_name, only_metadata_fnames=only_metadata_fnames, include_convertible=include_convertible, include_tsv=include_tsv, exclude_review=exclude_review, level=level ) view.update_config(paths=paths, file_re=file_re, folder_re=folder_re, exclude_re=exclude_re) return view def get_ferocious_name(): path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'ferocious_names.txt') return random.choice(open(path, 'r', encoding='utf-8').readlines()).strip('\n')