from functools import lru_cache
from logging import Logger
from typing import Literal, Collection, Dict, List, Union, Tuple, Iterator, Optional
import io
import sys, os, re
from itertools import zip_longest
import pathos.multiprocessing as mp
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
from git import Repo, InvalidGitRepositoryError
from gitdb.exc import BadName
from .annotations import Annotations
from .logger import LoggedClass, get_logger, get_log_capture_handler, temporarily_suppress_warnings, function_logger, normalize_logger_name
from .piece import Piece
from .score import Score, compare_two_score_objects
from ._typing import FileDict, FileList, ParsedFile, FileParsedTuple, ScoreFacets, FileDataframeTupleMaybe, FacetArguments, \
Facet, ScoreFacet, FileScoreTuple, FileDataframeTuple, AnnotationsFacet
from .utils import File, column_order, get_musescore, get_path_component, group_id_tuples, iter_selection, join_tsvs, \
load_tsv, make_continuous_offset_series, \
make_id_tuples, make_playthrough2mc, METADATA_COLUMN_ORDER, path2type, \
pretty_dict, resolve_dir, \
update_labels_cfg, write_metadata, write_tsv, available_views2str, prepare_metadata_for_writing, \
files2disambiguation_dict, ask_user_to_choose, resolve_paths_argument, make_file_path, resolve_facets_param, check_argument_against_literal_type, LATEST_MUSESCORE_VERSION, \
convert, string2identifier, write_markdown, parse_ignored_warnings_file, parse_tsv_file_at_git_revision, disambiguate_files, enforce_fname_index_for_metadata, store_csvw_jsonld
from .view import DefaultView, View, create_view_from_parameters
[docs]class Corpus(LoggedClass):
"""
Collection of scores and TSV files that can be matched to each other based on their file names.
"""
default_count_index = pd.MultiIndex.from_product([
('scores',) + Score.dataframe_types,
('found', 'parsed')
])
def __init__(self, directory: str,
view: View = None,
only_metadata_fnames: bool = True,
include_convertible: bool = False,
include_tsv: bool = True,
exclude_review: bool = True,
file_re: Optional[Union[str, re.Pattern]] = None,
folder_re: Optional[Union[str, re.Pattern]] = None,
exclude_re: Optional[Union[str, re.Pattern]] = None,
paths: Optional[Collection[str]] = None,
labels_cfg={},
ms=None,
**logger_cfg):
"""
Parameters
----------
directory, key, index, file_re, folder_re, exclude_re, recursive : optional
Arguments for the method :py:meth:`~ms3.parse.add_dir`.
If ``dir`` is not passed, no files are added to the new object except if you pass ``paths``
paths : :obj:`~collections.abc.Collection` or :obj:`str`, optional
List of file paths you want to add. If ``directory`` is also passed, all files will be combined in the same object.
logger_cfg : :obj:`dict`, optional
| The following options are available:
| 'name': LOGGER_NAME -> by default the logger name is based on the parsed file(s)
| 'level': {'W', 'D', 'I', 'E', 'C', 'WARNING', 'DEBUG', 'INFO', 'ERROR', 'CRITICAL'}
| 'path': Directory in which log files are stored. If 'file' is relative, this path is used as root, otherwise, it is ignored.
| 'file': PATH_TO_LOGFILE Pass absolute path to store all log messages in a single log file.
If PATH_TO_LOGFILE is relative, multiple log files are created dynamically, relative to the original MSCX files' paths.
If 'path' is set, the corresponding subdirectory structure is created there.
ms : :obj:`str`, optional
If you pass the path to your local MuseScore 3 installation, ms3 will attempt to parse musicXML, MuseScore 2,
and other formats by temporarily converting them. If you're using the standard path, you may try 'auto', or 'win' for
Windows, 'mac' for MacOS, or 'mscore' for Linux. In case you do not pass the 'file_re' and the MuseScore executable is
detected, all convertible files are automatically selected, otherwise only those that can be parsed without conversion.
"""
directory = resolve_dir(directory)
assert os.path.isdir(directory), f"{directory} is not an existing directory."
self.corpus_path: str = directory
"""Path where the corpus is located."""
self.name = os.path.basename(directory).strip(r'\/')
"""Folder name of the corpus."""
if 'name' not in logger_cfg or logger_cfg['name'] is None or logger_cfg['name'] == '':
logger_cfg['name'] = 'ms3.Corpus.' + self.name.replace('.', '')
# if 'level' not in logger_cfg or (logger_cfg['level'] is None):
# logger_cfg['level'] = 'w'
super().__init__(subclass='Corpus', logger_cfg=logger_cfg)
self.files: list = []
"""
``[File]`` list of :obj:`File` data objects containing information on the file location
etc. for all detected files.
"""
self._views: dict = {}
if view is None:
initial_view = create_view_from_parameters(only_metadata_fnames=only_metadata_fnames,
include_convertible=include_convertible,
include_tsv=include_tsv,
exclude_review=exclude_review,
paths=paths,
file_re=file_re,
folder_re=folder_re,
exclude_re=exclude_re,
level=self.logger.getEffectiveLevel())
self._views[None] = initial_view
else:
legacy_params = any(param is not None for param in (paths, file_re, folder_re, exclude_re))
not_default = not only_metadata_fnames or not include_tsv or not exclude_review or include_convertible
if legacy_params or not_default:
self.logger.warning(f"If you pass an existing view, other view-related parameters are ignored.")
self._views[None] = view
if 'default' not in self.view_names:
self._views['default'] = DefaultView(level=self.logger.getEffectiveLevel())
if 'all' not in self.view_names:
self._views['all'] = View(level=self.logger.getEffectiveLevel())
self._ms = get_musescore(ms, logger=self.logger)
""":obj:`str`
Path or command of the local MuseScore 3 installation if specified by the user."""
self._ignored_warnings = defaultdict(list)
""":obj:`collections.defaultdict`
{'logger_name' -> [(message_id), ...]} This dictionary stores the warnings to be ignored
upon loading them from an IGNORED_WARNINGS file.
"""
self.labels_cfg = {
'staff': None,
'voice': None,
'harmony_layer': None,
'positioning': False,
'decode': True,
'column_name': 'label',
'color_format': None,
}
""":obj:`dict`
Configuration dictionary to determine the output format of :py:attr:`~.score.Score.labels` and
:py:attr:`~.score.Score.expanded` tables. The dictonary is passed to :py:attr:`~.score.Score` upon parsing.
"""
self.labels_cfg.update(update_labels_cfg(labels_cfg, logger=self.logger))
self._pieces: Dict[str, Piece] = {}
"""{fname -> :class:`Piece`} The objects holding together information for the same piece
from various files.
"""
self.metadata_tsv: pd.DataFrame = None
"""The parsed 'metadata.tsv' file for the corpus."""
self.metadata_ix: int = None
"""The index of the 'metadata.tsv' file for the corpus."""
self.ix2fname: Dict[int, str] = {}
"""{ix -> fname} dict for associating files with the piece they have been matched to.
None for indices that could not be matched, e.g. metadata.
"""
self.ix2metadata_file: Dict[int, File] = {}
"""{ix -> File} dict for collecting all metadata files."""
self.ix2orphan_file: Dict[int, File] = {}
"""{ix -> File} dict for collecting all metadata files."""
self._ix2parsed: Dict[int, pd.DataFrame] = {}
"""{ix -> DataFrame} dictionary for parsed metadata and orphan TSV files. Managed by :attr:`ix2parsed`."""
self.score_fnames: List[str] = []
"""Sorted list of unique file names of all detected scores"""
self.detect_parseable_files()
assert len(self.files) > 0, f"The path {self.corpus_path} contains no files that I can parse."
self.collect_fnames_from_scores()
self.find_and_load_metadata()
self.create_pieces()
self.look_for_ignored_warnings()
self.register_files_with_pieces()
# self.look_for_ignored_warnings(directory, key=top_level)
# if len(mscx_files) > 0:
# self.logger.warning(f"The following MSCX files are lying on the top level '{top_level}' and have not been registered (call with recursive=False to add them):"
# f"\n{mscx_files}", extra={"message_id": (7, top_level)})
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%% END of __init__() %%%%%%%%%%%%%%%%%%%%%%%%%%%%%#
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%#
@property
def ix2file(self) -> dict:
return dict(enumerate(self.files))
@property
def ix2parsed(self) -> Dict[int, ParsedFile]:
result = dict(self._ix2parsed)
for _, piece in self:
result.update(piece.ix2parsed)
return result
@property
def ix2parsed_score(self) -> Dict[int, Score]:
result = {}
for _, piece in self:
result.update(piece.ix2parsed_score)
return result
@property
def ix2parsed_tsv(self) -> Dict[int, pd.DataFrame]:
result = {}
for _, piece in self:
result.update(piece.ix2parsed_tsv)
return result
@property
def ix2annotations(self) -> Dict[int, ParsedFile]:
result = {}
for _, piece in self:
result.update(piece.ix2annotations)
return result
@property
def ms(self):
return self._ms
@ms.setter
def ms(self, ms):
self._ms = get_musescore(ms, logger=self.logger)
@property
def n_detected(self):
return len(self.files)
@property
def n_orphans(self):
return len(self.ix2orphan_file)
@property
def n_parsed(self):
return sum(file.ix not in self._ix2parsed for file in self.ix2parsed.values())
@property
def n_parsed_scores(self):
return len(self.ix2parsed_score)
@property
def n_unparsed_scores(self):
return sum(file.ix not in self.ix2parsed_score for file in self.files if file.type == 'scores')
@property
def n_parsed_tsvs(self):
return len(self.ix2parsed_tsv)
@property
def n_unparsed_tsvs(self):
return sum(file.type != 'scores' and file.ix not in self.ix2parsed_tsv and file.ix not in self.ix2orphan_file and file.ix not in self.ix2metadata_file for file in self.files)
@property
def n_annotations(self):
return len(self.ix2annotations)
def cadences(self,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
concatenate: bool = True,
) -> Union[Dict[str, FileParsedTuple], pd.DataFrame]:
return self.get_facet('cadences',
view_name=view_name,
choose=choose,
unfold=unfold,
interval_index=interval_index,
concatenate=concatenate)
def chords(self,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
concatenate: bool = True,
) -> Union[Dict[str, FileParsedTuple], pd.DataFrame]:
return self.get_facet('chords',
view_name=view_name,
choose=choose,
unfold=unfold,
interval_index=interval_index,
concatenate=concatenate)
def events(self,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
concatenate: bool = True,
) -> Union[Dict[str, FileParsedTuple], pd.DataFrame]:
return self.get_facet('events',
view_name=view_name,
choose=choose,
unfold=unfold,
interval_index=interval_index,
concatenate=concatenate)
def expanded(self,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
concatenate: bool = True,
) -> Union[Dict[str, FileParsedTuple], pd.DataFrame]:
return self.get_facet('expanded',
view_name=view_name,
choose=choose,
unfold=unfold,
interval_index=interval_index,
concatenate=concatenate)
def form_labels(self,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
concatenate: bool = True,
) -> Union[Dict[str, FileParsedTuple], pd.DataFrame]:
return self.get_facet('form_labels',
view_name=view_name,
choose=choose,
unfold=unfold,
interval_index=interval_index,
concatenate=concatenate)
def labels(self,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
concatenate: bool = True,
) -> Union[Dict[str, FileParsedTuple], pd.DataFrame]:
return self.get_facet('labels',
view_name=view_name,
choose=choose,
unfold=unfold,
interval_index=interval_index,
concatenate=concatenate)
def measures(self,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
concatenate: bool = True,
) -> Union[Dict[str, FileParsedTuple], pd.DataFrame]:
return self.get_facet('measures',
view_name=view_name,
choose=choose,
unfold=unfold,
interval_index=interval_index,
concatenate=concatenate)
def notes(self,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
concatenate: bool = True,
) -> Union[Dict[str, FileParsedTuple], pd.DataFrame]:
return self.get_facet('notes',
view_name=view_name,
choose=choose,
unfold=unfold,
interval_index=interval_index,
concatenate=concatenate)
def notes_and_rests(self,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
concatenate: bool = True,
) -> Union[Dict[str, FileParsedTuple], pd.DataFrame]:
return self.get_facet('notes_and_rests',
view_name=view_name,
choose=choose,
unfold=unfold,
interval_index=interval_index,
concatenate=concatenate)
def rests(self,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
concatenate: bool = True,
) -> Union[Dict[str, FileParsedTuple], pd.DataFrame]:
return self.get_facet('rests',
view_name=view_name,
choose=choose,
unfold=unfold,
interval_index=interval_index,
concatenate=concatenate)
def __getattr__(self, view_name):
if view_name in self.view_names:
if view_name != self.view_name:
self.switch_view(view_name, show_info=False)
return self
else:
raise AttributeError(f"'{view_name}' is not an existing view. Use _.get_view('{view_name}') to create it.")
def ix_logger(self, ix):
return get_logger(self.logger_names[ix])
def count_changed_scores(self, view_name: Optional[str] = None):
return sum(piece.count_changed_scores(view_name) for _, piece in self.iter_pieces(view_name))
def count_files(self,
detected: bool = True,
parsed: bool = True,
as_dict: bool = False,
drop_zero: bool = True,
view_name: Optional[str] = None) -> Union[pd.DataFrame, dict]:
assert detected + parsed > 0, "At least one parameter needs to be True"
fname2counts = {}
prefix = detected + parsed == 2
for fname, piece in self.iter_pieces(view_name=view_name):
if detected:
type_count = piece.count_detected(view_name=view_name, include_empty=True, prefix=prefix)
if not parsed:
fname2counts[fname] = type_count
if parsed:
parsed_count = piece.count_parsed(view_name=view_name, include_empty=True, prefix=prefix)
if not detected:
fname2counts[fname] = parsed_count
if detected & parsed:
alternating_counts = {}
for (k1, v1), (k2, v2) in zip_longest(type_count.items(), parsed_count.items(), fillvalue=(None, None)):
if k1 is not None:
alternating_counts[k1] = v1
if k2 is not None:
alternating_counts[k2] = v2
fname2counts[fname] = alternating_counts
if as_dict:
return fname2counts
df = pd.DataFrame.from_dict(fname2counts, orient='index')
if prefix:
try:
df.columns = df.columns.str.split('_', n=1, expand=True).swaplevel()
except TypeError:
pass
if drop_zero:
empty_cols = df.columns[df.sum() == 0]
return df.drop(columns=empty_cols)
return df
def _summed_file_count(self,
types=True,
parsed=True,
view_name: Optional[str] = None) -> pd.Series:
"""The sum of _.count_files() but returning zero-filled Series if no fnames have been selected."""
file_count = self.count_files(detected=types, parsed=parsed, drop_zero=False, view_name=view_name)
if len(file_count) == 0 and types and parsed:
return pd.Series(0, index=self.default_count_index)
return file_count.sum()
def add_file_paths(self, paths: Collection[str]) -> FileList:
resolved_paths = resolve_paths_argument(paths, logger=self.logger)
if len(resolved_paths) == 0:
return
score_extensions = ['.' + ext for ext in Score.parseable_formats]
detected_extensions = score_extensions + ['.tsv']
newly_added = []
existing_paths = self.files_df.full_path.to_list()
for full_path in resolved_paths:
if full_path in existing_paths:
self.logger.debug(f"Skipping {full_path} because it was already there.")
continue
current_path, file = os.path.split(full_path)
file_name, file_ext = os.path.splitext(file)
if file_ext not in detected_extensions:
continue
current_subdir = os.path.relpath(current_path, self.corpus_path)
rel_path = os.path.join(current_subdir, file)
if current_subdir.startswith('..'):
self.logger.warning(f"The file {rel_path} is lies outside the corpus folder {self.corpus_path}. "
f"This may lead to invalid relative paths in the metadata.tsv.")
file_type = path2type(full_path, logger=self.logger)
F = File(
ix=len(self.files),
type=file_type,
file=file,
fname=file_name,
fext=file_ext,
subdir=current_subdir,
corpus_path=self.corpus_path,
rel_path=rel_path,
full_path=full_path,
directory=current_path,
suffix='',
)
self.files.append(F)
newly_added.append(F)
if len(self._pieces) > 0:
self.register_files_with_pieces(newly_added)
return newly_added
[docs] def collect_fnames_from_scores(self) -> None:
"""Construct sorted list of fnames from all detected scores."""
files_df = self.files_df
detected_scores = files_df.loc[files_df.type == 'scores']
self.score_fnames = sorted(detected_scores.fname.unique())
[docs] def create_pieces(self, fnames: Union[Collection[str], str] = None) -> None:
"""Creates and stores one :obj:`Piece` object per fname."""
if fnames is None:
fnames = self.get_all_fnames()
elif isinstance(fnames, str):
fnames = [fnames]
for fname in fnames:
if fname in self._pieces:
self.logger.debug(f"Piece({fname}) existed already, skipping...")
continue
logger_cfg = dict(self.logger_cfg)
logger_name = self.logger.name + '.' + fname.replace('.', '')
logger_cfg['name'] = logger_name
piece = Piece(fname, view=self.get_view(), logger_cfg=logger_cfg, ms=self.ms)
piece.set_view(**{view_name: view for view_name, view in self._views.items() if view_name is not None})
self._pieces[fname] = piece
self.logger_names[fname] = logger_name
if self.metadata_tsv is not None:
try:
fname_col = next(col for col in ('fname', 'fnames', 'name', 'names') if col in self.metadata_tsv.columns)
except StopIteration:
file = self.files[self.metadata_ix]
self.logger.warning(f"Could not attribute metadata to Pieces because {file.rel_path} has no 'fname' column.")
return
metadata_rows = self.metadata_tsv.to_dict(orient='records')
for row in metadata_rows:
fname = row[fname_col]
if pd.isnull(fname):
continue
piece = self.get_piece(fname)
piece._tsv_metadata = row
[docs] def detect_parseable_files(self) -> None:
"""Walks through the corpus_path and collects information on all parseable files."""
score_extensions = ['.' + ext for ext in Score.parseable_formats]
detected_extensions = score_extensions + ['.tsv']
for current_path, subdirs, files in os.walk(self.corpus_path):
subdirs[:] = sorted(sd for sd in subdirs if not sd.startswith('.'))
current_subdir = os.path.relpath(current_path, self.corpus_path)
parseable_files = [(f,) + os.path.splitext(f) for f in files if os.path.splitext(f)[1] in detected_extensions]
for file, file_name, file_ext in sorted(parseable_files):
full_path = os.path.join(current_path, file)
rel_path = os.path.join(current_subdir, file)
file_type = path2type(full_path, logger=self.logger)
F = File(
ix=len(self.files),
type = file_type,
file=file,
fname=file_name,
fext=file_ext,
subdir=current_subdir,
corpus_path=self.corpus_path,
rel_path=rel_path,
full_path=full_path,
directory=current_path,
suffix='',
)
self.files.append(F)
[docs] def disambiguate_facet(self, facet: Facet, view_name: Optional[str] = None, ask_for_input=True) -> None:
""" Make sure that, for a given facet, the current view includes only one or zero files. If at least one piece
has more than one file, the user will be asked which ones to use. The others will be excluded from the view.
Args:
facet: Which facet to disambiguate.
ask_for_input:
By default, if there is anything to disambiguate, the user is asked to select a group of files.
Pass False to see only the questions and choices without actually disambiguating.
"""
assert isinstance(facet, str), f"Let's disambiguate one facet at a time. Received invalid argument {facet}"
assert facet in Facet.__args__, f"'{facet}' is not a valid facet. Choose one of {Facet.__args__}."
disambiguated = {}
no_need = {}
missing = []
all_available_files = []
selected_ixs = []
fname2files = self.get_files(facet, view_name=view_name, choose='all', flat=True, include_empty=False)
for peace, files in fname2files.items():
if len(files) == 0:
missing.append(peace)
elif len(files) == 1:
no_need[peace] = files[0]
else:
all_available_files.extend(files)
disamb2file = files2disambiguation_dict(files, include_disambiguator=True)
disambiguated[peace] = {disamb: file.ix for disamb, file in disamb2file.items()}
if len(missing) > 0:
self.logger.info(f"{len(missing)} files don't come with '{facet}'.")
if len(disambiguated) == 0:
self.logger.info(f"No files require disambiguation for facet '{facet}'.")
return
if len(no_need) > 0:
self.logger.info(f"{len(no_need)} files do not require disambiguation for facet '{facet}'.")
N_target = len(disambiguated)
N_remaining = N_target
print(f"{N_target} pieces require disambiguation for facet '{facet}'.")
df = pd.DataFrame.from_dict(disambiguated, orient='index', dtype='Int64')
n_files_per_disambiguator = df.notna().sum().sort_values(ascending=False)
df = df.loc[:, n_files_per_disambiguator.index]
n_choices_groups = df.notna().sum(axis=1).sort_values()
gpb: Iterator[Tuple[int, pd.DataFrame]] = df.groupby(n_choices_groups) # introduces variable just for type hinting
for n_choices, chunk in gpb:
range_str = f"1-{n_choices}"
chunk = chunk.dropna(axis=1, how='all')
choices_groups = chunk.apply(lambda S: tuple(S.index[S.notna()]), axis=1)
# question_gpb: Iterator[Tuple[Dict[int, str], pd.DataFrame]] = chunk.groupby(choices_groups)
question_gpb = chunk.groupby(choices_groups)
n_questions = question_gpb.ngroups
for i, (choices, piece_group) in enumerate(question_gpb, 1):
N_current = len(piece_group)
choices = dict(enumerate(choices, 1))
piece_group = piece_group.dropna(axis=1, how='all').sort_index(axis=1, key=lambda S: S.str.len())
remaining_string = '' if N_current == N_remaining else f" ({N_remaining} remaining)"
if N_current == 1:
for piece, first_row in piece_group.iterrows():
break
prompt = f"Choose one of the files (or 0 for none):\n{pretty_dict(choices)}"
choices_representation = f"'{piece}'{remaining_string} by choosing one of the {n_choices} files:\n"
else:
try:
prompt = f"Choose one of the columns (or 0 for none):\n{pretty_dict(choices)}"
except ValueError:
print(choices, piece_group)
raise
choices_representation = f"the following {N_current} pieces{remaining_string} by choosing one of the {n_choices} columns:\n\n{piece_group.to_string()}\n"
question_number = f"({i}/{n_questions}) " if n_questions > 1 else ''
print(f"{question_number} Disambiguation for {choices_representation}")
if ask_for_input:
print(prompt)
query = f"Selection [{range_str}]: "
column = ask_user_to_choose(query, list(choices.values()))
if column is not None:
selected_ixs.extend(piece_group[column].values)
print()
N_remaining -= N_current
if ask_for_input:
excluded_file_paths = [file.full_path for file in all_available_files if file.ix not in selected_ixs]
view = self.get_view(view_name)
view.excluded_file_paths.extend(excluded_file_paths)
return
def extract_facet(self,
facet: ScoreFacet,
view_name: Optional[str] = None,
force: bool = False,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
concatenate: bool = True,
) -> Union[Dict[str, FileParsedTuple], pd.DataFrame]:
view_name_display = self.view_name if view_name is None else view_name
result = {}
for fname, piece in self.iter_pieces(view_name):
file, df = piece.extract_facet(facet=facet,
view_name=view_name,
force=force,
choose=choose,
unfold=unfold,
interval_index=interval_index)
if df is None:
self.logger.info(f"The view '{view_name_display}' does not comprise a score for '{fname}' from which to "
f"extract {facet}.")
else:
result[fname] = df if concatenate else (file, df)
if concatenate:
if len(result) > 0:
df = pd.concat(result.values(), keys=result.keys())
df.index.rename(['fname', f"{facet}_i"], inplace=True)
return df
else:
return pd.DataFrame()
return result
@property
def files_df(self):
if len(self.files) == 0:
return pd.DataFrame()
return pd.DataFrame(self.files).set_index('ix')
def get_changed_scores(self,
view_name: Optional[str] = None,
include_empty: bool = False) -> Dict[str, List[FileScoreTuple]]:
result = {fname: piece.get_changed_scores(view_name) for fname, piece in self.iter_pieces(view_name)}
if include_empty:
return result
return {k: v for k, v in result.items() if len(v) > 0}
[docs] def get_dataframes(self,
notes: bool = False,
rests: bool = False,
notes_and_rests: bool = False,
measures: bool = False,
events: bool = False,
labels: bool = False,
chords: bool = False,
expanded: bool = False,
form_labels: bool = False,
cadences: bool = False,
view_name: Optional[str] = None,
force: bool = False,
choose: Literal['all', 'auto', 'ask'] = 'all',
unfold: bool = False,
interval_index: bool = False,
flat=False,
include_empty: bool = False,
) -> Dict[str, Union[Dict[str, FileDataframeTuple], List[FileDataframeTuple]]]:
"""Renamed to :meth:`get_facets`."""
l = locals()
facets = [facet for facet in ScoreFacet.__args__ if l[facet]]
return self.get_facets(facets=facets,
view_name=view_name,
force=force,
choose=choose,
unfold=unfold,
interval_index=interval_index,
flat=flat,
include_empty=include_empty)
def get_facet(self,
facet: ScoreFacet,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
concatenate: bool = True,
) -> Union[Dict[str, FileDataframeTuple], pd.DataFrame]:
view_name_display = self.view_name if view_name is None else view_name
result = {}
for fname, piece in self.iter_pieces(view_name):
file, df = piece.get_facet(facet=facet,
view_name=view_name,
choose=choose,
unfold=unfold,
interval_index=interval_index)
if df is None:
self.logger.info(f"The view '{view_name_display}' does not comprise a score or TSV file for '{fname}' from which to "
f"retrieve {facet}.")
else:
result[fname] = df if concatenate else (file, df)
if concatenate:
if len(result) > 0:
df = pd.concat(result.values(), keys=result.keys())
df.index.rename(['fname', f"{facet}_i"], inplace=True)
return df
else:
return pd.DataFrame()
return result
[docs] def get_facets(self,
facets: ScoreFacets = None,
view_name: Optional[str] = None,
force: bool = False,
choose: Literal['all', 'auto', 'ask'] = 'all',
unfold: bool = False,
interval_index: bool = False,
flat=False,
include_empty: bool = False,
) -> Dict[str, Union[Dict[str, FileDataframeTuple], List[FileDataframeTuple]]]:
"""
Args:
facets:
view_name:
force:
Only relevant when ``choose='all'``. By default, only scores and TSV files that have already been
parsed are taken into account. Set ``force=True`` to force-parse all scores and TSV files selected
under the given view.
choose:
unfold:
interval_index:
flat:
include_empty:
Returns:
"""
selected_facets = resolve_facets_param(facets, ScoreFacet, logger=self.logger)
if choose == 'ask':
for facet in selected_facets:
self.disambiguate_facet(facet, ask_for_input=True)
choose = 'auto'
if force:
self.check_number_of_unparsed_scores(view_name, choose)
result = {}
for fname, piece in self.iter_pieces(view_name=view_name):
facet2parsed = piece.get_facets(facets=selected_facets,
view_name=view_name,
force=force,
choose=choose,
unfold=unfold,
interval_index=interval_index,
flat=flat,
)
if not include_empty:
if not flat:
facet2parsed = {facet: parsed for facet, parsed in facet2parsed.items() if len(parsed) > 0}
if len(facet2parsed) == 0:
continue
result[piece.name] = facet2parsed
return result
def check_number_of_unparsed_scores(self, view_name, choose):
tmp_choose = 'auto' if choose == 'ask' else choose
unparsed_files = self.get_files('scores', view_name=view_name, parsed=False, choose=tmp_choose, flat=True)
n_unparsed = len(sum(unparsed_files.values(), []))
if n_unparsed > 10:
self.logger.warning(f"You have set force=True, which forces me to parse {n_unparsed} scores iteratively. "
f"Next time, call _.parse() on me, so we can speed this up!")
def get_file_from_path(self, full_path: str) -> Optional[File]:
for file in self.files:
if file.full_path == full_path:
return file
def get_files(self, facets: FacetArguments = None,
view_name: Optional[str] = None,
parsed: bool = True,
unparsed: bool = True,
choose: Literal['all', 'auto', 'ask'] = 'all',
flat: bool = False,
include_empty = False,
) -> Dict[str, Union[Dict[str, FileList], FileList]]:
""""""
assert parsed + unparsed > 0, "At least one of 'parsed' and 'unparsed' needs to be True."
selected_facets = resolve_facets_param(facets, logger=self.logger)
if choose == 'ask':
for facet in selected_facets:
self.disambiguate_facet(facet, ask_for_input=True)
choose = 'auto'
result = {}
for fname, piece in self.iter_pieces(view_name=view_name):
facet2files = piece.get_files(facets=facets,
view_name=view_name,
parsed=parsed,
unparsed=unparsed,
choose=choose,
flat=flat,
include_empty=include_empty)
if not include_empty:
if not flat:
facet2files = {facet: files for facet, files in facet2files.items() if len(files) > 0}
if len(facet2files) == 0:
continue
result[piece.name] = facet2files
result[piece.name] = facet2files
return result
def get_all_parsed(self, facets: FacetArguments = None,
view_name: Optional[str] = None,
force: bool = False,
choose: Literal['all', 'auto', 'ask'] = 'all',
flat: bool = False,
include_empty=False,
) -> Dict[str, Union[Dict[str, List[FileParsedTuple]], List[FileParsedTuple]]]:
""""""
selected_facets = resolve_facets_param(facets, logger=self.logger)
if choose == 'ask':
for facet in selected_facets:
self.disambiguate_facet(facet, ask_for_input=True)
choose = 'auto'
if force:
self.check_number_of_unparsed_scores(view_name, choose)
result = {}
for fname, piece in self.iter_pieces(view_name=view_name):
facet2parsed = piece.get_all_parsed(facets=selected_facets,
view_name=view_name,
force=force,
choose=choose,
flat=flat,
include_empty=include_empty)
if not include_empty:
if not flat:
facet2parsed = {facet: parsed for facet, parsed in facet2parsed.items() if len(parsed) > 0}
if len(facet2parsed) == 0:
continue
result[piece.name] = facet2parsed
return result
[docs] def get_all_fnames(self,
fnames_in_metadata: bool = True,
fnames_not_in_metadata: bool = True) -> List[str]:
""" fnames (file names without extension and suffix) serve as IDs for pieces. Use
this function to retrieve the comprehensive list, ignoring views.
Args:
fnames_in_metadata: fnames that are listed in the 'metadata.tsv' file for this corpus, if present
fnames_not_in_metadata: fnames that are not listed in the 'metadata.tsv' file for this corpus
Returns:
The file names included in 'metadata.tsv' and/or those of all other scores.
"""
result = []
if fnames_in_metadata:
result.extend(self.fnames_in_metadata(self.metadata_ix))
if fnames_not_in_metadata:
result.extend(self.fnames_not_in_metadata())
return sorted(result)
[docs] def get_fnames(self, view_name: Optional[str] = None) -> List[str]:
"""Retrieve fnames included in the current or selected view."""
return [fname for fname, _ in self.iter_pieces(view_name)]
def get_present_facets(self, view_name: Optional[str] = None):
view = self.get_view(view_name)
selected_fnames = []
if view.fnames_in_metadata:
selected_fnames.extend(self.fnames_in_metadata(self.metadata_ix))
if view.fnames_not_in_metadata:
selected_fnames.extend(self.fnames_not_in_metadata())
result = set()
for fname, piece in self:
detected_facets = piece.count_detected(include_empty=False, prefix=False)
result.update(detected_facets.keys())
return list(result)
[docs] def get_view(self,
view_name: Optional[str] = None,
**config
) -> View:
"""Retrieve an existing or create a new View object, potentially while updating the config."""
if view_name in self._views:
view = self._views[view_name]
elif view_name is not None and self._views[None].name == view_name:
view = self._views[None]
else:
view = self.get_view().copy(new_name=view_name)
self._views[view_name] = view
self.logger.info(f"New view '{view_name}' created.")
if len(config) > 0:
view.update_config(**config)
return view
def info(self, view_name: Optional[str] = None, return_str: bool = False, show_discarded: bool = False):
""""""
header = f"Corpus '{self.name}'"
header += "\n" + "-" * len(header) + "\n"
header += f"Location: {self.corpus_path}\n"
# get parsed scores before resetting the view's filtering counts to prevent counting twice
parsed_scores = self.get_all_parsed('scores', view_name=view_name, flat=True)
# start info message with the names of the available views, the header, and info on the active view.
view = self.get_view(view_name)
view.reset_filtering_data()
msg = available_views2str(self._views, view_name)
msg += header
view_info = f"View: {view}"
if view_name is None:
piece_views = [piece.get_view().name for _, piece in self.iter_pieces(view_name=view_name)]
if len(set(piece_views)) > 1:
view_info = f"This is a mixed view. Call _.info(view_name) to see a homogeneous one."
msg += view_info + "\n\n"
# a table counting the number of parseable files under the active view
counts_df = self.count_files(drop_zero=True, view_name=view_name)
if counts_df.isna().any().any():
counts_df = counts_df.fillna(0).astype('int')
n_pieces = len(counts_df)
if n_pieces == 0:
if self.metadata_tsv is None:
msg += "No 'metadata.tsv' file is present.\n\n"
msg += "All pieces are excluded from the current view.\n\n"
elif self.metadata_tsv is None:
msg += f"No 'metadata.tsv' file is present. Use _.create_metadata_tsv() to create one for these " \
f"{n_pieces} pieces.\n\n"
msg += counts_df.to_string()
else:
metadata_fnames = set(self.fnames_in_metadata(self.metadata_ix))
included_selector = counts_df.index.isin(metadata_fnames)
if included_selector.all():
msg += f"All {n_pieces} pieces are listed in 'metadata.tsv':\n\n"
msg += counts_df.to_string()
elif not included_selector.any():
msg = f"None of the {n_pieces} pieces is actually listed in 'metadata.tsv'.\n\n"
msg += counts_df.to_string()
else:
msg += f"Only the following {included_selector.sum()} pieces are listed in 'metadata.tsv':\n\n"
msg += counts_df[included_selector].to_string()
not_included = ~included_selector
plural = f"These {not_included.sum()} here are" if not_included.sum() > 1 else "This one is"
msg += f"\n\n{plural} missing from 'metadata.tsv':\n\n"
msg += counts_df[not_included].to_string()
n_changed_scores = 0
detached_key_counter = Counter()
for fname, tuples in parsed_scores.items():
for file, score in tuples:
n_changed_scores += score.mscx.changed
detached_key_counter.update(score._detached_annotations.keys())
has_changed = n_changed_scores > 0
has_detached = len(detached_key_counter) > 0
if has_changed or has_detached:
msg += "\n\n"
if has_changed:
plural = "s have" if n_changed_scores > 1 else " has"
msg += f"{n_changed_scores} score{plural} changed since parsing."
if has_detached:
msg += pretty_dict(detached_key_counter, "key", "#scores")
filtering_report = view.filtering_report(show_discarded=show_discarded, return_str=True)
if filtering_report != '':
msg += '\n' + filtering_report
if self.n_orphans > 0:
msg += f"\n\nThe corpus contains {self.n_orphans} orphans that could not be attributed to any of the fnames"
if show_discarded:
msg += f":\n{list(self.ix2orphan_file.values())}"
else:
msg += "."
if return_str:
return msg
print(msg)
def iter_facet(self,
facet: ScoreFacet,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False) -> Iterator[FileDataframeTuple]:
view_name_display = self.view_name if view_name is None else view_name
for fname, piece in self.iter_pieces(view_name):
file, df = piece.get_facet(facet=facet,
view_name=view_name,
choose=choose,
unfold=unfold,
interval_index=interval_index)
if df is None:
self.logger.info(f"The view '{view_name_display}' does not comprise a score or TSV file for '{fname}' from which to "
f"retrieve {facet}.")
else:
yield file, df
[docs] def iter_facets(self,
facets: ScoreFacets = None,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
unfold: bool = False,
interval_index: bool = False,
include_files: bool = False,
) -> Iterator:
""" Iterate through (fname, *DataFrame) tuples containing exactly one or zero DataFrames per requested facet.
Args:
facets:
view_name:
choose:
unfold:
interval_index:
include_files:
Returns:
(fname, *DataFrame) tuples containing exactly one or zero DataFrames per requested facet per piece (fname).
"""
selected_facets = resolve_facets_param(facets, ScoreFacet, logger=self.logger)
if choose == 'ask':
for facet in selected_facets:
self.disambiguate_facet(facet, ask_for_input=True)
choose = 'auto'
for fname, piece in self.iter_pieces(view_name=view_name):
facet2parsed = piece.get_facets(facets=selected_facets,
view_name=view_name,
force=True,
choose=choose,
unfold=unfold,
interval_index=interval_index,
flat=False,
)
if include_files:
result = [tup for facet in selected_facets for tup in facet2parsed[facet]]
else:
result = [df for facet in selected_facets for file, df in facet2parsed[facet]]
yield (fname, *result)
def iter_parsed(self,
facet: Facet = None,
view_name: Optional[str] = None,
force: bool = False,
choose: Literal['all', 'auto', 'ask'] = 'all',
include_empty=False,
unfold: bool = False,
interval_index: bool = False,
) -> Iterator[FileParsedTuple]:
""""""
facet = check_argument_against_literal_type(facet, Facet, logger=self.logger)
assert facet is not None, f"Pass a valid facet {Facet.__args__}"
if choose == 'ask':
self.disambiguate_facet(facet, ask_for_input=True)
choose = 'auto'
if force:
self.check_number_of_unparsed_scores(view_name, choose)
for fname, piece in self.iter_pieces(view_name=view_name):
for file, parsed in piece.iter_parsed(facet=facet,
view_name=view_name,
force=force,
choose=choose,
include_empty=include_empty,
unfold=unfold,
interval_index=interval_index):
yield file, parsed
[docs] def iter_pieces(self, view_name: Optional[str] = None) -> Iterator[Tuple[str, Piece]]:
"""Iterate through (name, corpus) tuples under the current or specified view."""
view = self.get_view(view_name)
view.reset_filtering_data(categories='fnames')
param_sum = view.fnames_in_metadata + view.fnames_not_in_metadata
if param_sum == 0:
# all excluded, need to update filter counts accordingly
key = 'fnames'
discarded_items, *_ = list(zip(*view.filter_by_token('fnames', self)))
view._discarded_items[key].update(discarded_items)
filtering_counts = view._last_filtering_counts[key]
filtering_counts[[0,1]] = filtering_counts[[1, 0]] # swapping counts for included & discarded in the array
yield from []
else:
n_kept, n_discarded = 0, 0
discarded_items = []
differentiate_by_presence_in_metadata = param_sum == 1
if differentiate_by_presence_in_metadata:
selected_fnames = self.fnames_in_metadata(self.metadata_ix) if view.fnames_in_metadata else self.fnames_not_in_metadata()
filter_incomplete_facets = not view.fnames_with_incomplete_facets
if filter_incomplete_facets:
selected_facets = set(self.get_present_facets(view_name))
selected_facets = selected_facets.intersection(set(view.selected_facets))
selected_facets = tuple(selected_facets)
for fname, piece in view.filter_by_token('fnames', self):
metadata_check = not differentiate_by_presence_in_metadata or fname in selected_fnames
facet_check = not filter_incomplete_facets or piece.all_facets_present(view_name=view_name,
selected_facets=selected_facets)
if metadata_check and facet_check:
if view_name not in piece._views:
if view_name is None:
piece.set_view(view)
else:
piece.set_view(**{view_name: view})
yield fname, piece
else:
discarded_items.append(fname)
n_kept -= 1
n_discarded += 1
key = 'fnames'
view._last_filtering_counts[key] += np.array([n_kept, n_discarded, 0], dtype='int')
view._discarded_items[key].update(discarded_items)
[docs] def load_facet_into_scores(self,
facet: AnnotationsFacet,
view_name: Optional[str] = None,
force: bool = False,
choose: Literal['auto', 'ask'] = 'auto',
git_revision: Optional[str] = None,
key: str = 'detached',
infer: bool = True,
**cols) -> int:
"""Loads annotations from maximum one TSV file to maximum one score per piece. Each score will contain the
annotations as a 'detached' annotation object accessible via the indicated ``key`` (defaults to 'detached').
"""
facet = check_argument_against_literal_type(facet, AnnotationsFacet, logger=self.logger)
assert facet is not None, f"Pass a valid facet {AnnotationsFacet.__args__}"
assert choose != 'all', "Only one set of annotations can be added under a given key."
if git_revision is None:
fname2tuples = self.get_all_parsed(facets=facet,
view_name=view_name,
force=force,
choose=choose,
flat=True,
)
fname2tuple = {fname: tuples[0] for fname, tuples in fname2tuples.items()}
revision_str = ''
else:
fname2tuple = self.get_facet_at_git_revision(facet=facet,
git_revision=git_revision,
view_name=view_name,
choose=choose)
revision_str = f" @ git revision {git_revision}"
n_pieces = len(fname2tuple)
if n_pieces == 0:
self.logger.debug(f"No parsed '{facet}' TSV files found{revision_str}.")
else:
plural = 's' if n_pieces > 1 else ''
self.logger.debug(f"Parsed '{facet}' file{revision_str} added to {n_pieces} piece{plural}.")
for fname, (file, df) in fname2tuple.items():
self[fname].load_annotation_table_into_score(df=df,
view_name=view_name,
choose=choose,
key=key,
infer=infer,
**cols)
return n_pieces
[docs] def look_for_ignored_warnings(self, directory: Optional[str] = None):
"""Looks for a text file called IGNORED_WARNINGS and, if it exists, loads it, configuring loggers as indicated."""
if directory is None:
directory = self.corpus_path
default_ignored_warnings_path = os.path.join(directory, 'IGNORED_WARNINGS')
if os.path.isfile(default_ignored_warnings_path):
loggers, unsuccessful = self.load_ignored_warnings(default_ignored_warnings_path)
n_lgrs, n_unscfl = len(loggers), len(unsuccessful)
lgrs_plural = f"{n_lgrs} logger has" if n_lgrs == 1 else f"{n_lgrs} loggers have"
if n_unscfl == 0:
uscfl_plural = ''
else:
uscfl_plural = f", {n_unscfl} hasn't" if n_unscfl == 1 else f", {n_unscfl} haven't"
self.logger.info(f"{lgrs_plural} successfully been configured{uscfl_plural}.")
[docs] def load_ignored_warnings(self, path: str) -> Tuple[List[Logger], List[str]]:
"""
Loads in a text file containing warnings that are to be ignored, i.e., wrapped in DEBUG messages. The purpose
is to mark certain warnings as OK, warranted by a human, to allow checks to pass regardless.
"""
try:
ignored_warnings = parse_ignored_warnings_file(path)
except ValueError as e:
self.logger.warning(e)
return
self.logger.debug(f"Ignored warnings contained in {path}:\n{ignored_warnings}")
logger_names = set(self.logger_names.values())
filtered_loggers = []
unsuccessful = []
for name, message_ids in ignored_warnings.items():
normalized_name = normalize_logger_name(name)
to_be_configured = [logger_name for logger_name in logger_names if normalized_name in logger_name]
if len(to_be_configured) == 0:
self.logger.warning(f"None of the logger names contains '{normalized_name}', which is the normalized name for loggers supposed to ignore "
f"warnings with message IDs {message_ids}")
unsuccessful.append(name)
continue
for name_to_configure in to_be_configured:
filtered_loggers.append(name_to_configure)
self._ignored_warnings[name_to_configure].extend(message_ids)
configured = get_logger(name_to_configure, ignored_warnings=message_ids, level=self.logger.getEffectiveLevel())
configured.debug(f"This logger has been configured to set warnings with the following IDs to DEBUG:\n{message_ids}.")
return filtered_loggers, unsuccessful
[docs] def parse(self, view_name=None, level=None, parallel=True, only_new=True, labels_cfg={}, cols={}, infer_types=None, **kwargs):
""" Shorthand for executing parse_scores and parse_tsv at a time.
Args:
view_name:
"""
self.parse_scores(level=level, parallel=parallel, only_new=only_new, labels_cfg=labels_cfg, view_name=view_name)
self.parse_tsv(view_name=view_name, level=level, cols=cols, infer_types=infer_types, only_new=only_new, **kwargs)
[docs] def parse_mscx(self, *args, **kwargs):
"""Renamed to :meth:`parse_scores`."""
self.parse_scores(*args, **kwargs)
[docs] def parse_scores(self,
level: str = None,
parallel: bool = True,
only_new: bool = True,
labels_cfg: dict = {},
view_name: str = None,
choose: Literal['all', 'auto', 'ask'] = 'all'):
""" Parse MuseScore 3 files (MSCX or MSCZ) and store the resulting read-only Score objects. If they need
to be writeable, e.g. for removing or adding labels, pass ``parallel=False`` which takes longer but prevents
having to re-parse at a later point.
Parameters
----------
level : {'W', 'D', 'I', 'E', 'C', 'WARNING', 'DEBUG', 'INFO', 'ERROR', 'CRITICAL'}, optional
Pass a level name for which (and above which) you want to see log records.
parallel : :obj:`bool`, optional
Defaults to True, meaning that all CPU cores are used simultaneously to speed up the parsing. It implies
that the resulting Score objects are in read-only mode and that you might not be able to use the computer
during parsing. Set to False to parse one score after the other, which uses more memory but will allow
making changes to the scores.
only_new : :obj:`bool`, optional
By default, score which already have been parsed, are not parsed again. Pass False to parse them, too.
Returns
-------
None
"""
if level is not None:
self.change_logger_cfg(level=level)
self.labels_cfg.update(update_labels_cfg(labels_cfg, logger=self.logger))
self.logger.debug(f"Parsing scores with parameters parallel={parallel}, only_new={only_new}")
fname2files = self.get_files('scores', view_name=view_name, parsed=not only_new, choose=choose, flat=True)
selected_files = sum(fname2files.values(), start=[])
target = len(selected_files)
if target == 0:
self.logger.debug(f"Nothing to parse.")
return
selected_scores_df = pd.concat([pd.DataFrame(files) for files in fname2files.values()], keys=fname2files.keys())
self.logger.debug(f"Selected scores to parse:\n{selected_scores_df.set_index('ix')['rel_path'].to_string()}")
exts = selected_scores_df.fext.value_counts()
if any(ext[1:].lower() in Score.convertible_formats for ext in exts.index) and parallel:
needs_conversion = [ext for ext in exts.index if ext[1:].lower() in Score.convertible_formats]
msg = f"The file extensions [{', '.join(needs_conversion)}] " \
f"require temporary conversion with your local MuseScore, which is not possible with parallel " \
f"processing. Parse with parallel=False or set View.include_convertible=False."
if self.ms is None:
msg += "\nIn case you want to temporarily convert these files, you will also have to set the " \
"property ms of this object to the path of your MuseScore 3 executable."
selected_files = [file for file in selected_files if file.fext not in needs_conversion]
msg += f"\nAfter filtering out the files, the target was reduced from N={target} to {len(selected_files)}."
target = len(selected_files)
self.logger.warning(msg)
if target == 0:
return
configs = [dict(
name=self.logger_names[ix],
) for ix in selected_scores_df.ix]
### collect argument tuples for calling parse_musescore_file
parse_this = [(file, self.logger, conf, parallel, self.ms) for file, conf in zip(selected_files, configs)]
try:
if parallel:
pool = mp.Pool(mp.cpu_count())
res = pool.starmap(parse_musescore_file, parse_this)
pool.close()
pool.join()
successful_results = {file.ix: score for file, score in zip(selected_files, res) if score is not None}
with_captured_logs = [score for score in successful_results.values() if hasattr(score, 'captured_logs')]
if len(with_captured_logs) > 0:
log_capture_handler = get_log_capture_handler(self.logger)
if log_capture_handler is not None:
for score in with_captured_logs:
log_capture_handler.log_queue.extend(score.captured_logs)
else:
parsing_results = [parse_musescore_file(*params) for params in parse_this]
successful_results = {file.ix: score for file, score in zip(selected_files, parsing_results) if score is not None}
successful = len(successful_results)
for ix, score in successful_results.items():
self.get_piece(self.ix2fname[ix]).add_parsed_score(ix, score)
if successful > 0:
if successful == target:
quantifier = f"The file has" if target == 1 else f"All {target} files have"
self.logger.info(f"{quantifier} been parsed successfully.")
else:
self.logger.info(f"Only {successful} of the {target} files have been parsed successfully.")
else:
self.logger.info(f"None of the {target} files have been parsed successfully.")
except KeyboardInterrupt:
self.logger.info("Parsing interrupted by user.")
raise
finally:
#self._collect_annotations_objects_references(ids=ids)
pass
[docs] def parse_tsv(self,
view_name: Optional[str] = None,
cols={},
infer_types=None,
level=None,
only_new: bool = True,
choose: Literal['all', 'auto', 'ask'] = 'all',
**kwargs):
""" Parse TSV files to be able to do something with them.
Parameters
----------
keys : :obj:`str` or :obj:`~collections.abc.Collection`, optional
Key(s) for which to parse all non-MSCX files. By default, all keys are selected.
ids : :obj:`~collections.abc.Collection`
To parse only particular files, pass there IDs. ``keys`` and ``fexts`` are ignored in this case.
fexts : :obj:`str` or :obj:`~collections.abc.Collection`, optional
If you want to parse only files with one or several particular file extension(s), pass the extension(s)
cols : :obj:`dict`, optional
By default, if a column called ``'label'`` is found, the TSV is treated as an annotation table and turned into
an Annotations object. Pass one or several column name(s) to treat *them* as label columns instead. If you
pass ``{}`` or no label column is found, the TSV is parsed as a "normal" table, i.e. a DataFrame.
infer_types : :obj:`dict`, optional
To recognize one or several custom label type(s), pass ``{name: regEx}``.
level : {'W', 'D', 'I', 'E', 'C', 'WARNING', 'DEBUG', 'INFO', 'ERROR', 'CRITICAL'}, optional
Pass a level name for which (and above which) you want to see log records.
**kwargs:
Arguments for :py:meth:`pandas.DataFrame.to_csv`. Defaults to ``{'sep': '\t', 'index': False}``. In particular,
you might want to update the default dictionaries for ``dtypes`` and ``converters`` used in :py:func:`load_tsv`.
Passing kwargs prevents ms3 from parsing TSVs in parallel, so it will be a bit slower.
Returns
-------
None
"""
if level is not None:
self.change_logger_cfg(level=level)
fname2files = self.get_files('tsv', view_name=view_name, parsed=not only_new, choose=choose, flat=True)
selected_files = sum(fname2files.values(), start=[])
target = len(selected_files)
if target == 0:
self.logger.debug(f"Nothing to parse.")
return
parse_this = [(file, get_logger(self.ix_logger(file.ix))) for file in selected_files]
if len(kwargs) == 0:
pool = mp.Pool(mp.cpu_count())
parsing_results = pool.starmap(parse_tsv_file, parse_this)
pool.close()
pool.join()
successful_results = {file.ix: df for file, df in zip(selected_files, parsing_results) if df is not None}
else:
parsing_results = [load_tsv(*params, **kwargs) for params in parse_this]
for file, logger in parse_this:
logger.debug(f"Trying to load {file.rel_path}")
try:
df = load_tsv(file.full_path, logger=logger, **kwargs)
parsing_results.append(df)
except Exception as e:
parsing_results.append(None)
logger.info(f"Couldn't be loaded, probably no tabular format or you need to specify 'sep', the delimiter as **kwargs."
f"\n{file.rel_path}\nError: {e}")
successful_results = {file.ix: score for file, score in zip(selected_files, parsing_results) if score is not None}
successful = len(successful_results)
for ix, df in successful_results.items():
self.get_piece(self.ix2fname[ix]).add_parsed_tsv(ix, df)
if successful > 0:
if successful == target:
quantifier = f"The file" if target == 1 else f"All {target} files"
self.logger.info(f"{quantifier} files have been parsed successfully.")
else:
self.logger.info(f"Only {successful} of the {target} files have been parsed successfully.")
else:
self.logger.info(f"None of the {target} files have been parsed successfully.")
def register_files_with_pieces(self, files: Optional[FileList] = None, fnames: Union[Collection[str], str] = None) -> None:
if fnames is None:
fnames = self.get_all_fnames()
elif isinstance(fnames, str):
fnames = [fnames]
fnames = sorted(fnames, key=len, reverse=True)
# sort so that longest fnames come first for preventing errors when detecting suffixes
if len(fnames) == 0:
self.logger.info(f"Corpus contains neither scores nor metadata.")
return
if files is None:
files = self.files
for file in files:
# try to associate all detected files with one of the created pieces and
# store the mapping in :attr:`ix2fname`
piece_name = None
if file.fname in fnames:
piece_name = file.fname
else:
for fname in fnames:
if file.fname.startswith(fname):
piece_name = fname
break
if piece_name is None:
if 'metadata' in file.fname:
self.load_metadata_file(file)
else:
self.logger.debug(f"Could not associate {file.file} with any of the pieces. Stored as orphan.")
self.ix2orphan_file[file.ix] = file
else:
piece = self.get_piece(piece_name)
registration_result = piece.register_file(file)
if registration_result is None:
self.logger.debug(f"Skipping '{file.rel_path}' because it had already been registered with Piece('{piece_name}').")
elif registration_result:
self.ix2fname[file.ix] = piece_name
self.logger_names[file.ix] = piece.logger.name
else:
self.logger.warning(f"Stored '{file.rel_path}' as orphan because it could not be registered with Piece('{piece_name}')")
self.ix2orphan_file[file.ix] = file
# tsv_metadata, score_metadata = None, None
# if view.fnames_in_metadata:
# tsv_fnames = [fname for fname in self.fnames_in_metadata(self.metadata_ix) if view.check_token('fnames', fname)]
# tsv_metadata = enforce_fname_index_for_metadata(self.metadata_tsv)
# tsv_metadata = tsv_metadata.loc[tsv_fnames]
# print(f"tsv_fnames: {tsv_fnames}")
# if view.fnames_not_in_metadata:
# score_fnames = [fname for fname in self.fnames_not_in_metadata() if view.check_token('fnames', fname)]
# rows = [self.get_piece(fname).score_metadata(view_name=view_name, choose=choose)
# for fname in score_fnames]
# if len(rows) > 0:
# score_metadata = pd.DataFrame(rows).set_index('fname')
# print(f"score_fnames: {score_fnames}")
# n_dataframes = (tsv_metadata is not None) + (score_metadata is not None)
# if n_dataframes == 0:
# return pd.DataFrame()
# if n_dataframes == 1:
# if tsv_metadata is None:
# return column_order(score_metadata, METADATA_COLUMN_ORDER, sort=False).sort_index()
# else:
# return column_order(tsv_metadata, METADATA_COLUMN_ORDER, sort=False).sort_index()
# result = pd.concat([tsv_metadata, score_metadata])
# return column_order(result, METADATA_COLUMN_ORDER, sort=False).sort_index()
def score_metadata(self,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto'):
fnames, rows = [], []
for fname, piece in self.iter_pieces(view_name=view_name):
try:
row = piece.score_metadata(view_name=view_name, choose=choose)
if row is not None:
rows.append(row)
fnames.append(fname)
except ValueError:
print(fname)
print(piece.score_metadata())
raise
if len(rows) > 0:
df = pd.DataFrame(rows).set_index('fname')
return column_order(df, METADATA_COLUMN_ORDER, sort=False).sort_index()
return pd.DataFrame()
[docs] def set_view(self, active: View = None, **views: View):
"""Register one or several view_name=View pairs."""
if active is not None:
new_name = active.name
if new_name in self._views and active != self._views[new_name]:
self.logger.info(f"The existing view called '{new_name}' has been overwritten")
del (self._views[new_name])
old_view = self._views[None]
self._views[old_view.name] = old_view
self._views[None] = active
for view_name, view in views.items():
if view.name != view_name:
view.name = view_name
self._views[view_name] = view
for fname, piece in self:
if active is not None:
piece.set_view(active)
for view_name, view in views.items():
piece.set_view(**{view_name:view})
def switch_view(self, view_name: str,
show_info: bool = True,
propagate: bool = True,
) -> None:
if view_name is None:
return
new_view = self.get_view(view_name)
old_view = self.get_view()
if old_view.name is not None:
self._views[old_view.name] = old_view
self._views[None] = new_view
new_name = new_view.name
if new_name in self._views:
del(self._views[new_name])
if propagate:
for fname, piece in self:
active_view = piece.get_view()
if active_view.name != new_name or active_view != new_view:
piece.set_view(new_view)
if show_info:
self.info()
@property
def view(self):
return self.get_view()
@view.setter
def view(self, new_view: View):
if not isinstance(new_view, View):
return TypeError("If you want to switch to an existing view by its name, use its name like an attribute or "
"call _.switch_view().")
self.set_view(new_view)
@property
def views(self):
print(pretty_dict({"[active]" if k is None else k: v for k, v in self._views.items()}, "view_name", "Description"))
@property
def view_name(self):
return self.get_view().name
@view_name.setter
def view_name(self, new_name):
view = self.get_view()
view.name = new_name
@property
def view_names(self):
return {view.name if name is None else name for name, view in self._views.items()}
def update_labels(self,
staff: Optional[int] = None,
voice: Optional[Literal[1, 2, 3, 4]] = None,
harmony_layer: Optional[Literal[0, 1, 2, 3]] = None,
above: bool = False,
safe: bool = True) -> FileList:
altered_files = []
for fname, piece in self.iter_pieces():
for file, score in piece.iter_parsed('scores'):
successfully_moved = score.move_labels_to_layer(staff=staff,
voice=voice,
harmony_layer=harmony_layer,
above=above,
safe=safe)
if successfully_moved:
altered_files.append(file)
return altered_files
[docs] def update_scores(self,
root_dir: Optional[str] = None,
folder: Optional[str] = '.',
suffix: str = '',
overwrite: bool = False) -> List[str]:
""" Update scores created with an older MuseScore version to the latest MuseScore 3 version.
Args:
root_dir:
In case you want to create output paths for the updated MuseScore files based on a folder different
from :attr:`corpus_path`.
folder:
* The default '.' has the updated scores written to the same directory as the old ones, effectively
overwriting them if ``root_dir`` is None.
* If ``folder`` is None, the files will be written to ``{root_dir}/scores/``.
* If ``folder`` is an absolute path, ``root_dir`` will be ignored.
* If ``folder`` is a relative path starting with a dot ``.`` the relative path is appended to the file's subdir.
For example, ``..\scores`` will resolve to a sibling directory of the one where the ``file`` is located.
* If ``folder`` is a relative path that does not begin with a dot ``.``, it will be appended to the
``root_dir``.
suffix: String to append to the file names of the updated files, e.g. '_updated'.
overwrite: By default, existing files are not overwritten. Pass True to allow this.
Returns:
A list of all up-to-date paths, whether they had to be converted or were already in the latest version.
"""
assert self.ms is not None, "Set the attribute 'ms' to your MuseScore 3 executable to update scores."
up2date_paths = []
latest_version = LATEST_MUSESCORE_VERSION.split('.')
for fname, piece in self.iter_pieces():
for file, score in piece.iter_parsed('scores'):
logger = self.ix_logger(file.ix)
new_path = make_file_path(file, root_dir=root_dir, folder=folder, suffix=suffix, fext='.mscx')
up2date_paths.append(new_path)
updated_existed = os.path.isfile(new_path)
if updated_existed:
if not overwrite:
logger.info(f"Skipped updating {file.rel_path} because the target file exists already and overwrite=False: {new_path}")
continue
else:
if new_path == file.full_path:
updated_file = file
else:
updated_file = self.get_file_from_path(new_path)
score_version = score.mscx.metadata['musescore'].split('.')
if score_version < latest_version:
convert(file.full_path, new_path, self.ms, logger=logger)
if not updated_existed:
new_files = self.add_file_paths([new_path])
updated_file = new_files[0]
new_score = Score(new_path)
piece.add_parsed_score(updated_file.ix, new_score)
compare_two_score_objects(score, new_score, logger=logger)
else:
updated_file = file
logger.debug(f"{file.rel_path} has version {LATEST_MUSESCORE_VERSION} already.")
return up2date_paths
[docs] def update_tsvs_on_disk(self,
facets: ScoreFacets = 'tsv',
view_name: Optional[str] = None,
force: bool = False,
choose: Literal['auto', 'ask'] = 'auto',
) -> List[str]:
"""
Update existing TSV files corresponding to one or several facets with information freshly extracted from a parsed
score, but only if the contents are identical. Otherwise, the existing TSV file is not overwritten and the
differences are displayed in a log warning. The purpose is to safely update the format of existing TSV files,
(for instance with respect to column order) making sure that the content doesn't change.
Args:
facets:
view_name:
force:
By default, only TSV files that have already been parsed are updated. Set to True in order to
force-parse for each facet one of the TSV files included in the given view, if necessary.
choose:
Returns:
List of paths that have been overwritten.
"""
paths = []
for fname, piece in self.iter_pieces(view_name=view_name):
paths.extend(piece.update_tsvs_on_disk(facets=facets,
view_name=view_name,
force=force,
choose=choose))
return paths
#####################
# OLD, needs adapting
#####################
def _add_annotations_by_ids(self, list_of_pairs, staff=None, voice=None, harmony_layer=1,
check_for_clashes=False):
""" For each pair, adds the labels at tsv_id to the score at score_id.
Parameters
----------
list_of_pairs : list of (score_id, tsv_id)
IDs of parsed files.
staff : :obj:`int`, optional
By default, labels are added to staves as specified in the TSV or to -1 (lowest).
Pass an integer to specify a staff.
voice : :obj:`int`, optional
By default, labels are added to voices (notational layers) as specified in the TSV or to 1 (main voice).
Pass an integer to specify a voice.
harmony_layer : :obj:`int`, optional
| By default, the labels are written into the staff's layer for Roman Numeral Analysis.
| To change the behaviour pass
| * None to instead attach them as absolute ('guitar') chords, meaning that when opened next time,
| MuseScore will split and encode those beginning with a note name ( resulting in ms3-internal harmony_layer 3).
| * 2 to have MuseScore interpret them as Nashville Numbers
check_for_clashes : :obj:`bool`, optional
Defaults to True, meaning that the positions where the labels will be inserted will be checked for existing
labels.
"""
self._add_detached_annotations_by_ids(list_of_pairs, new_key='labels_to_attach')
for score_id, tsv_id in list_of_pairs:
score = self[score_id]
score.attach_labels('labels_to_attach', staff=staff, voice=voice, harmony_layer=harmony_layer,
check_for_clashes=check_for_clashes,
remove_detached=True)
def _make_grouped_ids(self, keys=None, ids=None):
if ids is not None:
grouped_ids = group_id_tuples(ids)
else:
grouped_ids = {k: list(range(len(self.fnames[k]))) for k in self._treat_key_param(keys)}
return grouped_ids
def _concat_id_df_dict(self, dict_of_dataframes, id_index=False, third_level_name=None):
"""Concatenate DataFrames contained in a {ID -> df} dictionary.
Parameters
----------
dict_of_dataframes : :obj:`dict`
{ID -> DataFrame}
id_index : :obj:`bool`, optional
By default, the concatenated data will be differentiated through a three-level MultiIndex with the levels
'rel_paths', 'fnames', and '{which}_id'. Pass True if instead you want the first two levels to correspond to the file's IDs.
third_level_name : :obj:`str`, optional
Returns
-------
:obj:`pandas.DataFrame`
"""
d = {k: v for k, v in dict_of_dataframes.items() if v.shape[0] > 0}
if len(d) == 0:
self.logger.info(f"Nothing to concatenate:\n{dict_of_dataframes}")
return
if id_index:
result = pd.concat(d.values(), keys=d.keys())
result.index.names = ['key', 'i', third_level_name]
else:
levels = [(self.rel_paths[key][i], self.fnames[key][i]) for key, i in d.keys()]
result = pd.concat(d.values(), keys=levels)
result.index.names = ['rel_paths', 'fnames', third_level_name]
return result
def _concat_lists(self, which, id_index=False, keys=None, ids=None, quarterbeats=False, unfold=False, interval_index=False):
""" Boiler plate for concatenating DataFrames with the same type of information.
Parameters
----------
which : {'cadences', 'chords', 'events', 'expanded', 'labels', 'measures', 'notes_and_rests', 'notes', 'rests', 'form_labels'}
id_index : :obj:`bool`, optional
By default, the concatenated data will be differentiated through a three-level MultiIndex with the levels
'rel_paths', 'fnames', and '{which}_id'. Pass True if instead you want the first two levels to correspond to the file's IDs.
keys
ids
Returns
-------
"""
d = self.extract_facets(keys, flat=False, unfold=unfold, interval_index=interval_index, **{which: True})
d = d[which] if which in d else {}
msg = {
'cadences': 'cadence lists',
'chords': '<chord> tables',
'events': 'event tables',
'expanded': 'expandable annotation tables',
'labels': 'annotation tables',
'measures': 'measure lists',
'notes': 'note lists',
'notes_and_rests': 'note and rest lists',
'rests': 'rest lists',
'form_labels': 'form label tables'
}
if len(d) == 0:
if keys is None and ids is None:
self.logger.info(f'This Corpus object does not include any {msg[which]}.')
else:
self.logger.info(f'keys={keys}, ids={ids}, does not yield any {msg[which]}.')
return pd.DataFrame()
return self._concat_id_df_dict(d, id_index=id_index, third_level_name=f"{which}_id")
[docs] def add_detached_annotations(self, keys=None, use=None, tsv_key=None, new_key='old', revision_specifier=None):
""" Add :py:attr:`~.annotations.Annotations` objects generated from TSV files to the :py:attr:`~.score.Score`
objects to which they are being matched based on their filenames or on ``match_dict``.
Parameters
----------
keys : :obj:`str` or :obj:`~collections.abc.Collection`, optional
Key(s) under which score files are stored. By default, all keys are selected.
use : :obj:`str`, optional
By default, if several sets of annotation files are found, the user is asked to input
in which order to pick them. Instead, they can specify the name of a column of
_.pieces(), especially 'expanded' or 'labels' to be using only these.
new_key : :obj:`str`, optional
The key under which the :py:attr:`~.annotations.Annotations` objects will be available after attaching
them to the :py:attr:`~.score.Score` objects (``Corpusd.parsed_mscx[ID].new_key``).
tsv_key : :obj:`str`, optional
A key under which parsed TSV files are stored of which the type has been inferred as 'labels'.
Note that passing ``tsv_key`` results in the deprecated use of Corpus.match_files(). The preferred way
is to parse the labels to be attached under the same key as the scores and use
View.add_detached_labels().
revision_specifier : :obj:`str`, optional
If you want to retrieve a previous version of the TSV file from a git commit (e.g. for
using compare_labels()), pass the commit's SHA.
"""
keys = self._treat_key_param(keys)
if tsv_key is None:
for key in keys:
view = self._get_view(key)
view.add_detached_annotations(use=use, new_key=new_key, revision_specifier=revision_specifier)
return
matches = self.match_files(keys=keys + [tsv_key])
matches = matches[matches.labels.notna() | matches.expanded.notna()]
matches.labels.fillna(matches.expanded, inplace=True)
list_of_pairs = list(matches[['scores', 'labels']].itertuples(name=None, index=False))
if len(list_of_pairs) == 0:
self.logger.error(f"No files could be matched based on file names, probably a bug due to the deprecated use of the tsv_key parameter."
f"The preferred way of adding labels is parsing them under the same key as the scores and use Corpus[key].add_detached_annotations()")
return
self._add_detached_annotations_by_ids(list_of_pairs, new_key=new_key)
[docs] def annotation_objects(self):
"""Iterator through all annotation objects."""
yield from self._annotations.items()
[docs] def attach_labels(self, keys=None, annotation_key=None, staff=None, voice=None, harmony_layer=None, check_for_clashes=True):
""" Attach all :py:attr:`~.annotations.Annotations` objects that are reachable via ``Score.annotation_key`` to their
respective :py:attr:`~.score.Score`, changing their current XML. Calling :py:meth:`.store_scores` will output
MuseScore files where the annotations show in the score.
Parameters
----------
keys : :obj:`str` or :obj:`~collections.abc.Collection`, optional
Key(s) under which parsed MuseScore files are stored. By default, all keys are selected.
annotation_key : :obj:`str` or :obj:`list` or :obj:`tuple`, optional
Key(s) under which the :py:attr:`~.annotations.Annotations` objects to be attached are stored in the
:py:attr:`~.score.Score` objects. By default, all keys are selected.
staff : :obj:`int`, optional
If you pass a staff ID, the labels will be attached to that staff where 1 is the upper stuff.
By default, the staves indicated in the 'staff' column of :obj:`ms3.annotations.Annotations.df`
will be used.
voice : {1, 2, 3, 4}, optional
If you pass the ID of a notational layer (where 1 is the upper voice, blue in MuseScore),
the labels will be attached to that one.
By default, the notational layers indicated in the 'voice' column of
:obj:`ms3.annotations.Annotations.df` will be used.
check_for_clashes : :obj:`bool`, optional
By default, warnings are thrown when there already exists a label at a position (and in a notational
layer) where a new one is attached. Pass False to deactivate these warnings.
"""
layers = self.count_annotation_layers(keys, which='detached', per_key=True)
if len(layers) == 0:
ks = '' if keys is None else ' under the key(s) ' + keys
self.logger.warning(f"No detached annotations found{ks}.")
return
if annotation_key is None:
annotation_key = list(layers.keys())
elif isinstance(annotation_key, str):
annotation_key = [annotation_key]
if any(True for k in annotation_key if k not in layers):
wrong = [k for k in annotation_key if k not in layers]
annotation_key = [k for k in annotation_key if k in layers]
if len(annotation_key) == 0:
self.logger.error(
f"""'{wrong}' are currently not keys for sets of detached labels that have been added to parsed scores.
Currently available annotation keys are {list(layers.keys())}""")
return
else:
self.logger.warning(
f"""'{wrong}' are currently not keys for sets of detached labels that have been added to parsed scores.
Continuing with {annotation_key}.""")
ids = list(self._iterids(keys, only_detached_annotations=True))
reached, goal = 0, 0
for id in ids:
for anno_key in annotation_key:
if anno_key in self._parsed_mscx[id]:
r, g = self._parsed_mscx[id].attach_labels(anno_key, staff=staff, voice=voice, harmony_layer=harmony_layer, check_for_clashes=check_for_clashes)
self.logger.info(f"{r}/{g} labels successfully added to {self.files[id[0]][id[1]]}")
reached += r
goal += g
self.logger.info(f"{reached}/{goal} labels successfully added to {len(ids)} files.")
self._collect_annotations_objects_references(ids=ids)
[docs] def change_labels_cfg(self, labels_cfg={}, staff=None, voice=None, harmony_layer=None, positioning=None, decode=None, column_name=None, color_format=None):
""" Update :obj:`Corpus.labels_cfg` and retrieve new 'labels' tables accordingly.
Parameters
----------
labels_cfg : :obj:`dict`
Using an entire dictionary or, to change only particular options, choose from:
staff, voice, harmony_layer, positioning, decode, column_name
Arguments as they will be passed to :py:meth:`~ms3.annotations.Annotations.get_labels`
"""
keys = ['staff', 'voice', 'harmony_layer', 'positioning', 'decode', 'column_name', 'color_format']
for k in keys:
val = locals()[k]
if val is not None:
labels_cfg[k] = val
updated = update_labels_cfg(labels_cfg, logger=self.logger)
self.labels_cfg.update(updated)
for score in self._parsed_mscx.values():
score.change_labels_cfg(labels_cfg=updated)
ids = list(self._labellists.keys())
if len(ids) > 0:
self._extract_and_cache_dataframes(ids=ids, labels=True)
def check_labels(self, keys=None, ids=None):
if len(self._parsed_mscx) == 0:
self.logger.info("No scores have been parsed so far. Use parse_scores()")
return
if ids is None:
ids = list(self._iterids(keys, only_parsed_mscx=True))
checks = {id: self._parsed_mscx[id].check_labels() for id in ids}
checks = {k: v for k, v in checks.items() if v is not None and len(v) > 0}
if len(checks) > 0:
idx = self.ids2idx(checks.keys(), pandas_index=True)
return pd.concat(checks.values(), keys=idx, names=idx.names)
return pd.DataFrame()
def color_non_chord_tones(self,
color_name: str = 'red',
view_name: Optional[str] = None,
force: bool = False,
choose: Literal['all', 'auto', 'ask'] = 'all', ) -> Dict[str, List[FileDataframeTuple]]:
if self.n_parsed_scores == 0:
self.logger.info("No scores have been parsed so far. Use parse_scores()")
return dict()
result = defaultdict(list)
for file, score in self.iter_parsed('scores',
view_name=view_name,
force=force,
choose=choose,):
report = score.color_non_chord_tones(color_name=color_name)
if report is not None:
result[file.fname].append((file, report))
return dict(result)
[docs] def compare_labels(self,
key: str = 'detached',
new_color: str = 'ms3_darkgreen',
old_color: str = 'ms3_darkred',
detached_is_newer: bool = False,
add_to_rna: bool = True,
view_name: Optional[str] = None) -> Tuple[int, int]:
""" Compare detached labels ``key`` to the ones attached to the Score to create a diff.
By default, the attached labels are considered as the reviewed version and labels that have changed or been added
in comparison to the detached labels are colored in green; whereas the previous versions of changed labels are
attached to the Score in red, just like any deleted label.
Args:
key: Key of the detached labels you want to compare to the ones in the score.
new_color, old_color:
The colors by which new and old labels are differentiated. Identical labels remain unchanged. Colors can be
CSS colors or MuseScore colors (see :py:attr:`utils.MS3_COLORS`).
detached_is_newer:
Pass True if the detached labels are to be added with ``new_color`` whereas the attached changed labels
will turn ``old_color``, as opposed to the default.
add_to_rna:
By default, new labels are attached to the Roman Numeral layer.
Pass False to attach them to the chord layer instead.
Returns:
Number of scores in which labels have changed.
Number of scores in which no label has chnged.
"""
changed, unchanged = 0, 0
for fname, piece in self.iter_pieces(view_name=view_name):
c, u = piece.compare_labels(key=key,
new_color=new_color,
old_color=old_color,
detached_is_newer=detached_is_newer,
add_to_rna=add_to_rna,
view_name=view_name)
changed += c
unchanged += u
return changed, unchanged
[docs] def count_annotation_layers(self, keys=None, which='attached', per_key=False):
""" Counts the labels for each annotation layer defined as (staff, voice, harmony_layer).
By default, only labels attached to a score are counted.
Parameters
----------
keys : :obj:`str` or :obj:`~collections.abc.Collection`, optional
Key(s) for which to count annotation layers. By default, all keys are selected.
which : {'attached', 'detached', 'tsv'}, optional
'attached': Counts layers from annotations attached to a score.
'detached': Counts layers from annotations that are in a Score object, but detached from the score.
'tsv': Counts layers from Annotation objects that have been loaded from or into annotation tables.
per_key : :obj:`bool`, optional
If set to True, the results are returned as a dict {key: Counter}, otherwise the counts are summed up in one Counter.
If ``which='detached'``, the keys are keys from Score objects, otherwise they are keys from this Corpus object.
Returns
-------
:obj:`dict` or :obj:`collections.Counter`
By default, the function returns a Counter of labels for every annotation layer (staff, voice, harmony_layer)
If ``per_key`` is set to True, a dictionary {key: Counter} is returned, separating the counts.
"""
res_dict = defaultdict(Counter)
if which == 'detached':
for id in self._iterids(keys, only_detached_annotations=True):
for key, annotations in self._parsed_mscx[id]._detached_annotations.items():
if key != 'annotations':
_, layers = annotations.annotation_layers
layers_dict = {tuple(None if pd.isnull(e) else e for e in t): count for t, count in
layers.to_dict().items()}
res_dict[key].update(layers_dict)
elif which in ['attached', 'tsv']:
for key, i in self._iterids(keys):
if (key, i) in self._annotations:
ext = self.fexts[key][i]
if (which == 'attached' and ext == '.mscx') or (which == 'tsv' and ext != '.mscx'):
_, layers = self._annotations[(key, i)].annotation_layers
layers_dict = {tuple(None if pd.isnull(e) else e for e in t): count for t, count in
layers.to_dict().items()}
res_dict[key].update(layers_dict)
else:
self.logger.error(f"Parameter 'which' needs to be one of {{'attached', 'detached', 'tsv'}}, not {which}.")
return {} if per_key else pd.Series()
def make_series(counts):
if len(counts) == 0:
return pd.Series()
data = counts.values()
ks = list(counts.keys())
#levels = len(ks[0])
names = ['staff', 'voice', 'harmony_layer', 'color'] #<[:levels]
ix = pd.MultiIndex.from_tuples(ks, names=names)
return pd.Series(data, ix)
if per_key:
res = {k: make_series(v) for k, v in res_dict.items()}
else:
res = make_series(sum(res_dict.values(), Counter()))
if len(res) == 0:
self.logger.info("No annotations found. Maybe no scores have been parsed using parse_scores()?")
return res
def count_extensions(self, view_name: Optional[str] = None, include_metadata: bool = False):
""""""
selected_files = self.get_files(view_name=view_name, flat=True)
result = {fname: Counter(file.fext for file in files) for fname, files in selected_files.items()}
if include_metadata and 'metadata' not in result:
result['metadata'] = Counter(file.fext for file in self.ix2metadata_file.values())
return result
def _get_parsed_score_files(self, view_name: Optional[str] = None, flat=True) -> Union[FileList, FileDict]:
file_dict = self.get_files('scores', view_name=view_name, unparsed=False, flat=True)
all_files = sum(file_dict.values(), [])
if flat:
return all_files
else:
return {'scores': all_files}
def _get_unparsed_score_files(self, view_name: Optional[str] = None, flat=True) -> Union[FileList, FileDict]:
file_dict = self.get_files('scores', view_name=view_name, parsed=False, flat=True)
all_files = sum(file_dict.values(), [])
if flat:
return all_files
else:
return {'scores': all_files}
def _get_parsed_tsv_files(self, view_name: Optional[str] = None, flat=True) -> Union[FileList, FileDict]:
if flat:
file_dict = self.get_files('tsv', view_name=view_name, unparsed=False, flat=True)
return sum(file_dict.values(), [])
else:
result = defaultdict(list)
for fname, piece in self.iter_pieces(view_name=view_name):
file_dict = piece._get_parsed_tsv_files(view_name=view_name, flat=False)
for facet, files in file_dict.items():
result[facet].extend(files)
return dict(result)
def _get_unparsed_tsv_files(self, view_name: Optional[str] = None, flat: bool = True) -> Union[FileList, FileDict]:
if flat:
file_dict = self.get_files('tsv', view_name=view_name, parsed=False, flat=True)
return sum(file_dict.values(), [])
else:
result = defaultdict(list)
for fname, piece in self.iter_pieces(view_name=view_name):
file_dict = piece._get_unparsed_tsv_files(view_name=view_name, flat=False)
for facet, files in file_dict.items():
result[facet].extend(files)
return dict(result)
[docs] def count_labels(self, keys=None, per_key=False):
""" Count label types.
Parameters
----------
keys : :obj:`str` or :obj:`~collections.abc.Collection`, optional
Key(s) for which to count label types. By default, all keys are selected.
per_key : :obj:`bool`, optional
If set to True, the results are returned as a dict {key: Counter},
otherwise the counts are summed up in one Counter.
Returns
-------
:obj:`dict` or :obj:`collections.Counter`
By default, the function returns a Counter of label types.
If ``per_key`` is set to True, a dictionary {key: Counter} is returned, separating the counts.
"""
annotated = [id for id in self._iterids(keys) if id in self._annotations]
res_dict = defaultdict(Counter)
for key, i in annotated:
res_dict[key].update(self._annotations[(key, i)].harmony_layer_counts)
if len(res_dict) == 0:
if len(self._parsed_mscx) == 0:
self.logger.error("No scores have been parsed so far. Use parse_scores().")
else:
self.logger.info("None of the scores contain annotations.")
if per_key:
return {k: dict(v) for k, v in res_dict.items()}
return dict(sum(res_dict.values(), Counter()))
[docs] def count_tsv_types(self, keys=None, per_key=False):
""" Count inferred TSV types.
Parameters
----------
keys : :obj:`str` or :obj:`~collections.abc.Collection`, optional
Key(s) for which to count inferred TSV types. By default, all keys are selected.
per_key : :obj:`bool`, optional
If set to True, the results are returned as a dict {key: Counter},
otherwise the counts are summed up in one Counter.
Returns
-------
:obj:`dict` or :obj:`collections.Counter`
By default, the function returns a Counter of inferred TSV types.
If ``per_key`` is set to True, a dictionary {key: Counter} is returned, separating the counts.
"""
res_dict = defaultdict(Counter)
for key, i in self._iterids(keys, only_parsed_mscx=True):
t = self._tsv_types[(key, i)] if (key, i) in self._tsv_types else None
res_dict[key].update([t])
if per_key:
return {k: dict(v) for k, v in res_dict.items()}
return dict(sum(res_dict.values(), Counter()))
[docs] def detach_labels(self, keys=None, annotation_key='detached', staff=None, voice=None, harmony_layer=None, delete=True):
""" Calls :py:meth:`Score.detach_labels<ms3.score.Score.detach_labels` on every parsed score with key ``key``.
"""
assert annotation_key != 'annotations', "The key 'annotations' is reserved, please choose a different one."
ids = list(self._iterids(keys, only_attached_annotations=True))
if len(ids) == 0:
self.logger.info(f"Selection did not contain scores with labels: keys = '{keys}'")
for id in ids:
score = self._parsed_mscx[id]
try:
score.detach_labels(key=annotation_key, staff=staff, voice=voice, harmony_layer=harmony_layer, delete=delete)
except:
score.logger.error(f"Detaching labels failed with the following error:\n{sys.exc_info()[1]}")
self._collect_annotations_objects_references(ids=ids)
[docs] def get_labels(self, keys=None, staff=None, voice=None, harmony_layer=None, positioning=True, decode=False, column_name=None,
color_format=None, concat=True):
""" This function does not take into account self.labels_cfg """
if len(self._annotations) == 0:
self.logger.error("No labels available so far. Add files using add_dir() and parse them using parse().")
return pd.DataFrame()
keys = self._treat_key_param(keys)
harmony_layer = self._treat_harmony_layer_param(harmony_layer)
self._extract_and_cache_dataframes(labels=True, only_new=True)
l = locals()
params = {p: l[p] for p in self.labels_cfg.keys()}
ids = [id for id in self._iterids(keys) if id in self._annotations]
if len(ids) == 0:
self.logger.info(f"No labels match the criteria.")
return pd.DataFrame()
annotation_tables = [self._annotations[id].get_labels(**params, inverse=False) for id in ids]
idx, names = self.ids2idx(ids)
if names is None:
names = (None,) * len(idx[0])
names += tuple(annotation_tables[0].index.names)
if concat:
return pd.concat(annotation_tables, keys=idx, names=names)
return annotation_tables
def get_facet_at_git_revision(self,
facet: ScoreFacet,
git_revision: str,
view_name: Optional[str] = None,
choose: Literal['auto', 'ask'] = 'auto',
concatenate: bool = False,
) -> Union[Dict[str, FileDataframeTuple], pd.DataFrame]:
facet = check_argument_against_literal_type(facet, ScoreFacet, logger=self.logger)
assert facet is not None, f"Pass a valid facet {ScoreFacet.__args__}"
if choose == 'ask':
self.disambiguate_facet(facet, ask_for_input=True)
choose = 'auto'
fname2files = self.get_files(facets=facet,
view_name=view_name,
flat=True
)
fname2selected = {}
for fname, files in fname2files.items():
parsed_files = [file for file in files if file.ix in self.ix2parsed]
unparsed_files = [file for file in files if file.ix not in parsed_files]
n_parsed = len(parsed_files)
n_unparsed = len(unparsed_files)
if n_parsed == 1:
fname2selected[fname] = parsed_files[0]
else:
if n_parsed == 0:
if n_unparsed > 1:
selected = disambiguate_files(unparsed_files,
self.name,
facet,
choose=choose,
logger=self.logger)
else:
selected = unparsed_files[0]
else:
selected = disambiguate_files(parsed_files,
self.name,
facet,
choose=choose,
logger=self.logger)
if selected is None:
continue
fname2selected[fname] = selected
fname2tuples = {}
for fname, file in fname2selected.items():
new_file, parsed = parse_tsv_file_at_git_revision(file, git_revision, self.corpus_path, logger=self.ix_logger(file.ix))
if parsed is None:
self.logger.warning(f"Could not retrieve {file.rel_path} @ '{git_revision}'.")
else:
fname2tuples[fname] = (new_file, parsed)
if concatenate:
if len(fname2tuples) > 0:
dfs = [df for file, df in fname2tuples.values()]
df = pd.concat(dfs, keys=fname2tuples.keys())
df.index.rename(['fname', f"{facet}_i"], inplace=True)
return df
else:
return pd.DataFrame()
return fname2tuples
def get_parsed_at_index(self, ix: int) -> Optional[ParsedFile]:
parsed = self.ix2parsed
if ix in parsed:
return parsed[ix]
if ix in self.ix2fname:
piece = self.get_piece(self.ix2fname[ix])
try:
return piece._get_parsed_at_index(ix)
except RuntimeError:
return None
if ix in self.ix2orphan_file:
file = self.ix2orphan_file[ix]
try:
df = load_tsv(file.full_path)
except Exception:
df = pd.read_csv(file.full_path, sep='\t')
self._ix2parsed[file.ix] = df
return df
self.logger.warning(f"This Corpus does not include a file with index {ix}.")
def get_playthrough2mc(self, keys=None, ids=None):
if ids is None:
ids = list(self._iterids(keys))
_ = self.match_files(ids=ids)
res = {}
for id in ids:
unf_mcs = self._get_playthrough2mc(id)
if unf_mcs is not None:
res[id] = unf_mcs
# else:
# self.id_logger(id).(f"Unable to unfold.")
return res
def _get_measure_list(self, id, unfold=False):
""" Tries to retrieve the corresponding measure list, e.g. for unfolding repeats. Preference is given to
parsed MSCX files, then checks for parsed TSVs.
Parameters
----------
key, i
ID
unfold : :obj:`bool` or ``'raw'``
Defaults to False, meaning that all voltas except second ones are dropped.
If set to True, the measure list is unfolded.
Pass the string 'raw' to leave the measure list as it is, with all voltas.
Returns
-------
"""
logger = self.ix_logger(id)
piece = self.get_piece(id)
if piece is None:
logger.warning(f"Could not associate ID {id} with a Piece object.")
return
return piece.get_dataframe('measures', unfold=unfold)
def _get_playthrough2mc(self, id):
logger = self.ix_logger(id)
if id in self._playthrough2mc:
return self._playthrough2mc[id]
ml = self._get_measure_list(id)
if ml is None:
logger.warning("No measures table available.")
self._playthrough2mc[id] = None
return
mc_playthrough = make_playthrough2mc(ml, logger=logger)
if len(mc_playthrough) == 0:
logger.warning(f"Error in the repeat structure for ID {id}: Did not reach the stopping value -1 in measures.next:\n{ml.set_index('mc').next}")
mc_playthrough = None
else:
logger.debug("Measures successfully unfolded.")
self._playthrough2mc[id] = mc_playthrough
return mc_playthrough
[docs] def get_continuous_offsets(self, id, unfold):
""" Using a corresponding measure list, return a dictionary mapping MCs to their absolute distance from MC 1,
measured in quarter notes.
Parameters
----------
key, i:
ID
unfold : :obj:`bool`
If True, return ``{mc_playthrough -> offset}``, otherwise ``{mc -> offset}``, keeping only second endings.
Returns
-------
"""
logger = self.ix_logger(id)
if id in self._quarter_offsets[unfold]:
return self._quarter_offsets[unfold][id]
ml = self._get_measure_list(id, unfold=unfold)
if ml is None:
logger.warning(f"Could not find measure list for id {id}.")
return None
offset_col = make_continuous_offset_series(ml, logger=logger)
offsets = offset_col.to_dict()
self._quarter_offsets[unfold][id] = offsets
return offsets
def iter(self, columns, keys=None, skip_missing=False):
keys = self._treat_key_param(keys)
for key in keys:
for tup in self[key].iter(columns=columns, skip_missing=skip_missing):
if tup is not None:
yield (key, *tup)
def iter_transformed(self, columns, keys=None, skip_missing=False, unfold=False, quarterbeats=False, interval_index=False):
keys = self._treat_key_param(keys)
for key in keys:
for tup in self[key].iter_transformed(columns=columns, skip_missing=skip_missing, unfold=unfold, quarterbeats=quarterbeats, interval_index=interval_index):
if tup is not None:
yield (key, *tup)
def iter_notes(self, keys=None, unfold=False, quarterbeats=False, interval_index=False, skip_missing=False, weight_grace_durations=0):
keys = self._treat_key_param(keys)
for key in keys:
for tup in self[key].iter_notes(unfold=unfold, quarterbeats=quarterbeats, interval_index=interval_index, skip_missing=skip_missing,
weight_grace_durations=weight_grace_durations):
if tup is not None:
yield (key, *tup)
def join(self, keys=None, ids=None, what=None, use_index=True):
if what is not None:
what = [w for w in what if w != 'scores']
matches = self.match_files(keys=keys, ids=ids, what=what)
join_this = set(map(tuple, matches.values))
key_ids, *_ = zip(*join_this)
if use_index:
key_ids = self.ids2idx(key_ids, pandas_index=True)
join_this = [[self._parsed_tsv[id] for id in ids if not pd.isnull(id)] for ids in join_this]
joined = [join_tsvs(dfs) for dfs in join_this]
return pd.concat(joined, keys=key_ids)
def _parse_tsv_from_git_revision(self, tsv_id, revision_specifier):
""" Takes the ID of an annotation table, and parses the same file's previous version at ``revision_specifier``.
Parameters
----------
tsv_id
ID of the TSV file containing an annotation table, for which to parse a previous version.
revision_specifier : :obj:`str`
String used by git.Repo.commit() to find the desired git revision.
Can be a long or short SHA, git tag, branch name, or relative specifier such as 'HEAD~1'.
Returns
-------
ID
(key, i) of the newly added annotation table.
"""
key, i = tsv_id
corpus_path = self.corpus_paths[key]
try:
repo = Repo(corpus_path, search_parent_directories=True)
except InvalidGitRepositoryError:
self.logger.error(f"{corpus_path} seems not to be (part of) a git repository.")
return
try:
git_repo = repo.remote("origin").url
except ValueError:
git_repo = os.path.basename()
try:
commit = repo.commit(revision_specifier)
commit_sha = commit.hexsha
short_sha = commit_sha[:7]
commit_info = f"{short_sha} with message '{commit.message}'"
except BadName:
self.logger.error(f"{revision_specifier} does not resolve to a commit for repo {git_repo}.")
return
tsv_type = self._tsv_types[tsv_id]
tsv_path = self.full_paths[key][i]
rel_path = os.path.relpath(tsv_path, corpus_path)
new_directory = os.path.join(corpus_path, short_sha)
new_path = os.path.join(new_directory, self.files[key][i])
if new_path in self.full_paths[key]:
existing_i = self.full_paths[key].index(new_path)
existing_tsv_type = self._tsv_types[(key, existing_i)]
if tsv_type == existing_tsv_type:
self.logger.error(f"Had already loaded a {tsv_type} table for commit {commit_info} of repo {git_repo}.")
return
if not tsv_type in ('labels', 'expanded'):
raise NotImplementedError(f"Currently, only annotations are to be loaded from a git revision but {rel_path} is a {tsv_type}.")
try:
targetfile = commit.tree / rel_path
except KeyError:
# if the file was not found, try and see if at the time of the git revision the folder was still called 'harmonies'
if tsv_type == 'expanded':
folder, tsv_name = os.path.split(rel_path)
if folder != 'harmonies':
old_rel_path = os.path.join('harmonies', tsv_name)
try:
targetfile = commit.tree / old_rel_path
self.logger.debug(f"{rel_path} did not exist at commit {commit_info}, using {old_rel_path} instead.")
rel_path = old_rel_path
except KeyError:
self.logger.error(f"Neither {rel_path} nor its older version {old_rel_path} existed at commit {commit_info}.")
return
else:
self.logger.error(f"{rel_path} did not exist at commit {commit_info}.")
return
self.logger.info(f"Successfully loaded {rel_path} from {commit_info}.")
try:
with io.BytesIO(targetfile.data_stream.read()) as f:
df = load_tsv(f)
except Exception:
self.logger.error(f"Parsing {rel_path} @ commit {commit_info} failed with the following error:\n{sys.exc_info()[1]}")
return
new_id = self._handle_path(new_path, key, skip_checks=True)
self._parsed_tsv[new_id] = df
self._dataframes[tsv_type][new_id] = df
self._tsv_types[new_id] = tsv_type
logger_cfg = dict(self.logger_cfg)
logger_cfg['name'] = self.logger_names[(key, i)]
if tsv_id in self._annotations:
anno_obj = self._annotations[tsv_id] # get Annotation object's settings from the existing one
cols = anno_obj.cols
infer_types = anno_obj.regex_dict
else:
cols = dict(label='label')
infer_types = None
self._annotations[new_id] = Annotations(df=df, cols=cols, infer_types=infer_types,
**logger_cfg)
self.logger.debug(
f"{rel_path} successfully parsed from commit {short_sha}.")
return new_id
def pieces(self, parsed_only=False):
# pieces_dfs = [self[k].pieces(parsed_only=parsed_only) for k in self.keys()]
# result = pd.concat(pieces_dfs, keys=self.keys())
# result.index.names = ['key', 'metadata_row']
# return result
raise NotImplementedError
def _collect_annotations_objects_references(self, keys=None, ids=None):
""" Updates the dictionary self._annotations with all parsed Scores that have labels attached (or not any more). """
if ids is None:
ids = list(self._iterids(keys, only_parsed_mscx=True))
updated = {}
for id in ids:
if id in self._parsed_mscx:
score = self._parsed_mscx[id]
if score is not None:
if 'annotations' in score:
updated[id] = score.annotations
elif id in self._annotations:
del (self._annotations[id])
else:
del (self._parsed_mscx[id])
self._annotations.update(updated)
def _handle_path(self, full_path, key, skip_checks=False):
"""Store information about the file at ``full_path`` in their various fields under the given key."""
full_path = resolve_dir(full_path)
if not skip_checks and not os.path.isfile(full_path):
self.logger.error("No file found at this path: " + full_path)
return (None, None)
file_path, file = os.path.split(full_path)
file_name, file_ext = os.path.splitext(file)
if not skip_checks and file_ext[1:] not in Score.parseable_formats + ('tsv',):
ext_string = "without extension" if file_ext == '' else f"with extension {file_ext}"
self.logger.debug(f"ms3 does not handle files {ext_string} -> discarding" + full_path)
return (None, None)
rel_path = os.path.relpath(file_path, self.last_scanned_dir)
subdir = get_path_component(rel_path, key)
if file in self.files[key]:
same_name = [i for i, f in enumerate(self.files[key]) if f == file]
if not skip_checks and any(True for i in same_name if self.rel_paths[key][i] == rel_path):
self.logger.debug(
f"""The file name {file} is already registered for key '{key}' and both files have the relative path {rel_path}.
Load one of the identically named files with a different key using add_dir(key='KEY').""")
return (None, None)
self.logger.debug(
f"The file {file} is already registered for key '{key}' but can be distinguished via the relative path {rel_path}.")
i = len(self.files[key])
self.logger_names[(key, i)] = f"{self.logger.name}.{key}.{file_name.replace('.', '')}{file_ext}"
self.full_paths[key].append(full_path)
self.scan_paths[key].append(self.last_scanned_dir)
self.rel_paths[key].append(rel_path)
self.subdirs[key].append(subdir)
self.paths[key].append(file_path)
self.files[key].append(file)
self.logger_names[(key, i)] = f"{self.logger.name}.{key}.{file_name.replace('.', '')}{file_ext}"
self.fnames[key].append(file_name)
self.fexts[key].append(file_ext)
F = File(
id=(key, i),
full_path=full_path,
scan_path=self.last_scanned_dir,
rel_path=rel_path,
subdir=subdir,
path=file_path,
file=file,
fname=file_name,
fext=file_ext
)
self.id2file_info[(key, i)] = F
return key, i
def _iterids(self, keys=None, only_parsed_mscx=False, only_parsed_tsv=False, only_attached_annotations=False, only_detached_annotations=False):
"""Iterator through IDs for a given set of keys.
Parameters
----------
keys
only_parsed_mscx
only_attached_annotations
only_detached_annotations
Yields
------
:obj:`tuple`
(str, int)
"""
keys = self._treat_key_param(keys)
for key in sorted(keys):
for id in make_id_tuples(key, len(self.fnames[key])):
if only_parsed_mscx or only_attached_annotations or only_detached_annotations:
if id not in self._parsed_mscx:
continue
if only_attached_annotations:
if 'annotations' in self._parsed_mscx[id]:
pass
else:
continue
elif only_detached_annotations:
if self._parsed_mscx[id].has_detached_annotations:
pass
else:
continue
elif only_parsed_tsv:
if id in self._parsed_tsv:
pass
else:
continue
yield id
def _iter_subdir_selectors(self, keys=None, ids=None):
""" Iterate through the specified ids grouped by subdirs.
Yields
------
:obj:`tuple`
(key: str, subdir: str, ixs: list) tuples. IDs can be created by combining key with each i in ixs.
The yielded ``ixs`` are typically used as parameter for ``.utils.iter_selection``.
"""
grouped_ids = self._make_grouped_ids(keys, ids)
for k, ixs in grouped_ids.items():
subdirs = self.subdirs[k]
for subdir in sorted(set(iter_selection(subdirs, ixs))):
yield k, subdir, [i for i in ixs if subdirs[i] == subdir]
def _score_ids(self, keys=None, score_extensions=None, native=True, convertible=True, opposite=False):
""" Return IDs of all detected scores with particular file extensions, or all others if ``opposite==True``.
Parameters
----------
keys : :obj:`str` or :obj:`collections.abc.Iterable`, optional
Only get IDs for particular keys.
score_extensions : :obj:`collections.abc.Collection`, optional
Get IDs for files with the given extensions (each starting with a dot). If this parameter is defined,
``native```and ``convertible`` are being ignored.
native : :obj:`bool`, optional
If ``score_extensions`` is not set, ``native=True`` selects all scores that ms3 can parse without using
a MuseScore 3 executable.
convertible : :obj:`bool`, optional
If ``score_extensions`` is not set, ``convertible=True`` selects all scores that ms3 can parse as long as
a MuseScore 3 executable is defined.
opposite : :obj:`bool`, optional
Set to True if you want to get the IDs of all the scores that do NOT have the specified extensions.
Returns
-------
:obj:`list`
A list of IDs.
"""
if score_extensions is None:
score_extensions = []
if native:
score_extensions.extend(Score.native_formats)
if convertible:
score_extensions.extend(Score.convertible_formats)
if opposite:
return [(k, i) for k, i in self._iterids(keys) if self.fexts[k][i][1:].lower() not in score_extensions]
return [(k, i) for k, i in self._iterids(keys) if self.fexts[k][i][1:].lower() in score_extensions]
[docs] def store_parsed_scores(self,
view_name: Optional[str] = None,
only_changed: bool = True,
root_dir: Optional[str] = None,
folder: str = '.',
suffix: str = '',
overwrite: bool = False,
simulate=False) -> List[str]:
""" Stores all parsed scores under this view as MuseScore 3 files.
Args:
view_name:
only_changed:
By default, only scores that have been modified since parsing are written. Set to False to store
all scores regardless.
root_dir:
folder:
suffix: Suffix to append to the original file name.
overwrite: Pass True to overwrite existing files.
simulate: Set to True if no files are to be written.
Returns:
Paths of the stored files.
"""
file_paths = []
for fname, piece in self.iter_pieces(view_name):
if only_changed:
score_iterator = piece.get_changed_scores(view_name)
else:
score_iterator = piece.get_parsed_scores(view_name)
for file, score in score_iterator:
path = piece.store_parsed_score_at_ix(ix=file.ix,
root_dir=root_dir,
folder=folder,
suffix=suffix,
overwrite=overwrite,
simulate=simulate)
if path is None:
continue
file_paths.append(path)
if path == file.full_path:
continue
if self.corpus_path not in path:
continue
new_files = self.add_file_paths([path])
if len(new_files) == 0:
file_to_register = self.get_file_from_path(path)
else:
file_to_register = new_files[0]
piece.add_parsed_score(file_to_register.ix, score)
return file_paths
def _treat_key_param(self, keys):
if keys is None:
keys = list(self.full_paths.keys())
elif isinstance(keys, str):
keys = [keys]
return [k for k in sorted(set(keys)) if k in self.files]
def _treat_harmony_layer_param(self, harmony_layer):
if harmony_layer is None:
return None
all_types = {str(k): k for k in self.count_labels().keys()}
if isinstance(harmony_layer, int) or isinstance(harmony_layer, str):
harmony_layer = [harmony_layer]
lt = [str(t) for t in harmony_layer]
def matches_any_type(user_input):
return any(True for t in all_types if user_input in t)
def get_matches(user_input):
return [t for t in all_types if user_input in t]
not_found = [t for t in lt if not matches_any_type(t)]
if len(not_found) > 0:
plural = len(not_found) > 1
plural_s = 's' if plural else ''
self.logger.warning(
f"No labels found with {'these' if plural else 'this'} label{plural_s} harmony_layer{plural_s}: {', '.join(not_found)}")
return [all_types[t] for user_input in lt for t in get_matches(user_input)]
# def update_metadata(self, allow_suffix=False):
# """Uses all parsed metadata TSVs to update the information in the corresponding parsed MSCX files and returns
# the IDs of those that have been changed.
#
# Parameters
# ----------
# allow_suffix : :obj:`bool`, optional
# If set to True, this would also update the metadata for currently parsed MuseScore files
# corresponding to the columns 'rel_paths' and 'fnames' + [ANY SUFFIX]. For example,
# the row ('MS3', 'bwv846') would also update the metadata of 'MS3/bwv846_reviewed.mscx'.
#
# Returns
# -------
# :obj:`list`
# IDs of the parsed MuseScore files whose metadata has been updated.
# """
# metadata_dfs = self.metadata_tsv()
# if len(metadata_dfs) > 0:
# metadata = pd.concat(metadata_dfs.values(), keys=metadata_dfs.keys())
# else:
# metadata = self._metadata
# if len(metadata) == 0:
# self.logger.debug("No parsed metadata found.")
# return
# old = metadata
# if old.index.names != ['rel_paths', 'fnames']:
# try:
# old = old.set_index(['rel_paths', 'fnames'])
# except KeyError:
# self.logger.warning(f"Corpusd metadata do not contain the columns 'rel_paths' and 'fnames' "
# f"needed to match information on identical files.")
# return []
# new = self.metadata_from_parsed(from_tsv=False).set_index(['rel_paths', 'fnames'])
# excluded_cols = ['ambitus', 'annotated_key', 'KeySig', 'label_count', 'last_mc', 'last_mn', 'musescore',
# 'TimeSig', 'length_qb', 'length_qb_unfolded', 'all_notes_qb', 'n_onsets', 'n_onset_positions']
# old_cols = sorted([c for c in old.columns if c not in excluded_cols and c[:5] != 'staff'])
#
# parsed = old.index.map(lambda i: i in new.index)
# relevant = old.loc[parsed, old_cols]
# updates = defaultdict(dict)
# for i, row in relevant.iterrows():
# new_row = new.loc[i]
# for j, val in row[row.notna()].iteritems():
# val = str(val)
# if j not in new_row or str(new_row[j]) != val:
# updates[i][j] = val
#
# l = len(updates)
# ids = []
# if l > 0:
# for (rel_path, fname), new_dict in updates.items():
# matches = self.fname2ids(fname=fname, rel_path=rel_path, allow_suffix=allow_suffix)
# match_ids = [id for id in matches.keys() if id in self._parsed_mscx]
# n_files_to_update = len(match_ids)
# if n_files_to_update == 0:
# self.logger.debug(
# f"rel_path={rel_path}, fname={fname} does not correspond to a currently parsed MuseScore file.")
# continue
# for id in match_ids:
# for name, val in new_dict.items():
# self._parsed_mscx[id].mscx.parsed.metatags[name] = val
# self._parsed_mscx[id].mscx.parsed.update_metadata()
# self.ix_logger(id).debug(f"Updated with {new_dict}")
# ids.append(id)
#
# self.logger.info(f"{l} files updated.")
# else:
# self.logger.info("Nothing to update.")
# return ids
def __getstate__(self):
""" Override the method of superclass """
return self.__dict__
def __getitem__(self, fname_or_ix: Union[str, int]) -> Union[Piece, ParsedFile]:
if isinstance(fname_or_ix, str):
return self.get_piece(fname_or_ix)
if isinstance(fname_or_ix, int):
return self.get_parsed_at_index(fname_or_ix)
if isinstance(fname_or_ix, tuple):
if len(fname_or_ix) == 1:
return self.get_piece(fname_or_ix[0])
fname, ix = fname_or_ix
return self.get_piece(fname)[ix]
raise TypeError(f"Index needs to be fname (str) or ix (int), not {fname_or_ix} ({type(fname_or_ix)})")
def __iter__(self) -> Iterator[Tuple[str, Piece]]:
""" Iterate through all (fname, Piece) tuples, regardless of any Views.
Yields: (fname, Piece) tuples
"""
yield from self._pieces.items()
def __repr__(self):
return self.info(return_str=True)
[docs] def get_piece(self, fname) -> Piece:
"""Returns the :obj:`Piece` object for fname."""
assert fname in self._pieces, f"'{fname}' is not a piece in this corpus."
return self._pieces[fname]
########################################################################################################################
########################################################################################################################
################################################# End of Corpus() ########################################################
########################################################################################################################
########################################################################################################################
[docs]def parse_musescore_file(file: File,
logger: Logger,
logger_cfg: dict = {},
read_only: bool = False,
ms: Optional[str]= None) -> Score:
"""Performs a single parse and returns the resulting Score object or None.
Args:
file: File object with path information of a score that can be opened (or converted) with MuseScore 3.
logger: Logger to be used within this function (not for the parsing itself).
logger_cfg: Logger config for the new Score object (and therefore for the parsing itself).
read_only:
Pass True to return smaller objects that do not keep a copy of the original XML structure in memory.
In order to make changes to the score after parsing, this needs to be False (default).
ms: MuseScore executable in case the file needs to be converted.
Returns:
The parsed score.
"""
path = file.full_path
file = file.file
logger.debug(f"Attempting to parse {file}")
try:
score = Score(path, read_only=read_only, ms=ms, **logger_cfg)
if score is None:
logger.debug(f"Encountered errors when parsing {file}")
else:
logger.debug(f"Successfully parsed {file}")
return score
except (KeyboardInterrupt, SystemExit):
logger.info("Process aborted.")
raise
except Exception as e:
logger.error(f"Unable to parse {path} due to the following exception:\n{e}")
return None
def parse_tsv_file(file, logger) -> pd.DataFrame:
path = file.full_path
logger.debug(f"Trying to load {file.rel_path}")
try:
df = load_tsv(path)
return df
except Exception as e:
logger.info(
return_str=f"Couldn't be loaded, probably no tabular format or you need to specify 'sep', the delimiter as **kwargs."
f"\n{path}\nError: {e}")