Source code for mavis.checker

"""
module responsible for checking MAVIS output. Determines if jobs completed correctly and
what the status of the pipeline is
"""
import glob
import os
import re

from .constants import COMPLETE_STAMP, DISEASE_STATUS, SUBCOMMAND, PROTOCOL
from .util import bash_expands, log, MavisNamespace, unique_exists


LIBRARY_DIR_REGEX = r'^[\w-]+_({})_({})$'.format('|'.join(DISEASE_STATUS.values()), '|'.join(PROTOCOL.values()))
SGE_LOG_PATTERN = r'*.o*'
LOG_PATTERN = r'*.log'
BATCH_ID_PATTERN = 'batch-[0-9a-zA-Z-]+'

LOGFILE_STATUS = MavisNamespace(
    EMPTY='empty',
    CRASH='crash',
    INCOMPLETE='incomplete',
    COMPLETE='complete'
)


[docs]class LogDetails: """ stores information about the log status """ def __init__(self, filename): self.filename = filename self.status = None self.message = None self.run_time = None self.last_mod = None with open(filename, 'r') as fh: lines = fh.readlines() if not lines: self.status = LOGFILE_STATUS.EMPTY else: for line in lines[::-1]: line = line.strip() if line: non_empty_line = line.lower() break else: non_empty_line = lines[-1].lower() if re.search(r'\b(error|fault|fatal|aborted|core dumped|killed|died|^\S+error)\b', non_empty_line): self.status = LOGFILE_STATUS.CRASH self.message = non_empty_line.strip() else: run_time = None for line in lines[-10:]: match = re.match(r'^\s*run time \(s\): (\d+)\s*$', line) if match: run_time = int(match.group(1)) break if run_time is None: self.status = LOGFILE_STATUS.INCOMPLETE self.message = lines[-1].strip() self.last_mod = os.path.getmtime(filename) else: self.run_time = run_time self.status = LOGFILE_STATUS.COMPLETE
[docs]def parse_run_time(filename): with open(filename, 'r') as fh: for line in fh.readlines()[::-1]: match = re.match(r'^\s*run time \(s\): (\d+)\s*$', line) if match: return int(match.group(1)) return None
[docs]class PipelineStageRun: def __init__(self, name, output_dir): self.name = name self.output_dir = output_dir if not os.path.exists(output_dir): raise OSError('missing output_dir', output_dir) self.times = {} self.logs = {} self.stamps = {} self.job_ids = set() self.single = True self.max_run_time = None self.total_run_time = None self.avg_run_time = None if self.name in [SUBCOMMAND.ANNOTATE, SUBCOMMAND.VALIDATE, SUBCOMMAND.CLUSTER]: self.single = False for dirname in glob.glob(os.path.join(output_dir, '*')): name = os.path.basename(dirname) match = re.match(r'^' + BATCH_ID_PATTERN + r'-(\d+)(\.tab)?$', name) if match: self.job_ids.add(int(match.group(1))) if not self.job_ids: self.job_ids.add(1)
[docs] def report(self, indent=' ', indent_level=0, time_stamp=False): """ parses log files and checks for complete stamps. Reports any errors observed Returns: bool: success - True: no errors or incomplete files were found - False: some errors or incomplete files """ for job_task_id in self.job_ids: self.collect_log(job_task_id) self.collect_stamp(job_task_id) if self.single: self.collect_log() self.collect_stamp() self.job_ids = {None} incomplete_jobs = set() missing_logs = set() missing_stamp = set() missing_both = set() errors = set() run_times = {} for job_task_id in sorted(self.job_ids): if job_task_id in self.stamps: runtime = parse_run_time(self.stamps[job_task_id]) if runtime is not None: run_times[job_task_id] = runtime if job_task_id not in self.logs: # complete but unlogged? missing_logs.add(job_task_id) else: logfile = self.logs[job_task_id] if logfile.status == LOGFILE_STATUS.CRASH: errors.add(job_task_id) if logfile.run_time is not None: run_times[job_task_id] = logfile.run_time else: if job_task_id in self.logs: logfile = self.logs[job_task_id] if logfile.status == LOGFILE_STATUS.CRASH: errors.add(job_task_id) elif logfile.status == LOGFILE_STATUS.COMPLETE: missing_stamp.add(job_task_id) else: incomplete_jobs.add(job_task_id) else: missing_both.add(job_task_id) # report the overall status if not any([self.job_ids, self.logs, self.stamps]): log(indent * indent_level + self.name, 'FAIL', time_stamp=time_stamp) log(indent * indent_level + ' no files found: stage not started, or skipped', time_stamp=False) return False elif any([incomplete_jobs, missing_both, missing_stamp, errors]): log(indent * indent_level + self.name, 'FAIL', time_stamp=time_stamp) # summarize the errors if None not in self.job_ids or len(self.job_ids) > 1: if missing_logs: log('{}{} jobs stamped complete but missing log files (jobs: {})'.format( indent * (indent_level + 1), len(missing_logs), convert_set_to_ranges(missing_logs)), time_stamp=False) if missing_stamp: log('{}{} jobs logged complete but missing stamp (jobs: {})'.format( indent * (indent_level + 1), len(missing_stamp), convert_set_to_ranges(missing_stamp)), time_stamp=False) if missing_both: log('{}{} jobs not started (no log/stamp) (jobs: {})'.format( indent * (indent_level + 1), len(missing_both), convert_set_to_ranges(missing_both)), time_stamp=False) if incomplete_jobs: log('{}{} jobs incomplete without errors (jobs: {})'.format( indent * (indent_level + 1), len(incomplete_jobs), convert_set_to_ranges(incomplete_jobs)), time_stamp=False) if errors: log('{}{} jobs CRASHED (jobs: {})'.format( indent * (indent_level + 1), len(errors), convert_set_to_ranges(errors)), time_stamp=False) details = {} for job_task_id in errors: logfile = self.logs[job_task_id] details.setdefault(logfile.message, set()).add(job_task_id) for msg, jobs in details.items(): if len(msg) > 80: msg = msg[:80] + ' ...' log('{}{} (jobs: {})'.format(indent * (indent_level + 2), msg, convert_set_to_ranges(jobs)), time_stamp=False) else: if missing_logs: log(indent * (indent_level + 1) + 'job stamped complete but missing log file', time_stamp=False) if missing_stamp: log(indent * (indent_level + 1) + 'job logged complete but missing complete stamp', time_stamp=False) if missing_both: log(indent * (indent_level + 1) + 'job not started (no log/stamp)', time_stamp=False) if incomplete_jobs: log(indent * (indent_level + 1) + 'job incomplete without errors', time_stamp=False) if errors: log(indent * (indent_level + 1) + 'job CRASHED', self.logs[None].message, time_stamp=False) return False if any([incomplete_jobs, missing_both, missing_stamp, errors]) else True else: log(indent * indent_level + self.name, 'OK', time_stamp=time_stamp) run_times, all_times = self.estimate_run_time() if run_times: self.max_run_time = max(run_times) self.total_run_time = sum(run_times) self.avg_run_time = int(round(self.total_run_time / len(self.job_ids), 0)) prefix = indent * (indent_level + 1) + ('' if all_times else 'min ') if self.name in [SUBCOMMAND.ANNOTATE, SUBCOMMAND.VALIDATE]: log(prefix + 'run times ({} jobs): {} (max), {} (total), {} (average)'.format( len(self.job_ids), self.max_run_time, self.total_run_time, self.avg_run_time), time_stamp=False) else: log(prefix + 'run time:', self.max_run_time, time_stamp=False) else: log(indent * (indent_level + 1) + 'error parsing run-times from the log files', time_stamp=False) return True
[docs] def estimate_run_time(self): """ pull the run time information from the logs/stamps """ run_time = {} files_used = set() all_times = True for job_task_id in self.job_ids: if job_task_id in self.stamps: if self.stamps[job_task_id] not in files_used: files_used.add(self.stamps[job_task_id]) job_run_time = parse_run_time(self.stamps[job_task_id]) if job_run_time is not None: run_time[job_task_id] = job_run_time continue else: continue if job_task_id in self.logs: if self.logs[job_task_id] not in files_used: files_used.add(self.logs[job_task_id].filename) if self.logs[job_task_id].run_time is not None: run_time[job_task_id] = self.logs[job_task_id].run_time else: continue if job_task_id not in run_time: all_times = False return run_time.values(), all_times
[docs] def collect_stamp(self, job_task_id=None): """ finds and stores the job complete stamp """ if self.name in [SUBCOMMAND.ANNOTATE, SUBCOMMAND.VALIDATE]: # annotation and validation are setup in subdirectories each with their own complete stamp stamp_pattern = os.path.join(self.output_dir, '*-' + str(job_task_id), COMPLETE_STAMP) elif self.name == SUBCOMMAND.CLUSTER: stamp_pattern = os.path.join(self.output_dir, COMPLETE_STAMP) # single stamp for top-level directory elif self.name in [SUBCOMMAND.SUMMARY, SUBCOMMAND.PAIR]: stamp_pattern = os.path.join(self.output_dir, COMPLETE_STAMP) # single stamp for top-level directory else: raise NotImplementedError('checker has not been implemented for pipeline stage', self.name) # collect the log and complete stamp files try: self.stamps[job_task_id] = unique_exists(stamp_pattern) except OSError: pass
[docs] def collect_log(self, job_task_id=None): """ finds and stores the job log file """ if self.name in [SUBCOMMAND.ANNOTATE, SUBCOMMAND.VALIDATE]: # annotation and validation are setup in subdirectories each with their own complete stamp patterns = [ os.path.join(self.output_dir, '{}.{}'.format(SGE_LOG_PATTERN, job_task_id)), # old log pattern os.path.join(self.output_dir, '*-{}'.format(job_task_id), LOG_PATTERN), # single job os.path.join(self.output_dir, LOG_PATTERN) # old log pattern manual run ] for log_pattern in patterns: try: self.logs[job_task_id] = LogDetails(unique_exists(log_pattern, allow_none=False, get_newest=True)) except OSError: pass else: break else: patterns = [ os.path.join(self.output_dir, SGE_LOG_PATTERN), # single job os.path.join(self.output_dir, LOG_PATTERN) # manual run ] for log_pattern in patterns: try: self.logs[job_task_id] = LogDetails(unique_exists(log_pattern, allow_none=False, get_newest=True)) except OSError: pass else: break
[docs]class LibraryRun: """ stores run information for pipeline steps that are run on individual libraries """ def __init__(self, name, output_dir): self.name = name self.output_dir = output_dir self.max_run_time = 0 self.total_run_time = 0 self.avg_run_time = 0 self.log_parse_error = False try: self.cluster = PipelineStageRun(SUBCOMMAND.CLUSTER, os.path.join(output_dir, SUBCOMMAND.CLUSTER)) except OSError: self.cluster = None try: self.validation = PipelineStageRun(SUBCOMMAND.VALIDATE, os.path.join(output_dir, SUBCOMMAND.VALIDATE)) except OSError: self.validation = None try: self.annotation = PipelineStageRun(SUBCOMMAND.ANNOTATE, os.path.join(output_dir, SUBCOMMAND.ANNOTATE)) except OSError: self.annotation = None
[docs] def report(self): self.max_run_time = 0 self.total_run_time = 0 self.avg_run_time = 0 result = True collective_job_ids = self.cluster.job_ids | self.annotation.job_ids if self.validation: collective_job_ids.update(self.validation.job_ids) self.validation.job_ids.update(collective_job_ids) self.cluster.job_ids.update(collective_job_ids) self.annotation.job_ids.update(collective_job_ids) if not self.cluster or not self.cluster.report(indent_level=1): result = False if self.validation and not self.validation.report(indent_level=1): result = False if not self.annotation or not self.annotation.report(indent_level=1): result = False if self.cluster.max_run_time is not None: self.max_run_time += self.cluster.max_run_time self.total_run_time += self.cluster.total_run_time self.avg_run_time += self.cluster.avg_run_time else: self.log_parse_error = True for stage in [self.validation, self.annotation]: if not stage: continue if stage.max_run_time is not None: self.max_run_time += stage.max_run_time self.total_run_time += stage.total_run_time self.avg_run_time += stage.avg_run_time else: self.log_parse_error = True return result
[docs]def convert_set_to_ranges(input_set): """ for a set of integers returns a list of consecutive ranges as strings Example: >>> convert_set_to_ranges({1, 2, 3, 7, 9, 10, 11}) ['1-3', '7', '10-11'] """ ranges = [] for curr in sorted(list(input_set)): if ranges: if ranges[-1][1] + 1 == curr: ranges[-1] = (ranges[-1][0], curr) continue ranges.append((curr, curr)) result = [] for start, end in ranges: if start == end: result.append(str(start)) else: result.append(str(start) + '-' + str(end)) return ', '.join(result)
[docs]def check_completion(target_dir, skipped_stages=None): """ Args: target_dir (str): path to the main pipeline output directory """ libraries = [] summary = None pairing = None if not skipped_stages: skipped_stages = set() # check the library steps first for subdir in sorted(glob.glob(os.path.join(target_dir, '*'))): stage_name = os.path.basename(subdir) if stage_name == SUBCOMMAND.PAIR: pairing = PipelineStageRun(stage_name, subdir) elif stage_name == SUBCOMMAND.SUMMARY: summary = PipelineStageRun(stage_name, subdir) elif re.match(LIBRARY_DIR_REGEX, stage_name): libraries.append(LibraryRun(stage_name, subdir)) else: log('ignoring dir', subdir) success_flag = True max_run_time = [] total_run_time = 0 log_parse_error = False for lib in sorted(libraries, key=lambda x: x.name): log('checking library:', lib.name) if not lib.report(): success_flag = False if lib.max_run_time: max_run_time.append(lib.max_run_time) total_run_time += lib.total_run_time if lib.log_parse_error: log_parse_error = True max_run_time = max(max_run_time + [0]) if not pairing.report(time_stamp=True): success_flag = False if not summary.report(time_stamp=True): success_flag = False for stage in [summary, pairing]: if stage.max_run_time is not None: max_run_time += stage.max_run_time total_run_time += stage.total_run_time else: log_parse_error = True log(('' if not log_parse_error else 'min ') + 'parallel run time (s):', max_run_time) log(('' if not log_parse_error else 'min ') + 'total run time (s):', total_run_time) return success_flag