#!/usr/bin/env python
################################################################################
#
#
# task.py
#
# Copyright (c) 10/9/2009 Leo Goodstadt
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#################################################################################
"""
********************************************
:mod:`ruffus.task` -- Overview
********************************************
.. moduleauthor:: Leo Goodstadt <ruffus@llew.org.uk>
Initial implementation of @active_if by Jacob Biesinger
============================
Decorator syntax:
============================
Pipelined tasks are created by "decorating" a function with the following syntax::
def func_a():
pass
@follows(func_a)
def func_b ():
pass
Each task is a single function which is applied one or more times to a list of parameters
(typically input files to produce a list of output files).
Each of these is a separate, independent job (sharing the same code) which can be
run in parallel.
============================
Running the pipeline
============================
To run the pipeline::
pipeline_run(target_tasks, forcedtorun_tasks = [], multiprocess = 1,
logger = stderr_logger,
gnu_make_maximal_rebuild_mode = True,
cleanup_log = "../cleanup.log")
pipeline_cleanup(cleanup_log = "../cleanup.log")
"""
from __future__ import with_statement
import os,sys,copy, multiprocessing
#from collections import namedtuple
import collections
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
# imports
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
import logging
import re
from collections import defaultdict
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
import traceback
import types
from itertools import imap
import textwrap
import time
from multiprocessing.managers import SyncManager
from contextlib import contextmanager
import cPickle as pickle
import dbdict
if __name__ == '__main__':
import sys
sys.path.insert(0,".")
from graph import *
from print_dependencies import *
from ruffus_exceptions import *
from ruffus_utility import *
from file_name_parameters import *
#
# use simplejson in place of json for python < 2.6
#
try:
import json
except ImportError:
import simplejson
json = simplejson
dumps = json.dumps
import Queue
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
#
# light weight logging objects
#
#
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
[docs]class t_black_hole_logger:
"""
Does nothing!
"""
def info (self, message, *args, **kwargs):
pass
def debug (self, message, *args, **kwargs):
pass
def warning (self, message, *args, **kwargs):
pass
def error (self, message, *args, **kwargs):
pass
[docs]class t_stderr_logger:
"""
Everything to stderr
"""
def __init__ (self):
self.unique_prefix = ""
def add_unique_prefix (self):
import random
random.seed()
self.unique_prefix= str(random.randint(0,1000)) + " "
def info (self, message):
sys.stderr.write(self.unique_prefix + message + "\n")
def warning (self, message):
sys.stderr.write("\n\n" + self.unique_prefix + "WARNING:\n " + message + "\n\n")
def error (self, message):
sys.stderr.write("\n\n" + self.unique_prefix + "ERROR:\n " + message + "\n\n")
def debug (self, message):
sys.stderr.write(self.unique_prefix + message + "\n")
class t_stream_logger:
"""
Everything to stderr
"""
def __init__ (self, stream):
self.stream = stream
def info (self, message):
self.stream.write(message + "\n")
def warning (self, message):
sys.stream.write("\n\nWARNING:\n " + message + "\n\n")
def error (self, message):
sys.stream.write("\n\nERROR:\n " + message + "\n\n")
def debug (self, message):
self.stream.write(message + "\n")
black_hole_logger = t_black_hole_logger()
stderr_logger = t_stderr_logger()
class t_verbose_logger:
def __init__ (self, verbose, logger, runtime_data):
self.verbose = verbose
self.logger = logger
self.runtime_data = runtime_data
#_________________________________________________________________________________________
#
# logging helper function
#
#________________________________________________________________________________________
def log_at_level (logger, message_level, verbose_level, msg):
"""
writes to log if message_level > verbose level
"""
if message_level <= verbose_level:
logger.info(msg)
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
# queue management objects
# inserted into queue like job parameters to control multi-processing queue
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
# fake parameters to signal in queue
class all_tasks_complete:
pass
class waiting_for_more_tasks_to_complete:
pass
#
# synchronisation data
#
#SyncManager()
#syncmanager.start()
#
# do nothing semaphore
#
@contextmanager
def do_nothing_semaphore():
yield
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
# task_decorator
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
class task_decorator(object):
"""
Adds task to the "pipeline_task" attribute of this function but
otherwise leaves function untouched
"""
def __init__(self, *decoratorArgs):
"""
saves decorator arguments
"""
self.args = decoratorArgs
def __call__(self, func):
"""
calls func in task with the same name as the class
"""
# add task as attribute of this function
if not hasattr(func, "pipeline_task"):
func.pipeline_task = _task.create_task(func)
# call the method called
# "task.task_decorator"
# where "task_decorator" is the name of this class
decorator_function_name = "task_" + self.__class__.__name__
task_decorator_function = getattr(func.pipeline_task, decorator_function_name)
task_decorator_function(self.args)
#
# don't change the function so we can call it unaltered
#
return func
#
# Basic decorators
#
class follows(task_decorator):
pass
class files(task_decorator):
pass
#
# Core
#
class split(task_decorator):
pass
class transform(task_decorator):
pass
class subdivide(task_decorator):
pass
class originate(task_decorator):
pass
class merge(task_decorator):
pass
class posttask(task_decorator):
pass
class jobs_limit(task_decorator):
pass
#
# Advanced
#
class collate(task_decorator):
pass
class active_if(task_decorator):
pass
#
# Esoteric
#
class check_if_uptodate(task_decorator):
pass
class parallel(task_decorator):
pass
#
# Obsolete
#
class files_re(task_decorator):
pass
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
# indicator objects
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
#_________________________________________________________________________________________
# mkdir
#_________________________________________________________________________________________
class mkdir(task_decorator):
#def __init__ (self, *args):
# self.args = args
pass
#_________________________________________________________________________________________
# touch_file
#_________________________________________________________________________________________
class touch_file(object):
def __init__ (self, *args):
self.args = args
#_________________________________________________________________________________________
# inputs
#_________________________________________________________________________________________
class inputs(object):
def __init__ (self, *args):
self.args = args
#_________________________________________________________________________________________
# add_inputs
#_________________________________________________________________________________________
class add_inputs(object):
def __init__ (self, *args):
self.args = args
#8888888888888888888888888888888888888888888888888888888888888888888888888888888888888
# job descriptors
# given parameters, returns strings describing job
# First returned parameter is string in strong form
# Second returned parameter is a list of strings for input, output and extra parameters
# intended to be reformatted with indentation
# main use in error logging
#8888888888888888888888888888888888888888888888888888888888888888888888888888888888888
def generic_job_descriptor (param, runtime_data):
if param in ([], None):
m = "Job"
else:
m = "Job = %s" % ignore_unknown_encoder(param)
return m, [m]
def io_files_job_descriptor (param, runtime_data):
extra_param = ", " + shorten_filenames_encoder(param[2:])[1:-1] if len(param) > 2 else ""
out_param = shorten_filenames_encoder(param[1]) if len(param) > 1 else "??"
in_param = shorten_filenames_encoder(param[0]) if len(param) > 0 else "??"
return ("Job = [%s -> %s%s]" % (in_param, out_param, extra_param),
["Job = [%s" % in_param, "-> " + out_param + extra_param + "]"])
def io_files_one_to_many_job_descriptor (param, runtime_data):
extra_param = ", " + shorten_filenames_encoder(param[2:])[1:-1] if len(param) > 2 else ""
out_param = shorten_filenames_encoder(param[1]) if len(param) > 1 else "??"
in_param = shorten_filenames_encoder(param[0]) if len(param) > 0 else "??"
# start with input parameter
ret_params = ["Job = [%s" % in_param]
# add output parameter to list,
# processing one by one if multiple output parameters
if len(param) > 1:
if isinstance(param[1], (list, tuple)):
ret_params.extend("-> " + shorten_filenames_encoder(p) for p in param[1])
else:
ret_params.append("-> " + out_param)
# add extra
if len(param) > 2 :
ret_params.append(" , " + shorten_filenames_encoder(param[2:])[1:-1])
# add closing bracket
ret_params[-1] +="]"
return ("Job = [%s -> %s%s]" % (in_param, out_param, extra_param), ret_params)
def mkdir_job_descriptor (param, runtime_data):
# input, output and parameters
if len(param) == 1:
m = "Make directories %s" % (shorten_filenames_encoder(param[0]))
elif len(param) == 2:
m = "Make directories %s" % (shorten_filenames_encoder(param[1]))
else:
return [], []
return m, [m]
#8888888888888888888888888888888888888888888888888888888888888888888888888888888888888
# job wrappers
# registers files/directories for cleanup
#8888888888888888888888888888888888888888888888888888888888888888888888888888888888888
#_________________________________________________________________________________________
# generic job wrapper
#_________________________________________________________________________________________
[docs]def job_wrapper_generic(param, user_defined_work_func, register_cleanup, touch_files_only):
"""
run func
"""
assert(user_defined_work_func)
return user_defined_work_func(*param)
#_________________________________________________________________________________________
# job wrapper for all that deal with i/o files
#_________________________________________________________________________________________
[docs]def job_wrapper_io_files(param, user_defined_work_func, register_cleanup, touch_files_only, output_files_only = False):
"""
run func on any i/o if not up to date
"""
assert(user_defined_work_func)
i,o = param[0:2]
if touch_files_only == 0:
# @originate only uses output files
if output_files_only:
ret_val = user_defined_work_func(*(param[1:]))
# all other decorators
else:
ret_val = user_defined_work_func(*param)
elif touch_files_only == 1:
#job_history = dbdict.open(RUFFUS_HISTORY_FILE, picklevalues=True)
#
# touch files only
#
for f in get_strings_in_nested_sequence(o):
#
# race condition still possible...
#
with file(f, 'a'):
os.utime(f, None)
#if not os.path.exists(f):
# open(f, 'w')
# mtime = os.path.getmtime(f)
#else:
# os.utime(f, None)
# mtime = os.path.getmtime(f)
#chksum = JobHistoryChecksum(f, mtime, param[2:], user_defined_work_func.pipeline_task)
#job_history[f] = chksum # update file times and job details in history
#
# register strings in output file for cleanup
#
for f in get_strings_in_nested_sequence(o):
register_cleanup(f, "file")
#_________________________________________________________________________________________
# job wrapper for all that only deals with output files
#_________________________________________________________________________________________
def job_wrapper_output_files(param, user_defined_work_func, register_cleanup, touch_files_only):
"""
run func on any output file if not up to date
"""
job_wrapper_io_files(param, user_defined_work_func, register_cleanup, touch_files_only, output_files_only = True)
#_________________________________________________________________________________________
# job wrapper for mkdir
#_________________________________________________________________________________________
[docs]def job_wrapper_mkdir(param, user_defined_work_func, register_cleanup, touch_files_only):
"""
make directories if not exists
"""
#
# Just in case, swallow file exist errors because some other makedirs might be subpath
# of this directory
# Should not be necessary because of "sorted" in task_mkdir
#
#
if len(param) == 1:
dirs = param[0]
# if there are two parameters, they are i/o, and the directories to be created are the output
elif len(param) == 2:
dirs = param[1]
else:
raise Exception("Wrong number of arguments in mkdir check %s" % (param,))
# get all file names in flat list
dirs = get_strings_in_nested_sequence (dirs)
for d in dirs:
try:
os.makedirs(d)
register_cleanup(d, "makedirs")
except:
#
# ignore exception if exception == OSError / "File exists"
#
exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
if exceptionType == OSError and "File exists" in str(exceptionValue):
continue
raise
# changed for compatibility with python 3.x
#except OSError, e:
# if "File exists" not in e:
# raise
JOB_ERROR = 0
JOB_SIGNALLED_BREAK = 1
JOB_UP_TO_DATE = 2
JOB_COMPLETED = 3
#_________________________________________________________________________________________
# t_job_result
# Previously a collections.namedtuple (introduced in python 2.6)
# Now using implementation from running
# t_job_result = namedtuple('t_job_result', 'task_name state job_name return_value exception', verbose =1)
# for compatibility with python 2.5
#_________________________________________________________________________________________
class t_job_result(tuple):
't_job_result(task_name, state, job_name, return_value, exception, params)'
__slots__ = ()
fields = ('task_name', 'state', 'job_name', 'return_value', 'exception', 'params')
def __new__(cls, task_name, state, job_name, return_value, exception, params):
return tuple.__new__(cls, (task_name, state, job_name, return_value, exception, params))
@classmethod
def make(cls, iterable, new=tuple.__new__, len=len):
'Make a new t_job_result object from a sequence or iterable'
result = new(cls, iterable)
if len(result) != 6:
raise TypeError('Expected 6 arguments, got %d' % len(result))
return result
def __repr__(self):
return 't_job_result(task_name=%r, state=%r, job_name=%r, return_value=%r, exception=%r, params=%r)' % self
def asdict(t):
'Return a new dict which maps field names to their values'
return {'task_name': t[0], 'state': t[1], 'job_name': t[2], 'return_value': t[3], 'exception': t[4], 'params':t[5]}
def replace(self, **kwds):
'Return a new t_job_result object replacing specified fields with new values'
result = self.make(map(kwds.pop, ('task_name', 'state', 'job_name', 'return_value', 'exception', 'params'), self))
if kwds:
raise ValueError('Got unexpected field names: %r' % kwds.keys())
return result
def __getnewargs__(self):
return tuple(self)
task_name = property(itemgetter(0))
state = property(itemgetter(1))
job_name = property(itemgetter(2))
return_value= property(itemgetter(3))
exception = property(itemgetter(4))
params = property(itemgetter(5))
#_________________________________________________________________________________________
# multiprocess_callback
#
#_________________________________________________________________________________________
def run_pooled_job_without_exceptions (process_parameters):
"""
handles running jobs in parallel
Make sure exceptions are caught here:
Otherwise, these will kill the thread/process
return any exceptions which will be rethrown at the other end:
See RethrownJobError / run_all_jobs_in_task
"""
(param, task_name, job_name, job_wrapper, user_defined_work_func,
job_limit_semaphore, touch_files_only) = process_parameters
##job_history = dbdict.open(RUFFUS_HISTORY_FILE, picklevalues=True)
##outfile = param[1] if len(param) > 1 else None # mkdir has no output
##if not isinstance(outfile, list):
## outfile = [outfile]
##for o in outfile:
## job_history.pop(o, None) # remove outfile from history if it exists
if job_limit_semaphore == None:
job_limit_semaphore = do_nothing_semaphore()
try:
with job_limit_semaphore:
return_value = job_wrapper(param, user_defined_work_func, register_cleanup, touch_files_only)
#
# ensure one second between jobs
#
#if one_second_per_job:
# time.sleep(1.01)
return t_job_result(task_name, JOB_COMPLETED, job_name, return_value, None, param)
except:
# Wrap up one or more exceptions rethrown across process boundaries
#
# See multiprocessor.Server.handle_request/serve_client for an analogous function
exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
exception_stack = traceback.format_exc(exceptionTraceback)
exception_name = exceptionType.__module__ + '.' + exceptionType.__name__
exception_value = str(exceptionValue)
if len(exception_value):
exception_value = "(%s)" % exception_value
if exceptionType == JobSignalledBreak:
job_state = JOB_SIGNALLED_BREAK
else:
job_state = JOB_ERROR
return t_job_result(task_name, job_state, job_name, None,
[task_name,
job_name,
exception_name,
exception_value,
exception_stack], param)
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
# Helper function
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
#_________________________________________________________________________________________
# register_cleanup
# to do
#_________________________________________________________________________________________
def register_cleanup (file_name, operation):
pass
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
# _task
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
class _task (node):
"""
pipeline task
"""
action_names = ["unspecified",
"task",
"task_files_re",
"task_split",
"task_merge",
"task_transform",
"task_collate",
"task_files_func",
"task_files",
"task_mkdir",
"task_parallel",
"task_active_if",
"task_product",
"task_permutations",
"task_combinations",
"task_combinations_with_replacement",
"task_subdivide",
"task_originate",
]
action_unspecified = 0
action_task = 1
action_task_files_re = 2
action_task_split = 3
action_task_merge = 4
action_task_transform = 5
action_task_collate = 6
action_task_files_func = 7
action_task_files = 8
action_mkdir = 9
action_parallel = 10
action_active_if = 11
action_task_product = 12
action_task_permutations = 13
action_task_combinations = 14
action_task_combinations_with_replacement = 15
action_task_subdivide = 16
action_task_originate = 17
multiple_jobs_outputs = 0
single_job_single_output = 1
job_single_matches_parent= 2
job_limit_semaphores = {}
#_________________________________________________________________________________________
# create_task / __init__
#_________________________________________________________________________________________
@staticmethod
def create_task(func):
"""
Create task if the name as not been previously specified
Note that the task function may not have been created yet.
This allows us to create tasks and dependencies out of order
"""
func_name = func.__name__
module_name = str(func.__module__)
task_name = module_name + "." + func_name
# Link to existing dependency if task name has previously been specified
if node.is_node(task_name):
t = node.lookup_node_from_name(task_name)
if t.user_defined_work_func != None:
raise error_duplicate_task_name("Same task name %s specified multiple times in the same module" % task_name)
# otherwise create new
else:
t = _task(module_name, func_name)
t.set_action_type (_task.action_task)
t.user_defined_work_func = func
assert(t._name == task_name)
# convert description into one line
if func.__doc__:
t._description = re.sub("\n\s+", " ", func.__doc__).strip()
else:
t._description = ""
return t
#_________________________________________________________________________________________
# get_action_name
#_________________________________________________________________________________________
def get_action_name (self):
return _task.action_names[self._action_type]
#_________________________________________________________________________________________
# __init__
#_________________________________________________________________________________________
def __init__ (self, module_name, func_name):
"""
Does nothing because this might just be a dependency.
If it does not get initialised by a real task
(a task is depending on an unknown function/task),
throw an exception when running the pipeline
"""
self._module_name = module_name
self._func_name = func_name
node.__init__ (self, module_name + "." + func_name)
self._action_type = _task.action_unspecified
# Each task has its own checksum level
# At the moment this is really so multiple pipelines in the same script can have
# different checksum levels
# Though set by pipeline_xxxx functions, have initial valid value so unit tests work :-|
self.checksum_level = CHECKSUM_FILE_TIMESTAMPS
self.param_generator_func = None
self.needs_update_func = None
self.job_wrapper = job_wrapper_generic
#
self.job_descriptor = generic_job_descriptor
# jobs which produce a single output.
# special handling for task.get_output_files for dependency chaining
self._single_job_single_output = self.multiple_jobs_outputs
self.single_multi_io = self.many_to_many
# function which is decorated and does the actual work
self.user_defined_work_func = None
# functions which will be called when task completes
self.posttask_functions = []
# give makedir automatically made parent tasks unique names
self.cnt_task_mkdir = 0
# whether only task function itself knows what output it will produce
# i.e. output is a glob or something similar
self.indeterminate_output = 0
# cache output file names here
self.output_filenames = None
self.semaphore_name = module_name + "." + func_name
# do not test for whether task is active
self.active_if_checks = None
# extra flag for outputfiles
self.is_active = True
#_________________________________________________________________________________________
# init_for_pipeline
#_________________________________________________________________________________________
def init_for_pipeline (self):
"""
Initialize variables for pipeline run / printout
**********
BEWARE
**********
Because state is stored, ruffus is *not* reentrant.
**********
BEWARE
**********
"""
# cache output file names here
self.output_filenames = None
#_________________________________________________________________________________________
# set_action_type
#_________________________________________________________________________________________
def set_action_type (self, new_action_type):
"""
Save how this task
1) tests whether it is up-to-date and
2) handles input/output files
Checks that the task has not been defined with conflicting actions
"""
if self._action_type not in (_task.action_unspecified, _task.action_task):
old_action = _task.action_names[self._action_type]
new_action = _task.action_names[new_action_type]
actions = " and ".join(list(set((old_action, new_action))))
task_name = "def %s(...)" % self._name.replace("__main__.", "")
raise error_decorator_args((" %s\n has duplicate task specifications: (%s)\n") %
(task_name, actions))
self._action_type = new_action_type
self._action_type_desc = _task.action_names[new_action_type]
#_________________________________________________________________________________________
# get_job_name
#_________________________________________________________________________________________
def get_job_name(self, descriptive_param, runtime_data):
"""
Use job descriptor to return short name for job, including any parameters
runtime_data is not (yet) used but may be used to add context in future
"""
return self.job_descriptor(descriptive_param, runtime_data)[0]
#_________________________________________________________________________________________
# get_task_name
#_________________________________________________________________________________________
def get_task_name(self, in_func_format = False):
"""
Returns name of task function, removing __main__ namespace if necessary
if in_func_format is true, will return def task_func(...):
"""
task_name = self._name.replace("__main__.", "")
if self._action_type != _task.action_mkdir and in_func_format:
return "def %s(...):" % task_name
else:
return task_name
#_________________________________________________________________________________________
# update_active_state
#_________________________________________________________________________________________
def update_active_state (self):
#
# If has an @active_if decorator, check if the task needs to be run
# @active_if parameters may be call back functions or booleans
#
if (self.active_if_checks != None and
any( not arg() if isinstance(arg, collections.Callable) else not arg
for arg in self.active_if_checks)):
# flip is active to false.
# ( get_output_files() will return empty if inactive )
# Remember each iteration of pipeline_printout pipeline_run will have
# another bite at changing this value
self.is_active = False
else:
# flip is active to True so that downstream dependencies will be correct
# ( get_output_files() will return empty if inactive )
# Remember each iteration of pipeline_printout pipeline_run will have
# another bite at changing this value
self.is_active = True
#_________________________________________________________________________________________
# printout
#_________________________________________________________________________________________
def printout (self, runtime_data, force_rerun, job_history, verbose=1, indent = 4):
"""
Print out all jobs for this task
verbose = 1 : print task name
2 : print task description if exists
3 : print job names for jobs to be run
4 : print job names for up-to- date jobs
"""
def get_job_names (param, indent_str):
job_names = self.job_descriptor(param, runtime_data)[1]
if len(job_names) > 1:
job_names = ([indent_str + job_names[0]] +
[indent_str + " " + jn for jn in job_names[1:]])
else:
job_names = ([indent_str + job_names[0]])
return job_names
if not verbose:
return []
indent_str = ' ' * indent
messages = []
messages.append("Task = " + self.get_task_name() + (" >>Forced to rerun<<" if force_rerun else ""))
if verbose ==1:
return messages
if verbose >= 2 and len(self._description):
messages.append(indent_str + '"' + self._description + '"')
#
# single job state
#
if verbose > 5:
if self._single_job_single_output == self.single_job_single_output:
messages.append(" Single job single output")
elif self._single_job_single_output == self.multiple_jobs_outputs:
messages.append(" Multiple jobs Multiple outputs")
else:
messages.append(" Single jobs status depends on %s" % self._single_job_single_output._name)
if verbose <= 2 :
return messages
# increase indent for jobs up to date status
indent_str += " " * 3
#
# If has an @active_if decorator, check if the task needs to be run
# @active_if parameters may be call back functions or booleans
#
if not self.is_active:
if verbose <= 3:
return messages
messages.append(indent_str + "Task is inactive")
# add spacer line
messages.append("")
return messages
#
# No parameters: just call task function
#
if self.param_generator_func == None:
if verbose <= 3:
return messages
#
# needs update func = None: always needs update
#
if not self.needs_update_func:
messages.append(indent_str + "Task needs update: No function to check if up-to-date or not")
return messages
if self.needs_update_func == needs_update_check_modify_time:
needs_update, msg = self.needs_update_func (task=self, job_history = job_history)
else:
needs_update, msg = self.needs_update_func ()
if needs_update:
messages.append(indent_str + "Task needs update: %s" % msg)
else:
messages.append(indent_str + "Task up-to-date")
else:
runtime_data["MATCH_FAILURE"] = []
#
# return messages description per job
#
cnt_jobs = 0
for param, descriptive_param in self.param_generator_func(runtime_data):
cnt_jobs += 1
#
# needs update func = None: always needs update
#
if not self.needs_update_func:
messages.extend(get_job_names (descriptive_param, indent_str))
messages.append(indent_str + " Jobs needs update: No function to check if up-to-date or not")
continue
if self.needs_update_func == needs_update_check_modify_time:
needs_update, msg = self.needs_update_func (*param, task=self, job_history = job_history)
else:
needs_update, msg = self.needs_update_func (*param)
if needs_update:
messages.extend(get_job_names (descriptive_param, indent_str))
per_job_messages = [(indent_str + s) for s in (" Job needs update: %s" % msg).split("\n")]
messages.extend(per_job_messages)
else:
if verbose > 4:
messages.extend(get_job_names (descriptive_param, indent_str))
messages.append(indent_str + " Job up-to-date")
if cnt_jobs == 0:
messages.append(indent_str + "!!! No jobs for this task. "
"Are you sure there is not a error in your "
"code / regular expression?")
if verbose >= 3 or (verbose and cnt_jobs == 0):
if runtime_data and "MATCH_FAILURE" in runtime_data:
for s in runtime_data["MATCH_FAILURE"]:
messages.append(indent_str + "Warning: File match failure: " + s)
runtime_data["MATCH_FAILURE"] = []
messages.append("")
return messages
#_____________________________________________________________________________________
# signal
#
# returns whether up to date
#
#_____________________________________________________________________________________
def signal (self, verbose_logger_job_history):
"""
If up to date: signal = true
If true, depth first search will not pass through this node
"""
if not verbose_logger_job_history:
raise Exception("verbose_logger_job_history is None")
verbose_logger = verbose_logger_job_history[0]
job_history = verbose_logger_job_history[1]
try:
logger = verbose_logger.logger
verbose = verbose_logger.verbose
runtime_data = verbose_logger.runtime_data
log_at_level (logger, 4, verbose,
" Task = " + self.get_task_name())
#
# If job is inactive, always consider it up-to-date
#
if (self.active_if_checks != None and
any( not arg() if isinstance(arg, collections.Callable) else not arg
for arg in self.active_if_checks)):
log_at_level (logger, 4, verbose,
" Inactive task: treat as Up to date")
#print 'signaling that the inactive task is up to date'
return True
#
# Always needs update if no way to check if up to date
#
if self.needs_update_func == None:
log_at_level (logger, 4, verbose,
" No update function: treat as out of date")
return False
#
# if no parameters, just return the results of needs update
#
if self.param_generator_func == None:
if self.needs_update_func:
if self.needs_update_func == needs_update_check_modify_time:
needs_update, msg = self.needs_update_func (task=self, job_history = job_history)
else:
needs_update, msg = self.needs_update_func ()
log_at_level (logger, 4, verbose,
" Needs update = %s" % needs_update)
return not needs_update
else:
return True
else:
#
# return not up to date if ANY jobs needs update
#
for param, descriptive_param in self.param_generator_func(runtime_data):
if self.needs_update_func == needs_update_check_modify_time:
needs_update, msg = self.needs_update_func (*param, task=self, job_history = job_history)
else:
needs_update, msg = self.needs_update_func (*param)
if needs_update:
if verbose >= 4:
job_name = self.get_job_name(descriptive_param, runtime_data)
log_at_level (logger, 4, verbose,
" Needing update:\n %s" % job_name)
return False
#
# Percolate warnings from parameter factories
#
if (verbose >= 1 and "ruffus_WARNING" in runtime_data and
self.param_generator_func in runtime_data["ruffus_WARNING"]):
for msg in runtime_data["ruffus_WARNING"][self.param_generator_func]:
logger.warning(" 'In Task %s' %s " % (self.get_task_name(True), msg))
log_at_level (logger, 4, verbose, " All jobs up to date")
return True
#
# removed for compatibility with python 3.x
#
# rethrow exception after adding task name
#except error_task, inst:
# inst.specify_task(self, "Exceptions in dependency checking")
# raise
except:
exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
#
# rethrow exception after adding task name
#
if exceptionType == error_task:
exceptionValue.specify
inst.specify_task(self, "Exceptions in dependency checking")
raise
exception_stack = traceback.format_exc(exceptionTraceback)
exception_name = exceptionType.__module__ + '.' + exceptionType.__name__
exception_value = str(exceptionValue)
if len(exception_value):
exception_value = "(%s)" % exception_value
errt = RethrownJobError([(self._name,
"",
exception_name,
exception_value,
exception_stack)])
errt.specify_task(self, "Exceptions generating parameters")
raise errt
#_____________________________________________________________________________________
# get_output_files
#
#
#_____________________________________________________________________________________
def get_output_files (self, do_not_expand_single_job_tasks, runtime_data):
"""
Cache output files
If flattened is True, returns file as a list of strings,
flattening any nested structures and discarding non string names
Normally returns a list with one item for each job or a just a list of file names.
For "single_job_single_output" i.e. @merge and @files with single jobs,
returns the output of a single job (i.e. can be a string)
"""
#
# N.B. active_if_checks is called once per task
# in make_job_parameter_generator() for consistency
#
# self.is_active can be set using self.active_if_checks in that function,
# and therefore can be changed BETWEEN invocations of pipeline_run
#
# self.is_active is not used anywhere else
#
if (not self.is_active):
return []
#
# This looks like the wrong place to flatten
#
flattened = False
if self.output_filenames == None:
self.output_filenames = []
# skip tasks which don't have parameters
if self.param_generator_func != None:
cnt_jobs = 0
for param, descriptive_param in self.param_generator_func(runtime_data):
cnt_jobs += 1
# skip tasks which don't have output parameters
if len(param) >= 2:
# make sure each @split or @subdivide or @originate returns a list of jobs
# i.e. each @split or @subdivide or @originate is always a ->many operation
# even if len(many) can be 1 (or zero)
if self.indeterminate_output and not non_str_sequence(param[1]):
self.output_filenames.append([param[1]])
else:
self.output_filenames.append(param[1])
if self._single_job_single_output == self.single_job_single_output:
if cnt_jobs > 1:
raise error_task_get_output(self,
"Task which is supposed to produce a single output "
"somehow has more than one job.")
#
# The output of @split should be treated as multiple jobs
#
# The output of @split is always a list of lists:
# 1) There is a list of @split jobs
# A) For advanced (regex) @split
# this is a many -> many more operation
# So len(list) == many (i.e. the number of jobs
# B) For normal @split
# this is a 1 -> many operation
# So len(list) = 1
#
# 2) The output of each @split job is a list
# The items in this list of lists are each a job in subsequent tasks
#
#
# So we need to concatenate these separate lists into a single list of output
#
# For example:
# @split(["a.1", "b.1"], regex(r"(.)\.1"), r"\1.*.2")
# def example(input, output):
# # JOB 1
# # a.1 -> a.i.2
# # -> a.j.2
#
# # JOB 2
# # b.1 -> b.i.2
# # -> b.j.2
#
# output_filenames = [ [a.i.2, a.j.2], [b.i.2, b.j.2] ]
#
# we want [ a.i.2, a.j.2, b.i.2, b.j.2 ]
#
# This also works for simple @split
#
# @split("a.1", r"a.*.2")
# def example(input, output):
# # only job
# # a.1 -> a.i.2
# # -> a.j.2
#
# output_filenames = [ [a.i.2, a.j.2] ]
#
# we want [ a.i.2, a.j.2 ]
#
if len(self.output_filenames) and self.indeterminate_output:
self.output_filenames = reduce(lambda x,y: x + y, self.output_filenames)
if flattened:
# if single file name, return that
# accepts unicode
if (do_not_expand_single_job_tasks and
len(self.output_filenames) and
isinstance(self.output_filenames[0], basestring)):
return self.output_filenames
# if it is flattened, might as well sort it
return sorted(get_strings_in_nested_sequence(self.output_filenames))
else:
# special handling for jobs which have a single task,
if (do_not_expand_single_job_tasks and
self._single_job_single_output and
len(self.output_filenames) ):
return self.output_filenames[0]
#
# sort by jobs so it is just a weeny little bit less deterministic
#
return sorted(self.output_filenames)
#_____________________________________________________________________________________
# completed
#
#
#_____________________________________________________________________________________
def completed (self, logger, jobs_uptodate = False):
"""
called even when all jobs are up to date
"""
if not self.is_active:
logger.info("Inactive Task = " + self.get_task_name())
self.output_filenames = None
return
for f in self.posttask_functions:
f()
if jobs_uptodate:
logger.info("Uptodate Task = " + self.get_task_name())
else:
logger.info("Completed Task = " + self.get_task_name())
#
# indeterminate output. Check actual output again if someother tasks job function depend on it
# used for @split
#
if self.indeterminate_output:
self.output_filenames = None
#_________________________________________________________________________________________
# handle_tasks_globs_in_inputs
#_________________________________________________________________________________________
def handle_tasks_globs_in_inputs(self, input_params):
"""
Helper function for tasks which
1) Notes globs and tasks
2) Replaces tasks names and functions with actual tasks
3) Adds task dependencies automatically via task_follows
"""
#
# get list of function/function names and globs
#
function_or_func_names, globs, runtime_data_names = get_nested_tasks_or_globs(input_params)
#
# replace function / function names with tasks
#
tasks = self.task_follows(function_or_func_names)
functions_to_tasks = dict(zip(function_or_func_names, tasks))
input_params = replace_func_names_with_tasks(input_params, functions_to_tasks)
return t_params_tasks_globs_run_time_data(input_params, tasks, globs, runtime_data_names)
#8888888888888888888888888888888888888888888888888888888888888888888888888888888888888
# task handlers
# sets
# 1) action_type
# 2) param_generator_func
# 3) needs_update_func
# 4) job wrapper
#8888888888888888888888888888888888888888888888888888888888888888888888888888888888888
#_________________________________________________________________________________________
# do_task_subdivide
#_________________________________________________________________________________________
def do_task_subdivide (self, orig_args, decorator_name, error_type):
"""
@subdivide and @split are synonyms
Common code here
"""
if len(orig_args) < 3:
raise error_type(self, "Too few arguments for %s" % decorator_name)
#
# replace function / function names with tasks
#
input_files_task_globs = self.handle_tasks_globs_in_inputs(orig_args[0])
# allows split to take a single file or task
input_files_task_globs.single_file_to_list()
# how to transform input to output file name
file_names_transform = self.choose_file_names_transform (orig_args[1], error_type, decorator_name)
orig_args = orig_args[2:]
# inputs can also be defined by pattern match
extra_inputs, replace_inputs, output_pattern, extra_params = self.get_extra_inputs_outputs_extra (orig_args, error_type, decorator_name)
#
# output globs will be replaced with files. But there should not be tasks here!
#
output_files_task_globs = self.handle_tasks_globs_in_inputs(output_pattern)
if len(output_files_task_globs.tasks):
raise error_type(self, ("%s cannot output to another task. "
"Do not include tasks in output parameters.") % decorator_name)
self.param_generator_func = subdivide_param_factory ( input_files_task_globs,
False, # flatten input
file_names_transform,
extra_inputs,
replace_inputs,
output_files_task_globs,
*extra_params)
self.needs_update_func = self.needs_update_func or needs_update_check_modify_time
self.job_wrapper = job_wrapper_io_files
#self.job_descriptor = io_files_job_descriptor # (orig_args[0], output_runtime_data_names)
self.job_descriptor = io_files_one_to_many_job_descriptor
# output is a glob
self.indeterminate_output = 2
self.single_multi_io = self.many_to_many
#_________________________________________________________________________________________
# task_split
#_________________________________________________________________________________________
def do_task_simple_split (self, orig_args, decorator_name, error_type):
#check enough arguments
if len(orig_args) < 2:
raise error_type(self, "Too few arguments for %s" % decorator_name)
#
# replace function / function names with tasks
#
input_files_task_globs = self.handle_tasks_globs_in_inputs(orig_args[0])
#
# replace output globs with files
#
output_files_task_globs = self.handle_tasks_globs_in_inputs(orig_args[1])
if len(output_files_task_globs.tasks):
raise error_type(self, ("%s cannot output to another task. "
"Do not include tasks in output parameters.") % decorator_name)
extra_params = orig_args[2:]
self.param_generator_func = split_param_factory (input_files_task_globs, output_files_task_globs, *extra_params)
self.needs_update_func = self.needs_update_func or needs_update_check_modify_time
self.job_wrapper = job_wrapper_io_files
#self.job_descriptor = io_files_job_descriptor# (orig_args[1], output_runtime_data_names)
self.job_descriptor = io_files_one_to_many_job_descriptor
# output is a glob
self.indeterminate_output = 1
self.single_multi_io = self.one_to_many
#_________________________________________________________________________________________
# task_split
#_________________________________________________________________________________________
def task_split (self, orig_args):
"""
Splits a single set of input files into multiple output file names,
where the number of output files may not be known beforehand.
"""
decorator_name = "@split"
error_type = error_task_split
self.set_action_type (_task.action_task_split)
#
# This is actually @subdivide
#
if isinstance(orig_args[1], regex):
self.do_task_subdivide(orig_args, decorator_name, error_type)
#
# This is actually @split
#
else:
self.do_task_simple_split(orig_args, decorator_name, error_type)
#_________________________________________________________________________________________
# task_originate
#_________________________________________________________________________________________
def task_originate (self, orig_args):
"""
Splits out multiple output file names,
where the number of output files may or may not be known beforehand.
This is a synonym for @split(None,...)
"""
decorator_name = "@originate"
error_type = error_task_originate
self.set_action_type (_task.action_task_originate)
if len(orig_args) < 1:
raise error_type(self, "%s takes a single argument" % decorator_name)
output_params = orig_args[0]
# make sure output_params is a list.
# Each of these will be called as an output
if not non_str_sequence (output_params):
output_params = [output_params]
#
# output globs will be replaced with files. But there should not be tasks here!
#
list_output_files_task_globs = [self.handle_tasks_globs_in_inputs(oo) for oo in output_params]
for oftg in list_output_files_task_globs:
if len(oftg.tasks):
raise error_type(self, ("%s cannot output to another task. "
"Do not include tasks in output parameters.") % decorator_name)
self.param_generator_func = originate_param_factory (list_output_files_task_globs, orig_args[1:])
self.needs_update_func = self.needs_update_func or needs_update_check_modify_time
self.job_wrapper = job_wrapper_output_files
self.job_descriptor = io_files_one_to_many_job_descriptor
# output is not a glob
self.indeterminate_output = 0
self.single_multi_io = self.many_to_many
#_________________________________________________________________________________________
# task_subdivide
#_________________________________________________________________________________________
def task_subdivide (self, orig_args):
"""
Splits a single set of input files into multiple output file names,
where the number of output files may not be known beforehand.
"""
decorator_name = "@subdivide"
error_type = error_task_subdivide
self.set_action_type (_task.action_task_subdivide)
self.do_task_subdivide(orig_args, decorator_name, error_type)
#_________________________________________________________________________________________
# get_extra_inputs
#_________________________________________________________________________________________
def get_extra_inputs_outputs_extra (self, orig_args, error_type, decorator_name):
"""
shared code for subdivide, transform, product etc for parsing orig_args into
add_inputs/inputs, output, extra
"""
#
# inputs can also be defined by pattern match
#
if isinstance(orig_args[0], inputs):
if len(orig_args) < 2:
raise error_type(self, "Too few arguments for %s" % decorator_name)
if len(orig_args[0].args) != 1:
raise error_task_transform_inputs_multiple_args(self,
"inputs(...) expects only a single argument. "
"This can be, for example, a file name, "
"a regular expression pattern, or any "
"nested structure. If the intention was to "
"specify a tuple as the input parameter, "
"please wrap the elements of the tuple "
"in brackets in the decorator\n\n"
"%s(..., inputs(...), ...)\n" % (decorator_name))
replace_inputs = t_extra_inputs.REPLACE_INPUTS
extra_inputs = self.handle_tasks_globs_in_inputs(orig_args[0].args[0])
output_pattern = orig_args[1]
extra_params = orig_args[2:]
elif isinstance(orig_args[0], add_inputs):
if len(orig_args) < 2:
raise error_type(self, "Too few arguments for %s" % decorator_name)
replace_inputs = t_extra_inputs.ADD_TO_INPUTS
extra_inputs = self.handle_tasks_globs_in_inputs(orig_args[0].args)
output_pattern = orig_args[1]
extra_params = orig_args[2:]
else:
replace_inputs = t_extra_inputs.KEEP_INPUTS
extra_inputs = None
output_pattern = orig_args[0]
extra_params = orig_args[1:]
return extra_inputs, replace_inputs, output_pattern, extra_params
#_________________________________________________________________________________________
# choose_file_names_transform
#_________________________________________________________________________________________
def choose_file_names_transform (self, file_name_transform_tag, error_type, decorator_name, valid_tags = (regex, suffix, formatter)):
"""
shared code for subdivide, transform, product etc for choosing method for transform input file to output files
"""
valid_tag_names = [];
# regular expression match
if (regex in valid_tags):
valid_tag_names.append("regex()")
if isinstance(file_name_transform_tag, regex):
return t_regex_file_names_transform(self, file_name_transform_tag, error_type, decorator_name)
# simulate end of string (suffix) match
if (suffix in valid_tags):
valid_tag_names.append("suffix()")
if isinstance(file_name_transform_tag, suffix):
return t_suffix_file_names_transform(self, file_name_transform_tag, error_type, decorator_name)
# new style string.format()
if (formatter in valid_tags):
valid_tag_names.append("formatter()")
if isinstance(file_name_transform_tag, formatter):
return t_formatter_file_names_transform(self, file_name_transform_tag, error_type, decorator_name)
raise error_type(self, "%s expects one of %s as the second argument" % (decorator_name, ", ".join(valid_tag_names)))
#_________________________________________________________________________________________
# task_product
#_________________________________________________________________________________________
def task_product(self, orig_args):
"""
all versus all
"""
decorator_name = "@product"
error_type = error_task_product
if len(orig_args) < 3:
raise error_type(self, "Too few arguments for %s" % decorator_name)
#
# get all pairs of tasks / globs and formatter()
#
list_input_files_task_globs = []
list_formatter = []
while len(orig_args) >= 3:
if isinstance(orig_args[1], formatter):
list_input_files_task_globs .append(orig_args[0])
list_formatter .append(orig_args[1])
orig_args = orig_args[2:]
else:
break
if not len(list_formatter):
raise error_task_product(self, "@product expects formatter() as the second argument")
self.set_action_type (_task.action_task_product)
#
# replace function / function names with tasks
#
list_input_files_task_globs = [self.handle_tasks_globs_in_inputs(ii) for ii in list_input_files_task_globs]
# list of new style string.format()
file_names_transform = t_nested_formatter_file_names_transform(self, list_formatter, error_task_product, decorator_name)
#
# inputs can also be defined by pattern match
#
extra_inputs, replace_inputs, output_pattern, extra_params = self.get_extra_inputs_outputs_extra (orig_args, error_type, decorator_name)
self.param_generator_func = product_param_factory ( list_input_files_task_globs,
False, # flatten input
file_names_transform,
extra_inputs,
replace_inputs,
output_pattern,
*extra_params)
self.needs_update_func = self.needs_update_func or needs_update_check_modify_time
self.job_wrapper = job_wrapper_io_files
self.job_descriptor = io_files_job_descriptor
self.single_multi_io = self.many_to_many
#_________________________________________________________________________________________
# task_combinatorics
#_________________________________________________________________________________________
def task_combinatorics (self, orig_args, combinatorics_type, decorator_name, error_type):
"""
Common code for task_permutations, task_combinations_with_replacement, task_combinations
"""
if len(orig_args) < 4:
raise error_type(self, "Too few arguments for %s" % decorator_name)
if not isinstance(orig_args[1], formatter):
raise error_task_product(self, "%s expects formatter() as the second argument" % decorator_name)
#
# replace function / function names with tasks
#
input_files_task_globs = self.handle_tasks_globs_in_inputs(orig_args[0])
k_tuple = orig_args[2]
# how to transform input to output file name: len(k-tuples) of (identical) formatters
file_names_transform = t_nested_formatter_file_names_transform(self, [orig_args[1]] * k_tuple, error_type, decorator_name)
self.set_action_type (_task.action_task_permutations)
if not isinstance(orig_args[2], int):
raise error_task_product(self, "%s expects an integer number as the third argument specifying the number of elements in each tuple." % decorator_name)
orig_args = orig_args[3:]
#
# inputs can also be defined by pattern match
#
extra_inputs, replace_inputs, output_pattern, extra_params = self.get_extra_inputs_outputs_extra (orig_args, error_type, decorator_name)
self.param_generator_func = combinatorics_param_factory ( input_files_task_globs,
False, # flatten input
combinatorics_type,
k_tuple,
file_names_transform,
extra_inputs,
replace_inputs,
output_pattern,
*extra_params)
self.needs_update_func = self.needs_update_func or needs_update_check_modify_time
self.job_wrapper = job_wrapper_io_files
self.job_descriptor = io_files_job_descriptor
self.single_multi_io = self.many_to_many
#_________________________________________________________________________________________
# task_permutations
#_________________________________________________________________________________________
def task_permutations(self, orig_args):
"""
k-permutations of n
k-length tuples, all possible orderings, no self vs self
"""
decorator_name = "@permutations"
error_type = error_task_permutations
combinatorics_type = t_combinatorics_type.COMBINATORICS_PERMUTATIONS
self.task_combinatorics (orig_args, combinatorics_type, decorator_name, error_type)
#_________________________________________________________________________________________
# task_combinations
#_________________________________________________________________________________________
def task_combinations(self, orig_args):
"""
k-length tuples
Single (sorted) ordering, i.e. AB is the same as BA,
No repeats. No AA, BB
E.g.
combinations("ABCD", 3) = ['ABC', 'ABD', 'ACD', 'BCD']
combinations("ABCD", 2) = ['AB', 'AC', 'AD', 'BC', 'BD', 'CD']
"""
decorator_name = "@combinations"
error_type = error_task_combinations
combinatorics_type = t_combinatorics_type.COMBINATORICS_COMBINATIONS
self.task_combinatorics (orig_args, combinatorics_type, decorator_name, error_type)
#_________________________________________________________________________________________
# task_combinations_with_replacement
#_________________________________________________________________________________________
def task_combinations_with_replacement(self, orig_args):
"""
k-length tuples
Single (sorted) ordering, i.e. AB is the same as BA,
Repeats. AA, BB, AAC etc.
E.g.
combinations_with_replacement("ABCD", 3) = ['AAA', 'AAB', 'AAC', 'AAD',
'ABB', 'ABC', 'ABD',
'ACC', 'ACD',
'ADD',
'BBB', 'BBC', 'BBD',
'BCC', 'BCD',
'BDD',
'CCC', 'CCD',
'CDD',
'DDD']
combinations_with_replacement("ABCD", 2) = ['AA', 'AB', 'AC', 'AD',
'BB', 'BC', 'BD',
'CC', 'CD',
'DD']
"""
decorator_name = "@combinations_with_replacement"
error_type = error_task_combinations_with_replacement
combinatorics_type = t_combinatorics_type.COMBINATORICS_COMBINATIONS_WITH_REPLACEMENT
self.task_combinatorics (orig_args, combinatorics_type, decorator_name, error_type)
#_________________________________________________________________________________________
# task_transform
#_________________________________________________________________________________________
def task_transform (self, orig_args):
"""
Merges multiple input files into a single output.
"""
decorator_name = "@transform"
error_type = error_task_transform
if len(orig_args) < 3:
raise error_type(self, "Too few arguments for %s" % decorator_name)
self.set_action_type (_task.action_task_transform)
#
# replace function / function names with tasks
#
input_files_task_globs = self.handle_tasks_globs_in_inputs(orig_args[0])
#_________________________________________________________________________________
#
# single_job_single_output is bad policy. Can we remove it?
# What does this actually mean in Ruffus semantics?
#
#
# allows transform to take a single file or task
if input_files_task_globs.single_file_to_list():
self._single_job_single_output = self.single_job_single_output
#
# whether transform generates a list of jobs or not will depend on the parent task
#
elif isinstance(input_files_task_globs.params, _task):
self._single_job_single_output = input_files_task_globs.params
#_________________________________________________________________________________
# how to transform input to output file name
file_names_transform = self.choose_file_names_transform (orig_args[1], error_task_transform, decorator_name)
orig_args = orig_args[2:]
#
# inputs can also be defined by pattern match
#
extra_inputs, replace_inputs, output_pattern, extra_params = self.get_extra_inputs_outputs_extra (orig_args, error_type, decorator_name)
self.param_generator_func = transform_param_factory ( input_files_task_globs,
False, # flatten input
file_names_transform,
extra_inputs,
replace_inputs,
output_pattern,
*extra_params)
self.needs_update_func = self.needs_update_func or needs_update_check_modify_time
self.job_wrapper = job_wrapper_io_files
self.job_descriptor = io_files_job_descriptor
self.single_multi_io = self.many_to_many
#_________________________________________________________________________________________
# task_collate
#_________________________________________________________________________________________
def task_collate (self, orig_args):
"""
Merges multiple input files into a single output.
"""
decorator_name = "@collate"
error_type = error_task_collate
if len(orig_args) < 3:
raise error_type(self, "Too few arguments for %s" % decorator_name)
self.set_action_type (_task.action_task_collate)
#
# replace function / function names with tasks
#
input_files_task_globs = self.handle_tasks_globs_in_inputs(orig_args[0])
# how to transform input to output file name
file_names_transform = self.choose_file_names_transform (orig_args[1], error_task_collate, decorator_name, (regex, formatter))
orig_args = orig_args[2:]
#
# inputs also defined by pattern match
#
extra_inputs, replace_inputs, output_pattern, extra_params = self.get_extra_inputs_outputs_extra (orig_args, error_type, decorator_name)
self.single_multi_io = self.many_to_many
self.param_generator_func = collate_param_factory ( input_files_task_globs,
False, # flatten input
file_names_transform,
extra_inputs,
replace_inputs,
output_pattern,
*extra_params)
self.needs_update_func = self.needs_update_func or needs_update_check_modify_time
self.job_wrapper = job_wrapper_io_files
self.job_descriptor = io_files_job_descriptor
#_________________________________________________________________________________________
# task_merge
#_________________________________________________________________________________________
def task_merge (self, orig_args):
"""
Merges multiple input files into a single output.
"""
#
# check enough arguments
#
if len(orig_args) < 2:
raise error_task_merge(self, "Too few arguments for @merge")
self.set_action_type (_task.action_task_merge)
#
# replace function / function names with tasks
#
input_files_task_globs = self.handle_tasks_globs_in_inputs(orig_args[0])
extra_params = orig_args[1:]
self.param_generator_func = merge_param_factory (input_files_task_globs,
*extra_params)
# self._single_job_single_output = self.multiple_jobs_outputs
self._single_job_single_output = self.single_job_single_output
self.single_multi_io = self.many_to_one
self.needs_update_func = self.needs_update_func or needs_update_check_modify_time
self.job_wrapper = job_wrapper_io_files
self.job_descriptor = io_files_job_descriptor
#_________________________________________________________________________________________
# task_parallel
#_________________________________________________________________________________________
def task_parallel (self, orig_args):
"""
calls user function in parallel
with either each of a list of parameters
or using parameters generated by a custom function
"""
self.set_action_type (_task.action_parallel)
# unmodified from __init__
#
# self.needs_update_func = None
# self.job_wrapper = job_wrapper_generic
# self.job_descriptor = io_files_job_descriptor
if len(orig_args) == 0:
raise error_task_parallel(self, "Too few arguments for @parallel")
# Use parameters generated by a custom function
if len(orig_args) == 1 and isinstance(orig_args[0], collections.Callable):
#if len(orig_args) == 1 and type(orig_args[0]) == types.FunctionType:
self.param_generator_func = args_param_factory(orig_args[0]())
# list of params
else:
if len(orig_args) > 1:
# single jobs
params = copy.copy([orig_args])
self._single_job_single_output = self.single_job_single_output
else:
# multiple jobs with input/output parameters etc.
params = copy.copy(orig_args[0])
check_parallel_parameters (self, params, error_task_parallel)
self.param_generator_func = args_param_factory (params)
#_________________________________________________________________________________________
# task_files
#_________________________________________________________________________________________
def task_files (self, orig_args):
"""
calls user function in parallel
with either each of a list of parameters
or using parameters generated by a custom function
In the parameter list,
The first two items of each set of parameters must
be input/output files or lists of files or Null
"""
self.needs_update_func = self.needs_update_func or needs_update_check_modify_time
self.job_wrapper = job_wrapper_io_files
self.job_descriptor = io_files_job_descriptor
if len(orig_args) == 0:
raise error_task_files(self, "Too few arguments for @files")
# Use parameters generated by a custom function
if len(orig_args) == 1 and isinstance(orig_args[0], collections.Callable):
#if len(orig_args) == 1 and type(orig_args[0]) == types.FunctionType:
self.set_action_type (_task.action_task_files_func)
self.param_generator_func = files_custom_generator_param_factory(orig_args[0])
# assume
self.single_multi_io = self.many_to_many
# Use parameters in supplied list
else:
self.set_action_type (_task.action_task_files)
if len(orig_args) > 1:
# single jobs
# This is true even if the previous task has multiple output
# These will all be joined together at the hip (like @merge)
# If you want different behavior, use @transform
params = copy.copy([orig_args])
self._single_job_single_output = self.single_job_single_output
self.single_multi_io = self.one_to_one
else:
# multiple jobs with input/output parameters etc.
params = copy.copy(orig_args[0])
self._single_job_single_output = self.multiple_jobs_outputs
self.single_multi_io = self.many_to_many
check_files_io_parameters (self, params, error_task_files)
#
# get list of function/function names and globs for all job params
#
#
# replace function / function names with tasks
#
input_patterns = [j[0] for j in params]
input_files_task_globs = self.handle_tasks_globs_in_inputs(input_patterns)
#
# extra params
#
output_extra_params = [tuple(j[1:]) for j in params]
self.param_generator_func = files_param_factory (input_files_task_globs,
False, # flatten input
True, # do_not_expand_single_job_tasks
output_extra_params)
#_________________________________________________________________________________________
# task_files_re
#_________________________________________________________________________________________
def task_files_re (self, old_args):
"""
calls user function in parallel
with input_files, output_files, parameters
These needed to be generated on the fly by
getting all file names in the supplied list/glob pattern
There are two variations:
1) inputfiles = all files in glob which match the regular expression
outputfile = generated from the replacement string
2) inputfiles = all files in glob which match the regular expression and
generated from the "from" replacement string
outputfiles = all files in glob which match the regular expression and
generated from the "to" replacement string
"""
#
# check enough arguments
#
if len(old_args) < 3:
raise error_task_files_re(self, "Too few arguments for @files_re")
self.set_action_type (_task.action_task_files_re)
# check if parameters wrapped in combine
combining_all_jobs, orig_args = is_file_re_combining(old_args)
#
# replace function / function names with tasks
#
input_files_task_globs = self.handle_tasks_globs_in_inputs(orig_args[0])
file_names_transform = t_regex_file_names_transform(self, regex(orig_args[1]), error_task_files_re, "@files_re")
# if the input file term is missing, just use the original
if len(orig_args) == 3:
extra_input_files_task_globs = None
output_and_extras = [orig_args[2]]
else:
extra_input_files_task_globs = self.handle_tasks_globs_in_inputs(orig_args[2])
output_and_extras = orig_args[3:]
if combining_all_jobs:
self.single_multi_io = self.many_to_many
self.param_generator_func = collate_param_factory (input_files_task_globs,
False, # flatten
file_names_transform,
extra_input_files_task_globs,
t_extra_inputs.REPLACE_INPUTS,
*output_and_extras)
else:
self.single_multi_io = self.many_to_many
self.param_generator_func = transform_param_factory (input_files_task_globs,
False, # flatten
file_names_transform,
extra_input_files_task_globs,
t_extra_inputs.REPLACE_INPUTS,
*output_and_extras)
self.needs_update_func = self.needs_update_func or needs_update_check_modify_time
self.job_wrapper = job_wrapper_io_files
self.job_descriptor = io_files_job_descriptor
#_________________________________________________________________________________________
# task_mkdir
# only called within task_follows
#_________________________________________________________________________________________
def task_mkdir (self, orig_args):
self.cnt_task_mkdir += 1
# give unique name to this instance of mkdir
unique_name = r"(mkdir %d) before " % self.cnt_task_mkdir + self._name
new_node = _task(self._module_name, unique_name)
self.add_child(new_node)
new_node.do_task_mkdir(orig_args)
new_node.display_name = new_node._description
def do_task_mkdir (self, orig_args):
"""
list of directory names or a single argument which is aa list of directory names
Creates directory if missing
"""
decorator_name = "mkdir"
error_type = error_task_mkdir
# jump through hoops
self.set_action_type (_task.action_mkdir)
self.needs_update_func = self.needs_update_func or needs_update_check_directory_missing
self._description = "Make directories %s" % (shorten_filenames_encoder(orig_args))
self.job_wrapper = job_wrapper_mkdir
self.job_descriptor = mkdir_job_descriptor
# doesn't have a real function
# use job_wrapper just so it is not None
self.user_defined_work_func = self.job_wrapper
#
# @transform like behaviour with regex / suffix or formatter
#
if len(orig_args) > 1 and isinstance(orig_args[1], (formatter, suffix, regex)):
self.single_multi_io = self.many_to_many
if len(orig_args) < 3:
raise error_type(self, "Too few arguments for %s" % decorator_name)
#
# replace function / function names with tasks
#
input_files_task_globs = self.handle_tasks_globs_in_inputs(orig_args[0])
# how to transform input to output file name
file_names_transform = self.choose_file_names_transform (orig_args[1], error_task_transform, decorator_name)
orig_args = orig_args[2:]
#
# inputs can also be defined by pattern match
#
extra_inputs, replace_inputs, output_pattern, extra_params = self.get_extra_inputs_outputs_extra (orig_args, error_type, decorator_name)
if len(extra_params):
raise error_type(self, "Too many arguments for %s" % decorator_name)
self.param_generator_func = transform_param_factory ( input_files_task_globs,
False, # flatten input
file_names_transform,
extra_inputs,
replace_inputs,
output_pattern,
*extra_params)
#
# simple behaviour: just make directories in list of strings
#
# the mkdir decorator accepts one string, multiple strings or a list of strings
else:
self.single_multi_io = self.one_to_one
#
#
#
# if a single argument collection of parameters, keep that as is
if len(orig_args) == 0:
mkdir_params = []
elif len(orig_args) > 1:
mkdir_params = orig_args
# len(orig_args) == 1: unpack orig_args[0]
elif non_str_sequence (orig_args[0]):
mkdir_params = orig_args[0]
# single string or other non collection types
else:
mkdir_params = orig_args
# all directories created in one job to reduce race conditions
# so we are converting [a,b,c] into [ [(a, b,c)] ]
# where orig_args = (a,b,c)
# i.e. one job whose solitory argument is a tuple/list of directory names
self.param_generator_func = args_param_factory([[sorted(mkdir_params)]])
#8888888888888888888888888888888888888888888888888888888888888888888888888888888888888
# Other task handlers
#8888888888888888888888888888888888888888888888888888888888888888888888888888888888888
#_________________________________________________________________________________________
# task_follows
#_________________________________________________________________________________________
def task_follows (self, args):
"""
Saved decorator arguments should be:
(string/task,...)
"""
new_tasks = []
for arg in args:
#
# specified by string: unicode or otherwise
#
if isinstance(arg, basestring):
# string looks up to defined task, use that
if node.is_node(arg):
arg = node.lookup_node_from_name(arg)
# string looks up to defined task in main module, use that
elif node.is_node("__main__." + arg):
arg = node.lookup_node_from_name("__main__." + arg)
#
# string does not look up to defined task: defer
#
else:
# no module: use same module as current task
names = arg.rsplit(".", 2)
if len(names) == 1:
arg = _task(self._module_name, arg)
else:
arg = _task(*names)
#
# add dependency
# duplicate dependencies are ignore automatically
#
self.add_child(arg)
new_tasks.append(arg)
#
# for mkdir, automatically generate task with unique name
#
elif isinstance(arg, mkdir):
self.cnt_task_mkdir += 1
# give unique name to this instance of mkdir
unique_name = r"(mkdir %d) before " % self.cnt_task_mkdir + self._name
new_node = _task(self._module_name, unique_name)
self.add_child(new_node)
new_node.do_task_mkdir(arg.args)
new_node.display_name = new_node._description
new_tasks.append(new_node)
#
# Is this a function?
# Turn this function into a task
# (add task as attribute of this function)
# Add self as dependent
else:
#if type(arg) != types.FunctionType:
if not isinstance(arg, collections.Callable):
raise error_decorator_args("Dependencies must be functions or function names in " +
"@task_follows %s:\n[%s]" %
(self._name, str(arg)))
# add task as attribute of this function
if not hasattr(arg, "pipeline_task"):
arg.pipeline_task = _task.create_task(arg)
self.add_child(arg.pipeline_task)
new_tasks.append(arg.pipeline_task)
return new_tasks
#_________________________________________________________________________________________
# task_check_if_uptodate
#_________________________________________________________________________________________
def task_check_if_uptodate (self, args):
"""
Saved decorator arguments should be:
a function which takes the appropriate number of arguments for each job
"""
if len(args) != 1 or not isinstance(args[0], collections.Callable):
#if len(args) != 1 or type(args[0]) != types.FunctionType:
raise error_decorator_args("Expecting a single function in " +
"@task_check_if_uptodate %s:\n[%s]" %
(self._name, str(args)))
self.needs_update_func = args[0]
#_________________________________________________________________________________________
# task_posttask
#_________________________________________________________________________________________
def task_posttask(self, args):
"""
Saved decorator arguments should be:
one or more functions which will be called if the task completes
"""
for arg in args:
if isinstance(arg, touch_file):
self.posttask_functions.append(touch_file_factory (arg.args, register_cleanup))
elif isinstance(arg, collections.Callable):
#elif type(arg) == types.FunctionType:
self.posttask_functions.append(arg)
else:
raise PostTaskArgumentError("Expecting simple functions or touch_file in " +
"@posttask(...)\n Task = %s" %
(self._name))
#_________________________________________________________________________________________
# task_jobs_limit
#_________________________________________________________________________________________
def task_jobs_limit(self, args):
"""
Limit the number of concurrent jobs
"""
maximum_jobs, name = (args + (None,))[0:2]
try:
maximum_jobs_num = int(maximum_jobs)
assert(maximum_jobs_num >= 1)
except:
limit_name = ", " + name if name else ""
raise JobsLimitArgumentError(('In @jobs_limit(%s%s), the limit '
'must be an integer number greater than or '
'equal to 1') %
(maximum_jobs_num, limit_name))
if name != None:
self.semaphore_name = name
if self.semaphore_name in self.job_limit_semaphores:
curr_maximum_jobs = self.job_limit_semaphores[self.semaphore_name]
if curr_maximum_jobs != maximum_jobs_num:
raise JobsLimitArgumentError(('@jobs_limit(%d, "%s") cannot ' +
're-defined with a different limit of %d') %
(self.semaphore_name, curr_maximum_jobs,
maximum_jobs_num))
else:
#
# save semaphore and limit
#
self.job_limit_semaphores[self.semaphore_name] = maximum_jobs_num
#_________________________________________________________________________________________
# task_active_if
#_________________________________________________________________________________________
def task_active_if (self, active_if_checks):
"""
If any of active_checks is False or returns False, then the task is
marked as "inactive" and its outputs removed.
"""
#print 'job is active:', active_checks, [
# arg() if isinstance(arg, collections.Callable) else arg
# for arg in active_checks]
if self.active_if_checks == None:
self.active_if_checks = []
self.active_if_checks.extend(active_if_checks)
print self.active_if_checks
class task_encoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, set):
return list(obj)
if isinstance(obj, defaultdict):
return dict(obj)
if isinstance(obj, _task):
return obj._name #, _task.action_names[obj.action_task], obj._description]
return json.JSONEncoder.default(self, obj)
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
# Functions
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
#_________________________________________________________________________________________
# link_task_names_to_functions
#_________________________________________________________________________________________
def link_task_names_to_functions ():
"""
Make sure all tasks in dependency list are linked to real functions
Call this before running anything else
"""
for n in node._all_nodes:
if n.user_defined_work_func == None:
dependent_display_task_name = n._inward[0].get_task_name()
if n._module_name in sys.modules:
module = sys.modules[n._module_name]
if hasattr(module, n._func_name):
n.user_defined_work_func = getattr(module, n._func_name)
else:
raise error_decorator_args(("Module '%s' has no function '%s' in " +
"\n@task_follows('%s')\ndef %s...") %
(n._module_name, n._func_name, n.get_task_name(), dependent_display_task_name))
else:
raise error_decorator_args("Module '%s' not found in " +
"\n@task_follows('%s')\ndef %s..." %
(n._module_name, n.get_task_name(), dependent_display_task_name))
#
# some jobs single state status mirrors parent's state
# and parent task not known until know
#
if isinstance(n._single_job_single_output, _task):
n._single_job_single_output = n._single_job_single_output._single_job_single_output
#_________________________________________________________________________________________
# update_checksum_level_on_tasks
#_________________________________________________________________________________________
def update_checksum_level_on_tasks (checksum_level):
"""Reset the checksum level for all tasks"""
for n in node._all_nodes:
n.checksum_level = checksum_level
#_________________________________________________________________________________________
# update_active_states_for_all_tasks
#_________________________________________________________________________________________
def update_active_states_for_all_tasks ():
"""
@active_if decorated tasks can change their active state every time
pipeline_run / pipeline_printout / pipeline_printout_graph is called
update_active_states_for_all_tasks ()
"""
for n in node._all_nodes:
n.update_active_state()
#_________________________________________________________________________________________
# task_names_to_tasks
#_________________________________________________________________________________________
def task_names_to_tasks (task_description, task_names):
"""
Given a list of task names, look up the corresponding tasks
Will just pass through if the task_name is already a task
"""
#
# In case we are given a single item instead of a list
# accepts unicode
#
if isinstance(task_names, basestring) or isinstance(task_names, collections.Callable):
#if isinstance(task_names, basestring) or type(task_names) == types.FunctionType:
task_names = [task_names]
task_nodes = []
for task_name in task_names:
# Is this already a function, don't do mapping if already is task
if isinstance(task_name, collections.Callable):
#if type(task_name) == types.FunctionType:
if hasattr(task_name, "pipeline_task"):
task_nodes.append(task_name.pipeline_task)
continue
else:
# blow up for unwrapped function
raise error_function_is_not_a_task(("Function def %s(...): is not a pipelined task in ruffus." %
task_name.__name__) +
" To include this, this function needs to have a ruffus "+
"decoration like '@parallel', '@files', or named as a dependent "+
"of some other Ruffus task function via '@follows'.")
# assumes is some kind of string
if not node.is_node(task_name):
if node.is_node("__main__." + task_name):
task_nodes.append(node.lookup_node_from_name("__main__." + task_name))
else:
raise error_node_not_task("%s task '%s' is not a pipelined task in Ruffus. Have you mis-spelt the function name?" % (
task_description, task_name))
else:
task_nodes.append(node.lookup_node_from_name(task_name))
return task_nodes
#_________________________________________________________________________________________
# pipeline_printout_in_dot_format
#_________________________________________________________________________________________
[docs]def pipeline_printout_graph (stream,
output_format = None,
target_tasks = [],
forcedtorun_tasks = [],
draw_vertically = True,
ignore_upstream_of_target = False,
skip_uptodate_tasks = False,
gnu_make_maximal_rebuild_mode = True,
test_all_task_for_update = True,
no_key_legend = False,
minimal_key_legend = True,
user_colour_scheme = None,
pipeline_name = "Pipeline:",
size = (11,8),
dpi = 120,
runtime_data = None,
checksum_level = None,
history_file = None):
# Remember to add further extra parameters here to "extra_pipeline_printout_graph_options" inside cmdline.py
# This will forward extra parameters from the command line to pipeline_printout_graph
"""
print out pipeline dependencies in various formats
:param stream: where to print to
:type stream: file-like object with ``write()`` function
:param output_format: ["dot", "jpg", "svg", "ps", "png"]. All but the first depends on the `dot <http://www.graphviz.org>`_ program.
:param target_tasks: targets task functions which will be run if they are out-of-date.
:param forcedtorun_tasks: task functions which will be run whether or not they are out-of-date.
:param draw_vertically: Top to bottom instead of left to right.
:param ignore_upstream_of_target: Don't draw upstream tasks of targets.
:param skip_uptodate_tasks: Don't draw up-to-date tasks if possible.
:param gnu_make_maximal_rebuild_mode: Defaults to re-running *all* out-of-date tasks. Runs minimal
set to build targets if set to ``True``. Use with caution.
:param test_all_task_for_update: Ask all task functions if they are up-to-date.
:param no_key_legend: Don't draw key/legend for graph.
:param checksum_level: Several options for checking up-to-dateness are available: Default is level 1.
level 0 : Use only file timestamps
level 1 : above, plus timestamp of successful job completion
level 2 : above, plus a checksum of the pipeline function body
level 3 : above, plus a checksum of the pipeline function default arguments and the additional arguments passed in by task decorators
"""
if checksum_level is None:
checksum_level = get_default_checksum_level()
link_task_names_to_functions ()
update_checksum_level_on_tasks (checksum_level)
#
# @active_if decorated tasks can change their active state every time
# pipeline_run / pipeline_printout / pipeline_printout_graph is called
#
update_active_states_for_all_tasks ()
#
# run time data
#
if runtime_data == None:
runtime_data = {}
if not isinstance(runtime_data, dict):
raise Exception("pipeline_run parameter runtime_data should be a dictionary of "
"values passes to jobs at run time.")
#
# If we aren't using checksums, and history file hasn't been specified,
# we might be a bit surprised to find Ruffus writing to a sqlite db anyway.
# Let us just use a in memory db which will be thrown away
# Of course, if history_file is specified, we presume you know what you are doing
#
if checksum_level == CHECKSUM_FILE_TIMESTAMPS and history_file == None:
history_file = ':memory:'
#
# load previous job history if it exists, otherwise create an empty history
#
job_history = open_job_history (history_file)
#
# target jobs
#
if target_tasks == None:
target_tasks = []
if forcedtorun_tasks == None:
forcedtorun_tasks = []
target_tasks = task_names_to_tasks ("Target", target_tasks)
forcedtorun_tasks = task_names_to_tasks ("Forced to run", forcedtorun_tasks)
(topological_sorted, ignore_param1, ignore_param2,
ignore_param3) = topologically_sorted_nodes(target_tasks, forcedtorun_tasks,
gnu_make_maximal_rebuild_mode,
extra_data_for_signal = [t_verbose_logger(0, None, runtime_data), job_history])
if not len(target_tasks):
target_tasks = topological_sorted[-1:]
# open file if (unicode?) string
if isinstance(stream, basestring):
stream = open(stream, "w")
# derive format automatically from name
if output_format == None:
output_format = os.path.splitext(stream.name)[1].lstrip(".")
graph_printout ( stream,
output_format,
target_tasks,
forcedtorun_tasks,
draw_vertically,
ignore_upstream_of_target,
skip_uptodate_tasks,
gnu_make_maximal_rebuild_mode,
test_all_task_for_update,
no_key_legend,
minimal_key_legend,
user_colour_scheme,
pipeline_name,
size,
dpi,
extra_data_for_signal = [t_verbose_logger(0, None, runtime_data), job_history])
#_________________________________________________________________________________________
# pipeline_printout
#_________________________________________________________________________________________
[docs]def pipeline_printout( output_stream = None,
target_tasks = [],
forcedtorun_tasks = [],
verbose = 1,
indent = 4,
gnu_make_maximal_rebuild_mode = True,
wrap_width = 100,
runtime_data = None,
checksum_level = None,
history_file = None):
# Remember to add further extra parameters here to "extra_pipeline_printout_options" inside cmdline.py
# This will forward extra parameters from the command line to pipeline_printout
"""
Printouts the parts of the pipeline which will be run
Because the parameters of some jobs depend on the results of previous tasks, this function
produces only the current snap-shot of task jobs. In particular, tasks which generate
variable number of inputs into following tasks will not produce the full range of jobs.
::
verbose = 0 : nothing
verbose = 1 : print task name
verbose = 2 : print task description if exists
verbose = 3 : print job names for jobs to be run
verbose = 4 : print list of up-to-date tasks and job names for jobs to be run
verbose = 5 : print job names for all jobs whether up-to-date or not
:param output_stream: where to print to
:type output_stream: file-like object with ``write()`` function
:param target_tasks: targets task functions which will be run if they are out-of-date
:param forcedtorun_tasks: task functions which will be run whether or not they are out-of-date
:param verbose: level 0 : nothing
level 1 : logs task names and warnings
level 2 : logs task description if exists
level 3 : logs job names for jobs to be run
level 4 : logs list of up-to-date tasks and job names for jobs to be run
level 5 : logs job names for all jobs whether up-to-date or not
level 10: logs messages useful only for debugging ruffus pipeline code
:param indent: How much indentation for pretty format.
:param gnu_make_maximal_rebuild_mode: Defaults to re-running *all* out-of-date tasks. Runs minimal
set to build targets if set to ``True``. Use with caution.
:param wrap_width: The maximum length of each line
:param runtime_data: Experimental feature for passing data to tasks at run time
:param checksum_level: Several options for checking up-to-dateness are available: Default is level 1.
level 0 : Use only file timestamps
level 1 : above, plus timestamp of successful job completion
level 2 : above, plus a checksum of the pipeline function body
level 3 : above, plus a checksum of the pipeline function default arguments and the additional arguments passed in by task decorators
"""
if verbose == 0:
return
if output_stream == None:
import sys
output_stream = sys.stdout
if not hasattr(output_stream, "write"):
raise Exception("The first parameter to pipeline_printout needs to be an output file, e.g. sys.stdout and not %s" % str(output_stream))
if runtime_data == None:
runtime_data = {}
if not isinstance(runtime_data, dict):
raise Exception("pipeline_run parameter runtime_data should be a dictionary of "
"values passes to jobs at run time.")
if checksum_level is None:
checksum_level = get_default_checksum_level()
link_task_names_to_functions ()
update_checksum_level_on_tasks(checksum_level)
#
# @active_if decorated tasks can change their active state every time
# pipeline_run / pipeline_printout / pipeline_printout_graph is called
#
update_active_states_for_all_tasks ()
#
# target jobs
#
target_tasks = task_names_to_tasks ("Target", target_tasks)
forcedtorun_tasks = task_names_to_tasks ("Forced to run", forcedtorun_tasks)
logging_strm = t_verbose_logger(verbose, t_stream_logger(output_stream), runtime_data)
#
# If we aren't using checksums, and history file hasn't been specified,
# we might be a bit surprised to find Ruffus writing to a sqlite db anyway.
# Let us just use a in memory db which will be thrown away
# Of course, if history_file is specified, we presume you know what you are doing
#
if checksum_level == CHECKSUM_FILE_TIMESTAMPS and history_file == None:
history_file = ':memory:'
#
# load previous job history if it exists, otherwise create an empty history
#
job_history = open_job_history (history_file)
(topological_sorted,
self_terminated_nodes,
dag_violating_edges,
dag_violating_nodes) = topologically_sorted_nodes(target_tasks, forcedtorun_tasks,
gnu_make_maximal_rebuild_mode,
extra_data_for_signal = [t_verbose_logger(0, None, runtime_data), job_history])
#
# raise error if DAG violating nodes
#
if len(dag_violating_nodes):
dag_violating_tasks = ", ".join(t._name for t in dag_violating_nodes)
e = error_circular_dependencies("Circular dependencies found in the "
"pipeline involving one or more of (%s)" %
(dag_violating_tasks))
raise e
wrap_indent = " " * (indent + 11)
#
# Get updated nodes as all_nodes - nodes_to_run
#
if verbose >= 4:
(all_tasks, ignore_param1, ignore_param2,
ignore_param3) = topologically_sorted_nodes(target_tasks, True,
gnu_make_maximal_rebuild_mode,
extra_data_for_signal = [t_verbose_logger(0, None, runtime_data), job_history])
if len(all_tasks) > len(topological_sorted):
output_stream.write("\n" + "_" * 40 + "\nTasks which are up-to-date:\n\n")
pipelined_tasks_to_run = set(topological_sorted)
for t in all_tasks:
if t in pipelined_tasks_to_run:
continue
messages = t.printout(runtime_data, t in forcedtorun_tasks, job_history, verbose, indent)
for m in messages:
output_stream.write(textwrap.fill(m, subsequent_indent = wrap_indent, width = wrap_width) + "\n")
output_stream.write("\n" + "_" * 40 + "\nTasks which will be run:\n\n")
for t in topological_sorted:
messages = t.printout(runtime_data, t in forcedtorun_tasks, job_history, verbose, indent)
for m in messages:
output_stream.write(textwrap.fill(m, subsequent_indent = wrap_indent, width = wrap_width) + "\n")
if verbose:
output_stream.write("_" * 40 + "\n")
#_________________________________________________________________________________________
# get_semaphore
#_________________________________________________________________________________________
def get_semaphore (t, job_limit_semaphores, syncmanager):
"""
return semaphore to limit the number of concurrent jobs
"""
#
# Is this task limited in the number of jobs?
#
if t.semaphore_name not in t.job_limit_semaphores:
return None
#
# create semaphore if not yet created
#
if t.semaphore_name not in job_limit_semaphores:
maximum_jobs_num = t.job_limit_semaphores[t.semaphore_name]
job_limit_semaphores[t.semaphore_name] = syncmanager.BoundedSemaphore(maximum_jobs_num)
return job_limit_semaphores[t.semaphore_name]
#_________________________________________________________________________________________
#
# Parameter generator for all jobs / tasks
#
#________________________________________________________________________________________
def make_job_parameter_generator (incomplete_tasks, task_parents, logger, forcedtorun_tasks,
task_with_completed_job_q, runtime_data, verbose,
syncmanager,
touch_files_only, job_history):
inprogress_tasks = set()
job_limit_semaphores = dict()
def parameter_generator():
count_remaining_jobs = defaultdict(int)
log_at_level (logger, 10, verbose, " job_parameter_generator BEGIN")
while len(incomplete_tasks):
cnt_jobs_created_for_all_tasks = 0
cnt_tasks_processed = 0
#
# get rid of all completed tasks first
# Completion is signalled from pipeline_run
#
while True:
try:
item = task_with_completed_job_q.get_nowait()
job_completed_task, job_completed_task_name, job_completed_name = item
if not job_completed_task in incomplete_tasks:
raise Exception("Last job %s for %s. Missing from incomplete tasks in make_job_parameter_generator" % (job_completed_name, job_completed_task_name))
count_remaining_jobs[job_completed_task] = count_remaining_jobs[job_completed_task] - 1
#
# This is bad: something has gone very wrong
#
if count_remaining_jobs[t] < 0:
raise Exception("job %s for %s causes job count < 0." % (job_completed_name, job_completed_task_name))
#
# This Task completed
#
if count_remaining_jobs[job_completed_task] == 0:
log_at_level (logger, 10, verbose, " Last job for %s. Retired from incomplete tasks in pipeline_run " % job_completed_task._name)
incomplete_tasks.remove(job_completed_task)
job_completed_task.completed (logger)
except Queue.Empty:
break
for t in list(incomplete_tasks):
#
# wrap in execption handler so that we know which task exception
# came from
#
try:
log_at_level (logger, 10, verbose, " job_parameter_generator consider task = %s" % t._name)
# ignore tasks in progress
if t in inprogress_tasks:
continue
log_at_level (logger, 10, verbose, " job_parameter_generator task %s not in progress" % t._name)
# ignore tasks with incomplete dependencies
incomplete_parent = False
for parent in task_parents[t]:
if parent in incomplete_tasks:
incomplete_parent = True
break
if incomplete_parent:
continue
log_at_level (logger, 10, verbose, " job_parameter_generator start task %s (parents completed)" % t._name)
force_rerun = t in forcedtorun_tasks
#
# Only log active task
#
if t.is_active:
log_at_level (logger, 3, verbose, "Task enters queue = " + t.get_task_name() + (": Forced to rerun" if force_rerun else ""))
log_at_level (logger, 3, verbose, t._description)
inprogress_tasks.add(t)
cnt_tasks_processed += 1
#
# Use output parameters actually generated by running task
#
t.output_filenames = []
#
# If no parameters: just call task function (empty list)
#
#if (t.active_if_checks != None):
# t.is_active = all(arg() if isinstance(arg, collections.Callable) else arg
# for arg in t.active_if_checks)
if not t.is_active:
parameters = []
#
# If no parameters: just call task function (empty list)
#
elif t.param_generator_func == None:
parameters = ([[], []],)
else:
parameters = t.param_generator_func(runtime_data)
#
# iterate through parameters
#
cnt_jobs_created = 0
for param, descriptive_param in parameters:
#
# save output even if uptodate
#
if len(param) >= 2:
t.output_filenames.append(param[1])
job_name = t.get_job_name(descriptive_param, runtime_data)
#
# don't run if up to date unless force to run
#
if force_rerun:
log_at_level (logger, 3, verbose, " force task %s to rerun " % job_name)
else:
if not t.needs_update_func:
log_at_level (logger, 3, verbose, " %s no function to check if up-to-date " % job_name)
else:
# extra clunky hack to also pass task info--
# makes sure that there haven't been code or arg changes
if t.needs_update_func == needs_update_check_modify_time:
needs_update, msg = t.needs_update_func (*param, task=t, job_history = job_history)
else:
needs_update, msg = t.needs_update_func (*param)
if not needs_update:
log_at_level (logger, 2, verbose, " %s unnecessary: already up to date " % job_name)
continue
else:
log_at_level (logger, 3, verbose, " %s %s " % (job_name, msg))
#
# Clunky hack to make sure input files exists right before
# job is called for better error messages
#
if t.needs_update_func == needs_update_check_modify_time:
check_input_files_exist (*param)
# pause for one second before first job of each tasks
# @originate tasks do not need to pause, because they depend on nothing!
if cnt_jobs_created == 0 and touch_files_only < 2:
if "ONE_SECOND_PER_JOB" in runtime_data and runtime_data["ONE_SECOND_PER_JOB"] and t._action_type != _task.action_task_originate:
log_at_level (logger, 10, verbose, " 1 second PAUSE in job_parameter_generator\n\n\n")
time.sleep(1.01)
else:
time.sleep(0.1)
count_remaining_jobs[t] += 1
cnt_jobs_created += 1
cnt_jobs_created_for_all_tasks += 1
yield (param,
t._name,
job_name,
t.job_wrapper,
t.user_defined_work_func,
get_semaphore (t, job_limit_semaphores, syncmanager),
touch_files_only)
# if no job came from this task, this task is complete
# we need to retire it here instead of normal completion at end of job tasks
# precisely because it created no jobs
if cnt_jobs_created == 0:
incomplete_tasks.remove(t)
t.completed (logger, True)
log_at_level (logger, 10, verbose, " No jobs created for %s. Retired in parameter_generator " % t._name)
#
# Add extra warning if no regular expressions match:
# This is a common class of frustrating errors
#
if (verbose >= 1 and "ruffus_WARNING" in runtime_data and
t.param_generator_func in runtime_data["ruffus_WARNING"]):
for msg in runtime_data["ruffus_WARNING"][t.param_generator_func]:
logger.warning(" 'In Task def %s(...):' %s " % (t.get_task_name(), msg))
#
# GeneratorExit is thrown when this generator does not complete.
# I.e. there is a break in the pipeline_run loop.
# This happens where there are exceptions signalled from within a job
#
# This is not really an exception, more a way to exit the generator loop
# asynchrononously so that cleanups can happen (e.g. the "with" statement
# or finally.)
#
# We could write except Exception: below which will catch everything but
# KeyboardInterrupt and StopIteration and GeneratorExit in python 2.6
#
# However, in python 2.5, GeneratorExit inherits from Exception. So
# we explicitly catch and rethrow GeneratorExit.
except GeneratorExit:
raise
except:
exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
exception_stack = traceback.format_exc(exceptionTraceback)
exception_name = exceptionType.__module__ + '.' + exceptionType.__name__
exception_value = str(exceptionValue)
if len(exception_value):
exception_value = "(%s)" % exception_value
errt = RethrownJobError([(t._name,
"",
exception_name,
exception_value,
exception_stack)])
errt.specify_task(t, "Exceptions generating parameters")
raise errt
# extra tests incase final tasks do not result in jobs
if len(incomplete_tasks) and (not cnt_tasks_processed or cnt_jobs_created_for_all_tasks):
log_at_level (logger, 10, verbose, " incomplete tasks = " +
",".join([t._name for t in incomplete_tasks] ))
yield waiting_for_more_tasks_to_complete()
yield all_tasks_complete()
# This function is done
log_at_level (logger, 10, verbose, " job_parameter_generator END")
return parameter_generator
#_________________________________________________________________________________________
#
# feed_job_params_to_process_pool
#
#
#________________________________________________________________________________________
def feed_job_params_to_process_pool_factory (parameter_q, logger, verbose):
"""
Process pool gets its parameters from this generator
Use factory function to save parameter_queue
"""
def feed_job_params_to_process_pool ():
log_at_level (logger, 10, verbose, " Send param to Pooled Process START")
while 1:
log_at_level (logger, 10, verbose, " Get next parameter size = %d" %
parameter_q.qsize())
if not parameter_q.qsize():
time.sleep(0.1)
param = parameter_q.get()
log_at_level (logger, 10, verbose, " Get next parameter done")
# all tasks done
if isinstance(param, all_tasks_complete):
break
log_at_level (logger, 10, verbose, " Send param to Pooled Process=>" + str(param[0]))
yield param
log_at_level (logger, 10, verbose, " Send param to Pooled Process END")
# return generator
return feed_job_params_to_process_pool
#_________________________________________________________________________________________
#
# fill_queue_with_job_parameters
#
#________________________________________________________________________________________
def fill_queue_with_job_parameters (job_parameters, parameter_q, POOL_SIZE, logger, verbose):
"""
Ensures queue is filled with number of parameters > jobs / slots (POOL_SIZE)
"""
log_at_level (logger, 10, verbose, " fill_queue_with_job_parameters START")
for param in job_parameters:
# stop if no more jobs available
if isinstance(param, waiting_for_more_tasks_to_complete):
log_at_level (logger, 10, verbose, " fill_queue_with_job_parameters WAITING for task to complete")
break
if not isinstance(param, all_tasks_complete):
log_at_level (logger, 10, verbose, " fill_queue_with_job_parameters=>" + str(param[0]))
# put into queue
parameter_q.put(param)
# queue size needs to be at least 2 so that the parameter queue never consists of a single
# waiting_for_task_to_complete entry which will cause
# a loop and everything to hang!
if parameter_q.qsize() > POOL_SIZE + 1:
break
log_at_level (logger, 10, verbose, " fill_queue_with_job_parameters END")
#
# How the job queue works:
#
# Main loop
# iterates pool.map using feed_job_params_to_process_pool()
# (calls parameter_q.get() until all_tasks_complete)
#
# if errors but want to finish tasks already in pipeine:
# parameter_q.put(all_tasks_complete())
# keep going
# else:
#
# loops through jobs until no more jobs in non-dependent tasks
# separate loop in generator so that list of incomplete_tasks does not
# get updated half way through
# causing race conditions
#
# parameter_q.put(param)
# until waiting_for_more_tasks_to_complete
# until queue is full (check *after*)
#
#_________________________________________________________________________________________
# pipeline_run
#_________________________________________________________________________________________
[docs]def pipeline_run(target_tasks = [],
forcedtorun_tasks = [],
multiprocess = 1,
logger = stderr_logger,
gnu_make_maximal_rebuild_mode = True,
verbose = 1,
runtime_data = None,
one_second_per_job = None,
touch_files_only = False,
exceptions_terminate_immediately = False,
log_exceptions = False,
checksum_level = None,
multithread = 0,
history_file = None):
# Remember to add further extra parameters here to "extra_pipeline_run_options" inside cmdline.py
# This will forward extra parameters from the command line to pipeline_run
"""
Run pipelines.
:param target_tasks: targets task functions which will be run if they are out-of-date
:param forcedtorun_tasks: task functions which will be run whether or not they are out-of-date
:param multiprocess: The number of concurrent jobs running on different processes.
:param multithread: The number of concurrent jobs running as different threads. If > 1, ruffus will use multithreading *instead of* multiprocessing (and ignore the multiprocess parameter). Using multi threading is particularly useful to manage high performance clusters which otherwise are prone to "processor storms" when large number of cores finish jobs at the same time. (Thanks Andreas Heger)
:param logger: Where progress will be logged. Defaults to stderr output.
:type logger: `logging <http://docs.python.org/library/logging.html>`_ objects
:param verbose: level 0 : nothing
level 1 : logs task names and warnings
level 2 : logs task description if exists
level 3 : logs job names for jobs to be run
level 4 : logs list of up-to-date tasks and job names for jobs to be run
level 5 : logs job names for all jobs whether up-to-date or not
level 10: logs messages useful only for debugging ruffus pipeline code
:param touch_files_only: Create or update input/output files only to simulate running the pipeline. Do not run jobs. If set to CHECKSUM_REGENERATE, will regenerate the checksum history file to reflect the existing i/o files on disk.
:param exceptions_terminate_immediately: Exceptions cause immediate termination
rather than waiting for N jobs to finish where N = multiprocess
:param log_exceptions: Print exceptions to the logger as soon as they occur.
:param checksum_level: Several options for checking up-to-dateness are available: Default is level 1.
level 0 : Use only file timestamps
level 1 : above, plus timestamp of successful job completion
level 2 : above, plus a checksum of the pipeline function body
level 3 : above, plus a checksum of the pipeline function default arguments and the additional arguments passed in by task decorators
:param history_file: The database file which stores checksums and file timestamps for input/output files.
:param one_second_per_job: To work around poor file timepstamp resolution for some file systems. Defaults to True if checksum_level is 0 forcing Tasks to take a minimum of 1 second to complete.
:param runtime_data: Experimental feature for passing data to tasks at run time
:param gnu_make_maximal_rebuild_mode: Defaults to re-running *all* out-of-date tasks. Runs minimal
set to build targets if set to ``True``. Use with caution.
"""
if touch_files_only == False:
touch_files_only = 0
elif touch_files_only == True:
touch_files_only = 1
else:
touch_files_only = 2
# we are not running anything so do it as quickly as possible
one_second_per_job = False
syncmanager = multiprocessing.Manager()
if runtime_data == None:
runtime_data = {}
if not isinstance(runtime_data, dict):
raise Exception("pipeline_run parameter runtime_data should be a dictionary of "
"values passes to jobs at run time.")
#
# whether using multiprocessing or multithreading
#
if multithread:
pool = ThreadPool(multithread)
parallelism = multithread
elif multiprocess > 1:
pool = Pool(multiprocess)
parallelism = multiprocess
else:
parallelism = 1
pool = None
if checksum_level is None:
checksum_level = get_default_checksum_level()
#
# Supplement mtime with system clock if using CHECKSUM_HISTORY_TIMESTAMPS
# we don't need to default to adding 1 second delays between jobs
#
if one_second_per_job == None:
if checksum_level == CHECKSUM_FILE_TIMESTAMPS:
log_at_level (logger, 5, verbose, " Checksums rely on FILE TIMESTAMPS only and we don't know if the system file time resolution: Pause 1 second...")
runtime_data["ONE_SECOND_PER_JOB"] = True
else:
log_at_level (logger, 5, verbose, " Checksum use calculated time as well: No 1 second pause...")
runtime_data["ONE_SECOND_PER_JOB"] = False
else:
log_at_level (logger, 5, verbose, " One second per job specified to be %s" % one_second_per_job)
runtime_data["ONE_SECOND_PER_JOB"] = one_second_per_job
if verbose == 0:
logger = black_hole_logger
elif verbose >= 11:
if hasattr(logger, "add_unique_prefix"):
logger.add_unique_prefix()
if touch_files_only and verbose >= 1:
logger.info("Touch output files instead of remaking them.")
link_task_names_to_functions ()
update_checksum_level_on_tasks (checksum_level)
#
# If we aren't using checksums, and history file hasn't been specified,
# we might be a bit surprised to find Ruffus writing to a sqlite db anyway.
# Let us just use a in-memory db which will be thrown away
# Of course, if history_file is specified, we presume you know what you are doing
#
if checksum_level == CHECKSUM_FILE_TIMESTAMPS and history_file == None:
history_file = ':memory:'
job_history = open_job_history (history_file)
#
# @active_if decorated tasks can change their active state every time
# pipeline_run / pipeline_printout / pipeline_printout_graph is called
#
update_active_states_for_all_tasks ()
#
# target jobs
#
target_tasks = task_names_to_tasks ("Target", target_tasks)
forcedtorun_tasks = task_names_to_tasks ("Forced to run", forcedtorun_tasks)
#
# To update the checksum file, we force all tasks to rerun but then don't actually call the task function...
#
# So starting with target_tasks and forcedtorun_tasks, we harvest all upstream dependencies willy, nilly
# and assign the results to forcedtorun_tasks
#
if touch_files_only == 2:
(forcedtorun_tasks, ignore_param1, ignore_param2,
ignore_param3) = topologically_sorted_nodes(target_tasks + forcedtorun_tasks, True,
gnu_make_maximal_rebuild_mode,
extra_data_for_signal = [t_verbose_logger(0, None, runtime_data), job_history])
(topological_sorted,
self_terminated_nodes,
dag_violating_edges,
dag_violating_nodes) = topologically_sorted_nodes( target_tasks, forcedtorun_tasks,
gnu_make_maximal_rebuild_mode,
extra_data_for_signal = [t_verbose_logger(verbose, logger, runtime_data), job_history])
if len(dag_violating_nodes):
dag_violating_tasks = ", ".join(t._name for t in dag_violating_nodes)
e = error_circular_dependencies("Circular dependencies found in the "
"pipeline involving one or more of (%s)" %
(dag_violating_tasks))
raise e
#
# get dependencies. Only include tasks which will be run
#
incomplete_tasks = set(topological_sorted)
task_parents = defaultdict(set)
for t in incomplete_tasks:
task_parents[t] = set()
for parent in t._outward:
if parent in incomplete_tasks:
task_parents[t].add(parent)
#print json.dumps(task_parents.items(), indent=4, cls=task_encoder)
# prepare tasks for pipeline run
# **********
# BEWARE
# **********
#
# Because state is stored, ruffus is *not* reentrant.
#
# **********
# BEWARE
# **********
for t in topological_sorted:
t.init_for_pipeline()
#
# prime queue with initial set of job parameters
#
parameter_q = Queue.Queue()
task_with_completed_job_q = Queue.Queue()
parameter_generator = make_job_parameter_generator (incomplete_tasks, task_parents,
logger, forcedtorun_tasks,
task_with_completed_job_q,
runtime_data, verbose,
syncmanager,
touch_files_only, job_history)
job_parameters = parameter_generator()
fill_queue_with_job_parameters(job_parameters, parameter_q, parallelism, logger, verbose)
#
# N.B.
# Handling keyboard shortcuts may require
# See http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
#
# When waiting for a condition in threading.Condition.wait(), KeyboardInterrupt is never sent
# unless a timeout is specified
#
#
#
# #
# # whether using multiprocessing
# #
# pool = Pool(parallelism) if multiprocess > 1 else None
# if pool:
# pool_func = pool.imap_unordered
# job_iterator_timeout = []
# else:
# pool_func = imap
# job_iterator_timeout = [999999999999]
#
#
# ....
#
#
# it = pool_func(run_pooled_job_without_exceptions, feed_job_params_to_process_pool())
# while 1:
# try:
# job_result = it.next(*job_iterator_timeout)
#
# ...
#
# except StopIteration:
# break
if pool:
pool_func = pool.imap_unordered
else:
pool_func = imap
feed_job_params_to_process_pool = feed_job_params_to_process_pool_factory (parameter_q, logger, verbose)
#
# for each result from job
#
job_errors = RethrownJobError()
tasks_with_errors = set()
#
# job_result.job_name / job_result.return_value
# Reserved for returning result from job...
# How?
#
for job_result in pool_func(run_pooled_job_without_exceptions, feed_job_params_to_process_pool()):
t = node.lookup_node_from_name(job_result.task_name)
# remove failed jobs from history-- their output is bogus now!
if job_result.state in (JOB_ERROR, JOB_SIGNALLED_BREAK):
if len(job_result.params) > 1: # some jobs have no outputs
output_file_name = job_result.params[1]
if not isinstance(output_file_name, list): # some have multiple outputs from one job
output_file_name = [output_file_name]
#
# N.B. output parameters are not necessary all strings
#
for o_f_n in get_strings_in_nested_sequence(output_file_name):
#
# use paths relative to working directory
#
o_f_n = os.path.relpath(o_f_n)
job_history.pop(o_f_n, None) # remove outfile from history if it exists
# only save poolsize number of errors
if job_result.state == JOB_ERROR:
log_at_level (logger, 6, verbose, " Exception caught for %s" % job_result.job_name)
job_errors.append(job_result.exception)
tasks_with_errors.add(t)
#
# print to logger immediately
#
if log_exceptions:
log_at_level (logger, 6, verbose, " Log Exception")
logger.error(job_errors.get_nth_exception_str())
#
# break if too many errors
#
if len(job_errors) >= parallelism or exceptions_terminate_immediately:
log_at_level (logger, 6, verbose, " Break loop %s %s %s " % (exceptions_terminate_immediately, len(job_errors), parallelism) )
parameter_q.put(all_tasks_complete())
break
# break immediately if the user says stop
elif job_result.state == JOB_SIGNALLED_BREAK:
job_errors.append(job_result.exception)
job_errors.specify_task(t, "Exceptions running jobs")
log_at_level (logger, 6, verbose, " Break loop JOB_SIGNALLED_BREAK %s %s " % (len(job_errors), parallelism) )
parameter_q.put(all_tasks_complete())
break
else:
if job_result.state == JOB_UP_TO_DATE:
if verbose > 1:
logger.info(" %s unnecessary: already up to date" % job_result.job_name)
else:
if verbose:
logger.info(" %s completed" % job_result.job_name)
# save this task name and the job (input and output files)
# alternatively, we could just save the output file and its
# completion time, or on the other end of the spectrum,
# we could save a checksum of the function that generated
# this file, something akin to:
# chksum = md5.md5(marshal.dumps(t.user_defined_work_func.func_code.co_code))
# we could even checksum the arguments to the function that
# generated this file:
# chksum2 = md5.md5(marshal.dumps(t.user_defined_work_func.func_defaults) +
# marshal.dumps(t.args))
if len(job_result.params) > 1: # some jobs have no outputs
output_file_name = job_result.params[1]
if not isinstance(output_file_name, list): # some have multiple outputs from one job
output_file_name = [output_file_name]
#
# N.B. output parameters are not necessary all strings
# and not all files have been successfully created,
# even though the task apparently completed properly!
#
for o_f_n in get_strings_in_nested_sequence(output_file_name):
#
# use paths relative to working directory
#
o_f_n = os.path.relpath(o_f_n)
try:
log_at_level (logger, 6, verbose, " Job History for : " + o_f_n)
mtime = os.path.getmtime(o_f_n)
#
# use probably higher resolution time.time() over mtime
# which might have 1 or 2s resolutions, unless there is
# clock skew and the filesystem time > system time
# (e.g. for networks)
#
epoch_seconds = time.time()
# Aargh. go back to insert one second between jobs
if epoch_seconds < mtime:
if one_second_per_job == None and not runtime_data["ONE_SECOND_PER_JOB"]:
log_at_level (logger, 6, verbose, " Switch to one second per job")
runtime_data["ONE_SECOND_PER_JOB"] = True
elif epoch_seconds - mtime < 1.1:
mtime = epoch_seconds
chksum = JobHistoryChecksum(o_f_n, mtime, job_result.params[2:], t)
job_history[o_f_n] = chksum
except:
pass
##for output_file_name in t.output_filenames:
## # could use current time instead...
## if not isinstance(output_file_name, list):
## output_file_name = [output_file_name]
## for o_f_n in output_file_name:
## mtime = os.path.getmtime(o_f_n)
## chksum = JobHistoryChecksum(o_f_n, mtime, job_result.params[2:], t)
## job_history[o_f_n] = chksum
#
# signal completed task after checksumming
#
task_with_completed_job_q.put((t, job_result.task_name, job_result.job_name))
# make sure queue is still full after each job is retired
# do this after undating which jobs are incomplete
if len(job_errors):
#parameter_q.clear()
#if len(job_errors) == 1 and not parameter_q._closed:
parameter_q.put(all_tasks_complete())
else:
fill_queue_with_job_parameters(job_parameters, parameter_q, parallelism, logger, verbose)
syncmanager.shutdown()
if pool:
pool.close()
pool.terminate()
if len(job_errors):
raise job_errors
# use high resolution timestamps where available
# default in python 2.5 and greater
# N.B. File modify times / stat values have 1 second precision for many file systems
# and may not be accurate to boot, especially over the network.
os.stat_float_times(True)
if __name__ == '__main__':
import unittest
#
# debug parameter ignored if called as a module
#
if sys.argv.count("--debug"):
sys.argv.remove("--debug")
unittest.main()