simpler_with_shared_logging – How to share data across jobs

See here for a run through of this example.

The code for this example is in the test subdirectory of the ruffus module:

ruffus/test/simpler_with_shared_logging.py

The shared data:

--log_file_name, -L
shared log file location

Programme options:

--help, -h
show help message

To specify tasks:

--target_tasks TARGET_TASK, -t TARGET_TASK

Target task(s) of pipeline. TARGET_TASK can be

  • task1
  • task2
  • task3
  • task4

For example:

complicated_example.py -t task1 -t task4
--forced_tasks FORCED_TASK, -f FORCED_TASK

Pipeline task(s) which will be run even if they are up to date.

See above for a list of pipelined tasks

--jobs N, -j N
N specifies number of concurrent process running jobs in parallel
--minimal_rebuild_mode, -M
Rebuild a minimum of tasks necessary for the target. Ignore upstream out of date tasks if intervening tasks are up to date.

To print flowchart:

--dependency FILE, -d file
Print a dependency graph of the pipeline that would be executed to FILE, but do not execute it.
--dependency_graph_format FORMAT, -F FORMAT

Format of dependency graph file.

Can be:

* 'ps'
    (PostScript)

* 'svg'

* 'svgz'
    (Structured Vector Graphics),

* 'png'

* 'gif'
     (bitmap  graphics)
--just_print, -n
Print a description of the jobs that would be executed, but do not execute them.
--no_key_legend_in_graph, -K
Do not print out legend and key for dependency graph.
--draw_graph_horizontally, -H
Draw horizontal (left to right) dependency graph.

Code:

#!/usr/bin/env python
"""

    test_tasks.py

"""


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   options        


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

from optparse import OptionParser
import sys, os
import os.path
import StringIO

# add self to search path for testing
exe_path = os.path.split(os.path.abspath(sys.argv[0]))[0]
sys.path.insert(0,os.path.abspath(os.path.join(exe_path,"..", "..")))
if __name__ == '__main__':
    module_name = os.path.split(sys.argv[0])[1]
    module_name = os.path.splitext(module_name)[0];
else:
    module_name = __name__




parser = OptionParser(version="%prog 1.0")
parser.add_option("-t", "--target_tasks", dest="target_tasks",
                  action="append",
                  default = list(),
                  metavar="JOBNAME", 
                  type="string",
                  help="Target task(s) of pipeline.")
parser.add_option("-f", "--forced_tasks", dest="forced_tasks",
                  action="append",
                  default = list(),
                  metavar="JOBNAME", 
                  type="string",
                  help="Pipeline task(s) which will be included even if they are up to date.")
parser.add_option("-j", "--jobs", dest="jobs",
                  default=1,
                  metavar="jobs", 
                  type="int",
                  help="Specifies  the number of jobs (commands) to run simultaneously.")
parser.add_option("-v", "--verbose", dest = "verbose",
                  action="store_true", default=False,
                  help="Do not echo to shell but only print to log.")
parser.add_option("-d", "--dependency", dest="dependency_file",
                  #default="simple.svg",
                  metavar="FILE", 
                  type="string",
                  help="Print a dependency graph of the pipeline that would be executed "
                        "to FILE, but do not execute it.")
parser.add_option("-F", "--dependency_graph_format", dest="dependency_graph_format",
                  metavar="FORMAT", 
                  type="string",
                  default = 'svg',
                  help="format of dependency graph file. Can be 'ps' (PostScript), "+
                  "'svg' 'svgz' (Structured Vector Graphics), " +
                  "'png' 'gif' (bitmap  graphics) etc ")
parser.add_option("-n", "--just_print", dest="just_print",
                    action="store_true", default=False,
                    help="Print a description of the jobs that would be executed, "
                        "but do not execute them.")
parser.add_option("-M", "--minimal_rebuild_mode", dest="minimal_rebuild_mode",
                    action="store_true", default=False,
                    help="Rebuild a minimum of tasks necessary for the target. "
                    "Ignore upstream out of date tasks if intervening tasks are fine.")
parser.add_option("-K", "--no_key_legend_in_graph", dest="no_key_legend_in_graph",
                    action="store_true", default=False,
                    help="Do not print out legend and key for dependency graph.")
parser.add_option("-H", "--draw_graph_horizontally", dest="draw_horizontally",
                    action="store_true", default=False,
                    help="Draw horizontal dependency graph.")

parser.add_option("-L", "--log_file_name", dest="log_file_name",
                    default="/tmp/simple.log",
                    metavar="FILE", 
                    type="string",
                    help="log file.")
parameters = [  
                ]







#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   imports        


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

import StringIO
import re
import operator
import sys,os
from collections import defaultdict

sys.path.append(os.path.abspath(os.path.join(exe_path,"..", "..")))
from ruffus import *

# use simplejson in place of json for python < 2.6
try:
    import json
except ImportError:
    import simplejson
    json = simplejson

    
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Shared logging


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

import multiprocessing
import multiprocessing.managers


import logging
import logging.handlers



#
#   setup_logger 
#
def setup_shared_logger(LOGGING_LEVEL, LOG_FILENAME):
    """
    Function to setup logger shared between all processes
    The logger object will be created within a separate (special) process 
        run by multiprocessing.BaseManager.start()

    See "LoggingManager" below
    """

    #
    #   Log file name with logger level
    # 
    my_ruffus_logger = logging.getLogger('simpler_example_logger')
    my_ruffus_logger.setLevel(LOGGING_LEVEL)

    # 
    #   Add handler to print to file, with the specified format  
    #
    handler = logging.handlers.RotatingFileHandler(
                  LOG_FILENAME, maxBytes=100000, backupCount=5)
    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)6s - %(message)s")
    handler.setFormatter(formatter)
    my_ruffus_logger.addHandler(handler)


    #
    #   This log object will be wrapped in proxy 
    #
    return my_ruffus_logger


#
#   Proxy object for logging
#       Logging messages will be marshalled (forwarded) to the process where the 
#       shared log lives
#
class LoggerProxy(multiprocessing.managers.BaseProxy):
    def debug(self, message):
        return self._callmethod('debug', [message])
    def info(self, message):
        return self._callmethod('info', [message])
    def __str__ (self):
        return "Logging proxy"


# 
#   Register the setup_logger function as a proxy for setup_logger
#   
#   We use SyncManager as a base class so we can get a lock proxy for synchronising 
#       logging later on
#
class LoggingManager(multiprocessing.managers.SyncManager):
    """
    Logging manager sets up its own process and will create the real Log object there
    We refer to this (real) log via proxies
    """
    pass
LoggingManager.register('setup_logger', setup_shared_logger, proxytype=LoggerProxy, exposed = ('info', 'debug', '__str__'))




    
    
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Functions


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

def create_custom_file_func(params):
    """
    creates function which can be used as input to @files_func
    """
    def cust_func ():
        for job_param in params:
            yield job_param
    return cust_func

def is_job_uptodate (infiles, outfiles, *extra_params):
    """
    assumes first two parameters are files, checks if they are up to date
    """
    return task.needs_update_check_modify_time (infiles, outfiles, *extra_params)

def test_post_task_function ():
    print "Hooray"

import time
def test_job_io(infiles, outfiles, extra_params):
    """
    cat input files content to output files
        after writing out job parameters
    """
    
    # dump parameters
    params = (infiles, outfiles)# + extra_params[0:-3]

    logger_proxy, logging_mutex = extra_params
    with logging_mutex:
        logger_proxy.debug("job = %s, process name = %s" % 
                            (json.dumps(params),
                                multiprocessing.current_process().name))

    
    sys.stdout.write('    job = %s\n' % json.dumps(params))
    
    if isinstance(infiles, str):
        infiles = [infiles]
    elif infiles == None:
        infiles = []
    if isinstance(outfiles, str):
        outfiles = [outfiles]
    output_text = list()
    for f in infiles:
        output_text.append(open(f).read())
    output_text = "".join(sorted(output_text))
    output_text += json.dumps(infiles) + " -> " + json.dumps(outfiles) + "\n"
    for f in outfiles:
        open(f, "w").write(output_text)
    time.sleep(1)

    

#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Main logic


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888




if __name__ == '__main__':

    # get help string
    f =StringIO.StringIO()
    parser.print_help(f)
    helpstr = f.getvalue()
    
    #
    #   Get options 
    #
    (options, remaining_args) = parser.parse_args()


    #
    #   make shared log and proxy 
    #
    manager = LoggingManager()
    manager.register('setup_logger', setup_shared_logger, proxytype=LoggerProxy, exposed = ('info', 'debug'))
    
    manager.start()
    LOG_FILENAME  = options.log_file_name
    LOGGING_LEVEL = logging.DEBUG
    logger_proxy = manager.setup_logger(LOGGING_LEVEL, LOG_FILENAME)
    
    #
    #   make sure we are not logging at the same time in different processes
    #
    logging_mutex = manager.Lock()






#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888





#
#    task1
#
@files(None, 'a.1', logger_proxy, logging_mutex)
def task1(infiles, outfiles, *extra_params):
    """
    First task
    """
    test_job_io(infiles, outfiles, extra_params)



#
#    task2
#
@files_re('*.1', '(.*).1', r'\1.1', r'\1.2', logger_proxy, logging_mutex)
@follows(task1)
def task2(infiles, outfiles, *extra_params):
    """
    Second task
    """
    test_job_io(infiles, outfiles, extra_params)



#
#    task3
#
@files_re('*.1', '(.*).1', r'\1.2', r'\1.3', logger_proxy, logging_mutex)
@follows(task2)
def task3(infiles, outfiles, *extra_params):
    """
    Third task
    """
    test_job_io(infiles, outfiles, extra_params)



#
#    task4
#
@files_re('*.1', '(.*).1', r'\1.3', r'\1.4', logger_proxy, logging_mutex)
@follows(task3)
def task4(infiles, outfiles, *extra_params):
    """
    Fourth task
    """
    test_job_io(infiles, outfiles, extra_params)

# 
#   Necessary to protect the "entry point" of the program under windows.
#       see: http://docs.python.org/library/multiprocessing.html#multiprocessing-programming
#
if __name__ == '__main__':
    try:
        if options.just_print:
            pipeline_printout(sys.stdout, options.target_tasks, options.forced_tasks, 
                                long_winded=True, 
                                gnu_make_maximal_rebuild_mode = not options.minimal_rebuild_mode)
        
        elif options.dependency_file:
            pipeline_printout_graph (     open(options.dependency_file, "w"),
                                 options.dependency_graph_format,
                                 options.target_tasks, 
                                 options.forced_tasks,
                                 draw_vertically = not options.draw_horizontally,
                                 gnu_make_maximal_rebuild_mode  = not options.minimal_rebuild_mode,
                                 no_key_legend  = options.no_key_legend_in_graph)
        else:    
            pipeline_run(options.target_tasks, options.forced_tasks, multiprocess = options.jobs, 
                            gnu_make_maximal_rebuild_mode  = not options.minimal_rebuild_mode,
                            logger = logger_proxy)
    except Exception, e:
        print e.args