Code for Chapter 16: Logging progress through a pipeline

This example shows how to log messages from within each of your pipelined functions.

Code

from ruffus import *
from ruffus.proxy_logger import *
import logging

import sys,os
# use simplejson in place of json for python < 2.6
try:
    import json
except ImportError:
    import simplejson
    json = simplejson



#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Shared logging


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

logger_args={}
logger_args["file_name"] = "pipeline.log"
logger_args["level"] = logging.DEBUG
logger_args["rotating"] = True
logger_args["maxBytes"]=20000
logger_args["backupCount"]=10
logger_args["formatter"]="%(asctime)s - %(name)s - %(levelname)6s - %(message)s"

(logger_proxy,
 logging_mutex) = make_shared_logger_and_proxy (setup_std_shared_logger,
                                                "my_logger", logger_args)





#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Helper Function which writes to a shared log


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

import time
def test_job_io(infiles, outfiles, extra_params):
    """
    cat input files content to output files
        after writing out job parameters
    """
    #
    # dump parameters
    params = (infiles, outfiles)# + extra_params[0:-3]
    #
    logger_proxy, logging_mutex = extra_params
    with logging_mutex:
        logger_proxy.debug("job = %s, process name = %s" %
                            (json.dumps(params),
                                multiprocessing.current_process().name))
    #
    #
    sys.stdout.write('    job = %s\n' % json.dumps(params))
    #
    if isinstance(infiles, str):
        infiles = [infiles]
    elif infiles == None:
        infiles = []
    if isinstance(outfiles, str):
        outfiles = [outfiles]
    output_text = list()
    for f in infiles:
        output_text.append(open(f).read())
    output_text = "".join(sorted(output_text))
    output_text += json.dumps(infiles) + " -> " + json.dumps(outfiles) + "\n"
    for f in outfiles:
        open(f, "w").write(output_text)




#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#
#    task1
#
@files(None, 'a.1', logger_proxy, logging_mutex)
def task1(infiles, outfiles, *extra_params):
    """
    First task
    """
    test_job_io(infiles, outfiles, extra_params)



#
#    task2
#
@transform(task1, regex('.1'), '.2', logger_proxy, logging_mutex)
def task2(infiles, outfiles, *extra_params):
    """
    Second task
    """
    test_job_io(infiles, outfiles, extra_params)



#
#    task3
#
@transform(task2, regex('.2'), '.3', logger_proxy, logging_mutex)
def task3(infiles, outfiles, *extra_params):
    """
    Third task
    """
    test_job_io(infiles, outfiles, extra_params)



#
#    task4
#
@transform(task3, regex('.3'), '.4', logger_proxy, logging_mutex)
def task4(infiles, outfiles, *extra_params):
    """
    Fourth task
    """
    test_job_io(infiles, outfiles, extra_params)


#
#   Necessary to protect the "entry point" of the program under windows.
#       see: http://docs.python.org/library/multiprocessing.html#multiprocessing-programming
#
pipeline_run([task4], multiprocess = 10, logger = logger_proxy)

Resulting Output

>>> pipeline_run([task4], multiprocess = 10, logger = logger_proxy)
    job = [null, "a.1"]
    job = ["a.1", "a.2"]
    job = ["a.2", "a.3"]
    job = ["a.3", "a.4"]

Pipeline.log will contain our unimaginative log messages:

2009-11-15 03:04:55,884 - my_logger -  DEBUG - job = [null, "a.1"], process name = PoolWorker-2
2009-11-15 03:04:56,941 - my_logger -   INFO -     Job = [None -> a.1, <LoggingProxy>, <thread.lock>] completed
2009-11-15 03:04:56,942 - my_logger -   INFO - Completed Task = task1
2009-11-15 03:04:56,945 - my_logger -  DEBUG - job = ["a.1", "a.2"], process name = PoolWorker-4
2009-11-15 03:04:57,962 - my_logger -   INFO -     Job = [a.1 -> a.2, <LoggingProxy>, <thread.lock>] completed
2009-11-15 03:04:57,962 - my_logger -   INFO - Completed Task = task2
2009-11-15 03:04:57,965 - my_logger -  DEBUG - job = ["a.2", "a.3"], process name = PoolWorker-3
2009-11-15 03:04:59,009 - my_logger -   INFO -     Job = [a.2 -> a.3, <LoggingProxy>, <thread.lock>] completed
2009-11-15 03:04:59,010 - my_logger -   INFO - Completed Task = task3
2009-11-15 03:04:59,013 - my_logger -  DEBUG - job = ["a.3", "a.4"], process name = PoolWorker-5
2009-11-15 03:05:00,024 - my_logger -   INFO -     Job = [a.3 -> a.4, <LoggingProxy>, <thread.lock>] completed
2009-11-15 03:05:00,025 - my_logger -   INFO - Completed Task = task4