Source code for gridmap.runner

# -*- coding: utf-8 -*-

# Written (W) 2008-2012 Christian Widmer
# Written (W) 2008-2010 Cheng Soon Ong
# Written (W) 2012-2013 Daniel Blanchard, dblanchard@ets.org
# Copyright (C) 2008-2012 Max-Planck-Society, 2012-2013 ETS

# This file is part of GridMap.

# GridMap is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# GridMap is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with GridMap.  If not, see <http://www.gnu.org/licenses/>.

"""
This module executes pickled jobs on the cluster.

@author: Christian Widmer
@author: Cheng Soon Ong
@author: Dan Blanchard (dblanchard@ets.org)
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import argparse
import logging
import multiprocessing
import os
import random
import socket
import sys
import time
from io import open
from subprocess import check_output

import zmq

from gridmap.conf import HEARTBEAT_FREQUENCY
from gridmap.data import zloads, zdumps


def _send_zmq_msg(job_id, command, data, address):
    """
    simple code to send messages back to host
    (and get a reply back)
    """

    context = zmq.Context()
    zsocket = context.socket(zmq.REQ)
    zsocket.connect(address)

    host_name = socket.gethostname()
    ip_address = socket.gethostbyname(host_name)

    msg_container = {}
    msg_container["job_id"] = job_id
    msg_container["host_name"] = host_name
    msg_container["ip_address"] = ip_address
    msg_container["command"] = command
    msg_container["data"] = data

    msg_string = zdumps(msg_container)

    zsocket.send(msg_string)
    msg = zloads(zsocket.recv())

    return msg


def _heart_beat(job_id, address, parent_pid=-1, log_file="", wait_sec=45):
    """
    will send reponses to the server with
    information about the current state of
    the process
    """

    while True:
        status = get_job_status(parent_pid)
        status["log_file"] = log_file
        _send_zmq_msg(job_id, "heart_beat", status, address)
        time.sleep(wait_sec)


def _VmB(VmKey, pid):
    """
    get various mem usage properties of process with id pid in MB
    """

    _proc_status = '/proc/%d/status' % pid

    _scale = {'kB': 1.0/1024.0, 'mB': 1.0,
              'KB': 1.0/1024.0, 'MB': 1.0}

     # get pseudo file  /proc/<pid>/status
    try:
        with open(_proc_status) as t:
            v = t.read()
    except:
        return 0.0  # non-Linux?
     # get VmKey line e.g. 'VmRSS:  9999  kB\n ...'
    i = v.index(VmKey)
    v = v[i:].split(None, 3)  # whitespace
    if len(v) < 3:
        return 0.0  # invalid format?
     # convert Vm value to bytes
    return float(v[1]) * _scale[v[2]]


[docs]def get_memory_usage(pid): """ return memory usage of process in Mb. """ return _VmB('VmSize:', pid)
[docs]def get_cpu_load(pid): """ return cpu usage of process """ command = ["ps", "h", "-o", "pcpu", "-p", "%d" % (pid)] try: info = check_output() ps_pseudofile = os.popen(command) info = ps_pseudofile.read() ps_pseudofile.close() cpu_load = info.strip() except: logger = logging.getLogger(__name__) logger.warning('Getting CPU info failed.', exc_info=True) cpu_load = "Unknown" return cpu_load
[docs]def get_job_status(parent_pid): """ Determines the status of the current worker and its machine (currently not cross-platform) """ status_container = {} if parent_pid != -1: status_container["memory"] = get_memory_usage(parent_pid) status_container["cpu_load"] = get_cpu_load(parent_pid) return status_container
def _run_job(job_id, address): """ Execute the pickled job and produce pickled output. :param job_id: Unique ID of job :type job_id: str :param address: IP address of submitting host. :type address: str """ wait_sec = random.randint(0, 5) logger = logging.getLogger(__name__) logger.info("waiting %i seconds before starting", wait_sec) time.sleep(wait_sec) try: job = _send_zmq_msg(job_id, "fetch_input", None, address) except Exception as e: # here we will catch errors caused by pickled objects # of classes defined in modules not in PYTHONPATH logger.error('Could not retrieve input for job {0}'.format(job_id), exc_info=True) # send back exception thank_you_note = _send_zmq_msg(job_id, "store_output", e, address) logger.info('Sending reply: {0}'.format(thank_you_note)) return logger.info("input arguments loaded, starting computation %s", job.args) # create heart beat process parent_pid = os.getpid() heart = multiprocessing.Process(target=_heart_beat, args=(job_id, address, parent_pid, job.log_stderr_fn, HEARTBEAT_FREQUENCY)) logger.info("starting heart beat") heart.start() # change working directory logger.info("changing working directory") if 1: if job.working_dir is not None: logger.info("Changing working directory: %s", job.working_dir) os.chdir(job.working_dir) # run job logger.info("executing job") job.execute() # send back result thank_you_note = _send_zmq_msg(job_id, "store_output", job, address) logger.info(thank_you_note) # stop heartbeat heart.terminate() def _main(): """ Parse the command line inputs and call _run_job """ # Get command line arguments parser = argparse.ArgumentParser(description="This wrapper script will run \ a pickled Python function on \ some pickled retrieved data \ via 0MQ. You almost never \ want to run this yourself.") parser.add_argument('job_id', help='Which job should be run.') parser.add_argument('home_address', help='IP address of submitting host.') parser.add_argument('module_dir', help='Directory that contains module containing pickled\ function. This will get added to PYTHONPATH \ temporarily.') args = parser.parse_args() # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) print("Appended {0} to PYTHONPATH".format(args.module_dir), file=sys.stderr) sys.path.append(clean_path(args.module_dir)) # Process the database and get job started _run_job(args.job_id, args.home_address) if __name__ == "__main__": _main()