# -*- coding: utf-8 -*-
# Written (W) 2008-2012 Christian Widmer
# Written (W) 2008-2010 Cheng Soon Ong
# Written (W) 2012-2013 Daniel Blanchard, dblanchard@ets.org
# Copyright (C) 2008-2012 Max-Planck-Society, 2012-2013 ETS
# This file is part of GridMap.
# GridMap is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# GridMap is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with GridMap. If not, see <http://www.gnu.org/licenses/>.
"""
This module executes pickled jobs on the cluster.
@author: Christian Widmer
@author: Cheng Soon Ong
@author: Dan Blanchard (dblanchard@ets.org)
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import argparse
import logging
import multiprocessing
import os
import random
import socket
import sys
import time
from io import open
from subprocess import check_output
import zmq
from gridmap.conf import HEARTBEAT_FREQUENCY
from gridmap.data import zloads, zdumps
def _send_zmq_msg(job_id, command, data, address):
"""
simple code to send messages back to host
(and get a reply back)
"""
context = zmq.Context()
zsocket = context.socket(zmq.REQ)
zsocket.connect(address)
host_name = socket.gethostname()
ip_address = socket.gethostbyname(host_name)
msg_container = {}
msg_container["job_id"] = job_id
msg_container["host_name"] = host_name
msg_container["ip_address"] = ip_address
msg_container["command"] = command
msg_container["data"] = data
msg_string = zdumps(msg_container)
zsocket.send(msg_string)
msg = zloads(zsocket.recv())
return msg
def _heart_beat(job_id, address, parent_pid=-1, log_file="", wait_sec=45):
"""
will send reponses to the server with
information about the current state of
the process
"""
while True:
status = get_job_status(parent_pid)
status["log_file"] = log_file
_send_zmq_msg(job_id, "heart_beat", status, address)
time.sleep(wait_sec)
def _VmB(VmKey, pid):
"""
get various mem usage properties of process with id pid in MB
"""
_proc_status = '/proc/%d/status' % pid
_scale = {'kB': 1.0/1024.0, 'mB': 1.0,
'KB': 1.0/1024.0, 'MB': 1.0}
# get pseudo file /proc/<pid>/status
try:
with open(_proc_status) as t:
v = t.read()
except:
return 0.0 # non-Linux?
# get VmKey line e.g. 'VmRSS: 9999 kB\n ...'
i = v.index(VmKey)
v = v[i:].split(None, 3) # whitespace
if len(v) < 3:
return 0.0 # invalid format?
# convert Vm value to bytes
return float(v[1]) * _scale[v[2]]
[docs]def get_memory_usage(pid):
"""
return memory usage of process in Mb.
"""
return _VmB('VmSize:', pid)
[docs]def get_cpu_load(pid):
"""
return cpu usage of process
"""
command = ["ps", "h", "-o", "pcpu", "-p", "%d" % (pid)]
try:
info = check_output()
ps_pseudofile = os.popen(command)
info = ps_pseudofile.read()
ps_pseudofile.close()
cpu_load = info.strip()
except:
logger = logging.getLogger(__name__)
logger.warning('Getting CPU info failed.', exc_info=True)
cpu_load = "Unknown"
return cpu_load
[docs]def get_job_status(parent_pid):
"""
Determines the status of the current worker and its machine (currently not
cross-platform)
"""
status_container = {}
if parent_pid != -1:
status_container["memory"] = get_memory_usage(parent_pid)
status_container["cpu_load"] = get_cpu_load(parent_pid)
return status_container
def _run_job(job_id, address):
"""
Execute the pickled job and produce pickled output.
:param job_id: Unique ID of job
:type job_id: str
:param address: IP address of submitting host.
:type address: str
"""
wait_sec = random.randint(0, 5)
logger = logging.getLogger(__name__)
logger.info("waiting %i seconds before starting", wait_sec)
time.sleep(wait_sec)
try:
job = _send_zmq_msg(job_id, "fetch_input", None, address)
except Exception as e:
# here we will catch errors caused by pickled objects
# of classes defined in modules not in PYTHONPATH
logger.error('Could not retrieve input for job {0}'.format(job_id),
exc_info=True)
# send back exception
thank_you_note = _send_zmq_msg(job_id, "store_output", e, address)
logger.info('Sending reply: {0}'.format(thank_you_note))
return
logger.info("input arguments loaded, starting computation %s", job.args)
# create heart beat process
parent_pid = os.getpid()
heart = multiprocessing.Process(target=_heart_beat,
args=(job_id, address, parent_pid,
job.log_stderr_fn,
HEARTBEAT_FREQUENCY))
logger.info("starting heart beat")
heart.start()
# change working directory
logger.info("changing working directory")
if 1:
if job.working_dir is not None:
logger.info("Changing working directory: %s", job.working_dir)
os.chdir(job.working_dir)
# run job
logger.info("executing job")
job.execute()
# send back result
thank_you_note = _send_zmq_msg(job_id, "store_output", job, address)
logger.info(thank_you_note)
# stop heartbeat
heart.terminate()
def _main():
"""
Parse the command line inputs and call _run_job
"""
# Get command line arguments
parser = argparse.ArgumentParser(description="This wrapper script will run \
a pickled Python function on \
some pickled retrieved data \
via 0MQ. You almost never \
want to run this yourself.")
parser.add_argument('job_id',
help='Which job should be run.')
parser.add_argument('home_address',
help='IP address of submitting host.')
parser.add_argument('module_dir',
help='Directory that contains module containing pickled\
function. This will get added to PYTHONPATH \
temporarily.')
args = parser.parse_args()
# Make warnings from built-in warnings module get formatted more nicely
logging.captureWarnings(True)
logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
'%(message)s'))
print("Appended {0} to PYTHONPATH".format(args.module_dir), file=sys.stderr)
sys.path.append(clean_path(args.module_dir))
# Process the database and get job started
_run_job(args.job_id, args.home_address)
if __name__ == "__main__":
_main()