1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 This module provides wrappers that simplify submission and collection of jobs,
25 in a more 'pythonic' fashion.
26
27 @author: Christian Widmer
28 @author: Cheng Soon Ong
29 @author: Dan Blanchard (dblanchard@ets.org)
30
31 @var REDIS_DB: The index of the database to select on the Redis server; can be
32 overriden by setting the GRID_MAP_REDIS_DB environment variable.
33 @var REDIS_PORT: The port of the Redis server to use; can be overriden by
34 setting the GRID_MAP_REDIS_PORT environment variable.
35 @var USE_MEM_FREE: Does your cluster support specifying how much memory a job
36 will use via mem_free? Can be overriden by setting the
37 GRID_MAP_USE_MEM_FREE environment variable.
38 @var DEFAULT_QUEUE: The default job scheduling queue to use; can be overriden
39 via the GRID_MAP_DEFAULT_QUEUE environment variable.
40 """
41
42 from __future__ import absolute_import, print_function, unicode_literals
43
44 import inspect
45 import os
46 import subprocess
47 import sys
48 import traceback
49 import uuid
50 from socket import gethostname
51 from time import sleep
52
53 from drmaa import Session
54 from drmaa.errors import InvalidJobException
55 from redis import StrictRedis
56 from redis.exceptions import ConnectionError as RedisConnectionError
57
58 from gridmap.data import clean_path, zload_db, zsave_db
59
60
61 if sys.version_info < (3, 0):
62 range = xrange
63
64
65
66
67 REDIS_DB = int(os.getenv('GRID_MAP_REDIS_DB', '2'))
68
69 REDIS_PORT = int(os.getenv('GRID_MAP_REDIS_PORT', '7272'))
70
71
72 USE_MEM_FREE = 'TRUE' == os.getenv('GRID_MAP_USE_MEM_FREE', 'False').upper()
73
74
75 DEFAULT_QUEUE = os.getenv('GRID_MAP_DEFAULT_QUEUE', 'all.q')
76
77
78 -class Job(object):
79 """
80 Central entity that wraps a function and its data. Basically, a job consists
81 of a function, its argument list, its keyword list and a field "ret" which
82 is filled, when the execute method gets called.
83
84 @note: This can only be used to wrap picklable functions (i.e., those that
85 are defined at the module or class level).
86 """
87
88 __slots__ = ('_f', 'args', 'jobid', 'kwlist', 'cleanup', 'ret', 'exception',
89 'environment', 'replace_env', 'working_dir', 'num_slots',
90 'mem_free', 'white_list', 'path', 'uniq_id', 'name', 'queue')
91
92 - def __init__(self, f, args, kwlist=None, cleanup=True, mem_free="1G",
93 name='gridmap_job', num_slots=1, queue=DEFAULT_QUEUE):
94 """
95 Initializes a new Job.
96
97 @param f: a function, which should be executed.
98 @type f: function
99 @param args: argument list of function f
100 @type args: list
101 @param kwlist: dictionary of keyword arguments for f
102 @type kwlist: dict
103 @param cleanup: flag that determines the cleanup of input and log file
104 @type cleanup: boolean
105 @param mem_free: Estimate of how much memory this job will need (for
106 scheduling)
107 @type mem_free: C{basestring}
108 @param name: Name to give this job
109 @type name: C{basestring}
110 @param num_slots: Number of slots this job should use.
111 @type num_slots: C{int}
112 @param queue: SGE queue to schedule job on.
113 @type queue: C{basestring}
114 """
115
116 self.path = None
117 self._f = None
118 self.function = f
119 self.args = args
120 self.jobid = -1
121 self.kwlist = kwlist if kwlist is not None else {}
122 self.cleanup = cleanup
123 self.ret = None
124 self.environment = None
125 self.replace_env = False
126 self.working_dir = os.getcwd()
127 self.num_slots = num_slots
128 self.mem_free = mem_free
129 self.white_list = []
130 self.name = name.replace(' ', '_')
131 self.queue = queue
132
133 @property
135 ''' Function this job will execute. '''
136 return self._f
137
138 @function.setter
140 """
141 setter for function that carefully takes care of
142 namespace, avoiding __main__ as a module
143 """
144
145 m = inspect.getmodule(f)
146 try:
147 self.path = clean_path(os.path.dirname(os.path.abspath(
148 inspect.getsourcefile(f))))
149 except TypeError:
150 self.path = ''
151
152
153 if m.__name__ != "__main__":
154 self._f = f
155
156 else:
157
158
159 mn = os.path.splitext(os.path.basename(m.__file__))[0]
160
161
162 __import__(mn)
163
164
165 mod = sys.modules[mn]
166
167
168 self._f = getattr(mod, f.__name__)
169
171 """
172 Executes function f with given arguments
173 and writes return value to field ret.
174 If an exception is encountered during execution, ret will
175 contain a pickled version of it.
176 Input data is removed after execution to save space.
177 """
178 try:
179 self.ret = self.function(*self.args, **self.kwlist)
180 except Exception as exception:
181 self.ret = exception
182 traceback.print_exc()
183 del self.args
184 del self.kwlist
185
186 @property
206
207
208 -def _submit_jobs(jobs, uniq_id, temp_dir='/scratch', white_list=None,
209 quiet=True):
210 """
211 Method used to send a list of jobs onto the cluster.
212 @param jobs: list of jobs to be executed
213 @type jobs: c{list} of L{Job}
214 @param uniq_id: The unique suffix for the tables corresponding to this job
215 in the database.
216 @type uniq_id: C{basestring}
217 @param temp_dir: Local temporary directory for storing output for an
218 individual job.
219 @type temp_dir: C{basestring}
220 @param white_list: List of acceptable nodes to use for scheduling job. If
221 None, all are used.
222 @type white_list: C{list} of C{basestring}
223 @param quiet: When true, do not output information about the jobs that have
224 been submitted.
225 @type quiet: C{bool}
226 """
227
228 session = Session()
229 session.initialize()
230 jobids = []
231
232 for job_num, job in enumerate(jobs):
233
234 job.white_list = white_list
235
236
237 jobid = _append_job_to_session(session, job, uniq_id, job_num,
238 temp_dir=temp_dir, quiet=quiet)
239 jobids.append(jobid)
240
241 sid = session.contact
242 session.exit()
243
244 return (sid, jobids)
245
249 """
250 For an active session, append new job based on information stored in job
251 object. Also sets job.job_id to the ID of the job on the grid.
252
253 @param session: The current DRMAA session with the grid engine.
254 @type session: C{Session}
255 @param job: The Job to add to the queue.
256 @type job: L{Job}
257 @param uniq_id: The unique suffix for the tables corresponding to this job
258 in the database.
259 @type uniq_id: C{basestring}
260 @param job_num: The row in the table to store/retrieve data on. This is only
261 non-zero for jobs created via grid_map.
262 @type job_num: C{int}
263 @param temp_dir: Local temporary directory for storing output for an
264 individual job.
265 @type temp_dir: C{basestring}
266 @param quiet: When true, do not output information about the jobs that have
267 been submitted.
268 @type quiet: C{bool}
269 """
270
271 jt = session.createJobTemplate()
272
273
274 shell_env = os.environ
275
276 if job.environment and job.replace_env:
277
278 jt.jobEnvironment = job.environment
279
280 elif job.environment and not job.replace_env:
281
282 env = shell_env
283 env.update(job.environment)
284 jt.jobEnvironment = env
285
286 else:
287
288 jt.jobEnvironment = shell_env
289
290
291 jt.remoteCommand = sys.executable
292 jt.args = ['-m', 'gridmap.runner', '{0}'.format(uniq_id),
293 '{0}'.format(job_num), job.path, temp_dir, gethostname()]
294 jt.nativeSpecification = job.native_specification
295 jt.outputPath = ":" + temp_dir
296 jt.errorPath = ":" + temp_dir
297
298 jobid = session.runJob(jt)
299
300
301 job.jobid = jobid
302
303 if not quiet:
304 print('Your job {0} has been submitted with id {1}'.format(job.name,
305 jobid),
306 file=sys.stderr)
307
308 session.deleteJobTemplate(jt)
309
310 return jobid
311
312
313 -def _collect_jobs(sid, jobids, joblist, redis_server, uniq_id,
314 temp_dir='/scratch/', wait=True):
315 """
316 Collect the results from the jobids, returns a list of Jobs
317
318 @param sid: session identifier
319 @type sid: string returned by cluster
320 @param jobids: list of job identifiers returned by the cluster
321 @type jobids: list of strings
322 @param redis_server: Open connection to the database where the results will
323 be stored.
324 @type redis_server: L{StrictRedis}
325 @param wait: Wait for jobs to finish?
326 @type wait: bool
327 @param temp_dir: Local temporary directory for storing output for an
328 individual job.
329 @type temp_dir: C{basestring}
330 """
331
332 for ix in range(len(jobids)):
333 assert(jobids[ix] == joblist[ix].jobid)
334
335
336 with Session(sid) as session:
337
338 if wait:
339 drmaaWait = Session.TIMEOUT_WAIT_FOREVER
340 else:
341 drmaaWait = Session.TIMEOUT_NO_WAIT
342
343
344 session.synchronize(jobids, drmaaWait, False)
345
346
347 job_output_list = []
348 for ix, job in enumerate(joblist):
349
350 log_stdout_fn = os.path.join(temp_dir, job.name + '.o' + jobids[ix])
351 log_stderr_fn = os.path.join(temp_dir, job.name + '.e' + jobids[ix])
352
353
354 job_info = session.wait(job.jobid, drmaaWait)
355
356 try:
357 job_output = zload_db(redis_server,
358 'output_{0}'.format(uniq_id),
359 ix)
360 except Exception as detail:
361 print(("Error while unpickling output for gridmap job {1} " +
362 "stored with key output_{0}_{1}").format(uniq_id, ix),
363 file=sys.stderr)
364 print("This usually happens when a job has crashed before " +
365 "writing its output to the database.",
366 file=sys.stderr)
367 print("\nHere is some information about the problem job:",
368 file=sys.stderr)
369 print("stdout:", log_stdout_fn, file=sys.stderr)
370 print("stderr:", log_stderr_fn, file=sys.stderr)
371 if job_info.hasExited:
372 print("Exit status: {0}".format(job_info.exitStatus),
373 file=sys.stderr)
374 if job_info.hasSignal:
375 print(("Terminating signal: " +
376 "{0}").format(job_info.terminatedSignal),
377 file=sys.stderr)
378 print("Core dumped: {0}".format(job_info.hasCoreDump),
379 file=sys.stderr)
380 print(("Job aborted before it ran: " +
381 "{0}").format(job_info.wasAborted),
382 file=sys.stderr)
383 print("Job resources: {0}".format(job_info.resourceUsage),
384 file=sys.stderr)
385 try:
386 print(("Job SGE status: " +
387 "{0}").format(session.jobStatus(job.jobid)),
388 file=sys.stderr)
389 except InvalidJobException:
390 pass
391 print("Unpickling exception: {0}".format(detail),
392 file=sys.stderr)
393 sys.exit(2)
394
395
396 if isinstance(job_output, Exception):
397 print("Exception encountered in job with log file:",
398 file=sys.stderr)
399 print(log_stdout_fn, file=sys.stderr)
400 print(job_output, file=sys.stderr)
401 print(file=sys.stderr)
402
403 job_output_list.append(job_output)
404
405 return job_output_list
406
407
408 -def process_jobs(jobs, temp_dir='/scratch/', wait=True, white_list=None,
409 quiet=True):
410 """
411 Take a list of jobs and process them on the cluster.
412
413 @param temp_dir: Local temporary directory for storing output for an
414 individual job.
415 @type temp_dir: C{basestring}
416 @param wait: Should we wait for jobs to finish? (Should only be false if the
417 function you're running doesn't return anything)
418 @type wait: C{bool}
419 @param white_list: If specified, limit nodes used to only those in list.
420 @type white_list: C{list} of C{basestring}
421 @param quiet: When true, do not output information about the jobs that have
422 been submitted.
423 @type quiet: C{bool}
424 """
425
426 redis_server = StrictRedis(host=gethostname(), db=REDIS_DB, port=REDIS_PORT)
427
428
429 try:
430 redis_server.set('connection_test', True)
431 except RedisConnectionError:
432 with open('/dev/null') as null_file:
433 redis_process = subprocess.Popen(['redis-server', '-'],
434 stdout=null_file,
435 stdin=subprocess.PIPE,
436 stderr=null_file)
437 redis_process.stdin.write('''daemonize yes
438 pidfile {0}
439 port {1}
440 '''.format(os.path.join(temp_dir,
441 'redis{0}.pid'.format(REDIS_PORT)),
442 REDIS_PORT))
443 redis_process.stdin.close()
444
445 sleep(5)
446
447
448 uniq_id = uuid.uuid4()
449
450
451 for job_id, job in enumerate(jobs):
452 zsave_db(job, redis_server, 'job_{0}'.format(uniq_id), job_id)
453
454
455 sids, jobids = _submit_jobs(jobs, uniq_id, white_list=white_list,
456 temp_dir=temp_dir, quiet=quiet)
457
458
459 job_outputs = _collect_jobs(sids, jobids, jobs, redis_server, uniq_id,
460 temp_dir=temp_dir, wait=wait)
461
462
463 assert(len(jobs) == len(job_outputs))
464
465
466 redis_server.delete(*redis_server.keys('job_{0}_*'.format(uniq_id)))
467 redis_server.delete(*redis_server.keys('output_{0}_*'.format(uniq_id)))
468 return job_outputs
469
470
471
472
473
474 -def grid_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job',
475 num_slots=1, temp_dir='/scratch/', white_list=None,
476 queue=DEFAULT_QUEUE, quiet=True):
477 """
478 Maps a function onto the cluster.
479 @note: This can only be used with picklable functions (i.e., those that are
480 defined at the module or class level).
481
482 @param f: The function to map on args_list
483 @type f: C{function}
484 @param args_list: List of arguments to pass to f
485 @type args_list: C{list}
486 @param cleanup: Should we remove the stdout and stderr temporary files for
487 each job when we're done? (They are left in place if there's
488 an error.)
489 @type cleanup: C{bool}
490 @param mem_free: Estimate of how much memory each job will need (for
491 scheduling). (Not currently used, because our cluster does
492 not have that setting enabled.)
493 @type mem_free: C{basestring}
494 @param name: Base name to give each job (will have a number add to end)
495 @type name: C{basestring}
496 @param num_slots: Number of slots each job should use.
497 @type num_slots: C{int}
498 @param temp_dir: Local temporary directory for storing output for an
499 individual job.
500 @type temp_dir: C{basestring}
501 @param white_list: If specified, limit nodes used to only those in list.
502 @type white_list: C{list} of C{basestring}
503 @param queue: The SGE queue to use for scheduling.
504 @type queue: C{basestring}
505 @param quiet: When true, do not output information about the jobs that have
506 been submitted.
507 @type quiet: C{bool}
508 """
509
510
511 jobs = [Job(f, [args] if not isinstance(args, list) else args,
512 cleanup=cleanup, mem_free=mem_free,
513 name='{0}{1}'.format(name, job_num), num_slots=num_slots,
514 queue=queue)
515 for job_num, args in enumerate(args_list)]
516
517
518 job_results = process_jobs(jobs, temp_dir=temp_dir, white_list=white_list,
519 quiet=quiet)
520
521 return job_results
522
523
524 -def pg_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job',
525 num_slots=1, temp_dir='/scratch/', white_list=None,
526 queue=DEFAULT_QUEUE, quiet=True):
527 """
528 @deprecated: This function has been renamed grid_map.
529
530 @param f: The function to map on args_list
531 @type f: C{function}
532 @param args_list: List of arguments to pass to f
533 @type args_list: C{list}
534 @param cleanup: Should we remove the stdout and stderr temporary files for
535 each job when we're done? (They are left in place if there's
536 an error.)
537 @type cleanup: C{bool}
538 @param mem_free: Estimate of how much memory each job will need (for
539 scheduling). (Not currently used, because our cluster does
540 not have that setting enabled.)
541 @type mem_free: C{basestring}
542 @param name: Base name to give each job (will have a number add to end)
543 @type name: C{basestring}
544 @param num_slots: Number of slots each job should use.
545 @type num_slots: C{int}
546 @param temp_dir: Local temporary directory for storing output for an
547 individual job.
548 @type temp_dir: C{basestring}
549 @param white_list: If specified, limit nodes used to only those in list.
550 @type white_list: C{list} of C{basestring}
551 @param queue: The SGE queue to use for scheduling.
552 @type queue: C{basestring}
553 @param quiet: When true, do not output information about the jobs that have
554 been submitted.
555 @type quiet: C{bool}
556 """
557 return grid_map(f, args_list, cleanup=cleanup, mem_free=mem_free, name=name,
558 num_slots=num_slots, temp_dir=temp_dir,
559 white_list=white_list, queue=queue, quiet=quiet)
560