Source code for renku.core.models.workflow.run

# -*- coding: utf-8 -*-
#
# Copyright 2018-2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Represents a workflow template."""

import os
import pathlib
import urllib.parse
from bisect import bisect
from copy import copy
from functools import total_ordering
from pathlib import Path

import attr
from marshmallow import EXCLUDE

from renku.core.models.calamus import Nested, fields, prov, renku
from renku.core.models.cwl.types import PATH_OBJECTS
from renku.core.models.entities import Collection, CommitMixin, \
    CommitMixinSchema, Entity
from renku.core.models.workflow.parameters import CommandArgument, \
    CommandArgumentSchema, CommandInput, CommandInputSchema, CommandOutput, \
    CommandOutputSchema, MappedIOStream


def _entity_from_path(client, path, commit):
    """Gets the entity associated with a path."""
    client, commit, path = client.resolve_in_submodules(
        client.find_previous_commit(path, revision=commit.hexsha),
        path,
    )

    entity_cls = Entity
    if (client.path / path).is_dir():
        entity_cls = Collection

    if str(path).startswith(os.path.join(client.renku_home, client.DATASETS)):
        return client.load_dataset_from_path(path, commit=commit)
    else:
        return entity_cls(
            commit=commit,
            client=client,
            path=str(path),
        )


def _convert_cmd_binding(binding, client, commit):
    """Convert a cwl argument to ``CommandArgument``."""

    base_id = Run.generate_id(client, commit)

    id_ = CommandArgument.generate_id(base_id, binding.position)

    return CommandArgument(
        id=id_, position=binding.position, value=binding.valueFrom
    )


def _convert_cmd_input(input, client, commit):
    """Convert a cwl input to ``CommandInput``."""
    val = input.default

    if isinstance(val, list):
        val = input.inputBinding.itemSeparator.join(val)

    base_id = Run.generate_id(client, commit)

    if input.type in PATH_OBJECTS and input.default:
        if input.inputBinding:
            prefix = input.inputBinding.prefix
            if prefix and input.inputBinding.separate:
                prefix += ' '
            return CommandInput(
                id=CommandInput.generate_id(
                    base_id, input.inputBinding.position
                ),
                position=input.inputBinding.position,
                prefix=prefix,
                consumes=_entity_from_path(client, input.default.path, commit)
            )
        else:
            mapped_id = MappedIOStream.generate_id(base_id, 'stdin')

            return CommandInput(
                id=CommandInput.generate_id(
                    base_id, 'stdin' if input.id == 'input_stdin' else None
                ),
                consumes=_entity_from_path(client, input.default.path, commit),
                mapped_to=MappedIOStream(id=mapped_id, stream_type='stdin')
                if input.id == 'input_stdin' else None
            )
    else:
        prefix = input.inputBinding.prefix
        if prefix and input.inputBinding.separate:
            prefix += ' '
        return CommandArgument(
            id=CommandArgument.generate_id(
                base_id, input.inputBinding.position
            ),
            position=input.inputBinding.position,
            value=val,
            prefix=prefix
        )


def _convert_cmd_output(output, factory, client, commit):
    """Convert a cwl output to ``CommandOutput``."""
    path = None
    mapped = None
    input_prefix = '$(inputs.'
    position = None
    prefix = None
    input_to_remove = None
    create_folder = False

    base_id = Run.generate_id(client, commit)

    if output.outputBinding:
        if output.outputBinding.glob.startswith(input_prefix):
            input_id = output.outputBinding.glob[len(input_prefix):-1]
            inp = next(i for i in factory.inputs if i.id == input_id)
            path = inp.default
            position = inp.inputBinding.position
            prefix = inp.inputBinding.prefix
            if prefix and inp.inputBinding.separate:
                prefix += ' '
            input_to_remove = inp
        else:
            path = output.outputBinding.glob

    if output.type in MappedIOStream.STREAMS:
        path = getattr(factory, output.type)
        mapped_id = MappedIOStream.generate_id(base_id, output.type)
        mapped = MappedIOStream(id=mapped_id, stream_type=output.type)

    if (((client.path / path).is_dir() and
         path in factory.existing_directories) or (
             not (client.path / path).is_dir() and
             str(Path(path).parent) in factory.existing_directories
         )):
        create_folder = True

    return CommandOutput(
        id=CommandOutput.generate_id(base_id, position),
        produces=_entity_from_path(client, path, commit),
        mapped_to=mapped,
        position=position,
        prefix=prefix,
        create_folder=create_folder
    ), input_to_remove


[docs]@total_ordering @attr.s( cmp=False, ) class Run(CommitMixin): """Represents a `renku run` execution template.""" command = attr.ib( default=None, type=str, kw_only=True, ) process_order = attr.ib( default=None, type=int, kw_only=True, ) successcodes = attr.ib(kw_only=True, type=list, factory=list) subprocesses = attr.ib(kw_only=True, factory=list) arguments = attr.ib(kw_only=True, factory=list) inputs = attr.ib(kw_only=True, factory=list) outputs = attr.ib(kw_only=True, factory=list)
[docs] @staticmethod def generate_id(client, commit): """Generate an id for an argument.""" host = 'localhost' if client: host = client.remote.get('host') or host host = os.environ.get('RENKU_DOMAIN') or host return urllib.parse.urljoin( 'https://{host}'.format(host=host), pathlib.posixpath.join( '/runs/commit', urllib.parse.quote(commit.hexsha, safe='') ) )
[docs] @classmethod def from_factory(cls, factory, client, commit, path): """Creates a ``Run`` from a ``CommandLineToolFactory``.""" inputs = [] arguments = [] outputs = [ _convert_cmd_output(o, factory, client, commit) for o in factory.outputs ] # TODO: handle stream! if outputs: outputs, inputs_to_remove = zip(*outputs) outputs = list(outputs) for i in inputs_to_remove: # remove inputs that are actually outputs # note: a single input can represent multiple outputs # in case of repetition in the cli if not i: continue if i in factory.inputs: factory.inputs.remove(i) for i in factory.inputs: res = _convert_cmd_input(i, client, commit) if isinstance(res, CommandInput): inputs.append(res) else: arguments.append(res) return cls( client=client, commit=commit, path=path, command=' '.join(factory.baseCommand), successcodes=factory.successCodes, arguments=[ _convert_cmd_binding(a, client, commit) for a in factory.arguments ] + arguments, inputs=inputs, outputs=outputs )
@property def activity(self): """Return the activity object.""" return self._activity()
[docs] def to_argv(self): """Convert run into argv list.""" argv = [] if self.command: argv.extend(self.command.split(' ')) arguments = self.inputs + self.outputs + self.arguments arguments = filter(lambda x: x.position, arguments) arguments = sorted(arguments, key=lambda x: x.position) argv.extend(e for a in arguments for e in a.to_argv()) return argv
[docs] def to_stream_repr(self): """Input/output stream representation.""" stream_repr = [] for input_ in self.inputs: if input_.mapped_to: stream_repr.append(input_.to_stream_repr()) for output in self.outputs: if output.mapped_to: stream_repr.append(output.to_stream_repr()) return stream_repr
[docs] def update_id_and_label_from_commit_path(self, client, commit, path): """Updates the _id and _label using supplied commit and path.""" self.client = client if not self.commit: self.commit = commit path = Path(os.path.abspath(path)).relative_to(self.client.path) self.path = path self._id = self.generate_id(client, commit) self._label = self.default_label() if len(self.subprocesses) > 0: for s in self.subprocesses: s.update_id_and_label_from_commit_path(client, commit, path)
[docs] def set_process_order(self, process_order): """Updates the process_order on a subprocess and its parameters.""" self.process_order = process_order parts = self._id.split('/') if '/step/' in self._id: parts[-1] = str(process_order) else: parts.extend(['step', str(process_order)]) self._id = '/'.join(parts) for input_ in self.inputs: parts = input_._id.split('/') if '/step/' in input_._id: parts[-3] = str(process_order) else: parts.insert(-2, 'step') parts.insert(-2, str(process_order)) input_._id = '/'.join(parts) for output in self.outputs: parts = output._id.split('/') if '/step/' in output._id: parts[-3] = str(process_order) else: parts.insert(-2, 'step') parts.insert(-2, str(process_order)) output._id = '/'.join(parts) for argument in self.arguments: # adjust id to be a subprocess id parts = argument._id.split('/') if '/step/' in argument._id: parts[-3] = str(process_order) else: parts.insert(-2, 'step') parts.insert(-2, str(process_order)) argument._id = '/'.join(parts)
[docs] def add_subprocess(self, subprocess, process_order=None): """Adds a subprocess to this run.""" if not process_order: process_order = 0 if self.subprocesses: # sort subprocesses by dependencies process_order = bisect(self.subprocesses, subprocess) if process_order < len(self.subprocesses): # inserted before end, recalculate orders or rest for s in self.subprocesses: if s.process_order >= process_order: s.set_process_order(s.process_order + 1) if any(s.process_order == process_order for s in self.subprocesses): raise ValueError( 'process_order {} already exists'.format(process_order) ) input_paths = [i.consumes.path for i in self.inputs] output_paths = [o.produces.path for o in self.outputs] for input_ in subprocess.inputs: if ( input_.consumes.path not in input_paths and input_.consumes.path not in output_paths ): new_input = copy(input_) new_input.mapped_to = None matching_output = next(( o for o in self.outputs if o.produces.path == new_input.consumes.path ), None) if not matching_output: self.inputs.append(new_input) input_paths.append(new_input.consumes.path) for output in subprocess.outputs: if output.produces.path not in output_paths: new_output = copy(output) new_output.mapped_to = None self.outputs.append(new_output) output_paths.append(new_output.produces.path) matching_input = next(( i for i in self.inputs if i.consumes.path == new_output.produces.path ), None) if matching_input: self.inputs.remove(matching_input) input_paths.remove(matching_input.consumes.path) subprocess.set_process_order(process_order) self.subprocesses.append(subprocess) self.subprocesses = sorted( self.subprocesses, key=lambda s: s.process_order )
def __lt__(self, other): """Compares two subprocesses order based on their dependencies.""" a_inputs = set() b_outputs = set() for i in other.inputs: entity = i.consumes for subentity in entity.entities: a_inputs.add(subentity.path) for i in self.outputs: entity = i.produces for subentity in entity.entities: b_outputs.add(subentity.path) return a_inputs & b_outputs def __attrs_post_init__(self): """Calculate properties.""" super().__attrs_post_init__() commit_not_set = not self.commit or self.commit.hexsha in self._id if commit_not_set and self.client and Path(self.path).exists(): self.commit = self.client.find_previous_commit(self.path)
[docs] @classmethod def from_jsonld(cls, data): """Create an instance from JSON-LD data.""" if isinstance(data, cls): return data if not isinstance(data, dict): raise ValueError(data) return RunSchema().load(data)
[docs] def as_jsonld(self): """Create JSON-LD.""" return RunSchema().dump(self)
[docs]class RunSchema(CommitMixinSchema): """Run schema."""
[docs] class Meta: """Meta class.""" rdf_type = [renku.Run, prov.Plan, prov.Entity] model = Run unknown = EXCLUDE
command = fields.String(renku.command, missing=None) process_order = fields.Integer(renku.processOrder, missing=None) successcodes = fields.List( renku.successCodes, fields.Integer(), missing=[0] ) subprocesses = Nested( renku.hasSubprocess, 'RunSchema', many=True, missing=None ) arguments = Nested( renku.hasArguments, CommandArgumentSchema, many=True, missing=None ) inputs = Nested( renku.hasInputs, CommandInputSchema, many=True, missing=None ) outputs = Nested( renku.hasOutputs, CommandOutputSchema, many=True, missing=None )