Module vflow.pipeline

Class that stores the entire pipeline of steps in a data-science workflow

Expand source code
'''Class that stores the entire pipeline of steps in a data-science workflow
'''
import itertools

import joblib
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

from vflow.vset import PREV_KEY


class PCSPipeline:
    def __init__(self, steps: list = [], cache_dir=None):
        '''Helper function that just calls build_graph_recur with an empty graph
        Params
        ------
        steps: list
            a list of Vset instances
        cache_dir: str, default=None
            The directory to use as data store by joblib. If None, won't do
            caching.

        Returns
        -------
        G: nx.Digraph()
        '''
        self.steps = steps
        # set up the cache
        self.memory = joblib.Memory(location=cache_dir)

    def run(self, *args, **kwargs):
        '''Runs the pipeline
        '''
        run_step_cached = self.memory.cache(_run_step)
        for i, step in enumerate(self.steps):
            try:
                step_name = step.name
            except:
                step_name = f'Step {i}'
            print(step_name)
            outputs, fitted_step = run_step_cached(step, *args, **kwargs)
            self.steps[i] = fitted_step

    def __getitem__(self, i):
        '''Accesses ith step of pipeline
        '''
        return self.steps[i]

    def __len__(self):
        return len(self.steps)

    def generate_names(self, as_pandas=True):
        name_lists = []
        if as_pandas:
            for step in self.steps:
                name_lists.append([f'{i}_{str(mod)[:8]}'
                                   for i, mod in enumerate(step)])
            indexes = list(itertools.product(*name_lists))
            return pd.DataFrame(indexes, columns=[step.name for step in self.steps])
        else:
            for step in self.steps:
                name_lists.append([f'{step.name}_{i}_{str(mod)[:8]}'
                                   for i, mod in enumerate(step)])
            return list(itertools.product(*name_lists))


def build_graph(node, draw=True):
    '''Helper function that just calls build_graph_recur with an empty graph
    Params
    ------
    node: dict or Vset

    Returns
    -------
    G: nx.Digraph()
    '''

    def build_graph_recur(node, G):
        '''Builds a graph up using __prev__ and PREV_KEY pointers
        Params
        ------
        node: dict or Vset
        G: nx.Digraph()

        Returns
        -------
        G: nx.Digraph()
        '''
        # base case: reached starting node
        if type(node) is str:
            return G

        # initial case: starting at dict
        elif type(node) is dict:
            s_node = 'End'
            nodes_prev = node[PREV_KEY]
            for node_prev in nodes_prev:
                G.add_edge(node_prev, s_node)
                G = build_graph_recur(node_prev, G)
            return G

        # main case: at a moduleset
        elif 'Vset' in str(type(node)):
            nodes_prev = node.__prev__
            for node_prev in nodes_prev:
                G.add_edge(node_prev, node)
                G = build_graph_recur(node_prev, G)
            return G

    G = nx.DiGraph()
    G = build_graph_recur(node, G)
    if draw:
        nx.draw(G, with_labels=True, node_color='#CCCCCC')
        plt.tight_layout()
    return G


def _run_step(step, *args, **kwargs):
    if step._fitted:
        return step.modules, step
    outputs = step(*args, **kwargs)
    return outputs, step

Functions

def build_graph(node, draw=True)

Helper function that just calls build_graph_recur with an empty graph Params


node: dict or Vset

Returns

G : nx.Digraph()
 
Expand source code
def build_graph(node, draw=True):
    '''Helper function that just calls build_graph_recur with an empty graph
    Params
    ------
    node: dict or Vset

    Returns
    -------
    G: nx.Digraph()
    '''

    def build_graph_recur(node, G):
        '''Builds a graph up using __prev__ and PREV_KEY pointers
        Params
        ------
        node: dict or Vset
        G: nx.Digraph()

        Returns
        -------
        G: nx.Digraph()
        '''
        # base case: reached starting node
        if type(node) is str:
            return G

        # initial case: starting at dict
        elif type(node) is dict:
            s_node = 'End'
            nodes_prev = node[PREV_KEY]
            for node_prev in nodes_prev:
                G.add_edge(node_prev, s_node)
                G = build_graph_recur(node_prev, G)
            return G

        # main case: at a moduleset
        elif 'Vset' in str(type(node)):
            nodes_prev = node.__prev__
            for node_prev in nodes_prev:
                G.add_edge(node_prev, node)
                G = build_graph_recur(node_prev, G)
            return G

    G = nx.DiGraph()
    G = build_graph_recur(node, G)
    if draw:
        nx.draw(G, with_labels=True, node_color='#CCCCCC')
        plt.tight_layout()
    return G

Classes

class PCSPipeline (steps: list = [], cache_dir=None)

Helper function that just calls build_graph_recur with an empty graph Params


steps: list a list of Vset instances cache_dir: str, default=None The directory to use as data store by joblib. If None, won't do caching.

Returns

G : nx.Digraph()
 
Expand source code
class PCSPipeline:
    def __init__(self, steps: list = [], cache_dir=None):
        '''Helper function that just calls build_graph_recur with an empty graph
        Params
        ------
        steps: list
            a list of Vset instances
        cache_dir: str, default=None
            The directory to use as data store by joblib. If None, won't do
            caching.

        Returns
        -------
        G: nx.Digraph()
        '''
        self.steps = steps
        # set up the cache
        self.memory = joblib.Memory(location=cache_dir)

    def run(self, *args, **kwargs):
        '''Runs the pipeline
        '''
        run_step_cached = self.memory.cache(_run_step)
        for i, step in enumerate(self.steps):
            try:
                step_name = step.name
            except:
                step_name = f'Step {i}'
            print(step_name)
            outputs, fitted_step = run_step_cached(step, *args, **kwargs)
            self.steps[i] = fitted_step

    def __getitem__(self, i):
        '''Accesses ith step of pipeline
        '''
        return self.steps[i]

    def __len__(self):
        return len(self.steps)

    def generate_names(self, as_pandas=True):
        name_lists = []
        if as_pandas:
            for step in self.steps:
                name_lists.append([f'{i}_{str(mod)[:8]}'
                                   for i, mod in enumerate(step)])
            indexes = list(itertools.product(*name_lists))
            return pd.DataFrame(indexes, columns=[step.name for step in self.steps])
        else:
            for step in self.steps:
                name_lists.append([f'{step.name}_{i}_{str(mod)[:8]}'
                                   for i, mod in enumerate(step)])
            return list(itertools.product(*name_lists))

Methods

def generate_names(self, as_pandas=True)
Expand source code
def generate_names(self, as_pandas=True):
    name_lists = []
    if as_pandas:
        for step in self.steps:
            name_lists.append([f'{i}_{str(mod)[:8]}'
                               for i, mod in enumerate(step)])
        indexes = list(itertools.product(*name_lists))
        return pd.DataFrame(indexes, columns=[step.name for step in self.steps])
    else:
        for step in self.steps:
            name_lists.append([f'{step.name}_{i}_{str(mod)[:8]}'
                               for i, mod in enumerate(step)])
        return list(itertools.product(*name_lists))
def run(self, *args, **kwargs)

Runs the pipeline

Expand source code
def run(self, *args, **kwargs):
    '''Runs the pipeline
    '''
    run_step_cached = self.memory.cache(_run_step)
    for i, step in enumerate(self.steps):
        try:
            step_name = step.name
        except:
            step_name = f'Step {i}'
        print(step_name)
        outputs, fitted_step = run_step_cached(step, *args, **kwargs)
        self.steps[i] = fitted_step