Module vflow.vset
Set of modules to be parallelized over in a pipeline. Function arguments are each a list
Expand source code
'''Set of modules to be parallelized over in a pipeline.
Function arguments are each a list
'''
PREV_KEY = '__prev__'
import numpy as np
import joblib
import ray
from mlflow.tracking import MlflowClient
from vflow.convert import *
from vflow.vfunc import Vfunc, AsyncModule
from vflow.subkey import Subkey
class Vset:
def __init__(self, name: str, modules, module_keys: list = None,
is_async: bool = False, output_matching: bool = False,
cache_dir: str = None, tracking_dir: str = None):
'''
todo: include prev and next and change functions to include that.
Params
-------
name: str
name of this moduleset
modules: list or dict
dictionary of functions that we want to associate with
module_keys: list (optional)
list of names corresponding to each module
is_async: bool (optional)
if True, modules are computed asynchronously
output_matching: bool (optional)
if True, then output keys from this Vset will be matched when used
in other Vsets
cache_dir: str (optional)
if provided, do caching and use cache_dir as the data store for
joblib.Memory
tracking_dir: str (optional)
if provided, use the mlflow.tracking api to log outputs as metrics
with params determined by input keys
'''
self.name = name
self._fitted = False
self.out = None # outputs
self._async = is_async
self._output_matching = output_matching
self._memory = joblib.Memory(cache_dir)
if tracking_dir is not None:
self._mlflow = MlflowClient(tracking_uri=tracking_dir)
experiment = self._mlflow.get_experiment_by_name(name=self.name)
if experiment is None:
self._exp_id = self._mlflow.create_experiment(name=self.name)
else:
self._exp_id = experiment.experiment_id
else:
self._mlflow = None
# check if any of the modules are AsyncModules
# if so, we'll make then all AsyncModules later on
if not self._async and np.any([isinstance(mod, AsyncModule) for mod in modules]):
self._async = True
if type(modules) is dict:
self.modules = modules
elif type(modules) is list:
if module_keys is not None:
assert type(module_keys) is list, 'modules passed as list but module_names is not a list'
assert len(modules) == len(
module_keys), 'modules list and module_names list do not have the same length'
# TODO: add more checking of module_keys
module_keys = [self.__create_subkey(k) if isinstance(k, tuple) else
(self.__create_subkey(k), ) for k in module_keys]
else:
module_keys = [(self.__create_subkey(f'{name}_{i}'), ) for i in range(len(modules))]
# convert module keys to singleton tuples
self.modules = dict(zip(module_keys, modules))
# if needed, wrap the modules in the Vfunc or AsyncModule class
for k, v in self.modules.items():
if self._async:
if not isinstance(v, AsyncModule):
self.modules[k] = AsyncModule(k[0], v)
elif not isinstance(v, Vfunc):
self.modules[k] = Vfunc(k[0], v)
def _apply_func(self, out_dict: dict=None, *args):
if out_dict is None:
out_dict = deepcopy(self.modules)
apply_func_cached = self._memory.cache(_apply_func_cached)
data_dict, out_dict = apply_func_cached(
out_dict, self._async, *args
)
self.__prev__ = data_dict[PREV_KEY]
if self._mlflow is not None:
run_dict = {}
# log subkeys as params and value as metric
for k, v in out_dict.items():
origins = np.array([subk.origin for subk in k])
# ignore init origins and the last origin (this Vset)
param_idx = [
i for i in range(len(k[:-1])) if origins[i] != 'init'
]
# get or create mlflow run
run_dict_key = tuple([subk.value for subk in k[:-1]])
if run_dict_key in run_dict:
run_id = run_dict[run_dict_key]
else:
run = self._mlflow.create_run(self._exp_id)
run_id = run.info.run_id
run_dict[run_dict_key] = run_id
# log params
for idx in param_idx:
subkey = k[idx]
param_name = subkey.origin
# check if the origin occurs multiple times
if np.sum(origins == param_name) > 1:
occurence = np.sum(origins[:idx] == param_name)
param_name = param_name + str(occurence)
self._mlflow.log_param(
run_id, param_name, subkey.value
)
self._mlflow.log_metric(run_id, k[-1].value, v)
out_dict[PREV_KEY] = (self,)
return out_dict
def fit(self, *args, **kwargs):
'''
'''
if self._fitted:
return self
out_dict = {}
for k, v in self.modules.items():
out_dict[k] = v.fit
self.out = self._apply_func(out_dict, *args)
self._fitted = True
return self
def transform(self, *args, **kwargs):
'''todo: fix this method
'''
results = []
for out in self.output:
result = out.transform(*args, **kwargs)
results.append(result)
return results
def predict(self, *args, **kwargs):
if not self._fitted:
raise AttributeError('Please fit the Vset object before calling the predict method.')
pred_dict = {}
for k, v in self.out.items():
if hasattr(v, 'predict'):
pred_dict[k] = v.predict
return self._apply_func(pred_dict, *args)
def predict_proba(self, *args, **kwargs):
if not self._fitted:
raise AttributeError('Please fit the Vset object before calling the predict_proba method.')
pred_dict = {}
for k, v in self.out.items():
if hasattr(v, 'predict_proba'):
pred_dict[k] = v.predict_proba
return self._apply_func(pred_dict, *args)
def evaluate(self, *args, **kwargs):
'''Combines dicts before calling _apply_func
'''
return self._apply_func(None, *args)
def __call__(self, *args, n_out: int = None, **kwargs):
'''
'''
if n_out is None:
n_out = len(args)
out = sep_dicts(self._apply_func(None, *args), n_out=n_out)
return out
def __getitem__(self, i):
'''Accesses ith item in the module set
'''
return self.modules[i]
def __contains__(self, key):
'''Returns true if modules is a dict and key is one of its keys
'''
if isinstance(self.modules, dict):
return key in self.modules.keys()
return False
def keys(self):
if isinstance(self.modules, dict):
return self.modules.keys()
return {}.keys()
def __len__(self):
return len(self.modules)
def __str__(self):
return 'Vset(' + self.name + ')'
def __create_subkey(self, value):
return Subkey(value, self.name, self._output_matching)
def _apply_func_cached(out_dict: dict, is_async: bool, *args):
'''
Params
------
*args: List[Dict]: takes multiple dicts and combines them into one.
Then runs modules on each item in combined dict.
out_dict: the dictionary to pass to the matching function. If None, defaults to self.modules.
Returns
-------
results: dict
with items being determined by functions in module set.
Functions and input dictionaries are currently matched using matching = 'cartesian' format.
e.g. inputs: module = {LR : logistic}, data = {train_1 : [X1,y1], train2 : [X2,y2]}
out: out_dict = {(train_1, LR) : fitted logistic, (train_2, LR) : fitted logistic}.
Currently matching = 'subset' is not used...
'''
# deepcopy args to avoid mutating them
args = deepcopy(args)
for ele in args:
if not isinstance(ele, dict):
raise Exception('Need to run init_args before calling module_set!')
if is_async:
# send data to the remote object store
for k, v in ele.items():
if k != PREV_KEY:
ele[k] = ray.put(v)
data_dict = combine_dicts(*args)
out_dict = apply_modules(out_dict, data_dict)
if is_async:
out_keys = list(out_dict.keys())
out_vals = ray.get(list(out_dict.values()))
out_dict = dict(zip(out_keys, out_vals))
return data_dict, out_dict
Classes
class Vset (name: str, modules, module_keys: list = None, is_async: bool = False, output_matching: bool = False, cache_dir: str = None, tracking_dir: str = None)
-
todo: include prev and next and change functions to include that. Params
name: str name of this moduleset modules: list or dict dictionary of functions that we want to associate with module_keys: list (optional) list of names corresponding to each module is_async: bool (optional) if True, modules are computed asynchronously output_matching: bool (optional) if True, then output keys from this Vset will be matched when used in other Vsets cache_dir: str (optional) if provided, do caching and use cache_dir as the data store for joblib.Memory tracking_dir: str (optional) if provided, use the mlflow.tracking api to log outputs as metrics with params determined by input keys
Expand source code
class Vset: def __init__(self, name: str, modules, module_keys: list = None, is_async: bool = False, output_matching: bool = False, cache_dir: str = None, tracking_dir: str = None): ''' todo: include prev and next and change functions to include that. Params ------- name: str name of this moduleset modules: list or dict dictionary of functions that we want to associate with module_keys: list (optional) list of names corresponding to each module is_async: bool (optional) if True, modules are computed asynchronously output_matching: bool (optional) if True, then output keys from this Vset will be matched when used in other Vsets cache_dir: str (optional) if provided, do caching and use cache_dir as the data store for joblib.Memory tracking_dir: str (optional) if provided, use the mlflow.tracking api to log outputs as metrics with params determined by input keys ''' self.name = name self._fitted = False self.out = None # outputs self._async = is_async self._output_matching = output_matching self._memory = joblib.Memory(cache_dir) if tracking_dir is not None: self._mlflow = MlflowClient(tracking_uri=tracking_dir) experiment = self._mlflow.get_experiment_by_name(name=self.name) if experiment is None: self._exp_id = self._mlflow.create_experiment(name=self.name) else: self._exp_id = experiment.experiment_id else: self._mlflow = None # check if any of the modules are AsyncModules # if so, we'll make then all AsyncModules later on if not self._async and np.any([isinstance(mod, AsyncModule) for mod in modules]): self._async = True if type(modules) is dict: self.modules = modules elif type(modules) is list: if module_keys is not None: assert type(module_keys) is list, 'modules passed as list but module_names is not a list' assert len(modules) == len( module_keys), 'modules list and module_names list do not have the same length' # TODO: add more checking of module_keys module_keys = [self.__create_subkey(k) if isinstance(k, tuple) else (self.__create_subkey(k), ) for k in module_keys] else: module_keys = [(self.__create_subkey(f'{name}_{i}'), ) for i in range(len(modules))] # convert module keys to singleton tuples self.modules = dict(zip(module_keys, modules)) # if needed, wrap the modules in the Vfunc or AsyncModule class for k, v in self.modules.items(): if self._async: if not isinstance(v, AsyncModule): self.modules[k] = AsyncModule(k[0], v) elif not isinstance(v, Vfunc): self.modules[k] = Vfunc(k[0], v) def _apply_func(self, out_dict: dict=None, *args): if out_dict is None: out_dict = deepcopy(self.modules) apply_func_cached = self._memory.cache(_apply_func_cached) data_dict, out_dict = apply_func_cached( out_dict, self._async, *args ) self.__prev__ = data_dict[PREV_KEY] if self._mlflow is not None: run_dict = {} # log subkeys as params and value as metric for k, v in out_dict.items(): origins = np.array([subk.origin for subk in k]) # ignore init origins and the last origin (this Vset) param_idx = [ i for i in range(len(k[:-1])) if origins[i] != 'init' ] # get or create mlflow run run_dict_key = tuple([subk.value for subk in k[:-1]]) if run_dict_key in run_dict: run_id = run_dict[run_dict_key] else: run = self._mlflow.create_run(self._exp_id) run_id = run.info.run_id run_dict[run_dict_key] = run_id # log params for idx in param_idx: subkey = k[idx] param_name = subkey.origin # check if the origin occurs multiple times if np.sum(origins == param_name) > 1: occurence = np.sum(origins[:idx] == param_name) param_name = param_name + str(occurence) self._mlflow.log_param( run_id, param_name, subkey.value ) self._mlflow.log_metric(run_id, k[-1].value, v) out_dict[PREV_KEY] = (self,) return out_dict def fit(self, *args, **kwargs): ''' ''' if self._fitted: return self out_dict = {} for k, v in self.modules.items(): out_dict[k] = v.fit self.out = self._apply_func(out_dict, *args) self._fitted = True return self def transform(self, *args, **kwargs): '''todo: fix this method ''' results = [] for out in self.output: result = out.transform(*args, **kwargs) results.append(result) return results def predict(self, *args, **kwargs): if not self._fitted: raise AttributeError('Please fit the Vset object before calling the predict method.') pred_dict = {} for k, v in self.out.items(): if hasattr(v, 'predict'): pred_dict[k] = v.predict return self._apply_func(pred_dict, *args) def predict_proba(self, *args, **kwargs): if not self._fitted: raise AttributeError('Please fit the Vset object before calling the predict_proba method.') pred_dict = {} for k, v in self.out.items(): if hasattr(v, 'predict_proba'): pred_dict[k] = v.predict_proba return self._apply_func(pred_dict, *args) def evaluate(self, *args, **kwargs): '''Combines dicts before calling _apply_func ''' return self._apply_func(None, *args) def __call__(self, *args, n_out: int = None, **kwargs): ''' ''' if n_out is None: n_out = len(args) out = sep_dicts(self._apply_func(None, *args), n_out=n_out) return out def __getitem__(self, i): '''Accesses ith item in the module set ''' return self.modules[i] def __contains__(self, key): '''Returns true if modules is a dict and key is one of its keys ''' if isinstance(self.modules, dict): return key in self.modules.keys() return False def keys(self): if isinstance(self.modules, dict): return self.modules.keys() return {}.keys() def __len__(self): return len(self.modules) def __str__(self): return 'Vset(' + self.name + ')' def __create_subkey(self, value): return Subkey(value, self.name, self._output_matching)
Methods
def evaluate(self, *args, **kwargs)
-
Combines dicts before calling _apply_func
Expand source code
def evaluate(self, *args, **kwargs): '''Combines dicts before calling _apply_func ''' return self._apply_func(None, *args)
def fit(self, *args, **kwargs)
-
Expand source code
def fit(self, *args, **kwargs): ''' ''' if self._fitted: return self out_dict = {} for k, v in self.modules.items(): out_dict[k] = v.fit self.out = self._apply_func(out_dict, *args) self._fitted = True return self
def keys(self)
-
Expand source code
def keys(self): if isinstance(self.modules, dict): return self.modules.keys() return {}.keys()
def predict(self, *args, **kwargs)
-
Expand source code
def predict(self, *args, **kwargs): if not self._fitted: raise AttributeError('Please fit the Vset object before calling the predict method.') pred_dict = {} for k, v in self.out.items(): if hasattr(v, 'predict'): pred_dict[k] = v.predict return self._apply_func(pred_dict, *args)
def predict_proba(self, *args, **kwargs)
-
Expand source code
def predict_proba(self, *args, **kwargs): if not self._fitted: raise AttributeError('Please fit the Vset object before calling the predict_proba method.') pred_dict = {} for k, v in self.out.items(): if hasattr(v, 'predict_proba'): pred_dict[k] = v.predict_proba return self._apply_func(pred_dict, *args)
def transform(self, *args, **kwargs)
-
todo: fix this method
Expand source code
def transform(self, *args, **kwargs): '''todo: fix this method ''' results = [] for out in self.output: result = out.transform(*args, **kwargs) results.append(result) return results