# load data
data_dir = "../../_other/Enhancer/data/"
X_train = np.asarray(pd.read_csv(data_dir + "01_X_train.csv", error_bad_lines=False).iloc[:, 1:])
X_test = np.asarray(pd.read_csv(data_dir + "02_X_test.csv", error_bad_lines=False).iloc[:, 1:])
y_train = np.asarray(pd.read_csv(data_dir + "03_y_train.csv", error_bad_lines=False).iloc[:, 1])
y_test = np.asarray(pd.read_csv(data_dir + "04_y_test.csv", error_bad_lines=False).iloc[:, 1])
# initialize data
np.random.seed(14)
X_train, X_test, y_train, y_test = init_args((X_train, X_test, y_train, y_test),
names=['X_train', 'X_test', 'y_train', 'y_test'])
# subsample
subsampling_fns = [partial(sklearn.utils.resample, n_samples=1000, random_state=i) for i in range(3)]
subsampling_set = Vset(name='subsampling', modules=subsampling_fns)
X_trains, y_trains = subsampling_set(X_train, y_train)
modeling_set = Vset(name='modeling',
modules=[RandomForestClassifier(n_estimators=50, max_depth=5), MLPClassifier()],
module_keys=["RF", "MLP"])
# model
modeling_set.fit(X_trains, y_trains)
preds = modeling_set.predict(X_test)
# hard metrics
hard_metrics_set = Vset(name='hard_metrics', modules=[accuracy_score, balanced_accuracy_score],
module_keys=["Acc", "Bal_Acc"])
hard_metrics = hard_metrics_set.evaluate(preds, y_test)
# permutation importance
feature_importance_set = Vset(name='feature_importance', modules=[permutation_importance])
importances = feature_importance_set.evaluate(modeling_set.out, X_test, y_test)
G = build_graph(importances, draw=True)
plt.show()