Source code for ParamsEstimation

import numpy as np
import pandas as pd
import statsmodels.api as sm
import sklearn.covariance as skcov


[docs]def mean_vector(X, method="hist", d=0.94): r""" Calculate the expected returns vector using the selected method. Parameters ---------- X : DataFrame of shape (n_samples, n_features) Features matrix, where n_samples is the number of samples and n_features is the number of features. method : str, can be {'hist', 'ewma1' or 'ewma2'} The method used to estimate the expected returns. The default value is 'hist'. - 'hist': use historical estimates. - 'ewma1'': use ewma with adjust=True, see `EWM <https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows>`_ for more details. - 'ewma2': use ewma with adjust=False, see `EWM <https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows>`_ for more details. d : scalar The smoothing factor of ewma methods. The default is 0.94. Returns ------- mu : 1d-array The estimation of expected returns. Raises ------ ValueError When the value cannot be calculated. """ if not isinstance(X, pd.DataFrame): raise ValueError("X must be a DataFrame") assets = X.columns.tolist() if method == "hist": mu = np.matrix(X.mean()) elif method == "ewma1": mu = X.ewm(alpha=1 - d).mean().iloc[-1, :] elif method == "ewma2": mu = X.ewm(alpha=1 - d, adjust=False).mean().iloc[-1, :] mu = pd.DataFrame(np.matrix(mu), columns=assets) return mu
[docs]def covar_matrix(X, method="hist", d=0.94, **kwargs): r""" Calculate the covariance matrix using the selected method. Parameters ---------- X : DataFrame of shape (n_samples, n_features) Features matrix, where n_samples is the number of samples and n_features is the number of features. method : str, can be {'hist', 'ewma1', 'ewma2', 'ledoit', 'oas' or 'shrunk'} The default is 'hist'. The method used to estimate the covariance matrix: - 'hist': use historical estimates. - 'ewma1'': use ewma with adjust=True, see `EWM <https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows>`_ for more details. - 'ewma2': use ewma with adjust=False, see `EWM <https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows>`_ for more details. - 'ledoit': use the Ledoit and Wolf Shrinkage method. - 'oas': use the Oracle Approximation Shrinkage method. - 'shrunk': use the basic Shrunk Covariance method. d : scalar The smoothing factor of ewma methods. The default is 0.94. **kwargs: Other variables related to covariance estimation. See `Scikit Learn <https://scikit-learn.org/stable/modules/covariance.html>`_ for more details. Returns ------- cov : nd-array The estimation of covariance matrix. Raises ------ ValueError When the value cannot be calculated. """ if not isinstance(X, pd.DataFrame): raise ValueError("X must be a DataFrame") assets = X.columns.tolist() if method == "hist": cov = np.cov(X.T) elif method == "ewma1": cov = X.ewm(alpha=1 - d).cov() item = cov.iloc[-1, :].name[0] cov = cov.loc[(item, slice(None)), :] elif method == "ewma2": cov = X.ewm(alpha=1 - d, adjust=False).cov() item = cov.iloc[-1, :].name[0] cov = cov.loc[(item, slice(None)), :] elif method == "ledoit": lw = skcov.LedoitWolf(**kwargs) lw.fit(X) cov = lw.covariance_ elif method == "oas": oas = skcov.OAS(**kwargs) oas.fit(X) cov = oas.covariance_ elif method == "shrunk": sc = skcov.ShrunkCovariance(**kwargs) sc.fit(X) cov = sc.covariance_ cov = pd.DataFrame(np.matrix(cov), columns=assets, index=assets) return cov
[docs]def forward_regression(X, y, threshold=0.05, verbose=False): r""" Select the variables that estimate the best model using stepwise forward regression. Parameters ---------- X : DataFrame of shape (n_samples, n_features) Features matrix, where n_samples is the number of samples and n_features is the number of features. y : Series of shape (n_samples, 1) Target vector, where n_samples in the number of samples. thresholdt : scalar, optional Is the maximum p-value for each variable that will be accepted in the model. The default is 0.05. verbose : bool, optional Enable verbose output. The default is False. Returns ------- value : list A list of the variables that produce the best model. Raises ------ ValueError When the value cannot be calculated. """ if not isinstance(X, pd.DataFrame): raise ValueError("X must be a DataFrame") if not isinstance(y, pd.DataFrame) and not isinstance(y, pd.Series): raise ValueError("y must be a column DataFrame") if isinstance(y, pd.DataFrame): if y.shape[0] > 1 and y.shape[1] > 1: raise ValueError("y must be a column DataFrame") included = [] value = 0 while value <= threshold: excluded = list(set(X.columns) - set(included)) best_pvalue = 999999 new_feature = None for i in excluded: factors = included + [i] X1 = X[factors] X1 = sm.add_constant(X1) results = sm.OLS(y, X1).fit() new_pvalues = results.pvalues cond_1 = new_pvalues[new_pvalues.index != "const"].max() if best_pvalue > new_pvalues[i] and cond_1 <= threshold: best_pvalue = results.pvalues[i] new_feature = i pvalues = new_pvalues.copy() value = pvalues[pvalues.index != "const"].max() if new_feature is None: break included.append(new_feature) if verbose: print("Add {} with p-value {:.6}".format(new_feature, best_pvalue)) return included
[docs]def backward_regression(X, y, threshold=0.05, verbose=False): r""" Select the variables that estimate the best model using stepwise backward regression. Parameters ---------- X : DataFrame of shape (n_samples, n_features) Features matrix, where n_samples is the number of samples and n_features is the number of features. y : Series of shape (n_samples, 1) Target vector, where n_samples in the number of samples. threshold : scalar, optional Is the maximum p-value for each variable that will be accepted in the model. The default is 0.05. verbose : bool, optional Enable verbose output. The default is False. Returns ------- value : list A list of the variables that produce the best model. Raises ------ ValueError When the value cannot be calculated. """ if not isinstance(X, pd.DataFrame): raise ValueError("X must be a DataFrame") if not isinstance(y, pd.DataFrame) and not isinstance(y, pd.Series): raise ValueError("y must be a column DataFrame") if isinstance(y, pd.DataFrame): if y.shape[0] > 1 and y.shape[1] > 1: raise ValueError("y must be a column DataFrame") X1 = sm.add_constant(X) results = sm.OLS(y, X1).fit() pvalues = results.pvalues excluded = ["const"] while pvalues[pvalues.index != "const"].max() > threshold: # factors = pvalues[pvalues.index != 'const' ][pvalues != pvalues.max()].index.tolist() factors = pvalues[~pvalues.index.isin(excluded)].index.tolist() X1 = X[factors] X1 = sm.add_constant(X1) results = sm.OLS(y, X1).fit() pvalues = results.pvalues excluded = ["const", pvalues.idxmax()] if verbose and pvalues.max() > threshold: print("Drop {} with p-value {:.6}".format(pvalues.idxmax(), pvalues.max())) included = pvalues.index.tolist() included = included[1:] return included
[docs]def loadings_matrix(X, Y, threshold=0.05, verbose=False, stepwise="Forward"): r""" Estimate the loadings matrix using stepwise regression. Parameters ---------- X : DataFrame of shape (n_samples, n_features) Features matrix, where n_samples is the number of samples and n_features is the number of features. Y : DataFrame of shape (n_samples, n_assets) Target matrix, where n_samples in the number of samples and n_assets is the number of assets. thresholdt : scalar, optional Is the maximum p-value for each variable that will be accepted in the model. The default is 0.05. verbose : bool, optional Enable verbose output. The default is False. steepwise: str 'Forward' or 'Backward', optional Indicate the method used for stepwise regression. The default is 'Forward'. Returns ------- loadings : DataFrame A DataFrame with the loadings matrix. Raises ------ ValueError When the value cannot be calculated. """ if not isinstance(X, pd.DataFrame): raise ValueError("X must be a DataFrame") if not isinstance(Y, pd.DataFrame): raise ValueError("Y must be a DataFrame") rows = Y.columns.tolist() cols = X.columns.tolist() cols.insert(0, "const") loadings = np.zeros((len(rows), len(cols))) loadings = pd.DataFrame(loadings, index=rows, columns=cols) for i in rows: if stepwise == "Forward": included = forward_regression(X, Y[i], threshold, verbose) elif stepwise == "Backward": included = backward_regression(X, Y[i], threshold, verbose) else: raise ValueError("Choose and adecuate stepwise method") results = sm.OLS(Y[i], sm.add_constant(X[included])).fit() params = results.params loadings.loc[i, params.index.tolist()] = params.T return loadings
[docs]def risk_factors( X, Y, B=None, method_mu="hist", method_cov="hist", threshold=0.05, stepwise="Forward", error=True, **kwargs ): r""" Estimate the expected returns vector and covariance matrix based on risk factors models :cite:`b-Ross` :cite:`b-Fan`. .. math:: R = \alpha + B F + \epsilon .. math:: \mu_{f} = \alpha +BE(F) .. math:: \Sigma_{f} = B \Sigma_{F} B^{T} + \Sigma_{\epsilon} where: :math:`R` is the series returns. :math:`\alpha` is the intercept. :math:`B` is the loadings matrix. :math:`F` is the expected returns vector of the risk factors. :math:`\Sigma_{F}` is the covariance matrix of the risk factors. :math:`\Sigma_{\epsilon}` is the covariance matrix of error terms. :math:`\mu_{f}` is the expected returns vector obtained with the risk factor model. :math:`\Sigma_{f}` is the covariance matrix obtained with the risk factor model. Parameters ---------- X : DataFrame of shape (n_samples, n_features) Features matrix, where n_samples is the number of samples and n_features is the number of features. Y : DataFrame of shape (n_samples, n_assets) Target matrix, where n_samples in the number of samples and n_assets is the number of assets. B : DataFrame of shape (n_assets, n_features), optional Loadings matrix. If is not specified, is estimated using stepwise regression. The default is None. steepwise: str 'Forward' or 'Backward' Indicate the method used for stepwise regression. The default is 'Forward'. error : bool Indicate if diagonal covariance matrix of errors is included (only when B is estimated through a regression). **kwargs : dict Other variables related to the expected returns and covariance estimation. Returns ------- mu : DataFrame The mean vector of risk factors model. cov : DataFrame The covariance matrix of risk factors model. returns : DataFrame The returns based on a risk factor model. nav : DataFrame The cumulated uncompound returns based on a risk factor model. Raises ------ ValueError When the value cannot be calculated. """ if not isinstance(X, pd.DataFrame) and not isinstance(Y, pd.DataFrame): raise ValueError("X and Y must be DataFrames") if B is None: B = loadings_matrix(X, Y, threshold=threshold, verbose=False, stepwise=stepwise) elif not isinstance(B, pd.DataFrame): raise ValueError("B must be a DataFrame") assets = Y.columns.tolist() dates = X.index.tolist() X1 = sm.add_constant(X) mu_f = np.matrix(mean_vector(X1, method=method_mu, **kwargs)) S_f = np.matrix(covar_matrix(X1, method=method_cov, **kwargs)) B = np.matrix(B) returns = np.matrix(X1) * B.T mu = B * mu_f.T if error == True: e = np.matrix(Y) - returns S_e = np.diag(np.var(np.array(e), ddof=1, axis=0)) S = B * S_f * B.T + S_e else: S = B * S_f * B.T mu = pd.DataFrame(mu.T, columns=assets) cov = pd.DataFrame(S, index=assets, columns=assets) returns = pd.DataFrame(returns, index=dates, columns=assets) nav = returns.cumsum() return mu, cov, returns, nav
[docs]def black_litterman( X, w, P, Q, delta=1, rf=0, eq=True, method_mu="hist", method_cov="hist", **kwargs ): r""" Estimate the expected returns vector and covariance matrix based on the black litterman model :cite:`b-BlackLitterman` :cite:`b-Black1`. .. math:: \Pi = \delta \Sigma w .. math:: \Pi_{bl} = \left[(\tau\Sigma)^{-1}+ P \Omega^{-1}P \right]^{-1} \left[(\tau\Sigma)^{-1} \Pi + P \Omega^{-1}Q \right] .. math:: M = \left((\tau\Sigma)^{-1} + P'\Omega^{-1} P \right)^{-1} .. math:: \mu_{bl} = \Pi_{bl} + rf .. math:: \Sigma_{bl} = \Sigma + M where: :math:`rf` is the risk free rate. :math:`\delta` is the risk aversion factor. :math:`\Pi` is the equilibrium excess returns. :math:`\Sigma` is the covariance matrix. :math:`P` is the views matrix. :math:`Q` is the views returns matrix. :math:`\Omega` is covariance matrix of the error views. :math:`\mu_{bl}` is the mean vector obtained with the black litterman model. :math:`\Sigma_{bl}` is the covariance matrix obtained with the black litterman model. Parameters ---------- X : DataFrame of shape (n_samples, n_features) Assets matrix, where n_samples is the number of samples and n_features is the number of features. w : DataFrame of shape (n_assets, 1) Weights matrix, where n_assets is the number of assets. P : DataFrame of shape (n_views, n_assets) Analyst's views matrix, can be relative or absolute. Q : DataFrame of shape (n_views, 1) Expected returns of analyst's views. delta : float, optional Risk aversion factor. The default value is 1. rf : scalar, optional Risk free rate. The default is 0. eq : bool, optional Indicate if use equilibrum or historical excess returns. The default is True. **kwargs : dict Other variables related to the expected returns and covariance estimation. Returns ------- mu : DataFrame The mean vector of black litterman model. cov : DataFrame The covariance matrix of black litterman model. Raises ------ ValueError When the value cannot be calculated. """ if not isinstance(X, pd.DataFrame) and not isinstance(w, pd.DataFrame): raise ValueError("X and w must be DataFrames") if w.shape[0] > 1 and w.shape[1] > 1: raise ValueError("w must be a column DataFrame") assets = X.columns.tolist() w = np.matrix(w) if w.shape[0] == 1: w = w.T mu = np.matrix(mean_vector(X, method=method_mu, **kwargs)) S = np.matrix(covar_matrix(X, method=method_cov, **kwargs)) P = np.matrix(P) Q = np.matrix(Q) tau = 1 / X.shape[0] Omega = np.matrix(np.diag(np.diag(P * (tau * S) * P.T))) if eq == True: PI = delta * S * w elif eq == False: PI = mu.T - rf PI_ = ((tau * S).I + P.T * Omega.I * P).I * ((tau * S).I * PI + P.T * Omega.I * Q) M = ((tau * S).I + P.T * Omega.I * P).I # PI_1 = PI + (tau * S* P.T) * (P * tau * S * P.T + Omega).I * (Q - P * PI) # M = tau * S - (tau * S * P.T) * (P * tau * S * P.T + Omega).I * P * tau * S mu = PI_ + rf mu = mu.T cov = S + M w = (1 / (1 + tau)) * (delta * cov).I * PI_ mu = pd.DataFrame(mu, columns=assets) cov = pd.DataFrame(cov, index=assets, columns=assets) w = pd.DataFrame(w, index=assets) return mu, cov, w