Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3Cross-validation classes for GAM 

4 

5Author: Luca Puggini 

6 

7""" 

8 

9from abc import ABCMeta, abstractmethod 

10from statsmodels.compat.python import with_metaclass 

11import itertools 

12import numpy as np 

13from statsmodels.gam.smooth_basis import (GenericSmoothers, 

14 UnivariateGenericSmoother) 

15 

16 

17class BaseCV(with_metaclass(ABCMeta)): 

18 """ 

19 BaseCV class. It computes the cross validation error of a given model. 

20 All the cross validation classes can be derived by this one 

21 (e.g. GamCV, LassoCV,...) 

22 """ 

23 

24 def __init__(self, cv_iterator, endog, exog): 

25 self.cv_iterator = cv_iterator 

26 self.exog = exog 

27 self.endog = endog 

28 # TODO: cv_iterator.split only needs nobs from endog or exog 

29 self.train_test_cv_indices = self.cv_iterator.split(self.exog, 

30 self.endog, 

31 label=None) 

32 

33 def fit(self, **kwargs): 

34 # kwargs are the input values for the fit method of the 

35 # cross-validated object 

36 

37 cv_err = [] 

38 

39 for train_index, test_index in self.train_test_cv_indices: 

40 cv_err.append(self._error(train_index, test_index, **kwargs)) 

41 

42 return np.array(cv_err) 

43 

44 @abstractmethod 

45 def _error(self, train_index, test_index, **kwargs): 

46 # train the model on the train set 

47 # and returns the error on the test set 

48 pass 

49 

50 

51def _split_train_test_smoothers(x, smoother, train_index, test_index): 

52 """split smoothers in test and train sets and create GenericSmoothers 

53 

54 Note: this does not take exog_linear into account 

55 """ 

56 train_smoothers = [] 

57 test_smoothers = [] 

58 for smoother in smoother.smoothers: 

59 train_basis = smoother.basis[train_index] 

60 train_der_basis = smoother.der_basis[train_index] 

61 train_der2_basis = smoother.der2_basis[train_index] 

62 train_cov_der2 = smoother.cov_der2 

63 # TODO: Double check this part. cov_der2 is calculated with all data 

64 train_x = smoother.x[train_index] 

65 

66 train_smoothers.append( 

67 UnivariateGenericSmoother( 

68 train_x, train_basis, train_der_basis, train_der2_basis, 

69 train_cov_der2, smoother.variable_name + ' train')) 

70 

71 test_basis = smoother.basis[test_index] 

72 test_der_basis = smoother.der_basis[test_index] 

73 test_cov_der2 = smoother.cov_der2 

74 # TODO: Double check this part. cov_der2 is calculated with all data 

75 test_x = smoother.x[test_index] 

76 

77 test_smoothers.append( 

78 UnivariateGenericSmoother( 

79 test_x, test_basis, test_der_basis, train_der2_basis, 

80 test_cov_der2, smoother.variable_name + ' test')) 

81 

82 train_multivariate_smoothers = GenericSmoothers(x[train_index], 

83 train_smoothers) 

84 test_multivariate_smoothers = GenericSmoothers(x[test_index], 

85 test_smoothers) 

86 

87 return train_multivariate_smoothers, test_multivariate_smoothers 

88 

89 

90class MultivariateGAMCV(BaseCV): 

91 def __init__(self, smoother, alphas, gam, cost, endog, exog, cv_iterator): 

92 self.cost = cost 

93 self.gam = gam 

94 self.smoother = smoother 

95 self.exog_linear = exog 

96 self.alphas = alphas 

97 self.cv_iterator = cv_iterator 

98 # TODO: super does not do anything with endog, exog, except get nobs 

99 # refactor to clean up what where `exog` and `exog_linear` is attached 

100 super(MultivariateGAMCV, self).__init__(cv_iterator, 

101 endog, 

102 # exog, # not used in super 

103 self.smoother.basis) 

104 

105 def _error(self, train_index, test_index, **kwargs): 

106 train_smoother, test_smoother = _split_train_test_smoothers( 

107 self.smoother.x, self.smoother, train_index, test_index) 

108 

109 endog_train = self.endog[train_index] 

110 endog_test = self.endog[test_index] 

111 if self.exog_linear is not None: 

112 exog_linear_train = self.exog_linear[train_index] 

113 exog_linear_test = self.exog_linear[test_index] 

114 else: 

115 exog_linear_train = None 

116 exog_linear_test = None 

117 

118 gam = self.gam(endog_train, exog=exog_linear_train, 

119 smoother=train_smoother, alpha=self.alphas) 

120 gam_res = gam.fit(**kwargs) 

121 # exog_linear_test and test_smoother.basis will be column_stacked 

122 # but not transformed in predict 

123 endog_est = gam_res.predict(exog_linear_test, test_smoother.basis, 

124 transform=False) 

125 

126 return self.cost(endog_test, endog_est) 

127 

128 

129class BasePenaltiesPathCV(with_metaclass(ABCMeta)): 

130 """ 

131 Base class for cross validation over a grid of parameters. 

132 

133 The best parameter is saved in alpha_cv 

134 

135 This class is currently not used 

136 """ 

137 

138 def __init__(self, alphas): 

139 self.alphas = alphas 

140 self.alpha_cv = None 

141 self.cv_error = None 

142 self.cv_std = None 

143 

144 def plot_path(self): 

145 from statsmodels.graphics.utils import _import_mpl 

146 plt = _import_mpl() 

147 plt.plot(self.alphas, self.cv_error, c='black') 

148 plt.plot(self.alphas, self.cv_error + 1.96 * self.cv_std, 

149 c='blue') 

150 plt.plot(self.alphas, self.cv_error - 1.96 * self.cv_std, 

151 c='blue') 

152 

153 plt.plot(self.alphas, self.cv_error, 'o', c='black') 

154 plt.plot(self.alphas, self.cv_error + 1.96 * self.cv_std, 'o', 

155 c='blue') 

156 plt.plot(self.alphas, self.cv_error - 1.96 * self.cv_std, 'o', 

157 c='blue') 

158 

159 return 

160 # TODO add return 

161 

162 

163class MultivariateGAMCVPath(object): 

164 """k-fold cross-validation for GAM 

165 

166 Warning: The API of this class is preliminary and will change. 

167 

168 Parameters 

169 ---------- 

170 smoother : additive smoother instance 

171 alphas : list of iteratables 

172 list of alpha for smooths. The product space will be used as alpha 

173 grid for cross-validation 

174 gam : model class 

175 model class for creating a model with k-fole training data 

176 cost : function 

177 cost function for the prediction error 

178 endog : ndarray 

179 dependent (response) variable of the model 

180 cv_iterator : instance of cross-validation iterator 

181 """ 

182 

183 def __init__(self, smoother, alphas, gam, cost, endog, exog, cv_iterator): 

184 self.cost = cost 

185 self.smoother = smoother 

186 self.gam = gam 

187 self.alphas = alphas 

188 self.alphas_grid = list(itertools.product(*self.alphas)) 

189 self.endog = endog 

190 self.exog = exog 

191 self.cv_iterator = cv_iterator 

192 self.cv_error = np.zeros(shape=(len(self.alphas_grid, ))) 

193 self.cv_std = np.zeros(shape=(len(self.alphas_grid, ))) 

194 self.alpha_cv = None 

195 

196 def fit(self, **kwargs): 

197 for i, alphas_i in enumerate(self.alphas_grid): 

198 gam_cv = MultivariateGAMCV(smoother=self.smoother, 

199 alphas=alphas_i, 

200 gam=self.gam, 

201 cost=self.cost, 

202 endog=self.endog, 

203 exog=self.exog, 

204 cv_iterator=self.cv_iterator) 

205 cv_err = gam_cv.fit(**kwargs) 

206 self.cv_error[i] = cv_err.mean() 

207 self.cv_std[i] = cv_err.std() 

208 

209 self.alpha_cv = self.alphas_grid[np.argmin(self.cv_error)] 

210 return self