Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of Patsy 

2# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com> 

3# See file LICENSE.txt for license information. 

4 

5# Stateful transform protocol: 

6# def __init__(self): 

7# pass 

8# def memorize_chunk(self, input_data): 

9# return None 

10# def memorize_finish(self): 

11# return None 

12# def transform(self, input_data): 

13# return output_data 

14 

15# BETTER WAY: always run the first row of data through the builder alone, and 

16# check that it gives the same output row as when running the whole block of 

17# data through at once. This gives us the same information, but it's robust 

18# against people writing their own centering functions. 

19 

20# QUESTION: right now we refuse to even fit a model that contains a 

21# my_transform(x)-style function. Maybe we should allow it to be fit (with a 

22# warning), and only disallow making predictions with it? Need to revisit this 

23# question once it's clearer what exactly our public API will look like, 

24# because right now I'm not sure how to tell whether we are being called for 

25# fitting versus being called for prediction. 

26 

27from functools import wraps 

28import numpy as np 

29from patsy.util import (atleast_2d_column_default, 

30 asarray_or_pandas, pandas_friendly_reshape, 

31 wide_dtype_for, safe_issubdtype, 

32 no_pickling, assert_no_pickling) 

33 

34# These are made available in the patsy.* namespace 

35__all__ = ["stateful_transform", 

36 "center", "standardize", "scale", 

37 ] 

38 

39def stateful_transform(class_): 

40 """Create a stateful transform callable object from a class that fulfills 

41 the :ref:`stateful transform protocol <stateful-transform-protocol>`. 

42 """ 

43 @wraps(class_) 

44 def stateful_transform_wrapper(*args, **kwargs): 

45 transform = class_() 

46 transform.memorize_chunk(*args, **kwargs) 

47 transform.memorize_finish() 

48 return transform.transform(*args, **kwargs) 

49 stateful_transform_wrapper.__patsy_stateful_transform__ = class_ 

50 return stateful_transform_wrapper 

51 

52# class NonIncrementalStatefulTransform(object): 

53# def __init__(self): 

54# self._data = [] 

55# 

56# def memorize_chunk(self, input_data, *args, **kwargs): 

57# self._data.append(input_data) 

58# self._args = _args 

59# self._kwargs = kwargs 

60# 

61# def memorize_finish(self): 

62# all_data = np.row_stack(self._data) 

63# args = self._args 

64# kwargs = self._kwargs 

65# del self._data 

66# del self._args 

67# del self._kwargs 

68# self.memorize_all(all_data, *args, **kwargs) 

69# 

70# def memorize_all(self, input_data, *args, **kwargs): 

71# raise NotImplementedError 

72# 

73# def transform(self, input_data, *args, **kwargs): 

74# raise NotImplementedError 

75# 

76# class QuantileEstimatingTransform(NonIncrementalStatefulTransform): 

77# def memorize_all(self, input_data, *args, **kwargs): 

78 

79class Center(object): 

80 """center(x) 

81 

82 A stateful transform that centers input data, i.e., subtracts the mean. 

83 

84 If input has multiple columns, centers each column separately. 

85 

86 Equivalent to ``standardize(x, rescale=False)`` 

87 """ 

88 def __init__(self): 

89 self._sum = None 

90 self._count = 0 

91 

92 def memorize_chunk(self, x): 

93 x = atleast_2d_column_default(x) 

94 self._count += x.shape[0] 

95 this_total = np.sum(x, 0, dtype=wide_dtype_for(x)) 

96 # This is to handle potentially multi-column x's: 

97 if self._sum is None: 

98 self._sum = this_total 

99 else: 

100 self._sum += this_total 

101 

102 def memorize_finish(self): 

103 pass 

104 

105 def transform(self, x): 

106 x = asarray_or_pandas(x) 

107 # This doesn't copy data unless our input is a DataFrame that has 

108 # heterogenous types. And in that case we're going to be munging the 

109 # types anyway, so copying isn't a big deal. 

110 x_arr = np.asarray(x) 

111 if safe_issubdtype(x_arr.dtype, np.integer): 

112 dt = float 

113 else: 

114 dt = x_arr.dtype 

115 mean_val = np.asarray(self._sum / self._count, dtype=dt) 

116 centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val 

117 return pandas_friendly_reshape(centered, x.shape) 

118 

119 __getstate__ = no_pickling 

120 

121center = stateful_transform(Center) 

122 

123# See: 

124# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm 

125# or page 232 of Knuth vol. 3 (3rd ed.). 

126class Standardize(object): 

127 """standardize(x, center=True, rescale=True, ddof=0) 

128 

129 A stateful transform that standardizes input data, i.e. it subtracts the 

130 mean and divides by the sample standard deviation. 

131 

132 Either centering or rescaling or both can be disabled by use of keyword 

133 arguments. The `ddof` argument controls the delta degrees of freedom when 

134 computing the standard deviation (cf. :func:`numpy.std`). The default of 

135 ``ddof=0`` produces the maximum likelihood estimate; use ``ddof=1`` if you 

136 prefer the square root of the unbiased estimate of the variance. 

137 

138 If input has multiple columns, standardizes each column separately. 

139 

140 .. note:: This function computes the mean and standard deviation using a 

141 memory-efficient online algorithm, making it suitable for use with 

142 large incrementally processed data-sets. 

143 """ 

144 def __init__(self): 

145 self.current_n = 0 

146 self.current_mean = None 

147 self.current_M2 = None 

148 

149 def memorize_chunk(self, x, center=True, rescale=True, ddof=0): 

150 x = atleast_2d_column_default(x) 

151 if self.current_mean is None: 

152 self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x)) 

153 self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x)) 

154 # XX this can surely be vectorized but I am feeling lazy: 

155 for i in range(x.shape[0]): 

156 self.current_n += 1 

157 delta = x[i, :] - self.current_mean 

158 self.current_mean += delta / self.current_n 

159 self.current_M2 += delta * (x[i, :] - self.current_mean) 

160 

161 def memorize_finish(self): 

162 pass 

163 

164 def transform(self, x, center=True, rescale=True, ddof=0): 

165 # XX: this forces all inputs to double-precision real, even if the 

166 # input is single- or extended-precision or complex. But I got all 

167 # tangled up in knots trying to do that without breaking something 

168 # else (e.g. by requiring an extra copy). 

169 x = asarray_or_pandas(x, copy=True, dtype=float) 

170 x_2d = atleast_2d_column_default(x, preserve_pandas=True) 

171 if center: 

172 x_2d -= self.current_mean 

173 if rescale: 

174 x_2d /= np.sqrt(self.current_M2 / (self.current_n - ddof)) 

175 return pandas_friendly_reshape(x_2d, x.shape) 

176 

177 __getstate__ = no_pickling 

178 

179standardize = stateful_transform(Standardize) 

180# R compatibility: 

181scale = standardize