Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/patsy/missing.py : 19%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of Patsy
2# Copyright (C) 2013 Nathaniel Smith <njs@pobox.com>
3# See file LICENSE.txt for license information.
5# Missing data detection/handling
7# First, how do we represent missing data? (i.e., which values count as
8# "missing"?) In the long run, we want to use numpy's NA support... but that
9# doesn't exist yet. Until then, people use various sorts of ad-hoc
10# things. Some things that might be considered NA:
11# NA (eventually)
12# NaN (in float or object arrays)
13# None (in object arrays)
14# np.ma.masked (in numpy.ma masked arrays)
15# Pandas compatibility considerations:
16# For numeric arrays, None is unconditionally converted to NaN.
17# For object arrays (including string arrays!), None and NaN are preserved,
18# but pandas.isnull() returns True for both.
19# np.ma compatibility considerations:
20# Preserving array subtypes is a huge pain, because it means that we can't
21# just call 'asarray' and be done... we already jump through tons of hoops
22# to write code that can handle both ndarray's and pandas objects, and
23# just thinking about adding another item to this list makes me tired. So
24# for now we don't support np.ma missing values. Use pandas!
26# Next, what should be done once we find missing data? R's options:
27# -- throw away those rows (from all aligned matrices)
28# -- with or without preserving information on which rows were discarded
29# -- error out
30# -- carry on
31# The 'carry on' option requires that we have some way to represent NA in our
32# output array. To avoid further solidifying the use of NaN for this purpose,
33# we'll leave this option out for now, until real NA support is
34# available. Also, we always preserve information on which rows were
35# discarded, using the pandas index functionality (currently this is only
36# returned to the original caller if they used return_type="dataframe",
37# though).
39import numpy as np
40from patsy import PatsyError
41from patsy.util import (safe_isnan, safe_scalar_isnan,
42 no_pickling, assert_no_pickling)
44# These are made available in the patsy.* namespace
45__all__ = ["NAAction"]
47_valid_NA_types = ["None", "NaN"]
48_valid_NA_responses = ["raise", "drop"]
49def _desc_options(options):
50 return ", ".join([repr(opt) for opt in options])
52class NAAction(object):
53 """An :class:`NAAction` object defines a strategy for handling missing
54 data.
56 "NA" is short for "Not Available", and is used to refer to any value which
57 is somehow unmeasured or unavailable. In the long run, it is devoutly
58 hoped that numpy will gain first-class missing value support. Until then,
59 we work around this lack as best we're able.
61 There are two parts to this: First, we have to determine what counts as
62 missing data. For numerical data, the default is to treat NaN values
63 (e.g., ``numpy.nan``) as missing. For categorical data, the default is to
64 treat NaN values, and also the Python object None, as missing. (This is
65 consistent with how pandas does things, so if you're already using
66 None/NaN to mark missing data in your pandas DataFrames, you're good to
67 go.)
69 Second, we have to decide what to do with any missing data when we
70 encounter it. One option is to simply discard any rows which contain
71 missing data from our design matrices (``drop``). Another option is to
72 raise an error (``raise``). A third option would be to simply let the
73 missing values pass through into the returned design matrices. However,
74 this last option is not yet implemented, because of the lack of any
75 standard way to represent missing values in arbitrary numpy matrices;
76 we're hoping numpy will get this sorted out before we standardize on
77 anything ourselves.
79 You can control how patsy handles missing data through the ``NA_action=``
80 argument to functions like :func:`build_design_matrices` and
81 :func:`dmatrix`. If all you want to do is to choose between ``drop`` and
82 ``raise`` behaviour, you can pass one of those strings as the
83 ``NA_action=`` argument directly. If you want more fine-grained control
84 over how missing values are detected and handled, then you can create an
85 instance of this class, or your own object that implements the same
86 interface, and pass that as the ``NA_action=`` argument instead.
87 """
88 def __init__(self, on_NA="drop", NA_types=["None", "NaN"]):
89 """The :class:`NAAction` constructor takes the following arguments:
91 :arg on_NA: How to handle missing values. The default is ``"drop"``,
92 which removes all rows from all matrices which contain any missing
93 values. Also available is ``"raise"``, which raises an exception
94 when any missing values are encountered.
95 :arg NA_types: Which rules are used to identify missing values, as a
96 list of strings. Allowed values are:
98 * ``"None"``: treat the ``None`` object as missing in categorical
99 data.
100 * ``"NaN"``: treat floating point NaN values as missing in
101 categorical and numerical data.
103 .. versionadded:: 0.2.0
104 """
105 self.on_NA = on_NA
106 if self.on_NA not in _valid_NA_responses:
107 raise ValueError("invalid on_NA action %r "
108 "(should be one of %s)"
109 % (on_NA, _desc_options(_valid_NA_responses)))
110 if isinstance(NA_types, str):
111 raise ValueError("NA_types should be a list of strings")
112 self.NA_types = tuple(NA_types)
113 for NA_type in self.NA_types:
114 if NA_type not in _valid_NA_types:
115 raise ValueError("invalid NA_type %r "
116 "(should be one of %s)"
117 % (NA_type, _desc_options(_valid_NA_types)))
119 def is_categorical_NA(self, obj):
120 """Return True if `obj` is a categorical NA value.
122 Note that here `obj` is a single scalar value."""
123 if "NaN" in self.NA_types and safe_scalar_isnan(obj):
124 return True
125 if "None" in self.NA_types and obj is None:
126 return True
127 return False
129 def is_numerical_NA(self, arr):
130 """Returns a 1-d mask array indicating which rows in an array of
131 numerical values contain at least one NA value.
133 Note that here `arr` is a numpy array or pandas DataFrame."""
134 mask = np.zeros(arr.shape, dtype=bool)
135 if "NaN" in self.NA_types:
136 mask |= np.isnan(arr)
137 if mask.ndim > 1:
138 mask = np.any(mask, axis=1)
139 return mask
141 def handle_NA(self, values, is_NAs, origins):
142 """Takes a set of factor values that may have NAs, and handles them
143 appropriately.
145 :arg values: A list of `ndarray` objects representing the data.
146 These may be 1- or 2-dimensional, and may be of varying dtype. All
147 will have the same number of rows (or entries, for 1-d arrays).
148 :arg is_NAs: A list with the same number of entries as `values`,
149 containing boolean `ndarray` objects that indicate which rows
150 contain NAs in the corresponding entry in `values`.
151 :arg origins: A list with the same number of entries as
152 `values`, containing information on the origin of each
153 value. If we encounter a problem with some particular value, we use
154 the corresponding entry in `origins` as the origin argument when
155 raising a :class:`PatsyError`.
156 :returns: A list of new values (which may have a differing number of
157 rows.)
158 """
159 assert len(values) == len(is_NAs) == len(origins)
160 if len(values) == 0:
161 return values
162 if self.on_NA == "raise":
163 return self._handle_NA_raise(values, is_NAs, origins)
164 elif self.on_NA == "drop":
165 return self._handle_NA_drop(values, is_NAs, origins)
166 else: # pragma: no cover
167 assert False
169 def _handle_NA_raise(self, values, is_NAs, origins):
170 for is_NA, origin in zip(is_NAs, origins):
171 if np.any(is_NA):
172 raise PatsyError("factor contains missing values", origin)
173 return values
175 def _handle_NA_drop(self, values, is_NAs, origins):
176 total_mask = np.zeros(is_NAs[0].shape[0], dtype=bool)
177 for is_NA in is_NAs:
178 total_mask |= is_NA
179 good_mask = ~total_mask
180 # "..." to handle 1- versus 2-dim indexing
181 return [v[good_mask, ...] for v in values]
183 __getstate__ = no_pickling
185def test_NAAction_basic():
186 from nose.tools import assert_raises
187 assert_raises(ValueError, NAAction, on_NA="pord")
188 assert_raises(ValueError, NAAction, NA_types=("NaN", "asdf"))
189 assert_raises(ValueError, NAAction, NA_types="NaN")
191 assert_no_pickling(NAAction())
193def test_NAAction_NA_types_numerical():
194 for NA_types in [[], ["NaN"], ["None"], ["NaN", "None"]]:
195 action = NAAction(NA_types=NA_types)
196 for extra_shape in [(), (1,), (2,)]:
197 arr = np.ones((4,) + extra_shape, dtype=float)
198 nan_rows = [0, 2]
199 if arr.ndim > 1 and arr.shape[1] > 1:
200 arr[nan_rows, [0, 1]] = np.nan
201 else:
202 arr[nan_rows] = np.nan
203 exp_NA_mask = np.zeros(4, dtype=bool)
204 if "NaN" in NA_types:
205 exp_NA_mask[nan_rows] = True
206 got_NA_mask = action.is_numerical_NA(arr)
207 assert np.array_equal(got_NA_mask, exp_NA_mask)
209def test_NAAction_NA_types_categorical():
210 for NA_types in [[], ["NaN"], ["None"], ["NaN", "None"]]:
211 action = NAAction(NA_types=NA_types)
212 assert not action.is_categorical_NA("a")
213 assert not action.is_categorical_NA(1)
214 assert action.is_categorical_NA(None) == ("None" in NA_types)
215 assert action.is_categorical_NA(np.nan) == ("NaN" in NA_types)
217def test_NAAction_drop():
218 action = NAAction("drop")
219 in_values = [np.asarray([-1, 2, -1, 4, 5]),
220 np.asarray([10.0, 20.0, 30.0, 40.0, 50.0]),
221 np.asarray([[1.0, np.nan],
222 [3.0, 4.0],
223 [10.0, 5.0],
224 [6.0, 7.0],
225 [8.0, np.nan]]),
226 ]
227 is_NAs = [np.asarray([True, False, True, False, False]),
228 np.zeros(5, dtype=bool),
229 np.asarray([True, False, False, False, True]),
230 ]
231 out_values = action.handle_NA(in_values, is_NAs, [None] * 3)
232 assert len(out_values) == 3
233 assert np.array_equal(out_values[0], [2, 4])
234 assert np.array_equal(out_values[1], [20.0, 40.0])
235 assert np.array_equal(out_values[2], [[3.0, 4.0], [6.0, 7.0]])
237def test_NAAction_raise():
238 action = NAAction(on_NA="raise")
240 # no-NA just passes through:
241 in_arrs = [np.asarray([1.1, 1.2]),
242 np.asarray([1, 2])]
243 is_NAs = [np.asarray([False, False])] * 2
244 got_arrs = action.handle_NA(in_arrs, is_NAs, [None, None])
245 assert np.array_equal(got_arrs[0], in_arrs[0])
246 assert np.array_equal(got_arrs[1], in_arrs[1])
248 from patsy.origin import Origin
249 o1 = Origin("asdf", 0, 1)
250 o2 = Origin("asdf", 2, 3)
252 # NA raises an error with a correct origin
253 in_idx = np.arange(2)
254 in_arrs = [np.asarray([1.1, 1.2]),
255 np.asarray([1.0, np.nan])]
256 is_NAs = [np.asarray([False, False]),
257 np.asarray([False, True])]
258 try:
259 action.handle_NA(in_arrs, is_NAs, [o1, o2])
260 assert False
261 except PatsyError as e:
262 assert e.origin is o2