Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/arrays/masked.py : 33%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from typing import TYPE_CHECKING
3import numpy as np
5from pandas._libs import lib, missing as libmissing
7from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype
8from pandas.core.dtypes.missing import isna, notna
10from pandas.core.algorithms import take
11from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
12from pandas.core.indexers import check_array_indexer
14if TYPE_CHECKING:
15 from pandas._typing import Scalar
18class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin):
19 """
20 Base class for masked arrays (which use _data and _mask to store the data).
22 numpy based
23 """
25 _data: np.ndarray
26 _mask: np.ndarray
28 # The value used to fill '_data' to avoid upcasting
29 _internal_fill_value: "Scalar"
31 def __getitem__(self, item):
32 if is_integer(item):
33 if self._mask[item]:
34 return self.dtype.na_value
35 return self._data[item]
37 item = check_array_indexer(self, item)
39 return type(self)(self._data[item], self._mask[item])
41 def __iter__(self):
42 for i in range(len(self)):
43 if self._mask[i]:
44 yield self.dtype.na_value
45 else:
46 yield self._data[i]
48 def __len__(self) -> int:
49 return len(self._data)
51 def __invert__(self):
52 return type(self)(~self._data, self._mask)
54 def to_numpy(
55 self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default,
56 ):
57 """
58 Convert to a NumPy Array.
60 By default converts to an object-dtype NumPy array. Specify the `dtype` and
61 `na_value` keywords to customize the conversion.
63 Parameters
64 ----------
65 dtype : dtype, default object
66 The numpy dtype to convert to.
67 copy : bool, default False
68 Whether to ensure that the returned value is a not a view on
69 the array. Note that ``copy=False`` does not *ensure* that
70 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
71 a copy is made, even if not strictly necessary. This is typically
72 only possible when no missing values are present and `dtype`
73 is the equivalent numpy dtype.
74 na_value : scalar, optional
75 Scalar missing value indicator to use in numpy array. Defaults
76 to the native missing value indicator of this array (pd.NA).
78 Returns
79 -------
80 numpy.ndarray
82 Examples
83 --------
84 An object-dtype is the default result
86 >>> a = pd.array([True, False, pd.NA], dtype="boolean")
87 >>> a.to_numpy()
88 array([True, False, NA], dtype=object)
90 When no missing values are present, an equivalent dtype can be used.
92 >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool")
93 array([ True, False])
94 >>> pd.array([1, 2], dtype="Int64").to_numpy("int64")
95 array([1, 2])
97 However, requesting such dtype will raise a ValueError if
98 missing values are present and the default missing value :attr:`NA`
99 is used.
101 >>> a = pd.array([True, False, pd.NA], dtype="boolean")
102 >>> a
103 <BooleanArray>
104 [True, False, NA]
105 Length: 3, dtype: boolean
107 >>> a.to_numpy(dtype="bool")
108 Traceback (most recent call last):
109 ...
110 ValueError: cannot convert to bool numpy array in presence of missing values
112 Specify a valid `na_value` instead
114 >>> a.to_numpy(dtype="bool", na_value=False)
115 array([ True, False, False])
116 """
117 if na_value is lib.no_default:
118 na_value = libmissing.NA
119 if dtype is None:
120 dtype = object
121 if self._hasna:
122 if (
123 not (is_object_dtype(dtype) or is_string_dtype(dtype))
124 and na_value is libmissing.NA
125 ):
126 raise ValueError(
127 f"cannot convert to '{dtype}'-dtype NumPy array "
128 "with missing values. Specify an appropriate 'na_value' "
129 "for this dtype."
130 )
131 # don't pass copy to astype -> always need a copy since we are mutating
132 data = self._data.astype(dtype)
133 data[self._mask] = na_value
134 else:
135 data = self._data.astype(dtype, copy=copy)
136 return data
138 __array_priority__ = 1000 # higher than ndarray so ops dispatch to us
140 def __array__(self, dtype=None) -> np.ndarray:
141 """
142 the array interface, return my values
143 We return an object array here to preserve our scalar values
144 """
145 return self.to_numpy(dtype=dtype)
147 def __arrow_array__(self, type=None):
148 """
149 Convert myself into a pyarrow Array.
150 """
151 import pyarrow as pa
153 return pa.array(self._data, mask=self._mask, type=type)
155 @property
156 def _hasna(self) -> bool:
157 # Note: this is expensive right now! The hope is that we can
158 # make this faster by having an optional mask, but not have to change
159 # source code using it..
160 return self._mask.any()
162 def isna(self):
163 return self._mask
165 @property
166 def _na_value(self):
167 return self.dtype.na_value
169 @property
170 def nbytes(self):
171 return self._data.nbytes + self._mask.nbytes
173 @classmethod
174 def _concat_same_type(cls, to_concat):
175 data = np.concatenate([x._data for x in to_concat])
176 mask = np.concatenate([x._mask for x in to_concat])
177 return cls(data, mask)
179 def take(self, indexer, allow_fill=False, fill_value=None):
180 # we always fill with 1 internally
181 # to avoid upcasting
182 data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value
183 result = take(
184 self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill
185 )
187 mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill)
189 # if we are filling
190 # we only fill where the indexer is null
191 # not existing missing values
192 # TODO(jreback) what if we have a non-na float as a fill value?
193 if allow_fill and notna(fill_value):
194 fill_mask = np.asarray(indexer) == -1
195 result[fill_mask] = fill_value
196 mask = mask ^ fill_mask
198 return type(self)(result, mask, copy=False)
200 def copy(self):
201 data, mask = self._data, self._mask
202 data = data.copy()
203 mask = mask.copy()
204 return type(self)(data, mask, copy=False)
206 def value_counts(self, dropna=True):
207 """
208 Returns a Series containing counts of each unique value.
210 Parameters
211 ----------
212 dropna : bool, default True
213 Don't include counts of missing values.
215 Returns
216 -------
217 counts : Series
219 See Also
220 --------
221 Series.value_counts
222 """
223 from pandas import Index, Series
224 from pandas.arrays import IntegerArray
226 # compute counts on the data with no nans
227 data = self._data[~self._mask]
228 value_counts = Index(data).value_counts()
230 # TODO(extension)
231 # if we have allow Index to hold an ExtensionArray
232 # this is easier
233 index = value_counts.index.values.astype(object)
235 # if we want nans, count the mask
236 if dropna:
237 counts = value_counts.values
238 else:
239 counts = np.empty(len(value_counts) + 1, dtype="int64")
240 counts[:-1] = value_counts
241 counts[-1] = self._mask.sum()
243 index = Index(
244 np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]),
245 dtype=object,
246 )
248 mask = np.zeros(len(counts), dtype="bool")
249 counts = IntegerArray(counts, mask)
251 return Series(counts, index=index)