Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/arrays/sparse/array.py : 19%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2SparseArray data structure
3"""
4from collections import abc
5import numbers
6import operator
7from typing import Any, Callable
8import warnings
10import numpy as np
12from pandas._libs import index as libindex, lib
13import pandas._libs.sparse as splib
14from pandas._libs.sparse import BlockIndex, IntIndex, SparseIndex
15from pandas._libs.tslibs import NaT
16import pandas.compat as compat
17from pandas.compat.numpy import function as nv
18from pandas.errors import PerformanceWarning
20from pandas.core.dtypes.cast import (
21 astype_nansafe,
22 construct_1d_arraylike_from_scalar,
23 find_common_type,
24 infer_dtype_from_scalar,
25)
26from pandas.core.dtypes.common import (
27 is_array_like,
28 is_bool_dtype,
29 is_datetime64_any_dtype,
30 is_dtype_equal,
31 is_integer,
32 is_object_dtype,
33 is_scalar,
34 is_string_dtype,
35 pandas_dtype,
36)
37from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries, ABCSparseArray
38from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
40import pandas.core.algorithms as algos
41from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
42from pandas.core.arrays.sparse.dtype import SparseDtype
43from pandas.core.base import PandasObject
44import pandas.core.common as com
45from pandas.core.construction import sanitize_array
46from pandas.core.indexers import check_array_indexer
47from pandas.core.missing import interpolate_2d
48import pandas.core.ops as ops
49from pandas.core.ops.common import unpack_zerodim_and_defer
51import pandas.io.formats.printing as printing
53# ----------------------------------------------------------------------------
54# Array
57_sparray_doc_kwargs = dict(klass="SparseArray")
60def _get_fill(arr: ABCSparseArray) -> np.ndarray:
61 """
62 Create a 0-dim ndarray containing the fill value
64 Parameters
65 ----------
66 arr : SparseArray
68 Returns
69 -------
70 fill_value : ndarray
71 0-dim ndarray with just the fill value.
73 Notes
74 -----
75 coerce fill_value to arr dtype if possible
76 int64 SparseArray can have NaN as fill_value if there is no missing
77 """
78 try:
79 return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)
80 except ValueError:
81 return np.asarray(arr.fill_value)
84def _sparse_array_op(
85 left: ABCSparseArray, right: ABCSparseArray, op: Callable, name: str
86) -> Any:
87 """
88 Perform a binary operation between two arrays.
90 Parameters
91 ----------
92 left : Union[SparseArray, ndarray]
93 right : Union[SparseArray, ndarray]
94 op : Callable
95 The binary operation to perform
96 name str
97 Name of the callable.
99 Returns
100 -------
101 SparseArray
102 """
103 if name.startswith("__"):
104 # For lookups in _libs.sparse we need non-dunder op name
105 name = name[2:-2]
107 # dtype used to find corresponding sparse method
108 ltype = left.dtype.subtype
109 rtype = right.dtype.subtype
111 if not is_dtype_equal(ltype, rtype):
112 subtype = find_common_type([ltype, rtype])
113 ltype = SparseDtype(subtype, left.fill_value)
114 rtype = SparseDtype(subtype, right.fill_value)
116 # TODO(GH-23092): pass copy=False. Need to fix astype_nansafe
117 left = left.astype(ltype)
118 right = right.astype(rtype)
119 dtype = ltype.subtype
120 else:
121 dtype = ltype
123 # dtype the result must have
124 result_dtype = None
126 if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
127 with np.errstate(all="ignore"):
128 result = op(left.to_dense(), right.to_dense())
129 fill = op(_get_fill(left), _get_fill(right))
131 if left.sp_index.ngaps == 0:
132 index = left.sp_index
133 else:
134 index = right.sp_index
135 elif left.sp_index.equals(right.sp_index):
136 with np.errstate(all="ignore"):
137 result = op(left.sp_values, right.sp_values)
138 fill = op(_get_fill(left), _get_fill(right))
139 index = left.sp_index
140 else:
141 if name[0] == "r":
142 left, right = right, left
143 name = name[1:]
145 if name in ("and", "or", "xor") and dtype == "bool":
146 opname = f"sparse_{name}_uint8"
147 # to make template simple, cast here
148 left_sp_values = left.sp_values.view(np.uint8)
149 right_sp_values = right.sp_values.view(np.uint8)
150 result_dtype = np.bool
151 else:
152 opname = f"sparse_{name}_{dtype}"
153 left_sp_values = left.sp_values
154 right_sp_values = right.sp_values
156 sparse_op = getattr(splib, opname)
158 with np.errstate(all="ignore"):
159 result, index, fill = sparse_op(
160 left_sp_values,
161 left.sp_index,
162 left.fill_value,
163 right_sp_values,
164 right.sp_index,
165 right.fill_value,
166 )
168 if result_dtype is None:
169 result_dtype = result.dtype
171 return _wrap_result(name, result, index, fill, dtype=result_dtype)
174def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
175 """
176 wrap op result to have correct dtype
177 """
178 if name.startswith("__"):
179 # e.g. __eq__ --> eq
180 name = name[2:-2]
182 if name in ("eq", "ne", "lt", "gt", "le", "ge"):
183 dtype = np.bool
185 fill_value = lib.item_from_zerodim(fill_value)
187 if is_bool_dtype(dtype):
188 # fill_value may be np.bool_
189 fill_value = bool(fill_value)
190 return SparseArray(
191 data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype
192 )
195class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin):
196 """
197 An ExtensionArray for storing sparse data.
199 .. versionchanged:: 0.24.0
201 Implements the ExtensionArray interface.
203 Parameters
204 ----------
205 data : array-like
206 A dense array of values to store in the SparseArray. This may contain
207 `fill_value`.
208 sparse_index : SparseIndex, optional
209 index : Index
210 fill_value : scalar, optional
211 Elements in `data` that are `fill_value` are not stored in the
212 SparseArray. For memory savings, this should be the most common value
213 in `data`. By default, `fill_value` depends on the dtype of `data`:
215 =========== ==========
216 data.dtype na_value
217 =========== ==========
218 float ``np.nan``
219 int ``0``
220 bool False
221 datetime64 ``pd.NaT``
222 timedelta64 ``pd.NaT``
223 =========== ==========
225 The fill value is potentially specified in three ways. In order of
226 precedence, these are
228 1. The `fill_value` argument
229 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is
230 a ``SparseDtype``
231 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`
232 is not a ``SparseDtype`` and `data` is a ``SparseArray``.
234 kind : {'integer', 'block'}, default 'integer'
235 The type of storage for sparse locations.
237 * 'block': Stores a `block` and `block_length` for each
238 contiguous *span* of sparse values. This is best when
239 sparse data tends to be clumped together, with large
240 regions of ``fill-value`` values between sparse values.
241 * 'integer': uses an integer to store the location of
242 each sparse value.
244 dtype : np.dtype or SparseDtype, optional
245 The dtype to use for the SparseArray. For numpy dtypes, this
246 determines the dtype of ``self.sp_values``. For SparseDtype,
247 this determines ``self.sp_values`` and ``self.fill_value``.
248 copy : bool, default False
249 Whether to explicitly copy the incoming `data` array.
251 Attributes
252 ----------
253 None
255 Methods
256 -------
257 None
258 """
260 _pandas_ftype = "sparse"
261 _subtyp = "sparse_array" # register ABCSparseArray
262 _deprecations = PandasObject._deprecations | frozenset(["get_values"])
263 _sparse_index: SparseIndex
265 def __init__(
266 self,
267 data,
268 sparse_index=None,
269 index=None,
270 fill_value=None,
271 kind="integer",
272 dtype=None,
273 copy=False,
274 ):
276 if fill_value is None and isinstance(dtype, SparseDtype):
277 fill_value = dtype.fill_value
279 if isinstance(data, type(self)):
280 # disable normal inference on dtype, sparse_index, & fill_value
281 if sparse_index is None:
282 sparse_index = data.sp_index
283 if fill_value is None:
284 fill_value = data.fill_value
285 if dtype is None:
286 dtype = data.dtype
287 # TODO: make kind=None, and use data.kind?
288 data = data.sp_values
290 # Handle use-provided dtype
291 if isinstance(dtype, str):
292 # Two options: dtype='int', regular numpy dtype
293 # or dtype='Sparse[int]', a sparse dtype
294 try:
295 dtype = SparseDtype.construct_from_string(dtype)
296 except TypeError:
297 dtype = pandas_dtype(dtype)
299 if isinstance(dtype, SparseDtype):
300 if fill_value is None:
301 fill_value = dtype.fill_value
302 dtype = dtype.subtype
304 if index is not None and not is_scalar(data):
305 raise Exception("must only pass scalars with an index ")
307 if is_scalar(data):
308 if index is not None:
309 if data is None:
310 data = np.nan
312 if index is not None:
313 npoints = len(index)
314 elif sparse_index is None:
315 npoints = 1
316 else:
317 npoints = sparse_index.length
319 dtype = infer_dtype_from_scalar(data)[0]
320 data = construct_1d_arraylike_from_scalar(data, npoints, dtype)
322 if dtype is not None:
323 dtype = pandas_dtype(dtype)
325 # TODO: disentangle the fill_value dtype inference from
326 # dtype inference
327 if data is None:
328 # XXX: What should the empty dtype be? Object or float?
329 data = np.array([], dtype=dtype)
331 if not is_array_like(data):
332 try:
333 # probably shared code in sanitize_series
335 data = sanitize_array(data, index=None)
336 except ValueError:
337 # NumPy may raise a ValueError on data like [1, []]
338 # we retry with object dtype here.
339 if dtype is None:
340 dtype = object
341 data = np.atleast_1d(np.asarray(data, dtype=dtype))
342 else:
343 raise
345 if copy:
346 # TODO: avoid double copy when dtype forces cast.
347 data = data.copy()
349 if fill_value is None:
350 fill_value_dtype = data.dtype if dtype is None else dtype
351 if fill_value_dtype is None:
352 fill_value = np.nan
353 else:
354 fill_value = na_value_for_dtype(fill_value_dtype)
356 if isinstance(data, type(self)) and sparse_index is None:
357 sparse_index = data._sparse_index
358 sparse_values = np.asarray(data.sp_values, dtype=dtype)
359 elif sparse_index is None:
360 sparse_values, sparse_index, fill_value = make_sparse(
361 data, kind=kind, fill_value=fill_value, dtype=dtype
362 )
363 else:
364 sparse_values = np.asarray(data, dtype=dtype)
365 if len(sparse_values) != sparse_index.npoints:
366 raise AssertionError(
367 f"Non array-like type {type(sparse_values)} must "
368 "have the same length as the index"
369 )
370 self._sparse_index = sparse_index
371 self._sparse_values = sparse_values
372 self._dtype = SparseDtype(sparse_values.dtype, fill_value)
374 @classmethod
375 def _simple_new(
376 cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype
377 ) -> "SparseArray":
378 new = cls([])
379 new._sparse_index = sparse_index
380 new._sparse_values = sparse_array
381 new._dtype = dtype
382 return new
384 @classmethod
385 def from_spmatrix(cls, data):
386 """
387 Create a SparseArray from a scipy.sparse matrix.
389 .. versionadded:: 0.25.0
391 Parameters
392 ----------
393 data : scipy.sparse.sp_matrix
394 This should be a SciPy sparse matrix where the size
395 of the second dimension is 1. In other words, a
396 sparse matrix with a single column.
398 Returns
399 -------
400 SparseArray
402 Examples
403 --------
404 >>> import scipy.sparse
405 >>> mat = scipy.sparse.coo_matrix((4, 1))
406 >>> pd.arrays.SparseArray.from_spmatrix(mat)
407 [0.0, 0.0, 0.0, 0.0]
408 Fill: 0.0
409 IntIndex
410 Indices: array([], dtype=int32)
411 """
412 length, ncol = data.shape
414 if ncol != 1:
415 raise ValueError(f"'data' must have a single column, not '{ncol}'")
417 # our sparse index classes require that the positions be strictly
418 # increasing. So we need to sort loc, and arr accordingly.
419 arr = data.data
420 idx, _ = data.nonzero()
421 loc = np.argsort(idx)
422 arr = arr.take(loc)
423 idx.sort()
425 zero = np.array(0, dtype=arr.dtype).item()
426 dtype = SparseDtype(arr.dtype, zero)
427 index = IntIndex(length, idx)
429 return cls._simple_new(arr, index, dtype)
431 def __array__(self, dtype=None, copy=True) -> np.ndarray:
432 fill_value = self.fill_value
434 if self.sp_index.ngaps == 0:
435 # Compat for na dtype and int values.
436 return self.sp_values
437 if dtype is None:
438 # Can NumPy represent this type?
439 # If not, `np.result_type` will raise. We catch that
440 # and return object.
441 if is_datetime64_any_dtype(self.sp_values.dtype):
442 # However, we *do* special-case the common case of
443 # a datetime64 with pandas NaT.
444 if fill_value is NaT:
445 # Can't put pd.NaT in a datetime64[ns]
446 fill_value = np.datetime64("NaT")
447 try:
448 dtype = np.result_type(self.sp_values.dtype, type(fill_value))
449 except TypeError:
450 dtype = object
452 out = np.full(self.shape, fill_value, dtype=dtype)
453 out[self.sp_index.to_int_index().indices] = self.sp_values
454 return out
456 def __setitem__(self, key, value):
457 # I suppose we could allow setting of non-fill_value elements.
458 # TODO(SparseArray.__setitem__): remove special cases in
459 # ExtensionBlock.where
460 msg = "SparseArray does not support item assignment via setitem"
461 raise TypeError(msg)
463 @classmethod
464 def _from_sequence(cls, scalars, dtype=None, copy=False):
465 return cls(scalars, dtype=dtype)
467 @classmethod
468 def _from_factorized(cls, values, original):
469 return cls(values, dtype=original.dtype)
471 # ------------------------------------------------------------------------
472 # Data
473 # ------------------------------------------------------------------------
474 @property
475 def sp_index(self):
476 """
477 The SparseIndex containing the location of non- ``fill_value`` points.
478 """
479 return self._sparse_index
481 @property
482 def sp_values(self):
483 """
484 An ndarray containing the non- ``fill_value`` values.
486 Examples
487 --------
488 >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
489 >>> s.sp_values
490 array([1, 2])
491 """
492 return self._sparse_values
494 @property
495 def dtype(self):
496 return self._dtype
498 @property
499 def fill_value(self):
500 """
501 Elements in `data` that are `fill_value` are not stored.
503 For memory savings, this should be the most common value in the array.
504 """
505 return self.dtype.fill_value
507 @fill_value.setter
508 def fill_value(self, value):
509 self._dtype = SparseDtype(self.dtype.subtype, value)
511 @property
512 def kind(self) -> str:
513 """
514 The kind of sparse index for this array. One of {'integer', 'block'}.
515 """
516 if isinstance(self.sp_index, IntIndex):
517 return "integer"
518 else:
519 return "block"
521 @property
522 def _valid_sp_values(self):
523 sp_vals = self.sp_values
524 mask = notna(sp_vals)
525 return sp_vals[mask]
527 def __len__(self) -> int:
528 return self.sp_index.length
530 @property
531 def _null_fill_value(self):
532 return self._dtype._is_na_fill_value
534 def _fill_value_matches(self, fill_value):
535 if self._null_fill_value:
536 return isna(fill_value)
537 else:
538 return self.fill_value == fill_value
540 @property
541 def nbytes(self) -> int:
542 return self.sp_values.nbytes + self.sp_index.nbytes
544 @property
545 def density(self):
546 """
547 The percent of non- ``fill_value`` points, as decimal.
549 Examples
550 --------
551 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
552 >>> s.density
553 0.6
554 """
555 r = float(self.sp_index.npoints) / float(self.sp_index.length)
556 return r
558 @property
559 def npoints(self) -> int:
560 """
561 The number of non- ``fill_value`` points.
563 Examples
564 --------
565 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
566 >>> s.npoints
567 3
568 """
569 return self.sp_index.npoints
571 def isna(self):
572 # If null fill value, we want SparseDtype[bool, true]
573 # to preserve the same memory usage.
574 dtype = SparseDtype(bool, self._null_fill_value)
575 return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
577 def fillna(self, value=None, method=None, limit=None):
578 """
579 Fill missing values with `value`.
581 Parameters
582 ----------
583 value : scalar, optional
584 method : str, optional
586 .. warning::
588 Using 'method' will result in high memory use,
589 as all `fill_value` methods will be converted to
590 an in-memory ndarray
592 limit : int, optional
594 Returns
595 -------
596 SparseArray
598 Notes
599 -----
600 When `value` is specified, the result's ``fill_value`` depends on
601 ``self.fill_value``. The goal is to maintain low-memory use.
603 If ``self.fill_value`` is NA, the result dtype will be
604 ``SparseDtype(self.dtype, fill_value=value)``. This will preserve
605 amount of memory used before and after filling.
607 When ``self.fill_value`` is not NA, the result dtype will be
608 ``self.dtype``. Again, this preserves the amount of memory used.
609 """
610 if (method is None and value is None) or (
611 method is not None and value is not None
612 ):
613 raise ValueError("Must specify one of 'method' or 'value'.")
615 elif method is not None:
616 msg = "fillna with 'method' requires high memory usage."
617 warnings.warn(msg, PerformanceWarning)
618 filled = interpolate_2d(np.asarray(self), method=method, limit=limit)
619 return type(self)(filled, fill_value=self.fill_value)
621 else:
622 new_values = np.where(isna(self.sp_values), value, self.sp_values)
624 if self._null_fill_value:
625 # This is essentially just updating the dtype.
626 new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)
627 else:
628 new_dtype = self.dtype
630 return self._simple_new(new_values, self._sparse_index, new_dtype)
632 def shift(self, periods=1, fill_value=None):
634 if not len(self) or periods == 0:
635 return self.copy()
637 if isna(fill_value):
638 fill_value = self.dtype.na_value
640 subtype = np.result_type(fill_value, self.dtype.subtype)
642 if subtype != self.dtype.subtype:
643 # just coerce up front
644 arr = self.astype(SparseDtype(subtype, self.fill_value))
645 else:
646 arr = self
648 empty = self._from_sequence(
649 [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype
650 )
652 if periods > 0:
653 a = empty
654 b = arr[:-periods]
655 else:
656 a = arr[abs(periods) :]
657 b = empty
658 return arr._concat_same_type([a, b])
660 def _first_fill_value_loc(self):
661 """
662 Get the location of the first missing value.
664 Returns
665 -------
666 int
667 """
668 if len(self) == 0 or self.sp_index.npoints == len(self):
669 return -1
671 indices = self.sp_index.to_int_index().indices
672 if not len(indices) or indices[0] > 0:
673 return 0
675 diff = indices[1:] - indices[:-1]
676 return np.searchsorted(diff, 2) + 1
678 def unique(self):
679 uniques = list(algos.unique(self.sp_values))
680 fill_loc = self._first_fill_value_loc()
681 if fill_loc >= 0:
682 uniques.insert(fill_loc, self.fill_value)
683 return type(self)._from_sequence(uniques, dtype=self.dtype)
685 def _values_for_factorize(self):
686 # Still override this for hash_pandas_object
687 return np.asarray(self), self.fill_value
689 def factorize(self, na_sentinel=-1):
690 # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
691 # The sparsity on this is backwards from what Sparse would want. Want
692 # ExtensionArray.factorize -> Tuple[EA, EA]
693 # Given that we have to return a dense array of codes, why bother
694 # implementing an efficient factorize?
695 codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
696 uniques = SparseArray(uniques, dtype=self.dtype)
697 return codes, uniques
699 def value_counts(self, dropna=True):
700 """
701 Returns a Series containing counts of unique values.
703 Parameters
704 ----------
705 dropna : boolean, default True
706 Don't include counts of NaN, even if NaN is in sp_values.
708 Returns
709 -------
710 counts : Series
711 """
712 from pandas import Index, Series
714 keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna)
715 fcounts = self.sp_index.ngaps
716 if fcounts > 0:
717 if self._null_fill_value and dropna:
718 pass
719 else:
720 if self._null_fill_value:
721 mask = isna(keys)
722 else:
723 mask = keys == self.fill_value
725 if mask.any():
726 counts[mask] += fcounts
727 else:
728 keys = np.insert(keys, 0, self.fill_value)
729 counts = np.insert(counts, 0, fcounts)
731 if not isinstance(keys, ABCIndexClass):
732 keys = Index(keys)
733 result = Series(counts, index=keys)
734 return result
736 # --------
737 # Indexing
738 # --------
740 def __getitem__(self, key):
741 # avoid mypy issues when importing at the top-level
742 from pandas.core.indexing import check_bool_indexer
744 if isinstance(key, tuple):
745 if len(key) > 1:
746 raise IndexError("too many indices for array.")
747 key = key[0]
749 if is_integer(key):
750 return self._get_val_at(key)
751 elif isinstance(key, tuple):
752 data_slice = self.to_dense()[key]
753 elif isinstance(key, slice):
754 # special case to preserve dtypes
755 if key == slice(None):
756 return self.copy()
757 # TODO: this logic is surely elsewhere
758 # TODO: this could be more efficient
759 indices = np.arange(len(self), dtype=np.int32)[key]
760 return self.take(indices)
761 else:
762 # TODO: I think we can avoid densifying when masking a
763 # boolean SparseArray with another. Need to look at the
764 # key's fill_value for True / False, and then do an intersection
765 # on the indicies of the sp_values.
766 if isinstance(key, SparseArray):
767 if is_bool_dtype(key):
768 key = key.to_dense()
769 else:
770 key = np.asarray(key)
772 key = check_array_indexer(self, key)
774 if com.is_bool_indexer(key):
775 key = check_bool_indexer(self, key)
777 return self.take(np.arange(len(key), dtype=np.int32)[key])
778 elif hasattr(key, "__len__"):
779 return self.take(key)
780 else:
781 raise ValueError(f"Cannot slice with '{key}'")
783 return type(self)(data_slice, kind=self.kind)
785 def _get_val_at(self, loc):
786 n = len(self)
787 if loc < 0:
788 loc += n
790 if loc >= n or loc < 0:
791 raise IndexError("Out of bounds access")
793 sp_loc = self.sp_index.lookup(loc)
794 if sp_loc == -1:
795 return self.fill_value
796 else:
797 return libindex.get_value_at(self.sp_values, sp_loc)
799 def take(self, indices, allow_fill=False, fill_value=None):
800 if is_scalar(indices):
801 raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.")
802 indices = np.asarray(indices, dtype=np.int32)
804 if indices.size == 0:
805 result = []
806 kwargs = {"dtype": self.dtype}
807 elif allow_fill:
808 result = self._take_with_fill(indices, fill_value=fill_value)
809 kwargs = {}
810 else:
811 result = self._take_without_fill(indices)
812 kwargs = {"dtype": self.dtype}
814 return type(self)(result, fill_value=self.fill_value, kind=self.kind, **kwargs)
816 def _take_with_fill(self, indices, fill_value=None):
817 if fill_value is None:
818 fill_value = self.dtype.na_value
820 if indices.min() < -1:
821 raise ValueError(
822 "Invalid value in 'indices'. Must be between -1 "
823 "and the length of the array."
824 )
826 if indices.max() >= len(self):
827 raise IndexError("out of bounds value in 'indices'.")
829 if len(self) == 0:
830 # Empty... Allow taking only if all empty
831 if (indices == -1).all():
832 dtype = np.result_type(self.sp_values, type(fill_value))
833 taken = np.empty_like(indices, dtype=dtype)
834 taken.fill(fill_value)
835 return taken
836 else:
837 raise IndexError("cannot do a non-empty take from an empty axes.")
839 sp_indexer = self.sp_index.lookup_array(indices)
841 if self.sp_index.npoints == 0:
842 # Avoid taking from the empty self.sp_values
843 taken = np.full(
844 sp_indexer.shape,
845 fill_value=fill_value,
846 dtype=np.result_type(type(fill_value)),
847 )
848 else:
849 taken = self.sp_values.take(sp_indexer)
851 # sp_indexer may be -1 for two reasons
852 # 1.) we took for an index of -1 (new)
853 # 2.) we took a value that was self.fill_value (old)
854 new_fill_indices = indices == -1
855 old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
857 # Fill in two steps.
858 # Old fill values
859 # New fill values
860 # potentially coercing to a new dtype at each stage.
862 m0 = sp_indexer[old_fill_indices] < 0
863 m1 = sp_indexer[new_fill_indices] < 0
865 result_type = taken.dtype
867 if m0.any():
868 result_type = np.result_type(result_type, type(self.fill_value))
869 taken = taken.astype(result_type)
870 taken[old_fill_indices] = self.fill_value
872 if m1.any():
873 result_type = np.result_type(result_type, type(fill_value))
874 taken = taken.astype(result_type)
875 taken[new_fill_indices] = fill_value
877 return taken
879 def _take_without_fill(self, indices):
880 to_shift = indices < 0
881 indices = indices.copy()
883 n = len(self)
885 if (indices.max() >= n) or (indices.min() < -n):
886 if n == 0:
887 raise IndexError("cannot do a non-empty take from an empty axes.")
888 else:
889 raise IndexError("out of bounds value in 'indices'.")
891 if to_shift.any():
892 indices[to_shift] += n
894 if self.sp_index.npoints == 0:
895 # edge case in take...
896 # I think just return
897 out = np.full(
898 indices.shape,
899 self.fill_value,
900 dtype=np.result_type(type(self.fill_value)),
901 )
902 arr, sp_index, fill_value = make_sparse(out, fill_value=self.fill_value)
903 return type(self)(arr, sparse_index=sp_index, fill_value=fill_value)
905 sp_indexer = self.sp_index.lookup_array(indices)
906 taken = self.sp_values.take(sp_indexer)
907 fillable = sp_indexer < 0
909 if fillable.any():
910 # TODO: may need to coerce array to fill value
911 result_type = np.result_type(taken, type(self.fill_value))
912 taken = taken.astype(result_type)
913 taken[fillable] = self.fill_value
915 return taken
917 def searchsorted(self, v, side="left", sorter=None):
918 msg = "searchsorted requires high memory usage."
919 warnings.warn(msg, PerformanceWarning, stacklevel=2)
920 if not is_scalar(v):
921 v = np.asarray(v)
922 v = np.asarray(v)
923 return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter)
925 def copy(self):
926 values = self.sp_values.copy()
927 return self._simple_new(values, self.sp_index, self.dtype)
929 @classmethod
930 def _concat_same_type(cls, to_concat):
931 fill_values = [x.fill_value for x in to_concat]
933 fill_value = fill_values[0]
935 # np.nan isn't a singleton, so we may end up with multiple
936 # NaNs here, so we ignore tha all NA case too.
937 if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
938 warnings.warn(
939 "Concatenating sparse arrays with multiple fill "
940 f"values: '{fill_values}'. Picking the first and "
941 "converting the rest.",
942 PerformanceWarning,
943 stacklevel=6,
944 )
945 keep = to_concat[0]
946 to_concat2 = [keep]
948 for arr in to_concat[1:]:
949 to_concat2.append(cls(np.asarray(arr), fill_value=fill_value))
951 to_concat = to_concat2
953 values = []
954 length = 0
956 if to_concat:
957 sp_kind = to_concat[0].kind
958 else:
959 sp_kind = "integer"
961 if sp_kind == "integer":
962 indices = []
964 for arr in to_concat:
965 idx = arr.sp_index.to_int_index().indices.copy()
966 idx += length # TODO: wraparound
967 length += arr.sp_index.length
969 values.append(arr.sp_values)
970 indices.append(idx)
972 data = np.concatenate(values)
973 indices = np.concatenate(indices)
974 sp_index = IntIndex(length, indices)
976 else:
977 # when concatenating block indices, we don't claim that you'll
978 # get an identical index as concating the values and then
979 # creating a new index. We don't want to spend the time trying
980 # to merge blocks across arrays in `to_concat`, so the resulting
981 # BlockIndex may have more blocs.
982 blengths = []
983 blocs = []
985 for arr in to_concat:
986 idx = arr.sp_index.to_block_index()
988 values.append(arr.sp_values)
989 blocs.append(idx.blocs.copy() + length)
990 blengths.append(idx.blengths)
991 length += arr.sp_index.length
993 data = np.concatenate(values)
994 blocs = np.concatenate(blocs)
995 blengths = np.concatenate(blengths)
997 sp_index = BlockIndex(length, blocs, blengths)
999 return cls(data, sparse_index=sp_index, fill_value=fill_value)
1001 def astype(self, dtype=None, copy=True):
1002 """
1003 Change the dtype of a SparseArray.
1005 The output will always be a SparseArray. To convert to a dense
1006 ndarray with a certain dtype, use :meth:`numpy.asarray`.
1008 Parameters
1009 ----------
1010 dtype : np.dtype or ExtensionDtype
1011 For SparseDtype, this changes the dtype of
1012 ``self.sp_values`` and the ``self.fill_value``.
1014 For other dtypes, this only changes the dtype of
1015 ``self.sp_values``.
1017 copy : bool, default True
1018 Whether to ensure a copy is made, even if not necessary.
1020 Returns
1021 -------
1022 SparseArray
1024 Examples
1025 --------
1026 >>> arr = SparseArray([0, 0, 1, 2])
1027 >>> arr
1028 [0, 0, 1, 2]
1029 Fill: 0
1030 IntIndex
1031 Indices: array([2, 3], dtype=int32)
1033 >>> arr.astype(np.dtype('int32'))
1034 [0, 0, 1, 2]
1035 Fill: 0
1036 IntIndex
1037 Indices: array([2, 3], dtype=int32)
1039 Using a NumPy dtype with a different kind (e.g. float) will coerce
1040 just ``self.sp_values``.
1042 >>> arr.astype(np.dtype('float64'))
1043 ... # doctest: +NORMALIZE_WHITESPACE
1044 [0, 0, 1.0, 2.0]
1045 Fill: 0
1046 IntIndex
1047 Indices: array([2, 3], dtype=int32)
1049 Use a SparseDtype if you wish to be change the fill value as well.
1051 >>> arr.astype(SparseDtype("float64", fill_value=np.nan))
1052 ... # doctest: +NORMALIZE_WHITESPACE
1053 [nan, nan, 1.0, 2.0]
1054 Fill: nan
1055 IntIndex
1056 Indices: array([2, 3], dtype=int32)
1057 """
1058 dtype = self.dtype.update_dtype(dtype)
1059 subtype = dtype._subtype_with_str
1060 sp_values = astype_nansafe(self.sp_values, subtype, copy=copy)
1061 if sp_values is self.sp_values and copy:
1062 sp_values = sp_values.copy()
1064 return self._simple_new(sp_values, self.sp_index, dtype)
1066 def map(self, mapper):
1067 """
1068 Map categories using input correspondence (dict, Series, or function).
1070 Parameters
1071 ----------
1072 mapper : dict, Series, callable
1073 The correspondence from old values to new.
1075 Returns
1076 -------
1077 SparseArray
1078 The output array will have the same density as the input.
1079 The output fill value will be the result of applying the
1080 mapping to ``self.fill_value``
1082 Examples
1083 --------
1084 >>> arr = pd.arrays.SparseArray([0, 1, 2])
1085 >>> arr.apply(lambda x: x + 10)
1086 [10, 11, 12]
1087 Fill: 10
1088 IntIndex
1089 Indices: array([1, 2], dtype=int32)
1091 >>> arr.apply({0: 10, 1: 11, 2: 12})
1092 [10, 11, 12]
1093 Fill: 10
1094 IntIndex
1095 Indices: array([1, 2], dtype=int32)
1097 >>> arr.apply(pd.Series([10, 11, 12], index=[0, 1, 2]))
1098 [10, 11, 12]
1099 Fill: 10
1100 IntIndex
1101 Indices: array([1, 2], dtype=int32)
1102 """
1103 # this is used in apply.
1104 # We get hit since we're an "is_extension_type" but regular extension
1105 # types are not hit. This may be worth adding to the interface.
1106 if isinstance(mapper, ABCSeries):
1107 mapper = mapper.to_dict()
1109 if isinstance(mapper, abc.Mapping):
1110 fill_value = mapper.get(self.fill_value, self.fill_value)
1111 sp_values = [mapper.get(x, None) for x in self.sp_values]
1112 else:
1113 fill_value = mapper(self.fill_value)
1114 sp_values = [mapper(x) for x in self.sp_values]
1116 return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value)
1118 def to_dense(self):
1119 """
1120 Convert SparseArray to a NumPy array.
1122 Returns
1123 -------
1124 arr : NumPy array
1125 """
1126 return np.asarray(self, dtype=self.sp_values.dtype)
1128 _internal_get_values = to_dense
1130 # ------------------------------------------------------------------------
1131 # IO
1132 # ------------------------------------------------------------------------
1133 def __setstate__(self, state):
1134 """Necessary for making this object picklable"""
1135 if isinstance(state, tuple):
1136 # Compat for pandas < 0.24.0
1137 nd_state, (fill_value, sp_index) = state
1138 sparse_values = np.array([])
1139 sparse_values.__setstate__(nd_state)
1141 self._sparse_values = sparse_values
1142 self._sparse_index = sp_index
1143 self._dtype = SparseDtype(sparse_values.dtype, fill_value)
1144 else:
1145 self.__dict__.update(state)
1147 def nonzero(self):
1148 if self.fill_value == 0:
1149 return (self.sp_index.to_int_index().indices,)
1150 else:
1151 return (self.sp_index.to_int_index().indices[self.sp_values != 0],)
1153 # ------------------------------------------------------------------------
1154 # Reductions
1155 # ------------------------------------------------------------------------
1157 def _reduce(self, name, skipna=True, **kwargs):
1158 method = getattr(self, name, None)
1160 if method is None:
1161 raise TypeError(f"cannot perform {name} with type {self.dtype}")
1163 if skipna:
1164 arr = self
1165 else:
1166 arr = self.dropna()
1168 # we don't support these kwargs.
1169 # They should only be present when called via pandas, so do it here.
1170 # instead of in `any` / `all` (which will raise if they're present,
1171 # thanks to nv.validate
1172 kwargs.pop("filter_type", None)
1173 kwargs.pop("numeric_only", None)
1174 kwargs.pop("op", None)
1175 return getattr(arr, name)(**kwargs)
1177 def all(self, axis=None, *args, **kwargs):
1178 """
1179 Tests whether all elements evaluate True
1181 Returns
1182 -------
1183 all : bool
1185 See Also
1186 --------
1187 numpy.all
1188 """
1189 nv.validate_all(args, kwargs)
1191 values = self.sp_values
1193 if len(values) != len(self) and not np.all(self.fill_value):
1194 return False
1196 return values.all()
1198 def any(self, axis=0, *args, **kwargs):
1199 """
1200 Tests whether at least one of elements evaluate True
1202 Returns
1203 -------
1204 any : bool
1206 See Also
1207 --------
1208 numpy.any
1209 """
1210 nv.validate_any(args, kwargs)
1212 values = self.sp_values
1214 if len(values) != len(self) and np.any(self.fill_value):
1215 return True
1217 return values.any().item()
1219 def sum(self, axis=0, *args, **kwargs):
1220 """
1221 Sum of non-NA/null values
1223 Returns
1224 -------
1225 sum : float
1226 """
1227 nv.validate_sum(args, kwargs)
1228 valid_vals = self._valid_sp_values
1229 sp_sum = valid_vals.sum()
1230 if self._null_fill_value:
1231 return sp_sum
1232 else:
1233 nsparse = self.sp_index.ngaps
1234 return sp_sum + self.fill_value * nsparse
1236 def cumsum(self, axis=0, *args, **kwargs):
1237 """
1238 Cumulative sum of non-NA/null values.
1240 When performing the cumulative summation, any non-NA/null values will
1241 be skipped. The resulting SparseArray will preserve the locations of
1242 NaN values, but the fill value will be `np.nan` regardless.
1244 Parameters
1245 ----------
1246 axis : int or None
1247 Axis over which to perform the cumulative summation. If None,
1248 perform cumulative summation over flattened array.
1250 Returns
1251 -------
1252 cumsum : SparseArray
1253 """
1254 nv.validate_cumsum(args, kwargs)
1256 if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.
1257 raise ValueError(f"axis(={axis}) out of bounds")
1259 if not self._null_fill_value:
1260 return SparseArray(self.to_dense()).cumsum()
1262 return SparseArray(
1263 self.sp_values.cumsum(),
1264 sparse_index=self.sp_index,
1265 fill_value=self.fill_value,
1266 )
1268 def mean(self, axis=0, *args, **kwargs):
1269 """
1270 Mean of non-NA/null values
1272 Returns
1273 -------
1274 mean : float
1275 """
1276 nv.validate_mean(args, kwargs)
1277 valid_vals = self._valid_sp_values
1278 sp_sum = valid_vals.sum()
1279 ct = len(valid_vals)
1281 if self._null_fill_value:
1282 return sp_sum / ct
1283 else:
1284 nsparse = self.sp_index.ngaps
1285 return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
1287 def transpose(self, *axes):
1288 """
1289 Returns the SparseArray.
1290 """
1291 return self
1293 @property
1294 def T(self):
1295 """
1296 Returns the SparseArray.
1297 """
1298 return self
1300 # ------------------------------------------------------------------------
1301 # Ufuncs
1302 # ------------------------------------------------------------------------
1304 _HANDLED_TYPES = (np.ndarray, numbers.Number)
1306 def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
1307 out = kwargs.get("out", ())
1309 for x in inputs + out:
1310 if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
1311 return NotImplemented
1313 # for binary ops, use our custom dunder methods
1314 result = ops.maybe_dispatch_ufunc_to_dunder_op(
1315 self, ufunc, method, *inputs, **kwargs
1316 )
1317 if result is not NotImplemented:
1318 return result
1320 if len(inputs) == 1:
1321 # No alignment necessary.
1322 sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
1323 fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)
1325 if isinstance(sp_values, tuple):
1326 # multiple outputs. e.g. modf
1327 arrays = tuple(
1328 self._simple_new(
1329 sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)
1330 )
1331 for sp_value, fv in zip(sp_values, fill_value)
1332 )
1333 return arrays
1334 elif is_scalar(sp_values):
1335 # e.g. reductions
1336 return sp_values
1338 return self._simple_new(
1339 sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)
1340 )
1342 result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], **kwargs)
1343 if out:
1344 if len(out) == 1:
1345 out = out[0]
1346 return out
1348 if type(result) is tuple:
1349 return tuple(type(self)(x) for x in result)
1350 elif method == "at":
1351 # no return value
1352 return None
1353 else:
1354 return type(self)(result)
1356 def __abs__(self):
1357 return np.abs(self)
1359 # ------------------------------------------------------------------------
1360 # Ops
1361 # ------------------------------------------------------------------------
1363 @classmethod
1364 def _create_unary_method(cls, op) -> Callable[["SparseArray"], "SparseArray"]:
1365 def sparse_unary_method(self) -> "SparseArray":
1366 fill_value = op(np.array(self.fill_value)).item()
1367 values = op(self.sp_values)
1368 dtype = SparseDtype(values.dtype, fill_value)
1369 return cls._simple_new(values, self.sp_index, dtype)
1371 name = f"__{op.__name__}__"
1372 return compat.set_function_name(sparse_unary_method, name, cls)
1374 @classmethod
1375 def _create_arithmetic_method(cls, op):
1376 op_name = op.__name__
1378 @unpack_zerodim_and_defer(op_name)
1379 def sparse_arithmetic_method(self, other):
1381 if isinstance(other, SparseArray):
1382 return _sparse_array_op(self, other, op, op_name)
1384 elif is_scalar(other):
1385 with np.errstate(all="ignore"):
1386 fill = op(_get_fill(self), np.asarray(other))
1387 result = op(self.sp_values, other)
1389 if op_name == "divmod":
1390 left, right = result
1391 lfill, rfill = fill
1392 return (
1393 _wrap_result(op_name, left, self.sp_index, lfill),
1394 _wrap_result(op_name, right, self.sp_index, rfill),
1395 )
1397 return _wrap_result(op_name, result, self.sp_index, fill)
1399 else:
1400 other = np.asarray(other)
1401 with np.errstate(all="ignore"):
1402 # TODO: look into _wrap_result
1403 if len(self) != len(other):
1404 raise AssertionError(
1405 (f"length mismatch: {len(self)} vs. {len(other)}")
1406 )
1407 if not isinstance(other, SparseArray):
1408 dtype = getattr(other, "dtype", None)
1409 other = SparseArray(
1410 other, fill_value=self.fill_value, dtype=dtype
1411 )
1412 return _sparse_array_op(self, other, op, op_name)
1414 name = f"__{op.__name__}__"
1415 return compat.set_function_name(sparse_arithmetic_method, name, cls)
1417 @classmethod
1418 def _create_comparison_method(cls, op):
1419 op_name = op.__name__
1420 if op_name in {"and_", "or_"}:
1421 op_name = op_name[:-1]
1423 @unpack_zerodim_and_defer(op_name)
1424 def cmp_method(self, other):
1426 if not is_scalar(other) and not isinstance(other, type(self)):
1427 # convert list-like to ndarray
1428 other = np.asarray(other)
1430 if isinstance(other, np.ndarray):
1431 # TODO: make this more flexible than just ndarray...
1432 if len(self) != len(other):
1433 raise AssertionError(
1434 f"length mismatch: {len(self)} vs. {len(other)}"
1435 )
1436 other = SparseArray(other, fill_value=self.fill_value)
1438 if isinstance(other, SparseArray):
1439 return _sparse_array_op(self, other, op, op_name)
1440 else:
1441 with np.errstate(all="ignore"):
1442 fill_value = op(self.fill_value, other)
1443 result = op(self.sp_values, other)
1445 return type(self)(
1446 result,
1447 sparse_index=self.sp_index,
1448 fill_value=fill_value,
1449 dtype=np.bool_,
1450 )
1452 name = f"__{op.__name__}__"
1453 return compat.set_function_name(cmp_method, name, cls)
1455 @classmethod
1456 def _add_unary_ops(cls):
1457 cls.__pos__ = cls._create_unary_method(operator.pos)
1458 cls.__neg__ = cls._create_unary_method(operator.neg)
1459 cls.__invert__ = cls._create_unary_method(operator.invert)
1461 @classmethod
1462 def _add_comparison_ops(cls):
1463 cls.__and__ = cls._create_comparison_method(operator.and_)
1464 cls.__or__ = cls._create_comparison_method(operator.or_)
1465 cls.__xor__ = cls._create_arithmetic_method(operator.xor)
1466 super()._add_comparison_ops()
1468 # ----------
1469 # Formatting
1470 # -----------
1471 def __repr__(self) -> str:
1472 pp_str = printing.pprint_thing(self)
1473 pp_fill = printing.pprint_thing(self.fill_value)
1474 pp_index = printing.pprint_thing(self.sp_index)
1475 return f"{pp_str}\nFill: {pp_fill}\n{pp_index}"
1477 def _formatter(self, boxed=False):
1478 # Defer to the formatter from the GenericArrayFormatter calling us.
1479 # This will infer the correct formatter from the dtype of the values.
1480 return None
1483SparseArray._add_arithmetic_ops()
1484SparseArray._add_comparison_ops()
1485SparseArray._add_unary_ops()
1488def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False):
1489 """
1490 Convert ndarray to sparse format
1492 Parameters
1493 ----------
1494 arr : ndarray
1495 kind : {'block', 'integer'}
1496 fill_value : NaN or another value
1497 dtype : np.dtype, optional
1498 copy : bool, default False
1500 Returns
1501 -------
1502 (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
1503 """
1505 arr = com.values_from_object(arr)
1507 if arr.ndim > 1:
1508 raise TypeError("expected dimension <= 1 data")
1510 if fill_value is None:
1511 fill_value = na_value_for_dtype(arr.dtype)
1513 if isna(fill_value):
1514 mask = notna(arr)
1515 else:
1516 # cast to object comparison to be safe
1517 if is_string_dtype(arr):
1518 arr = arr.astype(object)
1520 if is_object_dtype(arr.dtype):
1521 # element-wise equality check method in numpy doesn't treat
1522 # each element type, eg. 0, 0.0, and False are treated as
1523 # same. So we have to check the both of its type and value.
1524 mask = splib.make_mask_object_ndarray(arr, fill_value)
1525 else:
1526 mask = arr != fill_value
1528 length = len(arr)
1529 if length != len(mask):
1530 # the arr is a SparseArray
1531 indices = mask.sp_index.indices
1532 else:
1533 indices = mask.nonzero()[0].astype(np.int32)
1535 index = _make_index(length, indices, kind)
1536 sparsified_values = arr[mask]
1537 if dtype is not None:
1538 sparsified_values = astype_nansafe(sparsified_values, dtype=dtype)
1539 # TODO: copy
1540 return sparsified_values, index, fill_value
1543def _make_index(length, indices, kind):
1545 if kind == "block" or isinstance(kind, BlockIndex):
1546 locs, lens = splib.get_blocks(indices)
1547 index = BlockIndex(length, locs, lens)
1548 elif kind == "integer" or isinstance(kind, IntIndex):
1549 index = IntIndex(length, indices)
1550 else: # pragma: no cover
1551 raise ValueError("must be block or integer type")
1552 return index