Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/arrays/string_.py : 35%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import operator
2from typing import Type
4import numpy as np
6from pandas._libs import lib, missing as libmissing
8from pandas.core.dtypes.base import ExtensionDtype
9from pandas.core.dtypes.common import pandas_dtype
10from pandas.core.dtypes.dtypes import register_extension_dtype
11from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
12from pandas.core.dtypes.inference import is_array_like
14from pandas import compat
15from pandas.core import ops
16from pandas.core.arrays import PandasArray
17from pandas.core.construction import extract_array
18from pandas.core.indexers import check_array_indexer
19from pandas.core.missing import isna
22@register_extension_dtype
23class StringDtype(ExtensionDtype):
24 """
25 Extension dtype for string data.
27 .. versionadded:: 1.0.0
29 .. warning::
31 StringDtype is considered experimental. The implementation and
32 parts of the API may change without warning.
34 In particular, StringDtype.na_value may change to no longer be
35 ``numpy.nan``.
37 Attributes
38 ----------
39 None
41 Methods
42 -------
43 None
45 Examples
46 --------
47 >>> pd.StringDtype()
48 StringDtype
49 """
51 name = "string"
53 #: StringDtype.na_value uses pandas.NA
54 na_value = libmissing.NA
56 @property
57 def type(self) -> Type:
58 return str
60 @classmethod
61 def construct_array_type(cls) -> "Type[StringArray]":
62 return StringArray
64 def __repr__(self) -> str:
65 return "StringDtype"
67 def __from_arrow__(self, array):
68 """Construct StringArray from passed pyarrow Array/ChunkedArray"""
69 import pyarrow
71 if isinstance(array, pyarrow.Array):
72 chunks = [array]
73 else:
74 # pyarrow.ChunkedArray
75 chunks = array.chunks
77 results = []
78 for arr in chunks:
79 # using _from_sequence to ensure None is converted to NA
80 str_arr = StringArray._from_sequence(np.array(arr))
81 results.append(str_arr)
83 return StringArray._concat_same_type(results)
86class StringArray(PandasArray):
87 """
88 Extension array for string data.
90 .. versionadded:: 1.0.0
92 .. warning::
94 StringArray is considered experimental. The implementation and
95 parts of the API may change without warning.
97 Parameters
98 ----------
99 values : array-like
100 The array of data.
102 .. warning::
104 Currently, this expects an object-dtype ndarray
105 where the elements are Python strings or :attr:`pandas.NA`.
106 This may change without warning in the future. Use
107 :meth:`pandas.array` with ``dtype="string"`` for a stable way of
108 creating a `StringArray` from any sequence.
110 copy : bool, default False
111 Whether to copy the array of data.
113 Attributes
114 ----------
115 None
117 Methods
118 -------
119 None
121 See Also
122 --------
123 array
124 The recommended function for creating a StringArray.
125 Series.str
126 The string methods are available on Series backed by
127 a StringArray.
129 Notes
130 -----
131 StringArray returns a BooleanArray for comparison methods.
133 Examples
134 --------
135 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
136 <StringArray>
137 ['This is', 'some text', <NA>, 'data.']
138 Length: 4, dtype: string
140 Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string
141 values.
143 >>> pd.array(['1', 1], dtype="string")
144 Traceback (most recent call last):
145 ...
146 ValueError: StringArray requires an object-dtype ndarray of strings.
148 For comparison methods, this returns a :class:`pandas.BooleanArray`
150 >>> pd.array(["a", None, "c"], dtype="string") == "a"
151 <BooleanArray>
152 [True, <NA>, False]
153 Length: 3, dtype: boolean
154 """
156 # undo the PandasArray hack
157 _typ = "extension"
159 def __init__(self, values, copy=False):
160 values = extract_array(values)
161 skip_validation = isinstance(values, type(self))
163 super().__init__(values, copy=copy)
164 self._dtype = StringDtype()
165 if not skip_validation:
166 self._validate()
168 def _validate(self):
169 """Validate that we only store NA or strings."""
170 if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
171 raise ValueError("StringArray requires a sequence of strings or pandas.NA")
172 if self._ndarray.dtype != "object":
173 raise ValueError(
174 "StringArray requires a sequence of strings or pandas.NA. Got "
175 f"'{self._ndarray.dtype}' dtype instead."
176 )
178 @classmethod
179 def _from_sequence(cls, scalars, dtype=None, copy=False):
180 if dtype:
181 assert dtype == "string"
183 result = np.asarray(scalars, dtype="object")
184 if copy and result is scalars:
185 result = result.copy()
187 # Standardize all missing-like values to NA
188 # TODO: it would be nice to do this in _validate / lib.is_string_array
189 # We are already doing a scan over the values there.
190 na_values = isna(result)
191 if na_values.any():
192 if result is scalars:
193 # force a copy now, if we haven't already
194 result = result.copy()
195 result[na_values] = StringDtype.na_value
197 return cls(result)
199 @classmethod
200 def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
201 return cls._from_sequence(strings, dtype=dtype, copy=copy)
203 def __arrow_array__(self, type=None):
204 """
205 Convert myself into a pyarrow Array.
206 """
207 import pyarrow as pa
209 if type is None:
210 type = pa.string()
212 values = self._ndarray.copy()
213 values[self.isna()] = None
214 return pa.array(values, type=type, from_pandas=True)
216 def _values_for_factorize(self):
217 arr = self._ndarray.copy()
218 mask = self.isna()
219 arr[mask] = -1
220 return arr, -1
222 def __setitem__(self, key, value):
223 value = extract_array(value, extract_numpy=True)
224 if isinstance(value, type(self)):
225 # extract_array doesn't extract PandasArray subclasses
226 value = value._ndarray
228 key = check_array_indexer(self, key)
229 scalar_key = lib.is_scalar(key)
230 scalar_value = lib.is_scalar(value)
231 if scalar_key and not scalar_value:
232 raise ValueError("setting an array element with a sequence.")
234 # validate new items
235 if scalar_value:
236 if isna(value):
237 value = StringDtype.na_value
238 elif not isinstance(value, str):
239 raise ValueError(
240 f"Cannot set non-string value '{value}' into a StringArray."
241 )
242 else:
243 if not is_array_like(value):
244 value = np.asarray(value, dtype=object)
245 if len(value) and not lib.is_string_array(value, skipna=True):
246 raise ValueError("Must provide strings.")
248 super().__setitem__(key, value)
250 def fillna(self, value=None, method=None, limit=None):
251 # TODO: validate dtype
252 return super().fillna(value, method, limit)
254 def astype(self, dtype, copy=True):
255 dtype = pandas_dtype(dtype)
256 if isinstance(dtype, StringDtype):
257 if copy:
258 return self.copy()
259 return self
260 return super().astype(dtype, copy)
262 def _reduce(self, name, skipna=True, **kwargs):
263 raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
265 def value_counts(self, dropna=False):
266 from pandas import value_counts
268 return value_counts(self._ndarray, dropna=dropna).astype("Int64")
270 # Overrride parent because we have different return types.
271 @classmethod
272 def _create_arithmetic_method(cls, op):
273 # Note: this handles both arithmetic and comparison methods.
274 def method(self, other):
275 from pandas.arrays import BooleanArray
277 assert op.__name__ in ops.ARITHMETIC_BINOPS | ops.COMPARISON_BINOPS
279 if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)):
280 return NotImplemented
282 elif isinstance(other, cls):
283 other = other._ndarray
285 mask = isna(self) | isna(other)
286 valid = ~mask
288 if not lib.is_scalar(other):
289 if len(other) != len(self):
290 # prevent improper broadcasting when other is 2D
291 raise ValueError(
292 f"Lengths of operands do not match: {len(self)} != {len(other)}"
293 )
295 other = np.asarray(other)
296 other = other[valid]
298 if op.__name__ in ops.ARITHMETIC_BINOPS:
299 result = np.empty_like(self._ndarray, dtype="object")
300 result[mask] = StringDtype.na_value
301 result[valid] = op(self._ndarray[valid], other)
302 return StringArray(result)
303 else:
304 # logical
305 result = np.zeros(len(self._ndarray), dtype="bool")
306 result[valid] = op(self._ndarray[valid], other)
307 return BooleanArray(result, mask)
309 return compat.set_function_name(method, f"__{op.__name__}__", cls)
311 @classmethod
312 def _add_arithmetic_ops(cls):
313 cls.__add__ = cls._create_arithmetic_method(operator.add)
314 cls.__radd__ = cls._create_arithmetic_method(ops.radd)
316 cls.__mul__ = cls._create_arithmetic_method(operator.mul)
317 cls.__rmul__ = cls._create_arithmetic_method(ops.rmul)
319 _create_comparison_method = _create_arithmetic_method
322StringArray._add_arithmetic_ops()
323StringArray._add_comparison_ops()