Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Constructor functions intended to be shared by pd.array, Series.__init__, 

3and Index.__new__. 

4 

5These should not depend on core.internals. 

6""" 

7from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast 

8 

9import numpy as np 

10import numpy.ma as ma 

11 

12from pandas._libs import lib 

13from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime 

14from pandas._typing import ArrayLike, Dtype 

15 

16from pandas.core.dtypes.cast import ( 

17 construct_1d_arraylike_from_scalar, 

18 construct_1d_ndarray_preserving_na, 

19 construct_1d_object_array_from_listlike, 

20 infer_dtype_from_scalar, 

21 maybe_cast_to_datetime, 

22 maybe_cast_to_integer_array, 

23 maybe_castable, 

24 maybe_convert_platform, 

25 maybe_upcast, 

26) 

27from pandas.core.dtypes.common import ( 

28 is_categorical_dtype, 

29 is_datetime64_ns_dtype, 

30 is_extension_array_dtype, 

31 is_float_dtype, 

32 is_integer_dtype, 

33 is_iterator, 

34 is_list_like, 

35 is_object_dtype, 

36 is_timedelta64_ns_dtype, 

37 pandas_dtype, 

38) 

39from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype, registry 

40from pandas.core.dtypes.generic import ( 

41 ABCExtensionArray, 

42 ABCIndexClass, 

43 ABCPandasArray, 

44 ABCSeries, 

45) 

46from pandas.core.dtypes.missing import isna 

47 

48import pandas.core.common as com 

49 

50if TYPE_CHECKING: 

51 from pandas.core.series import Series # noqa: F401 

52 from pandas.core.indexes.api import Index # noqa: F401 

53 

54 

55def array( 

56 data: Sequence[object], 

57 dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None, 

58 copy: bool = True, 

59) -> ABCExtensionArray: 

60 """ 

61 Create an array. 

62 

63 .. versionadded:: 0.24.0 

64 

65 Parameters 

66 ---------- 

67 data : Sequence of objects 

68 The scalars inside `data` should be instances of the 

69 scalar type for `dtype`. It's expected that `data` 

70 represents a 1-dimensional array of data. 

71 

72 When `data` is an Index or Series, the underlying array 

73 will be extracted from `data`. 

74 

75 dtype : str, np.dtype, or ExtensionDtype, optional 

76 The dtype to use for the array. This may be a NumPy 

77 dtype or an extension type registered with pandas using 

78 :meth:`pandas.api.extensions.register_extension_dtype`. 

79 

80 If not specified, there are two possibilities: 

81 

82 1. When `data` is a :class:`Series`, :class:`Index`, or 

83 :class:`ExtensionArray`, the `dtype` will be taken 

84 from the data. 

85 2. Otherwise, pandas will attempt to infer the `dtype` 

86 from the data. 

87 

88 Note that when `data` is a NumPy array, ``data.dtype`` is 

89 *not* used for inferring the array type. This is because 

90 NumPy cannot represent all the types of data that can be 

91 held in extension arrays. 

92 

93 Currently, pandas will infer an extension dtype for sequences of 

94 

95 ============================== ===================================== 

96 Scalar Type Array Type 

97 ============================== ===================================== 

98 :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` 

99 :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` 

100 :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` 

101 :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` 

102 :class:`int` :class:`pandas.arrays.IntegerArray` 

103 :class:`str` :class:`pandas.arrays.StringArray` 

104 :class:`bool` :class:`pandas.arrays.BooleanArray` 

105 ============================== ===================================== 

106 

107 For all other cases, NumPy's usual inference rules will be used. 

108 

109 .. versionchanged:: 1.0.0 

110 

111 Pandas infers nullable-integer dtype for integer data, 

112 string dtype for string data, and nullable-boolean dtype 

113 for boolean data. 

114 

115 copy : bool, default True 

116 Whether to copy the data, even if not necessary. Depending 

117 on the type of `data`, creating the new array may require 

118 copying data, even if ``copy=False``. 

119 

120 Returns 

121 ------- 

122 ExtensionArray 

123 The newly created array. 

124 

125 Raises 

126 ------ 

127 ValueError 

128 When `data` is not 1-dimensional. 

129 

130 See Also 

131 -------- 

132 numpy.array : Construct a NumPy array. 

133 Series : Construct a pandas Series. 

134 Index : Construct a pandas Index. 

135 arrays.PandasArray : ExtensionArray wrapping a NumPy array. 

136 Series.array : Extract the array stored within a Series. 

137 

138 Notes 

139 ----- 

140 Omitting the `dtype` argument means pandas will attempt to infer the 

141 best array type from the values in the data. As new array types are 

142 added by pandas and 3rd party libraries, the "best" array type may 

143 change. We recommend specifying `dtype` to ensure that 

144 

145 1. the correct array type for the data is returned 

146 2. the returned array type doesn't change as new extension types 

147 are added by pandas and third-party libraries 

148 

149 Additionally, if the underlying memory representation of the returned 

150 array matters, we recommend specifying the `dtype` as a concrete object 

151 rather than a string alias or allowing it to be inferred. For example, 

152 a future version of pandas or a 3rd-party library may include a 

153 dedicated ExtensionArray for string data. In this event, the following 

154 would no longer return a :class:`arrays.PandasArray` backed by a NumPy 

155 array. 

156 

157 >>> pd.array(['a', 'b'], dtype=str) 

158 <PandasArray> 

159 ['a', 'b'] 

160 Length: 2, dtype: str32 

161 

162 This would instead return the new ExtensionArray dedicated for string 

163 data. If you really need the new array to be backed by a NumPy array, 

164 specify that in the dtype. 

165 

166 >>> pd.array(['a', 'b'], dtype=np.dtype("<U1")) 

167 <PandasArray> 

168 ['a', 'b'] 

169 Length: 2, dtype: str32 

170 

171 Finally, Pandas has arrays that mostly overlap with NumPy 

172 

173 * :class:`arrays.DatetimeArray` 

174 * :class:`arrays.TimedeltaArray` 

175 

176 When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is 

177 passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray`` 

178 rather than a ``PandasArray``. This is for symmetry with the case of 

179 timezone-aware data, which NumPy does not natively support. 

180 

181 >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') 

182 <DatetimeArray> 

183 ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] 

184 Length: 2, dtype: datetime64[ns] 

185 

186 >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') 

187 <TimedeltaArray> 

188 ['01:00:00', '02:00:00'] 

189 Length: 2, dtype: timedelta64[ns] 

190 

191 Examples 

192 -------- 

193 If a dtype is not specified, pandas will infer the best dtype from the values. 

194 See the description of `dtype` for the types pandas infers for. 

195 

196 >>> pd.array([1, 2]) 

197 <IntegerArray> 

198 [1, 2] 

199 Length: 2, dtype: Int64 

200 

201 >>> pd.array([1, 2, np.nan]) 

202 <IntegerArray> 

203 [1, 2, NaN] 

204 Length: 3, dtype: Int64 

205 

206 >>> pd.array(["a", None, "c"]) 

207 <StringArray> 

208 ['a', nan, 'c'] 

209 Length: 3, dtype: string 

210 

211 >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) 

212 <PeriodArray> 

213 ['2000-01-01', '2000-01-01'] 

214 Length: 2, dtype: period[D] 

215 

216 You can use the string alias for `dtype` 

217 

218 >>> pd.array(['a', 'b', 'a'], dtype='category') 

219 [a, b, a] 

220 Categories (2, object): [a, b] 

221 

222 Or specify the actual dtype 

223 

224 >>> pd.array(['a', 'b', 'a'], 

225 ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) 

226 [a, b, a] 

227 Categories (3, object): [a < b < c] 

228 

229 If pandas does not infer a dedicated extension type a 

230 :class:`arrays.PandasArray` is returned. 

231 

232 >>> pd.array([1.1, 2.2]) 

233 <PandasArray> 

234 [1.1, 2.2] 

235 Length: 2, dtype: float64 

236 

237 As mentioned in the "Notes" section, new extension types may be added 

238 in the future (by pandas or 3rd party libraries), causing the return 

239 value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype` 

240 as a NumPy dtype if you need to ensure there's no future change in 

241 behavior. 

242 

243 >>> pd.array([1, 2], dtype=np.dtype("int32")) 

244 <PandasArray> 

245 [1, 2] 

246 Length: 2, dtype: int32 

247 

248 `data` must be 1-dimensional. A ValueError is raised when the input 

249 has the wrong dimensionality. 

250 

251 >>> pd.array(1) 

252 Traceback (most recent call last): 

253 ... 

254 ValueError: Cannot pass scalar '1' to 'pandas.array'. 

255 """ 

256 from pandas.core.arrays import ( 

257 period_array, 

258 BooleanArray, 

259 IntegerArray, 

260 IntervalArray, 

261 PandasArray, 

262 DatetimeArray, 

263 TimedeltaArray, 

264 StringArray, 

265 ) 

266 

267 if lib.is_scalar(data): 

268 msg = f"Cannot pass scalar '{data}' to 'pandas.array'." 

269 raise ValueError(msg) 

270 

271 if dtype is None and isinstance( 

272 data, (ABCSeries, ABCIndexClass, ABCExtensionArray) 

273 ): 

274 dtype = data.dtype 

275 

276 data = extract_array(data, extract_numpy=True) 

277 

278 # this returns None for not-found dtypes. 

279 if isinstance(dtype, str): 

280 dtype = registry.find(dtype) or dtype 

281 

282 if is_extension_array_dtype(dtype): 

283 cls = cast(ExtensionDtype, dtype).construct_array_type() 

284 return cls._from_sequence(data, dtype=dtype, copy=copy) 

285 

286 if dtype is None: 

287 inferred_dtype = lib.infer_dtype(data, skipna=True) 

288 if inferred_dtype == "period": 

289 try: 

290 return period_array(data, copy=copy) 

291 except IncompatibleFrequency: 

292 # We may have a mixture of frequencies. 

293 # We choose to return an ndarray, rather than raising. 

294 pass 

295 elif inferred_dtype == "interval": 

296 try: 

297 return IntervalArray(data, copy=copy) 

298 except ValueError: 

299 # We may have a mixture of `closed` here. 

300 # We choose to return an ndarray, rather than raising. 

301 pass 

302 

303 elif inferred_dtype.startswith("datetime"): 

304 # datetime, datetime64 

305 try: 

306 return DatetimeArray._from_sequence(data, copy=copy) 

307 except ValueError: 

308 # Mixture of timezones, fall back to PandasArray 

309 pass 

310 

311 elif inferred_dtype.startswith("timedelta"): 

312 # timedelta, timedelta64 

313 return TimedeltaArray._from_sequence(data, copy=copy) 

314 

315 elif inferred_dtype == "string": 

316 return StringArray._from_sequence(data, copy=copy) 

317 

318 elif inferred_dtype == "integer": 

319 return IntegerArray._from_sequence(data, copy=copy) 

320 

321 elif inferred_dtype == "boolean": 

322 return BooleanArray._from_sequence(data, copy=copy) 

323 

324 # Pandas overrides NumPy for 

325 # 1. datetime64[ns] 

326 # 2. timedelta64[ns] 

327 # so that a DatetimeArray is returned. 

328 if is_datetime64_ns_dtype(dtype): 

329 return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) 

330 elif is_timedelta64_ns_dtype(dtype): 

331 return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) 

332 

333 result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) 

334 return result 

335 

336 

337def extract_array(obj, extract_numpy=False): 

338 """ 

339 Extract the ndarray or ExtensionArray from a Series or Index. 

340 

341 For all other types, `obj` is just returned as is. 

342 

343 Parameters 

344 ---------- 

345 obj : object 

346 For Series / Index, the underlying ExtensionArray is unboxed. 

347 For Numpy-backed ExtensionArrays, the ndarray is extracted. 

348 

349 extract_numpy : bool, default False 

350 Whether to extract the ndarray from a PandasArray 

351 

352 Returns 

353 ------- 

354 arr : object 

355 

356 Examples 

357 -------- 

358 >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category')) 

359 [a, b, c] 

360 Categories (3, object): [a, b, c] 

361 

362 Other objects like lists, arrays, and DataFrames are just passed through. 

363 

364 >>> extract_array([1, 2, 3]) 

365 [1, 2, 3] 

366 

367 For an ndarray-backed Series / Index a PandasArray is returned. 

368 

369 >>> extract_array(pd.Series([1, 2, 3])) 

370 <PandasArray> 

371 [1, 2, 3] 

372 Length: 3, dtype: int64 

373 

374 To extract all the way down to the ndarray, pass ``extract_numpy=True``. 

375 

376 >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True) 

377 array([1, 2, 3]) 

378 """ 

379 if isinstance(obj, (ABCIndexClass, ABCSeries)): 

380 obj = obj.array 

381 

382 if extract_numpy and isinstance(obj, ABCPandasArray): 

383 obj = obj.to_numpy() 

384 

385 return obj 

386 

387 

388def sanitize_array( 

389 data, index, dtype=None, copy: bool = False, raise_cast_failure: bool = False 

390): 

391 """ 

392 Sanitize input data to an ndarray, copy if specified, coerce to the 

393 dtype if specified. 

394 """ 

395 if dtype is not None: 

396 dtype = pandas_dtype(dtype) 

397 

398 if isinstance(data, ma.MaskedArray): 

399 mask = ma.getmaskarray(data) 

400 if mask.any(): 

401 data, fill_value = maybe_upcast(data, copy=True) 

402 data.soften_mask() # set hardmask False if it was True 

403 data[mask] = fill_value 

404 else: 

405 data = data.copy() 

406 

407 # extract ndarray or ExtensionArray, ensure we have no PandasArray 

408 data = extract_array(data, extract_numpy=True) 

409 

410 # GH#846 

411 if isinstance(data, np.ndarray): 

412 

413 if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): 

414 # possibility of nan -> garbage 

415 try: 

416 subarr = _try_cast(data, dtype, copy, True) 

417 except ValueError: 

418 if copy: 

419 subarr = data.copy() 

420 else: 

421 subarr = np.array(data, copy=False) 

422 else: 

423 # we will try to copy be-definition here 

424 subarr = _try_cast(data, dtype, copy, raise_cast_failure) 

425 

426 elif isinstance(data, ABCExtensionArray): 

427 # it is already ensured above this is not a PandasArray 

428 subarr = data 

429 

430 if dtype is not None: 

431 subarr = subarr.astype(dtype, copy=copy) 

432 elif copy: 

433 subarr = subarr.copy() 

434 return subarr 

435 

436 elif isinstance(data, (list, tuple)) and len(data) > 0: 

437 if dtype is not None: 

438 subarr = _try_cast(data, dtype, copy, raise_cast_failure) 

439 else: 

440 subarr = maybe_convert_platform(data) 

441 

442 subarr = maybe_cast_to_datetime(subarr, dtype) 

443 

444 elif isinstance(data, range): 

445 # GH#16804 

446 arr = np.arange(data.start, data.stop, data.step, dtype="int64") 

447 subarr = _try_cast(arr, dtype, copy, raise_cast_failure) 

448 else: 

449 subarr = _try_cast(data, dtype, copy, raise_cast_failure) 

450 

451 # scalar like, GH 

452 if getattr(subarr, "ndim", 0) == 0: 

453 if isinstance(data, list): # pragma: no cover 

454 subarr = np.array(data, dtype=object) 

455 elif index is not None: 

456 value = data 

457 

458 # figure out the dtype from the value (upcast if necessary) 

459 if dtype is None: 

460 dtype, value = infer_dtype_from_scalar(value) 

461 else: 

462 # need to possibly convert the value here 

463 value = maybe_cast_to_datetime(value, dtype) 

464 

465 subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype) 

466 

467 else: 

468 return subarr.item() 

469 

470 # the result that we want 

471 elif subarr.ndim == 1: 

472 if index is not None: 

473 

474 # a 1-element ndarray 

475 if len(subarr) != len(index) and len(subarr) == 1: 

476 subarr = construct_1d_arraylike_from_scalar( 

477 subarr[0], len(index), subarr.dtype 

478 ) 

479 

480 elif subarr.ndim > 1: 

481 if isinstance(data, np.ndarray): 

482 raise Exception("Data must be 1-dimensional") 

483 else: 

484 subarr = com.asarray_tuplesafe(data, dtype=dtype) 

485 

486 if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): 

487 # This is to prevent mixed-type Series getting all casted to 

488 # NumPy string type, e.g. NaN --> '-1#IND'. 

489 if issubclass(subarr.dtype.type, str): 

490 # GH#16605 

491 # If not empty convert the data to dtype 

492 # GH#19853: If data is a scalar, subarr has already the result 

493 if not lib.is_scalar(data): 

494 if not np.all(isna(data)): 

495 data = np.array(data, dtype=dtype, copy=False) 

496 subarr = np.array(data, dtype=object, copy=copy) 

497 

498 if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): 

499 inferred = lib.infer_dtype(subarr, skipna=False) 

500 if inferred in {"interval", "period"}: 

501 subarr = array(subarr) 

502 

503 return subarr 

504 

505 

506def _try_cast( 

507 arr, 

508 dtype: Optional[Union[np.dtype, "ExtensionDtype"]], 

509 copy: bool, 

510 raise_cast_failure: bool, 

511): 

512 """ 

513 Convert input to numpy ndarray and optionally cast to a given dtype. 

514 

515 Parameters 

516 ---------- 

517 arr : ndarray, list, tuple, iterator (catchall) 

518 Excludes: ExtensionArray, Series, Index. 

519 dtype : np.dtype, ExtensionDtype or None 

520 copy : bool 

521 If False, don't copy the data if not needed. 

522 raise_cast_failure : bool 

523 If True, and if a dtype is specified, raise errors during casting. 

524 Otherwise an object array is returned. 

525 """ 

526 # perf shortcut as this is the most common case 

527 if isinstance(arr, np.ndarray): 

528 if maybe_castable(arr) and not copy and dtype is None: 

529 return arr 

530 

531 try: 

532 # GH#15832: Check if we are requesting a numeric dype and 

533 # that we can convert the data to the requested dtype. 

534 if is_integer_dtype(dtype): 

535 subarr = maybe_cast_to_integer_array(arr, dtype) 

536 

537 subarr = maybe_cast_to_datetime(arr, dtype) 

538 # Take care in creating object arrays (but iterators are not 

539 # supported): 

540 if is_object_dtype(dtype) and ( 

541 is_list_like(subarr) 

542 and not (is_iterator(subarr) or isinstance(subarr, np.ndarray)) 

543 ): 

544 subarr = construct_1d_object_array_from_listlike(subarr) 

545 elif not is_extension_array_dtype(subarr): 

546 subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) 

547 except OutOfBoundsDatetime: 

548 # in case of out of bound datetime64 -> always raise 

549 raise 

550 except (ValueError, TypeError): 

551 if is_categorical_dtype(dtype): 

552 # We *do* allow casting to categorical, since we know 

553 # that Categorical is the only array type for 'category'. 

554 dtype = cast(CategoricalDtype, dtype) 

555 subarr = dtype.construct_array_type()( 

556 arr, dtype.categories, ordered=dtype.ordered 

557 ) 

558 elif is_extension_array_dtype(dtype): 

559 # create an extension array from its dtype 

560 dtype = cast(ExtensionDtype, dtype) 

561 array_type = dtype.construct_array_type()._from_sequence 

562 subarr = array_type(arr, dtype=dtype, copy=copy) 

563 elif dtype is not None and raise_cast_failure: 

564 raise 

565 else: 

566 subarr = np.array(arr, dtype=object, copy=copy) 

567 return subarr 

568 

569 

570def is_empty_data(data: Any) -> bool: 

571 """ 

572 Utility to check if a Series is instantiated with empty data, 

573 which does not contain dtype information. 

574 

575 Parameters 

576 ---------- 

577 data : array-like, Iterable, dict, or scalar value 

578 Contains data stored in Series. 

579 

580 Returns 

581 ------- 

582 bool 

583 """ 

584 is_none = data is None 

585 is_list_like_without_dtype = is_list_like(data) and not hasattr(data, "dtype") 

586 is_simple_empty = is_list_like_without_dtype and not data 

587 return is_none or is_simple_empty 

588 

589 

590def create_series_with_explicit_dtype( 

591 data: Any = None, 

592 index: Optional[Union[ArrayLike, "Index"]] = None, 

593 dtype: Optional[Dtype] = None, 

594 name: Optional[str] = None, 

595 copy: bool = False, 

596 fastpath: bool = False, 

597 dtype_if_empty: Dtype = object, 

598) -> "Series": 

599 """ 

600 Helper to pass an explicit dtype when instantiating an empty Series. 

601 

602 This silences a DeprecationWarning described in GitHub-17261. 

603 

604 Parameters 

605 ---------- 

606 data : Mirrored from Series.__init__ 

607 index : Mirrored from Series.__init__ 

608 dtype : Mirrored from Series.__init__ 

609 name : Mirrored from Series.__init__ 

610 copy : Mirrored from Series.__init__ 

611 fastpath : Mirrored from Series.__init__ 

612 dtype_if_empty : str, numpy.dtype, or ExtensionDtype 

613 This dtype will be passed explicitly if an empty Series will 

614 be instantiated. 

615 

616 Returns 

617 ------- 

618 Series 

619 """ 

620 from pandas.core.series import Series 

621 

622 if is_empty_data(data) and dtype is None: 

623 dtype = dtype_if_empty 

624 return Series( 

625 data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath 

626 )