Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""Sparse Dtype""" 

2 

3import re 

4from typing import Any, Tuple 

5 

6import numpy as np 

7 

8from pandas._typing import Dtype 

9 

10from pandas.core.dtypes.base import ExtensionDtype 

11from pandas.core.dtypes.cast import astype_nansafe 

12from pandas.core.dtypes.common import ( 

13 is_bool_dtype, 

14 is_object_dtype, 

15 is_scalar, 

16 is_string_dtype, 

17 pandas_dtype, 

18) 

19from pandas.core.dtypes.dtypes import register_extension_dtype 

20from pandas.core.dtypes.missing import isna, na_value_for_dtype 

21 

22 

23@register_extension_dtype 

24class SparseDtype(ExtensionDtype): 

25 """ 

26 Dtype for data stored in :class:`SparseArray`. 

27 

28 This dtype implements the pandas ExtensionDtype interface. 

29 

30 .. versionadded:: 0.24.0 

31 

32 Parameters 

33 ---------- 

34 dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 

35 The dtype of the underlying array storing the non-fill value values. 

36 fill_value : scalar, optional 

37 The scalar value not stored in the SparseArray. By default, this 

38 depends on `dtype`. 

39 

40 =========== ========== 

41 dtype na_value 

42 =========== ========== 

43 float ``np.nan`` 

44 int ``0`` 

45 bool ``False`` 

46 datetime64 ``pd.NaT`` 

47 timedelta64 ``pd.NaT`` 

48 =========== ========== 

49 

50 The default value may be overridden by specifying a `fill_value`. 

51 

52 Attributes 

53 ---------- 

54 None 

55 

56 Methods 

57 ------- 

58 None 

59 """ 

60 

61 # We include `_is_na_fill_value` in the metadata to avoid hash collisions 

62 # between SparseDtype(float, 0.0) and SparseDtype(float, nan). 

63 # Without is_na_fill_value in the comparison, those would be equal since 

64 # hash(nan) is (sometimes?) 0. 

65 _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") 

66 

67 def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None): 

68 

69 if isinstance(dtype, type(self)): 

70 if fill_value is None: 

71 fill_value = dtype.fill_value 

72 dtype = dtype.subtype 

73 

74 dtype = pandas_dtype(dtype) 

75 if is_string_dtype(dtype): 

76 dtype = np.dtype("object") 

77 

78 if fill_value is None: 

79 fill_value = na_value_for_dtype(dtype) 

80 

81 if not is_scalar(fill_value): 

82 raise ValueError(f"fill_value must be a scalar. Got {fill_value} instead") 

83 self._dtype = dtype 

84 self._fill_value = fill_value 

85 

86 def __hash__(self): 

87 # Python3 doesn't inherit __hash__ when a base class overrides 

88 # __eq__, so we explicitly do it here. 

89 return super().__hash__() 

90 

91 def __eq__(self, other: Any) -> bool: 

92 # We have to override __eq__ to handle NA values in _metadata. 

93 # The base class does simple == checks, which fail for NA. 

94 if isinstance(other, str): 

95 try: 

96 other = self.construct_from_string(other) 

97 except TypeError: 

98 return False 

99 

100 if isinstance(other, type(self)): 

101 subtype = self.subtype == other.subtype 

102 if self._is_na_fill_value: 

103 # this case is complicated by two things: 

104 # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) 

105 # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) 

106 # i.e. we want to treat any floating-point NaN as equal, but 

107 # not a floating-point NaN and a datetime NaT. 

108 fill_value = ( 

109 other._is_na_fill_value 

110 and isinstance(self.fill_value, type(other.fill_value)) 

111 or isinstance(other.fill_value, type(self.fill_value)) 

112 ) 

113 else: 

114 fill_value = self.fill_value == other.fill_value 

115 

116 return subtype and fill_value 

117 return False 

118 

119 @property 

120 def fill_value(self): 

121 """ 

122 The fill value of the array. 

123 

124 Converting the SparseArray to a dense ndarray will fill the 

125 array with this value. 

126 

127 .. warning:: 

128 

129 It's possible to end up with a SparseArray that has ``fill_value`` 

130 values in ``sp_values``. This can occur, for example, when setting 

131 ``SparseArray.fill_value`` directly. 

132 """ 

133 return self._fill_value 

134 

135 @property 

136 def _is_na_fill_value(self): 

137 return isna(self.fill_value) 

138 

139 @property 

140 def _is_numeric(self): 

141 return not is_object_dtype(self.subtype) 

142 

143 @property 

144 def _is_boolean(self): 

145 return is_bool_dtype(self.subtype) 

146 

147 @property 

148 def kind(self): 

149 """ 

150 The sparse kind. Either 'integer', or 'block'. 

151 """ 

152 return self.subtype.kind 

153 

154 @property 

155 def type(self): 

156 return self.subtype.type 

157 

158 @property 

159 def subtype(self): 

160 return self._dtype 

161 

162 @property 

163 def name(self): 

164 return f"Sparse[{self.subtype.name}, {self.fill_value}]" 

165 

166 def __repr__(self) -> str: 

167 return self.name 

168 

169 @classmethod 

170 def construct_array_type(cls): 

171 """ 

172 Return the array type associated with this dtype. 

173 

174 Returns 

175 ------- 

176 type 

177 """ 

178 from pandas.core.arrays.sparse.array import SparseArray 

179 

180 return SparseArray 

181 

182 @classmethod 

183 def construct_from_string(cls, string): 

184 """ 

185 Construct a SparseDtype from a string form. 

186 

187 Parameters 

188 ---------- 

189 string : str 

190 Can take the following forms. 

191 

192 string dtype 

193 ================ ============================ 

194 'int' SparseDtype[np.int64, 0] 

195 'Sparse' SparseDtype[np.float64, nan] 

196 'Sparse[int]' SparseDtype[np.int64, 0] 

197 'Sparse[int, 0]' SparseDtype[np.int64, 0] 

198 ================ ============================ 

199 

200 It is not possible to specify non-default fill values 

201 with a string. An argument like ``'Sparse[int, 1]'`` 

202 will raise a ``TypeError`` because the default fill value 

203 for integers is 0. 

204 

205 Returns 

206 ------- 

207 SparseDtype 

208 """ 

209 msg = f"Cannot construct a 'SparseDtype' from '{string}'" 

210 if string.startswith("Sparse"): 

211 try: 

212 sub_type, has_fill_value = cls._parse_subtype(string) 

213 except ValueError: 

214 raise TypeError(msg) 

215 else: 

216 result = SparseDtype(sub_type) 

217 msg = ( 

218 f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt " 

219 "looks like the fill_value in the string is not " 

220 "the default for the dtype. Non-default fill_values " 

221 "are not supported. Use the 'SparseDtype()' " 

222 "constructor instead." 

223 ) 

224 if has_fill_value and str(result) != string: 

225 raise TypeError(msg) 

226 return result 

227 else: 

228 raise TypeError(msg) 

229 

230 @staticmethod 

231 def _parse_subtype(dtype: str) -> Tuple[str, bool]: 

232 """ 

233 Parse a string to get the subtype 

234 

235 Parameters 

236 ---------- 

237 dtype : str 

238 A string like 

239 

240 * Sparse[subtype] 

241 * Sparse[subtype, fill_value] 

242 

243 Returns 

244 ------- 

245 subtype : str 

246 

247 Raises 

248 ------ 

249 ValueError 

250 When the subtype cannot be extracted. 

251 """ 

252 xpr = re.compile(r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$") 

253 m = xpr.match(dtype) 

254 has_fill_value = False 

255 if m: 

256 subtype = m.groupdict()["subtype"] 

257 has_fill_value = bool(m.groupdict()["fill_value"]) 

258 elif dtype == "Sparse": 

259 subtype = "float64" 

260 else: 

261 raise ValueError(f"Cannot parse {dtype}") 

262 return subtype, has_fill_value 

263 

264 @classmethod 

265 def is_dtype(cls, dtype): 

266 dtype = getattr(dtype, "dtype", dtype) 

267 if isinstance(dtype, str) and dtype.startswith("Sparse"): 

268 sub_type, _ = cls._parse_subtype(dtype) 

269 dtype = np.dtype(sub_type) 

270 elif isinstance(dtype, cls): 

271 return True 

272 return isinstance(dtype, np.dtype) or dtype == "Sparse" 

273 

274 def update_dtype(self, dtype): 

275 """ 

276 Convert the SparseDtype to a new dtype. 

277 

278 This takes care of converting the ``fill_value``. 

279 

280 Parameters 

281 ---------- 

282 dtype : Union[str, numpy.dtype, SparseDtype] 

283 The new dtype to use. 

284 

285 * For a SparseDtype, it is simply returned 

286 * For a NumPy dtype (or str), the current fill value 

287 is converted to the new dtype, and a SparseDtype 

288 with `dtype` and the new fill value is returned. 

289 

290 Returns 

291 ------- 

292 SparseDtype 

293 A new SparseDtype with the correct `dtype` and fill value 

294 for that `dtype`. 

295 

296 Raises 

297 ------ 

298 ValueError 

299 When the current fill value cannot be converted to the 

300 new `dtype` (e.g. trying to convert ``np.nan`` to an 

301 integer dtype). 

302 

303 

304 Examples 

305 -------- 

306 >>> SparseDtype(int, 0).update_dtype(float) 

307 Sparse[float64, 0.0] 

308 

309 >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) 

310 Sparse[float64, nan] 

311 """ 

312 cls = type(self) 

313 dtype = pandas_dtype(dtype) 

314 

315 if not isinstance(dtype, cls): 

316 fill_value = astype_nansafe(np.array(self.fill_value), dtype).item() 

317 dtype = cls(dtype, fill_value=fill_value) 

318 

319 return dtype 

320 

321 @property 

322 def _subtype_with_str(self): 

323 """ 

324 Whether the SparseDtype's subtype should be considered ``str``. 

325 

326 Typically, pandas will store string data in an object-dtype array. 

327 When converting values to a dtype, e.g. in ``.astype``, we need to 

328 be more specific, we need the actual underlying type. 

329 

330 Returns 

331 ------- 

332 

333 >>> SparseDtype(int, 1)._subtype_with_str 

334 dtype('int64') 

335 

336 >>> SparseDtype(object, 1)._subtype_with_str 

337 dtype('O') 

338 

339 >>> dtype = SparseDtype(str, '') 

340 >>> dtype.subtype 

341 dtype('O') 

342 

343 >>> dtype._subtype_with_str 

344 str 

345 """ 

346 if isinstance(self.fill_value, str): 

347 return type(self.fill_value) 

348 return self.subtype