Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2data hash pandas / numpy objects 

3""" 

4import itertools 

5from typing import Optional 

6 

7import numpy as np 

8 

9from pandas._libs import Timestamp 

10import pandas._libs.hashing as hashing 

11 

12from pandas.core.dtypes.cast import infer_dtype_from_scalar 

13from pandas.core.dtypes.common import ( 

14 is_categorical_dtype, 

15 is_extension_array_dtype, 

16 is_list_like, 

17) 

18from pandas.core.dtypes.generic import ( 

19 ABCDataFrame, 

20 ABCIndexClass, 

21 ABCMultiIndex, 

22 ABCSeries, 

23) 

24from pandas.core.dtypes.missing import isna 

25 

26# 16 byte long hashing key 

27_default_hash_key = "0123456789123456" 

28 

29 

30def _combine_hash_arrays(arrays, num_items: int): 

31 """ 

32 Parameters 

33 ---------- 

34 arrays : generator 

35 num_items : int 

36 

37 Should be the same as CPython's tupleobject.c 

38 """ 

39 try: 

40 first = next(arrays) 

41 except StopIteration: 

42 return np.array([], dtype=np.uint64) 

43 

44 arrays = itertools.chain([first], arrays) 

45 

46 mult = np.uint64(1000003) 

47 out = np.zeros_like(first) + np.uint64(0x345678) 

48 for i, a in enumerate(arrays): 

49 inverse_i = num_items - i 

50 out ^= a 

51 out *= mult 

52 mult += np.uint64(82520 + inverse_i + inverse_i) 

53 assert i + 1 == num_items, "Fed in wrong num_items" 

54 out += np.uint64(97531) 

55 return out 

56 

57 

58def hash_pandas_object( 

59 obj, 

60 index: bool = True, 

61 encoding: str = "utf8", 

62 hash_key: Optional[str] = _default_hash_key, 

63 categorize: bool = True, 

64): 

65 """ 

66 Return a data hash of the Index/Series/DataFrame. 

67 

68 Parameters 

69 ---------- 

70 index : bool, default True 

71 Include the index in the hash (if Series/DataFrame). 

72 encoding : str, default 'utf8' 

73 Encoding for data & key when strings. 

74 hash_key : str, default _default_hash_key 

75 Hash_key for string key to encode. 

76 categorize : bool, default True 

77 Whether to first categorize object arrays before hashing. This is more 

78 efficient when the array contains duplicate values. 

79 

80 Returns 

81 ------- 

82 Series of uint64, same length as the object 

83 """ 

84 from pandas import Series 

85 

86 if hash_key is None: 

87 hash_key = _default_hash_key 

88 

89 if isinstance(obj, ABCMultiIndex): 

90 return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) 

91 

92 elif isinstance(obj, ABCIndexClass): 

93 h = hash_array(obj.values, encoding, hash_key, categorize).astype( 

94 "uint64", copy=False 

95 ) 

96 h = Series(h, index=obj, dtype="uint64", copy=False) 

97 

98 elif isinstance(obj, ABCSeries): 

99 h = hash_array(obj.values, encoding, hash_key, categorize).astype( 

100 "uint64", copy=False 

101 ) 

102 if index: 

103 index_iter = ( 

104 hash_pandas_object( 

105 obj.index, 

106 index=False, 

107 encoding=encoding, 

108 hash_key=hash_key, 

109 categorize=categorize, 

110 ).values 

111 for _ in [None] 

112 ) 

113 arrays = itertools.chain([h], index_iter) 

114 h = _combine_hash_arrays(arrays, 2) 

115 

116 h = Series(h, index=obj.index, dtype="uint64", copy=False) 

117 

118 elif isinstance(obj, ABCDataFrame): 

119 hashes = (hash_array(series.values) for _, series in obj.items()) 

120 num_items = len(obj.columns) 

121 if index: 

122 index_hash_generator = ( 

123 hash_pandas_object( 

124 obj.index, 

125 index=False, 

126 encoding=encoding, 

127 hash_key=hash_key, 

128 categorize=categorize, 

129 ).values # noqa 

130 for _ in [None] 

131 ) 

132 num_items += 1 

133 

134 # keep `hashes` specifically a generator to keep mypy happy 

135 _hashes = itertools.chain(hashes, index_hash_generator) 

136 hashes = (x for x in _hashes) 

137 h = _combine_hash_arrays(hashes, num_items) 

138 

139 h = Series(h, index=obj.index, dtype="uint64", copy=False) 

140 else: 

141 raise TypeError(f"Unexpected type for hashing {type(obj)}") 

142 return h 

143 

144 

145def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): 

146 """ 

147 Hash an MultiIndex / list-of-tuples efficiently 

148 

149 Parameters 

150 ---------- 

151 vals : MultiIndex, list-of-tuples, or single tuple 

152 encoding : str, default 'utf8' 

153 hash_key : str, default _default_hash_key 

154 

155 Returns 

156 ------- 

157 ndarray of hashed values array 

158 """ 

159 is_tuple = False 

160 if isinstance(vals, tuple): 

161 vals = [vals] 

162 is_tuple = True 

163 elif not is_list_like(vals): 

164 raise TypeError("must be convertible to a list-of-tuples") 

165 

166 from pandas import Categorical, MultiIndex 

167 

168 if not isinstance(vals, ABCMultiIndex): 

169 vals = MultiIndex.from_tuples(vals) 

170 

171 # create a list-of-Categoricals 

172 vals = [ 

173 Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True) 

174 for level in range(vals.nlevels) 

175 ] 

176 

177 # hash the list-of-ndarrays 

178 hashes = ( 

179 _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals 

180 ) 

181 h = _combine_hash_arrays(hashes, len(vals)) 

182 if is_tuple: 

183 h = h[0] 

184 

185 return h 

186 

187 

188def hash_tuple(val, encoding: str = "utf8", hash_key: str = _default_hash_key): 

189 """ 

190 Hash a single tuple efficiently 

191 

192 Parameters 

193 ---------- 

194 val : single tuple 

195 encoding : str, default 'utf8' 

196 hash_key : str, default _default_hash_key 

197 

198 Returns 

199 ------- 

200 hash 

201 

202 """ 

203 hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) for v in val) 

204 

205 h = _combine_hash_arrays(hashes, len(val))[0] 

206 

207 return h 

208 

209 

210def _hash_categorical(c, encoding: str, hash_key: str): 

211 """ 

212 Hash a Categorical by hashing its categories, and then mapping the codes 

213 to the hashes 

214 

215 Parameters 

216 ---------- 

217 c : Categorical 

218 encoding : str 

219 hash_key : str 

220 

221 Returns 

222 ------- 

223 ndarray of hashed values array, same size as len(c) 

224 """ 

225 # Convert ExtensionArrays to ndarrays 

226 values = np.asarray(c.categories.values) 

227 hashed = hash_array(values, encoding, hash_key, categorize=False) 

228 

229 # we have uint64, as we don't directly support missing values 

230 # we don't want to use take_nd which will coerce to float 

231 # instead, directly construct the result with a 

232 # max(np.uint64) as the missing value indicator 

233 # 

234 # TODO: GH 15362 

235 

236 mask = c.isna() 

237 if len(hashed): 

238 result = hashed.take(c.codes) 

239 else: 

240 result = np.zeros(len(mask), dtype="uint64") 

241 

242 if mask.any(): 

243 result[mask] = np.iinfo(np.uint64).max 

244 

245 return result 

246 

247 

248def hash_array( 

249 vals, 

250 encoding: str = "utf8", 

251 hash_key: str = _default_hash_key, 

252 categorize: bool = True, 

253): 

254 """ 

255 Given a 1d array, return an array of deterministic integers. 

256 

257 Parameters 

258 ---------- 

259 vals : ndarray, Categorical 

260 encoding : str, default 'utf8' 

261 Encoding for data & key when strings. 

262 hash_key : str, default _default_hash_key 

263 Hash_key for string key to encode. 

264 categorize : bool, default True 

265 Whether to first categorize object arrays before hashing. This is more 

266 efficient when the array contains duplicate values. 

267 

268 Returns 

269 ------- 

270 1d uint64 numpy array of hash values, same length as the vals 

271 """ 

272 

273 if not hasattr(vals, "dtype"): 

274 raise TypeError("must pass a ndarray-like") 

275 dtype = vals.dtype 

276 

277 # For categoricals, we hash the categories, then remap the codes to the 

278 # hash values. (This check is above the complex check so that we don't ask 

279 # numpy if categorical is a subdtype of complex, as it will choke). 

280 if is_categorical_dtype(dtype): 

281 return _hash_categorical(vals, encoding, hash_key) 

282 elif is_extension_array_dtype(dtype): 

283 vals, _ = vals._values_for_factorize() 

284 dtype = vals.dtype 

285 

286 # we'll be working with everything as 64-bit values, so handle this 

287 # 128-bit value early 

288 if np.issubdtype(dtype, np.complex128): 

289 return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals)) 

290 

291 # First, turn whatever array this is into unsigned 64-bit ints, if we can 

292 # manage it. 

293 elif isinstance(dtype, np.bool): 

294 vals = vals.astype("u8") 

295 elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): 

296 vals = vals.view("i8").astype("u8", copy=False) 

297 elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: 

298 vals = vals.view("u{}".format(vals.dtype.itemsize)).astype("u8") 

299 else: 

300 # With repeated values, its MUCH faster to categorize object dtypes, 

301 # then hash and rename categories. We allow skipping the categorization 

302 # when the values are known/likely to be unique. 

303 if categorize: 

304 from pandas import factorize, Categorical, Index 

305 

306 codes, categories = factorize(vals, sort=False) 

307 cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) 

308 return _hash_categorical(cat, encoding, hash_key) 

309 

310 try: 

311 vals = hashing.hash_object_array(vals, hash_key, encoding) 

312 except TypeError: 

313 # we have mixed types 

314 vals = hashing.hash_object_array( 

315 vals.astype(str).astype(object), hash_key, encoding 

316 ) 

317 

318 # Then, redistribute these 64-bit ints within the space of 64-bit ints 

319 vals ^= vals >> 30 

320 vals *= np.uint64(0xBF58476D1CE4E5B9) 

321 vals ^= vals >> 27 

322 vals *= np.uint64(0x94D049BB133111EB) 

323 vals ^= vals >> 31 

324 return vals 

325 

326 

327def _hash_scalar( 

328 val, encoding: str = "utf8", hash_key: str = _default_hash_key 

329) -> np.ndarray: 

330 """ 

331 Hash scalar value. 

332 

333 Parameters 

334 ---------- 

335 val : scalar 

336 encoding : str, default "utf8" 

337 hash_key : str, default _default_hash_key 

338 

339 Returns 

340 ------- 

341 1d uint64 numpy array of hash value, of length 1 

342 """ 

343 

344 if isna(val): 

345 # this is to be consistent with the _hash_categorical implementation 

346 return np.array([np.iinfo(np.uint64).max], dtype="u8") 

347 

348 if getattr(val, "tzinfo", None) is not None: 

349 # for tz-aware datetimes, we need the underlying naive UTC value and 

350 # not the tz aware object or pd extension type (as 

351 # infer_dtype_from_scalar would do) 

352 if not isinstance(val, Timestamp): 

353 val = Timestamp(val) 

354 val = val.tz_convert(None) 

355 

356 dtype, val = infer_dtype_from_scalar(val) 

357 vals = np.array([val], dtype=dtype) 

358 

359 return hash_array(vals, hash_key=hash_key, encoding=encoding, categorize=False)