Coverage for django_querycache/cacheman.py: 85%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

259 statements  

1""" 

2Fingerprinting and serializer caching for Django models 

3 

4 Typical usage example: 

5 

6 Fingerprinting(Model.objects.all()) 

7 CachedQuerySet(Model.objects.all()) 

8""" 

9 

10import datetime 

11import functools 

12import logging 

13import time 

14from functools import reduce 

15from hashlib import md5 

16from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union 

17 

18from django.apps import apps 

19from django.core.cache import cache as default_cache 

20from django.core.cache import caches 

21from django.core.cache.backends.dummy import DummyCache 

22from django.db import models 

23from django.db.models import F 

24from django.db.models.base import Model 

25from django.db.models.expressions import Func 

26from django.db.models.query import QuerySet 

27 

28from .type_annotations import Feature 

29 

30logger = logging.getLogger(__name__) 

31 

32hstring = str # Declare where we expect a hex encoded string of length 8 

33InputModel = Union[Model, QuerySet, Tuple[str, str]] # Input models to these functions can take a number of forms 

34 

35 

36def utcnow(): 

37 return datetime.datetime.now(datetime.timezone.utc) 

38 

39 

40def inputmodel_parse(inputthing: InputModel) -> Tuple[QuerySet, Model]: 

41 """ 

42 Parameters 

43 ---------- 

44 inputmodel 

45 One of: model, queryset, tuple 

46 Returns 

47 ------- 

48 Queryset and model from an input 

49 """ 

50 # Permit either a whole Model or a given Query to be used 

51 if hasattr(inputthing, "objects"): 

52 return inputthing.objects.all(), inputthing # type: ignore 

53 elif hasattr(inputthing, "model"): 

54 return inputthing, inputthing.model # type: ignore 

55 elif isinstance(inputthing, tuple): 

56 _m = apps.get_model(inputthing[0], inputthing[1]) # type: ignore 

57 query = _m.objects.all() # type: QuerySet 

58 return query, _m # type: ignore 

59 raise TypeError(f"Could not determine the model or queryset from {inputthing}") 

60 

61 

62def query_to_key(query: QuerySet, suffix: str = "") -> str: 

63 """ 

64 Parameters 

65 ---------- 

66 query 

67 A queryset which will be parsed (table, sql, + placeholders) to create a unique key 

68 suffix 

69 Alter the key with extra text 

70 """ 

71 # and query to generate a key 

72 query_hex = md5(query.query.sql_with_params()[0].encode()).hexdigest()[:4] 

73 params_hex = md5("".join(map(str, query.query.sql_with_params()[1])).encode()).hexdigest()[:4] 

74 return f"{query.query.base_table}_{query_hex}{params_hex}{suffix}" 

75 

76 

77def timefunc(func): 

78 """Decorator function to log time taken by a function (in ms)""" 

79 

80 @functools.wraps(func) 

81 def time_closure(*args, **kwargs): 

82 """Wrapped function will log the ms the function took""" 

83 start = time.perf_counter() 

84 result = func(*args, **kwargs) 

85 time_elapsed = time.perf_counter() - start 

86 logger.info(f"Function: {func.__name__}, Time: {(time_elapsed * 1000):.1f} ms") 

87 return result 

88 

89 return time_closure 

90 

91 

92def get_query_cache(cache_alias: str = "default"): 

93 """ 

94 For purposes of caching, particularly in local dev, 

95 Django's default cache (memory) is not really helpful 

96 It gets destroyed regularly (on each deployment at least) 

97 

98 Specify an alternative cache here 

99 

100 Suggested local config for this: 

101 

102 CACHES['cacheman'] = { 

103 'BACKEND': 'diskcache.DjangoCache', 

104 'LOCATION': '/tmp/django-query-cache', 

105 'TIMEOUT': 7 * 24 * 60 * 60, # 1 week 

106 'SHARDS': 8, 

107 'DATABASE_TIMEOUT': 0.010, # 10 milliseconds 

108 'OPTIONS': { 

109 'size_limit': 2 ** 32 # 4 gigabyte 

110 }, 

111 } 

112 """ 

113 try: 

114 return caches[cache_alias] 

115 except Exception as E: # noqa: F401,F841 

116 logger.warn('No "%s" cache. Using defult cache' % (cache_alias,)) 

117 return default_cache 

118 

119 

120class RowHash(Func): 

121 """ 

122 Trick to return the md5 hash of a whole postgres row 

123 """ 

124 

125 function = "MD5" 

126 template = 'substring(%(function)s("%(table)s"::text) from 0 for 8)' 

127 output_field = models.TextField() # type: models.Field[Any, Any] 

128 

129 

130class SomeColsHash(RowHash): 

131 """ 

132 Trick to return the md5sum of only some columns 

133 """ 

134 

135 template = "substring(%(function)s(%(expressions)s) from 0 for 8)" 

136 

137 def as_sql(self, compiler, connection, function=None, template=None, arg_joiner="||", **extra_context): 

138 """ 

139 Override the superclass to always cast fields to text 

140 """ 

141 connection.ops.check_expression_support(self) 

142 sql_parts = [] 

143 params = [] 

144 for arg in self.source_expressions: 

145 arg_sql, arg_params = compiler.compile(arg) 

146 sql_parts.append(f"{arg_sql}::text") # <-- Always cast to text for md5 sum of field 

147 params.extend(arg_params) 

148 data = {**self.extra, **extra_context} 

149 if function is not None: 

150 data["function"] = function 

151 else: 

152 data.setdefault("function", self.function) 

153 template = template or data.get("template", self.template) 

154 arg_joiner = arg_joiner or data.get("arg_joiner", self.arg_joiner) 

155 data["expressions"] = data["field"] = arg_joiner.join(sql_parts) 

156 return template % data, params 

157 

158 

159class Fingerprinting: 

160 """ 

161 This class calculates and stores the fingerpring (hash) of a Django queryset. 

162 A fingerprint hash will be considered valid for `fingerprint_expiry` seconds. 

163 

164 Parameters 

165 ---------- 

166 query 

167 The model or query to hash 

168 cache_key 

169 The key to use for the fingerprint cache 

170 hashfields 

171 Fields to use for creating a row fingerprint (defaults to all) 

172 cache 

173 An instance with `get`, `set` and `delete` properties 

174 fingerprint_expiry 

175 Seconds to wait until assuming that the fingerprint needs recalculation 

176 time_cache_key 

177 The key to use for the fingerprint's "last calculated" time 

178 """ 

179 

180 def __init__( 

181 self, 

182 query: InputModel, 

183 cache_key: Optional[str] = None, 

184 hashfields: Optional[Iterable[Union[str, F]]] = (), 

185 fingerprint_expiry: int = 30, 

186 time_cache_key: Optional[str] = None, 

187 **kwargs, 

188 ): 

189 # Permit either a whole Model or a given Query to be used 

190 query, model = inputmodel_parse(query) 

191 self.query = query 

192 self.model = model 

193 

194 self.cache = get_query_cache(kwargs.get("cache_alias", "default")) 

195 self._cache_is_dummy = isinstance(caches["default"], DummyCache) 

196 self.cache_key = cache_key or query_to_key(self.query, "_hash") 

197 self.time_cache_key = time_cache_key or f"{self.cache_key}_set_time" 

198 self.fingerprint_expiry = fingerprint_expiry or 30 

199 

200 # Depending on whether certain rows are to be used or not 

201 # the hash function will be an md5 of whole table 

202 # or only some columns 

203 if hashfields: 

204 self.fingerprint: RowHash = SomeColsHash(*hashfields) 

205 else: 

206 self.fingerprint = RowHash(table=self.model._meta.db_table) 

207 

208 @property 

209 def _cached_fingerprint(self): 

210 """ 

211 Return the cached hash of the query's fingerprinting result 

212 """ 

213 return self.cache.get(self.cache_key) 

214 

215 @_cached_fingerprint.setter 

216 def _cached_fingerprint(self, value): 

217 """ 

218 Sets the cached key and also the "sentinel" value of the last time the 

219 key was changed for time based validity checks 

220 """ 

221 self.cache.set(self.time_cache_key, utcnow().timestamp()) 

222 self.cache.set(self.cache_key, value) 

223 

224 @_cached_fingerprint.deleter 

225 def _cached_fingerprint(self): 

226 self.cache.delete(self.time_cache_key) 

227 self.cache.delete(self.cache_key) 

228 

229 @property 

230 def _expired(self) -> bool: 

231 """ 

232 Time based expiration of the fingerprint result 

233 prevents spamming the fingerprint function if it was recently 

234 called 

235 """ 

236 if not self._cached_fingerprint: 

237 logger.debug("Hash is not in the cache") 

238 return True 

239 cached_fingerprint_timestamp = self.cache.get(self.time_cache_key) 

240 if not cached_fingerprint_timestamp: 

241 logger.debug("Unknown hash date") 

242 return True 

243 age = datetime.datetime.now().timestamp() - cached_fingerprint_timestamp 

244 if age < self.fingerprint_expiry: 

245 logger.debug( 

246 "Fresh fingerprint: %s seconds expiring at %s seconds", round(age, 1), self.fingerprint_expiry 

247 ) 

248 return False 

249 logger.debug("Old fingerprint: %s seconds", round(age, 1)) 

250 return True 

251 

252 def query_fingerprint(self) -> str: 

253 """ 

254 Returns a single 8 character hex encoded string 

255 representing the database content of this query 

256 at this time 

257 """ 

258 

259 def row_fingerprints() -> List[str]: 

260 """ 

261 Returns a list of named tuples with 

262 'fingerprints' for values returned by this queryset 

263 """ 

264 return list(self.query.annotate(fingerprint=self.fingerprint).values_list("fingerprint", flat=True)) 

265 

266 def hexxor(a: hstring, b: hstring) -> hstring: 

267 """ 

268 a and b are len 8 hex strings 

269 Note that 'x' = lowercase hex format 

270 """ 

271 return ("%X" % (int(a, 16) ^ int(b, 16))).zfill(8) 

272 

273 return reduce(hexxor, row_fingerprints(), "00000000") 

274 

275 @timefunc 

276 def update_required(self, force_check=False) -> bool: 

277 """ 

278 Return whether the cached query is considered "dirty" and 

279 a message informing reason for decision 

280 

281 Parameters 

282 ---------- 

283 force_check 

284 Ignore the age of the fingerprint and check it if not expired 

285 

286 Returns 

287 ------- 

288 bool 

289 True if the fingerprint changed, False if not 

290 or if the fingerprint was not expired 

291 """ 

292 

293 if not self._expired and not force_check: 

294 return False 

295 

296 current_fp = self._cached_fingerprint 

297 new_fp = self.query_fingerprint() 

298 if current_fp == new_fp: 

299 self._cached_fingerprint = new_fp # Still update to set the time key 

300 logger.debug("Fingerprint has not changed") 

301 return False 

302 logger.debug("Refreshing fingerprint for %s from %s to %s", self.cache_key, current_fp, new_fp) 

303 self._cached_fingerprint = new_fp 

304 logger.debug("Fingerprint has changed") 

305 return True 

306 

307 

308class TimeStampedFingerprint(Fingerprinting): 

309 """ 

310 Where a class or query with an "auto_now" field is present 

311 we can use that field instead of calculating the md5sum of all rows 

312 """ 

313 

314 def __init__(self, *args, **kwargs): 

315 super().__init__(*args, **kwargs) 

316 

317 if "fingerprint_expiry" not in kwargs: 

318 # The default expiry of 30s is probably too much for 

319 # the very low cost of a single last-modified 

320 # field, here we reduce it to 100ms 

321 self.fingerprint_expiry = 0.1 

322 

323 # Set different hash keys for Timestamped vs hash key queries 

324 # Mostly of interest in testing 

325 self.cache_key = f"{self.cache_key}_ts" 

326 self.time_cache_key = f"{self.time_cache_key}_ts" 

327 

328 # Use the specified column value, if provided; 

329 # otherwise search for a column with an 'auto_now' field 

330 

331 self.timestamp_column = None 

332 if "timestamp_column" in kwargs: 

333 self.timestamp_column = kwargs.pop("timestamp_column") 

334 else: 

335 for field in self.model._meta.fields: 

336 if hasattr(field, "auto_now") and field.auto_now is True: 

337 self.timestamp_column = field.name 

338 break 

339 if not self.timestamp_column: 

340 raise ValueError("No timestamp column") 

341 logger.debug("using %s as timestamp column", self.timestamp_column) 

342 

343 def query_fingerprint(self): 

344 """ 

345 Returns the last updated time of the table or query rather than the 

346 hash of all query rows 

347 """ 

348 try: 

349 ordered_query = self.query.order_by(self.timestamp_column) 

350 except TypeError as E: 

351 logger.debug(f"Encountered exception: {E}") 

352 logger.debug("Fall back to last_modified query for the whole model") 

353 ordered_query = self.model.objects.order_by(self.timestamp_column) 

354 last_updated = ordered_query.last() 

355 if not last_updated: 

356 logger.debug("Enpty query") 

357 return utcnow() 

358 last_timestamp = getattr(last_updated, self.timestamp_column) # type: Union[datetime.date, datetime.datetime] 

359 # Expect a `isoformat` on this field 

360 return last_timestamp.isoformat() 

361 

362 

363class ModelTimeStampedFingerprint(TimeStampedFingerprint): 

364 """ 

365 This class filters the "has_changed" return to check the last updated 

366 time for the query's whole model before running the fingerprint query 

367 which may have a slower result 

368 In many cases this should return faster than timestamp query over a few rows 

369 as it avoids the filtering steps; in the worst case it adds one additional 

370 but very fast query so it should probably be used as the default where 

371 a model has a timestamped column unless you have a huge table and 

372 a simple query 

373 """ 

374 

375 def __init__(self, *args, **kwargs): 

376 super().__init__(*args, **kwargs) 

377 self.table_cache_key = f"{self.cache_key}_table" 

378 self.table_time_cache_key = f"{self.time_cache_key}_table" 

379 

380 def _get_table_fingerprint(self): 

381 ordered_query = self.query.order_by(self.timestamp_column) 

382 last_updated = ordered_query.last() 

383 if not last_updated: 

384 logger.debug("Empty query") 

385 return utcnow() 

386 last_timestamp = getattr(last_updated, self.timestamp_column) # type: Union[datetime.date, datetime.datetime] 

387 # Expect a `isoformat` on this field 

388 stamp = last_timestamp.isoformat() 

389 logger.debug(stamp) 

390 return stamp 

391 

392 @property 

393 def _cached_table_fingerprint(self): 

394 """ 

395 Return the cached hash of the query's fingerprinting result 

396 """ 

397 return self.cache.get(self.table_cache_key) 

398 

399 @_cached_table_fingerprint.setter 

400 def _cached_table_fingerprint(self, value): 

401 """ 

402 Sets the cached key and also the "sentinel" value of the last time the 

403 key was changed for time based validity checks 

404 """ 

405 self.cache.set(self.table_time_cache_key, utcnow().timestamp()) 

406 self.cache.set(self.table_cache_key, value) 

407 

408 @_cached_table_fingerprint.deleter 

409 def _cached_table_fingerprint(self): 

410 self.cache.delete(self.table_time_cache_key) 

411 self.cache.delete(self.table_cache_key) 

412 

413 def update_required(self, force_check=False) -> bool: 

414 """ 

415 Shortcut if the table has not changed since last checked 

416 """ 

417 table_fp = self._get_table_fingerprint() 

418 if table_fp == self._cached_table_fingerprint: 

419 self._cached_table_fingerprint = table_fp 

420 logger.debug("Table not updated") 

421 return False 

422 self._cached_table_fingerprint = table_fp 

423 logger.debug("Table may have changed. Now checking if query has changed") 

424 return super().update_required(force_check=force_check) 

425 

426 

427class CachedQuerySet: 

428 """ 

429 Cache the serialized results of a query, using a 'Fingerprinting' 

430 instance to detect changes before updating results in the cache 

431 

432 Attributes 

433 ---------- 

434 cache 

435 An instance of a Django cache with get, set, delete methods 

436 _cache_is_dummy 

437 True if the cache will not do anything (Django's dummy cache) 

438 query: Queryset 

439 ... 

440 model: Model 

441 Django model. Derived from query if query is suitable. 

442 cache_key: str 

443 The unique key to use for this queryset 

444 fp 

445 Composition - a Fingerprint model 

446 

447 """ 

448 

449 def __init__( 

450 self, 

451 query: InputModel, 

452 fp: Optional[Fingerprinting] = None, 

453 cache_key: str = None, 

454 query_values: Iterable[Union[str, F]] = (), 

455 **kwargs, 

456 ): 

457 

458 self.cache = get_query_cache(kwargs.get("cache_alias", "default")) 

459 self._cache_is_dummy = isinstance(caches["default"], DummyCache) 

460 

461 self.query, self.model = inputmodel_parse(query) 

462 

463 self.query_values = query_values 

464 

465 if self.query_values: 

466 self.query = self.query.values(*query_values) 

467 

468 if cache_key: 

469 self.cache_key = cache_key 

470 else: 

471 self.cache_key = query_to_key(self.query) 

472 

473 if fp: 

474 self.fp = fp 

475 else: 

476 try: 

477 self.fp = TimeStampedFingerprint( 

478 query=self.query, 

479 fingerprint_expiry=kwargs.get("fingerprint_expiry", None), 

480 ) 

481 except ValueError: 

482 self.fp = Fingerprinting( 

483 query=self.query, 

484 fingerprint_expiry=kwargs.get("fingerprint_expiry", None), 

485 ) 

486 

487 @property 

488 def cached_query(self): 

489 if self._cache_is_dummy: 

490 return self.get_serialized_query() 

491 return self.cache.get(self.cache_key) 

492 

493 @cached_query.setter 

494 def cached_query(self, value): 

495 logger.debug("Refreshing cache for %s", self.cache_key) 

496 self.cache.set(self.cache_key, value) 

497 

498 @timefunc 

499 def get_with_update(self) -> Any: 

500 """ 

501 Return the cached query if fresh else 

502 update and then return the cached query 

503 

504 Returns 

505 ------- 

506 Any 

507 The serialized value from the cache 

508 """ 

509 self.update_if_required() 

510 return self.cached_query 

511 

512 def update_if_required(self): 

513 """ 

514 Check whether the fingerprint is fresh, and if 

515 the fingerprint has expired and changed 

516 then update the cache for this class' query 

517 """ 

518 if self.cache_key not in self.cache: 

519 self.update_cache() 

520 logger.info("No data in cache, updating") 

521 # This is called to refresh the fingerprint 

522 self.fp.update_required() 

523 elif self.fp.update_required(): 

524 self.update_cache() 

525 logger.info("Fingerprinting showed database changes, updated") 

526 else: 

527 logger.info("Fingerprint was recent or unchanged") 

528 

529 @timefunc 

530 def update_cache(self): 

531 if self._cache_is_dummy: 

532 return 

533 self.cached_query = self.get_serialized_query() 

534 

535 def get_serialized_query(self) -> List[Dict]: 

536 """ 

537 Convert the results of 'get_query' into a serializable instance 

538 """ 

539 if self.query_values: 

540 return list(self.query.values(*self.query_values)) 

541 return list(self.query.values()) 

542 

543 

544class GeoJsonCachedQuerySet(CachedQuerySet): 

545 """ 

546 Adds additional properties and methods to serialize 

547 GeoJSON features 

548 

549 This intended to work alongside a class with `get_query_for_cache()` 

550 and `get_with_update()` methods 

551 """ 

552 

553 def __init__( 

554 self, model: InputModel, geojson_props: Iterable[str] = [], geometry_field: str = "feature", *args, **kwargs 

555 ): 

556 """ 

557 Parameters 

558 ---------- 

559 geojson_props 

560 An iterable of strings, fields on the query, to turn into geojson "properties" 

561 geometry_field 

562 The name of a field to use, it should be a GeoJSON annotation not a geometry field 

563 """ 

564 self.geojson_props = geojson_props 

565 self.geometry_field = geometry_field 

566 super().__init__(model, *args, **kwargs) 

567 

568 def feature_props(self, item: Type[Model]) -> dict: 

569 """ 

570 You might wish to override the generation of 

571 geojson feature properties here 

572 """ 

573 return {field: getattr(item, field) for field in self.geojson_props} 

574 

575 @timefunc 

576 def get_serialized_query(self) -> List[Feature]: 

577 """ 

578 Django has a built in geometry serializer 

579 It does not work here because it requires geom to be 

580 an actual field not an annotation 

581 

582 Returns 

583 ------- 

584 List[Feature] 

585 Serialized features in GeoJSON format 

586 """ 

587 return [ 

588 { 

589 "type": "Feature", 

590 "geometry": getattr(item, self.geometry_field), 

591 "properties": self.feature_props(item), 

592 } 

593 for item in self.query 

594 ] 

595 

596 @timefunc 

597 def features(self) -> List[Feature]: 

598 """ 

599 This will update the features in the cache if necessary and return them 

600 It's an alias of 'get_with_update' 

601 

602 Returns 

603 ------- 

604 List[Feature] 

605 Serialized features in GeoJSON format 

606 """ 

607 features = super().get_with_update() 

608 return features