Coverage for django_querycache/cacheman.py: 85%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Fingerprinting and serializer caching for Django models
4 Typical usage example:
6 Fingerprinting(Model.objects.all())
7 CachedQuerySet(Model.objects.all())
8"""
10import datetime
11import functools
12import logging
13import time
14from functools import reduce
15from hashlib import md5
16from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union
18from django.apps import apps
19from django.core.cache import cache as default_cache
20from django.core.cache import caches
21from django.core.cache.backends.dummy import DummyCache
22from django.db import models
23from django.db.models import F
24from django.db.models.base import Model
25from django.db.models.expressions import Func
26from django.db.models.query import QuerySet
28from .type_annotations import Feature
30logger = logging.getLogger(__name__)
32hstring = str # Declare where we expect a hex encoded string of length 8
33InputModel = Union[Model, QuerySet, Tuple[str, str]] # Input models to these functions can take a number of forms
36def utcnow():
37 return datetime.datetime.now(datetime.timezone.utc)
40def inputmodel_parse(inputthing: InputModel) -> Tuple[QuerySet, Model]:
41 """
42 Parameters
43 ----------
44 inputmodel
45 One of: model, queryset, tuple
46 Returns
47 -------
48 Queryset and model from an input
49 """
50 # Permit either a whole Model or a given Query to be used
51 if hasattr(inputthing, "objects"):
52 return inputthing.objects.all(), inputthing # type: ignore
53 elif hasattr(inputthing, "model"):
54 return inputthing, inputthing.model # type: ignore
55 elif isinstance(inputthing, tuple):
56 _m = apps.get_model(inputthing[0], inputthing[1]) # type: ignore
57 query = _m.objects.all() # type: QuerySet
58 return query, _m # type: ignore
59 raise TypeError(f"Could not determine the model or queryset from {inputthing}")
62def query_to_key(query: QuerySet, suffix: str = "") -> str:
63 """
64 Parameters
65 ----------
66 query
67 A queryset which will be parsed (table, sql, + placeholders) to create a unique key
68 suffix
69 Alter the key with extra text
70 """
71 # and query to generate a key
72 query_hex = md5(query.query.sql_with_params()[0].encode()).hexdigest()[:4]
73 params_hex = md5("".join(map(str, query.query.sql_with_params()[1])).encode()).hexdigest()[:4]
74 return f"{query.query.base_table}_{query_hex}{params_hex}{suffix}"
77def timefunc(func):
78 """Decorator function to log time taken by a function (in ms)"""
80 @functools.wraps(func)
81 def time_closure(*args, **kwargs):
82 """Wrapped function will log the ms the function took"""
83 start = time.perf_counter()
84 result = func(*args, **kwargs)
85 time_elapsed = time.perf_counter() - start
86 logger.info(f"Function: {func.__name__}, Time: {(time_elapsed * 1000):.1f} ms")
87 return result
89 return time_closure
92def get_query_cache(cache_alias: str = "default"):
93 """
94 For purposes of caching, particularly in local dev,
95 Django's default cache (memory) is not really helpful
96 It gets destroyed regularly (on each deployment at least)
98 Specify an alternative cache here
100 Suggested local config for this:
102 CACHES['cacheman'] = {
103 'BACKEND': 'diskcache.DjangoCache',
104 'LOCATION': '/tmp/django-query-cache',
105 'TIMEOUT': 7 * 24 * 60 * 60, # 1 week
106 'SHARDS': 8,
107 'DATABASE_TIMEOUT': 0.010, # 10 milliseconds
108 'OPTIONS': {
109 'size_limit': 2 ** 32 # 4 gigabyte
110 },
111 }
112 """
113 try:
114 return caches[cache_alias]
115 except Exception as E: # noqa: F401,F841
116 logger.warn('No "%s" cache. Using defult cache' % (cache_alias,))
117 return default_cache
120class RowHash(Func):
121 """
122 Trick to return the md5 hash of a whole postgres row
123 """
125 function = "MD5"
126 template = 'substring(%(function)s("%(table)s"::text) from 0 for 8)'
127 output_field = models.TextField() # type: models.Field[Any, Any]
130class SomeColsHash(RowHash):
131 """
132 Trick to return the md5sum of only some columns
133 """
135 template = "substring(%(function)s(%(expressions)s) from 0 for 8)"
137 def as_sql(self, compiler, connection, function=None, template=None, arg_joiner="||", **extra_context):
138 """
139 Override the superclass to always cast fields to text
140 """
141 connection.ops.check_expression_support(self)
142 sql_parts = []
143 params = []
144 for arg in self.source_expressions:
145 arg_sql, arg_params = compiler.compile(arg)
146 sql_parts.append(f"{arg_sql}::text") # <-- Always cast to text for md5 sum of field
147 params.extend(arg_params)
148 data = {**self.extra, **extra_context}
149 if function is not None:
150 data["function"] = function
151 else:
152 data.setdefault("function", self.function)
153 template = template or data.get("template", self.template)
154 arg_joiner = arg_joiner or data.get("arg_joiner", self.arg_joiner)
155 data["expressions"] = data["field"] = arg_joiner.join(sql_parts)
156 return template % data, params
159class Fingerprinting:
160 """
161 This class calculates and stores the fingerpring (hash) of a Django queryset.
162 A fingerprint hash will be considered valid for `fingerprint_expiry` seconds.
164 Parameters
165 ----------
166 query
167 The model or query to hash
168 cache_key
169 The key to use for the fingerprint cache
170 hashfields
171 Fields to use for creating a row fingerprint (defaults to all)
172 cache
173 An instance with `get`, `set` and `delete` properties
174 fingerprint_expiry
175 Seconds to wait until assuming that the fingerprint needs recalculation
176 time_cache_key
177 The key to use for the fingerprint's "last calculated" time
178 """
180 def __init__(
181 self,
182 query: InputModel,
183 cache_key: Optional[str] = None,
184 hashfields: Optional[Iterable[Union[str, F]]] = (),
185 fingerprint_expiry: int = 30,
186 time_cache_key: Optional[str] = None,
187 **kwargs,
188 ):
189 # Permit either a whole Model or a given Query to be used
190 query, model = inputmodel_parse(query)
191 self.query = query
192 self.model = model
194 self.cache = get_query_cache(kwargs.get("cache_alias", "default"))
195 self._cache_is_dummy = isinstance(caches["default"], DummyCache)
196 self.cache_key = cache_key or query_to_key(self.query, "_hash")
197 self.time_cache_key = time_cache_key or f"{self.cache_key}_set_time"
198 self.fingerprint_expiry = fingerprint_expiry or 30
200 # Depending on whether certain rows are to be used or not
201 # the hash function will be an md5 of whole table
202 # or only some columns
203 if hashfields:
204 self.fingerprint: RowHash = SomeColsHash(*hashfields)
205 else:
206 self.fingerprint = RowHash(table=self.model._meta.db_table)
208 @property
209 def _cached_fingerprint(self):
210 """
211 Return the cached hash of the query's fingerprinting result
212 """
213 return self.cache.get(self.cache_key)
215 @_cached_fingerprint.setter
216 def _cached_fingerprint(self, value):
217 """
218 Sets the cached key and also the "sentinel" value of the last time the
219 key was changed for time based validity checks
220 """
221 self.cache.set(self.time_cache_key, utcnow().timestamp())
222 self.cache.set(self.cache_key, value)
224 @_cached_fingerprint.deleter
225 def _cached_fingerprint(self):
226 self.cache.delete(self.time_cache_key)
227 self.cache.delete(self.cache_key)
229 @property
230 def _expired(self) -> bool:
231 """
232 Time based expiration of the fingerprint result
233 prevents spamming the fingerprint function if it was recently
234 called
235 """
236 if not self._cached_fingerprint:
237 logger.debug("Hash is not in the cache")
238 return True
239 cached_fingerprint_timestamp = self.cache.get(self.time_cache_key)
240 if not cached_fingerprint_timestamp:
241 logger.debug("Unknown hash date")
242 return True
243 age = datetime.datetime.now().timestamp() - cached_fingerprint_timestamp
244 if age < self.fingerprint_expiry:
245 logger.debug(
246 "Fresh fingerprint: %s seconds expiring at %s seconds", round(age, 1), self.fingerprint_expiry
247 )
248 return False
249 logger.debug("Old fingerprint: %s seconds", round(age, 1))
250 return True
252 def query_fingerprint(self) -> str:
253 """
254 Returns a single 8 character hex encoded string
255 representing the database content of this query
256 at this time
257 """
259 def row_fingerprints() -> List[str]:
260 """
261 Returns a list of named tuples with
262 'fingerprints' for values returned by this queryset
263 """
264 return list(self.query.annotate(fingerprint=self.fingerprint).values_list("fingerprint", flat=True))
266 def hexxor(a: hstring, b: hstring) -> hstring:
267 """
268 a and b are len 8 hex strings
269 Note that 'x' = lowercase hex format
270 """
271 return ("%X" % (int(a, 16) ^ int(b, 16))).zfill(8)
273 return reduce(hexxor, row_fingerprints(), "00000000")
275 @timefunc
276 def update_required(self, force_check=False) -> bool:
277 """
278 Return whether the cached query is considered "dirty" and
279 a message informing reason for decision
281 Parameters
282 ----------
283 force_check
284 Ignore the age of the fingerprint and check it if not expired
286 Returns
287 -------
288 bool
289 True if the fingerprint changed, False if not
290 or if the fingerprint was not expired
291 """
293 if not self._expired and not force_check:
294 return False
296 current_fp = self._cached_fingerprint
297 new_fp = self.query_fingerprint()
298 if current_fp == new_fp:
299 self._cached_fingerprint = new_fp # Still update to set the time key
300 logger.debug("Fingerprint has not changed")
301 return False
302 logger.debug("Refreshing fingerprint for %s from %s to %s", self.cache_key, current_fp, new_fp)
303 self._cached_fingerprint = new_fp
304 logger.debug("Fingerprint has changed")
305 return True
308class TimeStampedFingerprint(Fingerprinting):
309 """
310 Where a class or query with an "auto_now" field is present
311 we can use that field instead of calculating the md5sum of all rows
312 """
314 def __init__(self, *args, **kwargs):
315 super().__init__(*args, **kwargs)
317 if "fingerprint_expiry" not in kwargs:
318 # The default expiry of 30s is probably too much for
319 # the very low cost of a single last-modified
320 # field, here we reduce it to 100ms
321 self.fingerprint_expiry = 0.1
323 # Set different hash keys for Timestamped vs hash key queries
324 # Mostly of interest in testing
325 self.cache_key = f"{self.cache_key}_ts"
326 self.time_cache_key = f"{self.time_cache_key}_ts"
328 # Use the specified column value, if provided;
329 # otherwise search for a column with an 'auto_now' field
331 self.timestamp_column = None
332 if "timestamp_column" in kwargs:
333 self.timestamp_column = kwargs.pop("timestamp_column")
334 else:
335 for field in self.model._meta.fields:
336 if hasattr(field, "auto_now") and field.auto_now is True:
337 self.timestamp_column = field.name
338 break
339 if not self.timestamp_column:
340 raise ValueError("No timestamp column")
341 logger.debug("using %s as timestamp column", self.timestamp_column)
343 def query_fingerprint(self):
344 """
345 Returns the last updated time of the table or query rather than the
346 hash of all query rows
347 """
348 try:
349 ordered_query = self.query.order_by(self.timestamp_column)
350 except TypeError as E:
351 logger.debug(f"Encountered exception: {E}")
352 logger.debug("Fall back to last_modified query for the whole model")
353 ordered_query = self.model.objects.order_by(self.timestamp_column)
354 last_updated = ordered_query.last()
355 if not last_updated:
356 logger.debug("Enpty query")
357 return utcnow()
358 last_timestamp = getattr(last_updated, self.timestamp_column) # type: Union[datetime.date, datetime.datetime]
359 # Expect a `isoformat` on this field
360 return last_timestamp.isoformat()
363class ModelTimeStampedFingerprint(TimeStampedFingerprint):
364 """
365 This class filters the "has_changed" return to check the last updated
366 time for the query's whole model before running the fingerprint query
367 which may have a slower result
368 In many cases this should return faster than timestamp query over a few rows
369 as it avoids the filtering steps; in the worst case it adds one additional
370 but very fast query so it should probably be used as the default where
371 a model has a timestamped column unless you have a huge table and
372 a simple query
373 """
375 def __init__(self, *args, **kwargs):
376 super().__init__(*args, **kwargs)
377 self.table_cache_key = f"{self.cache_key}_table"
378 self.table_time_cache_key = f"{self.time_cache_key}_table"
380 def _get_table_fingerprint(self):
381 ordered_query = self.query.order_by(self.timestamp_column)
382 last_updated = ordered_query.last()
383 if not last_updated:
384 logger.debug("Empty query")
385 return utcnow()
386 last_timestamp = getattr(last_updated, self.timestamp_column) # type: Union[datetime.date, datetime.datetime]
387 # Expect a `isoformat` on this field
388 stamp = last_timestamp.isoformat()
389 logger.debug(stamp)
390 return stamp
392 @property
393 def _cached_table_fingerprint(self):
394 """
395 Return the cached hash of the query's fingerprinting result
396 """
397 return self.cache.get(self.table_cache_key)
399 @_cached_table_fingerprint.setter
400 def _cached_table_fingerprint(self, value):
401 """
402 Sets the cached key and also the "sentinel" value of the last time the
403 key was changed for time based validity checks
404 """
405 self.cache.set(self.table_time_cache_key, utcnow().timestamp())
406 self.cache.set(self.table_cache_key, value)
408 @_cached_table_fingerprint.deleter
409 def _cached_table_fingerprint(self):
410 self.cache.delete(self.table_time_cache_key)
411 self.cache.delete(self.table_cache_key)
413 def update_required(self, force_check=False) -> bool:
414 """
415 Shortcut if the table has not changed since last checked
416 """
417 table_fp = self._get_table_fingerprint()
418 if table_fp == self._cached_table_fingerprint:
419 self._cached_table_fingerprint = table_fp
420 logger.debug("Table not updated")
421 return False
422 self._cached_table_fingerprint = table_fp
423 logger.debug("Table may have changed. Now checking if query has changed")
424 return super().update_required(force_check=force_check)
427class CachedQuerySet:
428 """
429 Cache the serialized results of a query, using a 'Fingerprinting'
430 instance to detect changes before updating results in the cache
432 Attributes
433 ----------
434 cache
435 An instance of a Django cache with get, set, delete methods
436 _cache_is_dummy
437 True if the cache will not do anything (Django's dummy cache)
438 query: Queryset
439 ...
440 model: Model
441 Django model. Derived from query if query is suitable.
442 cache_key: str
443 The unique key to use for this queryset
444 fp
445 Composition - a Fingerprint model
447 """
449 def __init__(
450 self,
451 query: InputModel,
452 fp: Optional[Fingerprinting] = None,
453 cache_key: str = None,
454 query_values: Iterable[Union[str, F]] = (),
455 **kwargs,
456 ):
458 self.cache = get_query_cache(kwargs.get("cache_alias", "default"))
459 self._cache_is_dummy = isinstance(caches["default"], DummyCache)
461 self.query, self.model = inputmodel_parse(query)
463 self.query_values = query_values
465 if self.query_values:
466 self.query = self.query.values(*query_values)
468 if cache_key:
469 self.cache_key = cache_key
470 else:
471 self.cache_key = query_to_key(self.query)
473 if fp:
474 self.fp = fp
475 else:
476 try:
477 self.fp = TimeStampedFingerprint(
478 query=self.query,
479 fingerprint_expiry=kwargs.get("fingerprint_expiry", None),
480 )
481 except ValueError:
482 self.fp = Fingerprinting(
483 query=self.query,
484 fingerprint_expiry=kwargs.get("fingerprint_expiry", None),
485 )
487 @property
488 def cached_query(self):
489 if self._cache_is_dummy:
490 return self.get_serialized_query()
491 return self.cache.get(self.cache_key)
493 @cached_query.setter
494 def cached_query(self, value):
495 logger.debug("Refreshing cache for %s", self.cache_key)
496 self.cache.set(self.cache_key, value)
498 @timefunc
499 def get_with_update(self) -> Any:
500 """
501 Return the cached query if fresh else
502 update and then return the cached query
504 Returns
505 -------
506 Any
507 The serialized value from the cache
508 """
509 self.update_if_required()
510 return self.cached_query
512 def update_if_required(self):
513 """
514 Check whether the fingerprint is fresh, and if
515 the fingerprint has expired and changed
516 then update the cache for this class' query
517 """
518 if self.cache_key not in self.cache:
519 self.update_cache()
520 logger.info("No data in cache, updating")
521 # This is called to refresh the fingerprint
522 self.fp.update_required()
523 elif self.fp.update_required():
524 self.update_cache()
525 logger.info("Fingerprinting showed database changes, updated")
526 else:
527 logger.info("Fingerprint was recent or unchanged")
529 @timefunc
530 def update_cache(self):
531 if self._cache_is_dummy:
532 return
533 self.cached_query = self.get_serialized_query()
535 def get_serialized_query(self) -> List[Dict]:
536 """
537 Convert the results of 'get_query' into a serializable instance
538 """
539 if self.query_values:
540 return list(self.query.values(*self.query_values))
541 return list(self.query.values())
544class GeoJsonCachedQuerySet(CachedQuerySet):
545 """
546 Adds additional properties and methods to serialize
547 GeoJSON features
549 This intended to work alongside a class with `get_query_for_cache()`
550 and `get_with_update()` methods
551 """
553 def __init__(
554 self, model: InputModel, geojson_props: Iterable[str] = [], geometry_field: str = "feature", *args, **kwargs
555 ):
556 """
557 Parameters
558 ----------
559 geojson_props
560 An iterable of strings, fields on the query, to turn into geojson "properties"
561 geometry_field
562 The name of a field to use, it should be a GeoJSON annotation not a geometry field
563 """
564 self.geojson_props = geojson_props
565 self.geometry_field = geometry_field
566 super().__init__(model, *args, **kwargs)
568 def feature_props(self, item: Type[Model]) -> dict:
569 """
570 You might wish to override the generation of
571 geojson feature properties here
572 """
573 return {field: getattr(item, field) for field in self.geojson_props}
575 @timefunc
576 def get_serialized_query(self) -> List[Feature]:
577 """
578 Django has a built in geometry serializer
579 It does not work here because it requires geom to be
580 an actual field not an annotation
582 Returns
583 -------
584 List[Feature]
585 Serialized features in GeoJSON format
586 """
587 return [
588 {
589 "type": "Feature",
590 "geometry": getattr(item, self.geometry_field),
591 "properties": self.feature_props(item),
592 }
593 for item in self.query
594 ]
596 @timefunc
597 def features(self) -> List[Feature]:
598 """
599 This will update the features in the cache if necessary and return them
600 It's an alias of 'get_with_update'
602 Returns
603 -------
604 List[Feature]
605 Serialized features in GeoJSON format
606 """
607 features = super().get_with_update()
608 return features