Coverage for /Users/buh/.pyenv/versions/3.12.2/envs/pii/lib/python3.12/site-packages/es_pii_tool/helpers/utils.py: 71%
206 statements
« prev ^ index » next coverage.py v7.5.0, created at 2025-03-18 12:25 -0600
« prev ^ index » next coverage.py v7.5.0, created at 2025-03-18 12:25 -0600
1"""Helper Functions"""
3import typing as t
4import logging
5from os import environ
6import json
7from inspect import stack
8from datetime import datetime, timezone
9import re
10from es_client.exceptions import ConfigurationError as esc_ConfigError
11from es_client.helpers.schemacheck import SchemaCheck
12from es_client.helpers.utils import get_yaml
13from es_wait.exceptions import EsWaitFatal, EsWaitTimeout, IlmWaitError
14import es_pii_tool.exceptions as e
15from es_pii_tool.defaults import (
16 PHASES,
17 PAUSE_DEFAULT,
18 TIMEOUT_DEFAULT,
19 TIMINGS,
20 redaction_schema,
21)
23if t.TYPE_CHECKING:
24 from dotmap import DotMap # type: ignore
25 from voluptuous import Schema
26 from elasticsearch8 import Elasticsearch
27 from es_pii_tool.job import Job
28 from es_pii_tool.trackables import Task
30logger = logging.getLogger(__name__)
33def build_script(message: str, fields: t.Sequence[str]) -> t.Dict[str, str]:
34 """
35 Build a painless script for redacting fields by way of an update_by_query operation
37 :param message: The text to put in place of whatever is in a field
38 :param fields: The list of field names to act on
40 :type message: str
41 :type fields: list
43 :rtype: dict
44 :returns: A dictionary of ``{"source": (assembled message), "lang": "painless"}``
45 """
46 msg = ""
47 for field in fields:
48 msg += f"ctx._source.{field} = '{message}'; "
49 script = {"source": msg, "lang": "painless"}
50 logger.debug('script = %s', script)
51 return script
54def check_dotted_fields(result: t.Dict, field: str, message: str) -> bool:
55 """Iterate through dotted fields to ensure success
57 :param result: The search result object
58 :param field: The field with dotted notation
60 :type result: dict
61 :type field: str
63 :returns: Success (``True``) or Failure (``False``)
64 :rtype: bool
65 """
66 success = False
67 logger.debug('Dotted field detected: (%s) ...', field)
68 fielder = result['hits']['hits'][0]['_source']
69 iterations = len(field.split('.'))
70 counter = 1
71 for key in field.split('.'):
72 # This should recursively look for each subkey
73 if key in fielder:
74 fielder = fielder[key]
75 else:
76 break
77 if counter == iterations:
78 if fielder == message:
79 success = True
80 counter += 1
81 return success
84def check_fields(result: t.Dict, job_config: t.Dict) -> bool:
85 """Check document fields in result to ensure success
87 :param result: The search result object
88 :param job_config: The configuration settings for this job
90 :type result: dict
91 :type job_config: dict
93 :returns: Success (``True``) or Failure (``False``)
94 :rtype: bool
95 """
96 complete = True
97 hit = result['hits']['hits'][0]['_source']
98 for field in job_config['fields']:
99 success = False
100 if len(field.split('.')) > 1:
101 success = check_dotted_fields(result, field, job_config['message'])
103 elif field in hit:
104 if hit[field] == job_config['message']:
105 success = True
107 else:
108 logger.warning("Field %s not present in document", field)
109 # Don't need to report the expected fail 2x, so we break the loop here
110 break
112 if success:
113 logger.info("Field %s is redacted correctly", field)
114 else:
115 # A single failure is enough to make it a complete failure.
116 complete = False
117 logger.error("Field %s is not redacted correctly", field)
118 return complete
121def chunk_index_list(indices: t.Sequence[str]) -> t.Sequence[t.Sequence[str]]:
122 """
123 This utility chunks very large index lists into 3KB chunks.
124 It measures the size as a csv string, then converts back into a list for the return
125 value.
127 :param indices: The list of indices
129 :type indices: list
131 :returns: A list of lists (each a piece of the original ``indices``)
132 :rtype: list
133 """
134 chunks = []
135 chunk = ""
136 for index in indices:
137 if len(chunk) < 3072:
138 if not chunk:
139 chunk = index
140 else:
141 chunk += "," + index
142 else:
143 chunks.append(chunk.split(','))
144 chunk = index
145 chunks.append(chunk.split(','))
146 return chunks
149def configure_ilm_policy(task: 'Task', data: 'DotMap') -> None:
150 """
151 Prune phases we've already passed.
153 If only_expunge_deletes is True in the job config, set any force_merge_index
154 actions to False.
155 """
156 # Copy the existing policy to a new spot
157 data.new.ilmpolicy = data.ilm.lifecycle.policy
159 # Prune phases from existing ILM policy we've already surpassed
160 for phase in list(data.new.ilmpolicy.phases.toDict().keys()):
161 if PHASES.index(data.ilm.explain.phase) > PHASES.index(phase):
162 del data.new.ilmpolicy.phases[phase]
164 # Figure out if we're doing force merge
165 fmerge = True
166 if 'forcemerge' in task.job.config:
167 fmkwargs = task.job.config['forcemerge']
168 if 'only_expunge_deletes' in fmkwargs and fmkwargs['only_expunge_deletes']:
169 fmerge = False
170 else:
171 fmerge = False
173 # Loop through the remaining phases and set 'force_merge_index': False
174 # to the cold or frozen actions.
176 for phase in data.new.ilmpolicy.phases:
177 if phase not in ['cold', 'frozen']:
178 continue
179 if 'searchable_snapshot' in data.new.ilmpolicy.phases[phase].actions:
180 data.new.ilmpolicy.phases[
181 phase
182 ].actions.searchable_snapshot.force_merge_index = fmerge
185def end_it(obj: t.Union['Job', 'Task'], success: bool) -> None:
186 """Close out the object here to avoid code repetition"""
187 # Record task success or fail here for THIS task_id
188 # Each index in per_index has its own status tracker
189 if not success:
190 err = True
191 log = 'Check application logs for detailed report'
192 else:
193 err = False
194 log = 'DONE'
195 obj.end(completed=success, errors=err, logmsg=log)
198def exception_msgmaker(exc: t.Union[e.MissingIndex, e.BadClientResult]) -> str:
199 """Most of the messages here are similar enough to warrant a single function"""
200 msg = ''
201 upstream = (
202 f'The upstream exception type was {type(exc.upstream).__name__}, '
203 f'with error message: {exc.upstream.args[0]}'
204 )
205 if isinstance(exc, e.MissingIndex):
206 msg = (
207 f'Exception raised because index {exc.missing} was not found. '
208 f'{upstream}'
209 )
210 elif isinstance(exc, e.BadClientResult):
211 msg = (
212 f'Exception raised because of a bad or unexpected response or result '
213 f'from the Elasticsearch cluster. {upstream}'
214 )
215 return msg
218def get_alias_actions(oldidx: str, newidx: str, aliases: t.Dict) -> t.Sequence:
219 """
220 :param oldidx: The old index name
221 :param newidx: The new index name
222 :param aliases: The aliases
224 :type oldidx: str
225 :type newidx: str
226 :type aliases: dict
228 :returns: A list of actions suitable for
229 :py:meth:`~.elasticsearch.client.IndicesClient.update_aliases` ``actions``
230 kwarg.
231 :rtype: list
232 """
233 actions = []
234 for alias in aliases.keys():
235 actions.append({'remove': {'index': oldidx, 'alias': alias}})
236 actions.append({'add': {'index': newidx, 'alias': alias}})
237 return actions
240def get_field_matches(config: t.Dict, result: t.Dict) -> int:
241 """Count docs which have the expected fields
243 :param config: The config from the YAML file
244 :param result: The query result dict
246 :type config: dict
247 :type result: dict
249 :returns: The count of docs in ``result`` which have the identified fields
250 :rtype: int
251 """
253 logger.debug('Extracting doc hit count from result')
254 doc_count = result['hits']['total']['value']
255 for element in range(0, result['hits']['total']['value']):
256 for field in config['fields']:
257 if len(field.split('.')) > 1:
258 logger.debug('Dotted field "%s" detected...', field)
259 fielder = result['hits']['hits'][element]['_source']
260 for key in field.split('.'):
261 # This should recursively look for each subkey
262 if key in fielder:
263 fielder = fielder[key]
264 else:
265 doc_count -= 1
266 break
267 elif field not in list(result['hits']['hits'][element]['_source'].keys()):
268 logger.debug('Fieldname "%s" NOT detected...', field)
269 doc_count -= 1
270 else:
271 logger.debug('Root-level fieldname "%s" detected...', field)
272 return doc_count
275def get_fname() -> str:
276 """Return the name of the calling function"""
277 return stack()[1].function
280def get_inc_version(name: str) -> int:
281 """Extract the incrementing version value from the end of name
283 :param name: The name
285 :type name: str
287 :returns: The integer value of the current index revision, or 0 if no version
288 :rtype: int
289 """
290 # Anchor the end as 3 dashes, a v, and 3 digits, e.g. ---v001
291 match = re.search(r'^.*---v(\d{3})$', name)
292 if match:
293 return int(match.group(1))
294 return 0
297def get_redactions(
298 file: str = '', data: t.Union[t.Dict, None] = None
299) -> t.Union['Schema', None]:
300 """
301 Return valid dictionary of redactions from either ``file`` or from ``data``
302 after checking Schema
304 :param file: YAML file with redactions to check
305 :param data: Configuration data in dictinoary format
307 :type file: str
308 :type data: dict
310 :rtype: dict
311 :returns: Redactions configuration data
312 """
313 if data is None:
314 data = {}
315 logger.debug('Getting redactions data...')
316 if file:
317 try:
318 config = get_yaml(file)
319 except esc_ConfigError as exc:
320 msg = f'Unable to read and/or parse YAML REDACTIONS_FILE: {file} Exiting.'
321 logger.critical(msg)
322 raise e.ConfigError(msg, exc)
323 elif data:
324 config = data
325 else:
326 raise e.FatalError('No configuration file or dictionary provided.', Exception())
327 logger.debug('Performing redaction schema check...')
328 retval = None
329 try:
330 retval = SchemaCheck(
331 config, redaction_schema(), 'Redaction Configuration', 'redactions'
332 ).result()
333 except Exception as exc:
334 msg = 'Redaction configuration schema check failed. Exiting.'
335 logger.critical(msg)
336 raise exc
337 return retval
340def now_iso8601() -> str:
341 """
342 :returns: An ISO8601 timestamp based on datetime.now
343 """
344 # Because Python 3.12 now requires non-naive timezone declarations, we must change.
345 #
346 # ## Example:
347 # ## The new way:
348 # ## datetime.now(timezone.utc).isoformat()
349 # ## Result: 2024-04-16T16:00:00+00:00
350 # ## End Example
351 #
352 # Note that the +00:00 is appended now where we affirmatively declare the UTC
353 # timezone
354 #
355 # As a result, we will use this function to prune away the timezone if it is +00:00
356 # and replace it with Z, which is shorter Zulu notation for UTC (per Elasticsearch)
357 #
358 # We are MANUALLY, FORCEFULLY declaring timezone.utc, so it should ALWAYS be +00:00,
359 # but could in theory sometime show up as a Z, so we test for that.
361 parts = datetime.now(timezone.utc).isoformat().split('+')
362 if len(parts) == 1:
363 if parts[0][-1] == 'Z':
364 return parts[0] # Our ISO8601 already ends with a Z for Zulu/UTC time
365 return f'{parts[0]}Z' # It doesn't end with a Z so we put one there
366 if parts[1] == '00:00':
367 return f'{parts[0]}Z' # It doesn't end with a Z so we put one there
368 return f'{parts[0]}+{parts[1]}' # Fallback publishes the +TZ, whatever that was
371def config_fieldmap(
372 rw_val: t.Literal['read', 'write'],
373 key: t.Literal[
374 'pattern',
375 'query',
376 'fields',
377 'message',
378 'expected_docs',
379 'restore_settings',
380 'delete',
381 ],
382) -> t.Union[str, int, object]:
383 """
384 Return the function from this function/key map
385 """
386 which = {
387 'read': {
388 'pattern': json.loads,
389 'query': json.loads,
390 'fields': json.loads,
391 'message': str,
392 'expected_docs': int,
393 'restore_settings': json.loads,
394 'delete': str,
395 },
396 'write': {
397 'pattern': json.dumps,
398 'query': json.dumps,
399 'fields': json.dumps,
400 'message': str,
401 'expected_docs': int,
402 'restore_settings': json.dumps,
403 'delete': str,
404 },
405 }
406 return which[rw_val][key]
409def parse_job_config(config: t.Dict, behavior: t.Literal['read', 'write']) -> t.Dict:
410 """Parse raw config from the index.
412 Several fields are JSON escaped, so we need to fix it to put it in a dict.
414 :param config: The raw config data
415 :param behavior: ``read`` or ``write``
417 :type config: dict
418 :type behavior: str
420 :rtype: dict
422 :returns: JSON-(de)sanitized configuration dict
423 """
424 fields = [
425 'pattern',
426 'query',
427 'fields',
428 'message',
429 'expected_docs',
430 'restore_settings',
431 'delete',
432 ]
433 doc = {}
434 for field in fields:
435 if field in config:
436 func = config_fieldmap(behavior, field) # type: ignore
437 doc[field] = func(config[field]) # type: ignore
438 return doc
441def strip_ilm_name(name: str) -> str:
442 """
443 Strip leading ``pii-tool-``, and trailing ``---v000`` from ``name``
445 :param name: The ILM lifecycle name
447 :type name: str
449 :returns: The "cleaned up" and stripped ILM name
450 :rtype: str
451 """
452 retval = name.replace('pii-tool-', '')
453 # Anchor the end as 3 dashes, a v, and 3 digits, e.g. ---v001
454 match = re.search(r'^(.*)---v\d{3}$', retval)
455 if match:
456 retval = match.group(1)
457 return retval
460def strip_index_name(name: str) -> str:
461 """
462 Strip ``partial-``, ``restored-``, ``redacted-``, and trailing ``---v000`` from
463 ``name``
465 :param name: The index name
467 :type name: str
469 :returns: The "cleaned up" and stripped index name
470 :rtype: str
471 """
472 retval = name.replace('partial-', '')
473 retval = retval.replace('restored-', '')
474 retval = retval.replace('redacted-', '')
475 # Anchor the end as 3 dashes, a v, and 3 digits, e.g. ---v001
476 match = re.search(r'^(.*)---v\d{3}$', retval)
477 if match:
478 retval = match.group(1)
479 return retval
482def es_waiter(client: 'Elasticsearch', cls, **kwargs) -> None:
483 """Wait for ILM Phase & Step to be reached"""
484 try:
485 waiter = cls(client, **kwargs)
486 waiter.wait()
487 except (
488 IlmWaitError,
489 EsWaitFatal,
490 EsWaitTimeout,
491 ) as wait_err:
492 msg = f'{cls.__name__}: wait for completion failed: {kwargs}'
493 logger.error(f'{msg}. Exception(s): - {wait_err}')
494 raise e.BadClientResult(msg, wait_err)
497def timing(kind: str) -> t.Tuple:
498 """
499 Return a tuple of two floats: the pause value and the timeout value
501 :param kind: The kind of timing to do
503 :type kind: str
505 :returns: A tuple of two floats
506 :rtype: tuple
507 """
508 is_test = environ.get('PII_TOOL_TESTING', 'False') == 'True'
509 pause = 1.0 if is_test else PAUSE_DEFAULT # Default values to be overridden
510 timeout = 30.0 if is_test else TIMEOUT_DEFAULT # Default values to be overridden
511 testkey = 'testing' if is_test else 'default'
512 pause = TIMINGS[kind]['pause'][testkey]
513 timeout = TIMINGS[kind]['timeout'][testkey]
514 # logger.debug(
515 # f'kind = {kind}, TESTING = {testing}, PAUSE = {pause}, TIMEOUT = {timeout}'
516 # )
517 return pause, timeout