Coverage for /Users/buh/.pyenv/versions/3.12.2/envs/pii/lib/python3.12/site-packages/es_pii_tool/helpers/utils.py: 71%
189 statements
« prev ^ index » next coverage.py v7.5.0, created at 2025-01-29 19:29 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2025-01-29 19:29 -0700
1"""Helper Functions"""
3import typing as t
4import logging
5import json
6from inspect import stack
7from datetime import datetime, timezone
8import re
9from elasticsearch8.exceptions import NotFoundError
10from es_client.exceptions import ConfigurationError as esc_ConfigError
11from es_client.helpers.schemacheck import SchemaCheck
12from es_client.helpers.utils import get_yaml
13from es_wait.exceptions import IlmWaitError
14import es_pii_tool.exceptions as e
15from es_pii_tool.defaults import PHASES, redaction_schema
17if t.TYPE_CHECKING:
18 from dotmap import DotMap # type: ignore
19 from voluptuous import Schema
20 from elasticsearch8 import Elasticsearch
21 from es_pii_tool.job import Job
22 from es_pii_tool.trackables import Task
24logger = logging.getLogger(__name__)
27def build_script(message: str, fields: t.Sequence[str]) -> t.Dict[str, str]:
28 """
29 Build a painless script for redacting fields by way of an update_by_query operation
31 :param message: The text to put in place of whatever is in a field
32 :param fields: The list of field names to act on
34 :type message: str
35 :type fields: list
37 :rtype: dict
38 :returns: A dictionary of ``{"source": (assembled message), "lang": "painless"}``
39 """
40 msg = ""
41 for field in fields:
42 msg += f"ctx._source.{field} = '{message}'; "
43 script = {"source": msg, "lang": "painless"}
44 logger.debug('script = %s', script)
45 return script
48def check_dotted_fields(result: t.Dict, field: str, message: str) -> bool:
49 """Iterate through dotted fields to ensure success
51 :param result: The search result object
52 :param field: The field with dotted notation
54 :type result: dict
55 :type field: str
57 :returns: Success (``True``) or Failure (``False``)
58 :rtype: bool
59 """
60 success = False
61 logger.debug('Dotted field detected: (%s) ...', field)
62 fielder = result['hits']['hits'][0]['_source']
63 iterations = len(field.split('.'))
64 counter = 1
65 for key in field.split('.'):
66 # This should recursively look for each subkey
67 if key in fielder:
68 fielder = fielder[key]
69 else:
70 break
71 if counter == iterations:
72 if fielder == message:
73 success = True
74 counter += 1
75 return success
78def check_fields(result: t.Dict, job_config: t.Dict) -> bool:
79 """Check document fields in result to ensure success
81 :param result: The search result object
82 :param job_config: The configuration settings for this job
84 :type result: dict
85 :type job_config: dict
87 :returns: Success (``True``) or Failure (``False``)
88 :rtype: bool
89 """
90 complete = True
91 hit = result['hits']['hits'][0]['_source']
92 for field in job_config['fields']:
93 success = False
94 if len(field.split('.')) > 1:
95 success = check_dotted_fields(result, field, job_config['message'])
97 elif field in hit:
98 if hit[field] == job_config['message']:
99 success = True
101 else:
102 logger.warning("Field %s not present in document", field)
103 # Don't need to report the expected fail 2x, so we break the loop here
104 break
106 if success:
107 logger.info("Field %s is redacted correctly", field)
108 else:
109 # A single failure is enough to make it a complete failure.
110 complete = False
111 logger.error("Field %s is not redacted correctly", field)
112 return complete
115def chunk_index_list(indices: t.Sequence[str]) -> t.Sequence[t.Sequence[str]]:
116 """
117 This utility chunks very large index lists into 3KB chunks.
118 It measures the size as a csv string, then converts back into a list for the return
119 value.
121 :param indices: The list of indices
123 :type indices: list
125 :returns: A list of lists (each a piece of the original ``indices``)
126 :rtype: list
127 """
128 chunks = []
129 chunk = ""
130 for index in indices:
131 if len(chunk) < 3072:
132 if not chunk:
133 chunk = index
134 else:
135 chunk += "," + index
136 else:
137 chunks.append(chunk.split(','))
138 chunk = index
139 chunks.append(chunk.split(','))
140 return chunks
143def configure_ilm_policy(task: 'Task', data: 'DotMap') -> None:
144 """
145 Prune phases we've already passed.
147 If only_expunge_deletes is True in the job config, set any force_merge_index
148 actions to False.
149 """
150 # Copy the existing policy to a new spot
151 data.new.ilmpolicy = data.ilm.lifecycle.policy
153 # Prune phases from existing ILM policy we've already surpassed
154 for phase in list(data.new.ilmpolicy.phases.toDict().keys()):
155 if PHASES.index(data.ilm.explain.phase) > PHASES.index(phase):
156 del data.new.ilmpolicy.phases[phase]
158 # Figure out if we're doing force merge
159 fmerge = True
160 if 'forcemerge' in task.job.config:
161 fmkwargs = task.job.config['forcemerge']
162 if 'only_expunge_deletes' in fmkwargs and fmkwargs['only_expunge_deletes']:
163 fmerge = False
164 else:
165 fmerge = False
167 # Loop through the remaining phases and set 'force_merge_index': False
168 # to the cold or frozen actions.
170 for phase in data.new.ilmpolicy.phases:
171 if phase not in ['cold', 'frozen']:
172 continue
173 if 'searchable_snapshot' in data.new.ilmpolicy.phases[phase].actions:
174 data.new.ilmpolicy.phases[
175 phase
176 ].actions.searchable_snapshot.force_merge_index = fmerge
179def end_it(obj: t.Union['Job', 'Task'], success: bool) -> None:
180 """Close out the object here to avoid code repetition"""
181 # Record task success or fail here for THIS task_id
182 # Each index in per_index has its own status tracker
183 if not success:
184 err = True
185 log = 'Check application logs for detailed report'
186 else:
187 err = False
188 log = 'DONE'
189 obj.end(completed=success, errors=err, logmsg=log)
192def exception_msgmaker(exc: t.Union[e.MissingIndex, e.BadClientResult]) -> str:
193 """Most of the messages here are similar enough to warrant a single function"""
194 msg = ''
195 upstream = (
196 f'The upstream exception type was {type(exc.upstream).__name__}, '
197 f'with error message: {exc.upstream.args[0]}'
198 )
199 if isinstance(exc, e.MissingIndex):
200 msg = (
201 f'Exception raised because index {exc.missing} was not found. '
202 f'{upstream}'
203 )
204 elif isinstance(exc, e.BadClientResult):
205 msg = (
206 f'Exception raised because of a bad or unexpected response or result '
207 f'from the Elasticsearch cluster. {upstream}'
208 )
209 return msg
212def get_alias_actions(oldidx: str, newidx: str, aliases: t.Dict) -> t.Sequence:
213 """
214 :param oldidx: The old index name
215 :param newidx: The new index name
216 :param aliases: The aliases
218 :type oldidx: str
219 :type newidx: str
220 :type aliases: dict
222 :returns: A list of actions suitable for
223 :py:meth:`~.elasticsearch.client.IndicesClient.update_aliases` ``actions``
224 kwarg.
225 :rtype: list
226 """
227 actions = []
228 for alias in aliases.keys():
229 actions.append({'remove': {'index': oldidx, 'alias': alias}})
230 actions.append({'add': {'index': newidx, 'alias': alias}})
231 return actions
234def get_field_matches(config: t.Dict, result: t.Dict) -> int:
235 """Count docs which have the expected fields
237 :param config: The config from the YAML file
238 :param result: The query result dict
240 :type config: dict
241 :type result: dict
243 :returns: The count of docs in ``result`` which have the identified fields
244 :rtype: int
245 """
247 logger.debug('Extracting doc hit count from result')
248 doc_count = result['hits']['total']['value']
249 for element in range(0, result['hits']['total']['value']):
250 for field in config['fields']:
251 if len(field.split('.')) > 1:
252 logger.debug('Dotted field "%s" detected...', field)
253 fielder = result['hits']['hits'][element]['_source']
254 for key in field.split('.'):
255 # This should recursively look for each subkey
256 if key in fielder:
257 fielder = fielder[key]
258 else:
259 doc_count -= 1
260 break
261 elif field not in list(result['hits']['hits'][element]['_source'].keys()):
262 logger.debug('Fieldname "%s" NOT detected...', field)
263 doc_count -= 1
264 else:
265 logger.debug('Root-level fieldname "%s" detected...', field)
266 return doc_count
269def get_fname() -> str:
270 """Return the name of the calling function"""
271 return stack()[1].function
274def get_inc_version(name: str) -> int:
275 """Extract the incrementing version value from the end of name
277 :param name: The name
279 :type name: str
281 :returns: The integer value of the current index revision, or 0 if no version
282 :rtype: int
283 """
284 # Anchor the end as 3 dashes, a v, and 3 digits, e.g. ---v001
285 match = re.search(r'^.*---v(\d{3})$', name)
286 if match:
287 return int(match.group(1))
288 return 0
291def get_redactions(file: str = '', data: t.Union[t.Dict, None] = None) -> 'Schema':
292 """
293 Return valid dictionary of redactions from either ``file`` or from ``data``
294 after checking Schema
296 :param file: YAML file with redactions to check
297 :param data: Configuration data in dictinoary format
299 :type file: str
300 :type data: dict
302 :rtype: dict
303 :returns: Redactions configuration data
304 """
305 if data is None:
306 data = {}
307 logger.debug('Getting redactions data...')
308 if file:
309 try:
310 config = get_yaml(file)
311 except esc_ConfigError as exc:
312 msg = f'Unable to read and/or parse YAML REDACTIONS_FILE: {file} Exiting.'
313 logger.critical(msg)
314 raise e.ConfigError(msg, exc)
315 elif data:
316 config = data
317 else:
318 raise e.FatalError('No configuration file or dictionary provided.', Exception())
319 return SchemaCheck(
320 config, redaction_schema(), 'Redaction Configuration', 'redactions'
321 ).result()
324def now_iso8601() -> str:
325 """
326 :returns: An ISO8601 timestamp based on datetime.now
327 """
328 # Because Python 3.12 now requires non-naive timezone declarations, we must change.
329 #
330 # ## Example:
331 # ## The new way:
332 # ## datetime.now(timezone.utc).isoformat()
333 # ## Result: 2024-04-16T16:00:00+00:00
334 # ## End Example
335 #
336 # Note that the +00:00 is appended now where we affirmatively declare the UTC
337 # timezone
338 #
339 # As a result, we will use this function to prune away the timezone if it is +00:00
340 # and replace it with Z, which is shorter Zulu notation for UTC (per Elasticsearch)
341 #
342 # We are MANUALLY, FORCEFULLY declaring timezone.utc, so it should ALWAYS be +00:00,
343 # but could in theory sometime show up as a Z, so we test for that.
345 parts = datetime.now(timezone.utc).isoformat().split('+')
346 if len(parts) == 1:
347 if parts[0][-1] == 'Z':
348 return parts[0] # Our ISO8601 already ends with a Z for Zulu/UTC time
349 return f'{parts[0]}Z' # It doesn't end with a Z so we put one there
350 if parts[1] == '00:00':
351 return f'{parts[0]}Z' # It doesn't end with a Z so we put one there
352 return f'{parts[0]}+{parts[1]}' # Fallback publishes the +TZ, whatever that was
355def config_fieldmap(
356 rw_val: t.Literal['read', 'write'],
357 key: t.Literal[
358 'pattern',
359 'query',
360 'fields',
361 'message',
362 'expected_docs',
363 'restore_settings',
364 'delete',
365 ],
366) -> t.Union[str, int, object]:
367 """
368 Return the function from this function/key map
369 """
370 which = {
371 'read': {
372 'pattern': json.loads,
373 'query': json.loads,
374 'fields': json.loads,
375 'message': str,
376 'expected_docs': int,
377 'restore_settings': json.loads,
378 'delete': str,
379 },
380 'write': {
381 'pattern': json.dumps,
382 'query': json.dumps,
383 'fields': json.dumps,
384 'message': str,
385 'expected_docs': int,
386 'restore_settings': json.dumps,
387 'delete': str,
388 },
389 }
390 return which[rw_val][key]
393def parse_job_config(config: t.Dict, behavior: t.Literal['read', 'write']) -> t.Dict:
394 """Parse raw config from the index.
396 Several fields are JSON escaped, so we need to fix it to put it in a dict.
398 :param config: The raw config data
399 :param behavior: ``read`` or ``write``
401 :type config: dict
402 :type behavior: str
404 :rtype: dict
406 :returns: JSON-(de)sanitized configuration dict
407 """
408 fields = [
409 'pattern',
410 'query',
411 'fields',
412 'message',
413 'expected_docs',
414 'restore_settings',
415 'delete',
416 ]
417 doc = {}
418 for field in fields:
419 if field in config:
420 func = config_fieldmap(behavior, field) # type: ignore
421 doc[field] = func(config[field]) # type: ignore
422 return doc
425def strip_ilm_name(name: str) -> str:
426 """
427 Strip leading ``pii-tool-``, and trailing ``---v000`` from ``name``
429 :param name: The ILM lifecycle name
431 :type name: str
433 :returns: The "cleaned up" and stripped ILM name
434 :rtype: str
435 """
436 retval = name.replace('pii-tool-', '')
437 # Anchor the end as 3 dashes, a v, and 3 digits, e.g. ---v001
438 match = re.search(r'^(.*)---v\d{3}$', retval)
439 if match:
440 retval = match.group(1)
441 return retval
444def strip_index_name(name: str) -> str:
445 """
446 Strip ``partial-``, ``restored-``, ``redacted-``, and trailing ``---v000`` from
447 ``name``
449 :param name: The index name
451 :type name: str
453 :returns: The "cleaned up" and stripped index name
454 :rtype: str
455 """
456 retval = name.replace('partial-', '')
457 retval = retval.replace('restored-', '')
458 retval = retval.replace('redacted-', '')
459 # Anchor the end as 3 dashes, a v, and 3 digits, e.g. ---v001
460 match = re.search(r'^(.*)---v\d{3}$', retval)
461 if match:
462 retval = match.group(1)
463 return retval
466def es_waiter(client: 'Elasticsearch', cls, **kwargs) -> None:
467 """Wait for ILM Phase & Step to be reached"""
468 try:
469 waiter = cls(client, **kwargs)
470 waiter.wait()
471 except (
472 KeyError,
473 ValueError,
474 TimeoutError,
475 IlmWaitError,
476 NotFoundError,
477 ) as wait_err:
478 msg = f'{cls.__name__}: wait for completion failed: {kwargs}'
479 raise e.BadClientResult(msg, wait_err)