Coverage for /Users/buh/.pyenv/versions/3.12.2/envs/pii/lib/python3.12/site-packages/es_pii_tool/base.py: 69%
118 statements
« prev ^ index » next coverage.py v7.5.0, created at 2025-03-18 12:25 -0600
« prev ^ index » next coverage.py v7.5.0, created at 2025-03-18 12:25 -0600
1"""Main app definition"""
3# pylint: disable=broad-exception-caught,R0913
4import typing as t
5import logging
6from es_pii_tool.exceptions import FatalError, MissingIndex
7from es_pii_tool.job import Job
8from es_pii_tool.redacters.index import RedactIndex
9from es_pii_tool.trackables import Task
10from es_pii_tool.helpers.elastic_api import get_hits
11from es_pii_tool.helpers.utils import end_it, get_redactions
13if t.TYPE_CHECKING:
14 from elasticsearch8 import Elasticsearch
16# pylint: disable=R0917
18logger = logging.getLogger(__name__)
21class PiiTool:
22 """Elasticsearch PII Tool"""
24 def __init__(
25 self,
26 client: 'Elasticsearch',
27 tracking_index: str,
28 redaction_file: str = '',
29 redaction_dict: t.Union[t.Dict, None] = None,
30 dry_run: bool = False,
31 ):
32 if redaction_dict is None:
33 redaction_dict = {}
34 logger.debug('Redactions file: %s', redaction_file)
35 self.counter = 0
36 self.client = client
37 try:
38 self.redactions = get_redactions(redaction_file, redaction_dict)
39 except Exception as err:
40 logger.critical('Unable to load redactions: %s', err)
41 raise err
42 logger.debug(f'Loaded redactions: {self.redactions}')
43 self.tracking_index = tracking_index
44 self.dry_run = dry_run
46 def verify_doc_count(self, job: Job) -> bool:
47 """Verify that expected_docs and the hits from the query have the same value
49 :param job: The job object for the present redaction run
51 :type job: :py:class:`~.app.tracking.Job`
53 :rtype: None
54 :returns: No return value
55 """
56 try:
57 task = Task(job, task_id=f'PRE---{job.name}---DOC-COUNT-VERIFICATION')
58 except Exception as err:
59 logger.critical('Unable to create task: %s', err)
60 raise FatalError('Unable to create task', err) from err
61 success = False
62 errors = False
63 if task.finished():
64 return True # We're done already
65 # Log task start
66 task.begin()
67 hits = 0
68 try:
69 hits = get_hits(self.client, job.config['pattern'], job.config['query'])
70 except Exception as err:
71 logger.critical('Unable to count query result hits: %s', err)
72 raise err
73 msg = f'{hits} hit(s)'
74 logger.debug(msg)
75 task.add_log(msg)
76 logger.info("Checking expected document count...")
77 zeromsg = (
78 f"For index pattern {job.config['pattern']}, with query "
79 f"{job.config['query']} 'expected_docs' is {job.config['expected_docs']} "
80 f"but query results is {hits} matches."
81 )
82 if job.config['expected_docs'] == hits:
83 msg = (
84 f'Query result hits: {hits} matches expected_docs: '
85 f'{job.config["expected_docs"]}'
86 )
87 logger.debug(msg)
88 task.add_log(msg)
89 success = True
90 if hits == 0:
91 logger.critical(zeromsg)
92 logger.info('Continuing to next configuration block (if any)')
93 success = False
94 else:
95 logger.critical(zeromsg)
96 logger.info('Continuing to next configuration block (if any)')
97 if not success:
98 errors = True
99 task.add_log(zeromsg)
100 task.end(success, errors=errors)
101 return success
103 def iterate_indices(self, job: Job) -> bool:
104 """Iterate over every index in job.indices"""
105 all_succeeded = True
106 for idx in job.indices:
107 try:
108 task = Task(job, index=idx, id_suffix='PARENT-TASK')
109 # First check to see if idx has been touched as part of a previous run
110 if task.finished():
111 continue # This index has already been verified
112 task.begin()
113 except Exception as err:
114 logger.critical('Unable to create task: %s', err)
115 raise FatalError('Unable to create task', err) from err
116 task_success = False
117 try:
118 msg = f'Iterating per index: Index {idx} of {job.indices}'
119 logger.debug(msg)
120 task.add_log(msg)
121 redact = RedactIndex(idx, job, self.counter)
122 redact.run()
123 task_success = redact.success
124 self.counter = redact.counter
125 logger.debug('RESULT: %s', task_success)
126 except MissingIndex as err:
127 logger.critical(err)
128 raise FatalError(f'Index {err.missing} not found.', err) from err
129 except FatalError as err:
130 logger.critical('Fatal upstream error encountered: %s', err.message)
131 raise FatalError('We suffered a fatal upstream error', err) from err
132 end_it(task, task_success)
133 if not task.completed:
134 all_succeeded = False
135 job.add_log(f'Unable to complete task {task.task_id}')
136 return all_succeeded
138 def iterate_configuration(self) -> None:
139 """Iterate over every configuration block in self.redactions"""
140 logger.debug('Full redactions object from config: %s', self.redactions)
141 for config_block in self.redactions['redactions']: # type: ignore
142 job_success = True
143 # Reset counter to zero for each full iteration
144 self.counter = 0
145 if self.dry_run:
146 logger.info("DRY-RUN MODE ENABLED. No data will be changed.")
148 # There's really only 1 root-level key for each configuration block,
149 # and that's job_id
150 job_name = list(config_block.keys())[0]
151 args = (self.client, self.tracking_index, job_name, config_block[job_name])
152 job = Job(*args, dry_run=self.dry_run)
153 if job.finished():
154 continue
155 job.begin()
156 if not self.verify_doc_count(job):
157 # This configuration block can't go further because of the mismatch
158 job_success = False
159 end_it(job, job_success)
160 continue
162 job_success = self.iterate_indices(job)
163 # At this point, self.counter should be equal to total, indicating that we
164 # matched expected_docs. We should therefore register that the job was
165 # successful, if we have reached this point with no other errors having
166 # interrupted the process.
168 end_it(job, job_success)
170 def run(self) -> None:
171 """Do the thing"""
172 logger.info('PII scrub initiated')
173 self.iterate_configuration()