Coverage for /Users/buh/.pyenv/versions/3.12.2/envs/pii/lib/python3.12/site-packages/es_pii_tool/redacters/index.py: 67%
126 statements
« prev ^ index » next coverage.py v7.5.0, created at 2025-03-17 23:33 -0600
« prev ^ index » next coverage.py v7.5.0, created at 2025-03-17 23:33 -0600
1"""Redact data from an Elasticsearch index"""
3import typing as t
4import logging
5from dotmap import DotMap # type: ignore
6from es_pii_tool.exceptions import (
7 BadClientResult,
8 FatalError,
9 MissingIndex,
10)
11from es_pii_tool.trackables import Task
12from es_pii_tool.helpers.utils import (
13 exception_msgmaker,
14 get_field_matches,
15)
16from es_pii_tool.helpers import elastic_api as api
17from es_pii_tool.redacters.snapshot import RedactSnapshot
19if t.TYPE_CHECKING:
20 from es_pii_tool.job import Job
22logger = logging.getLogger(__name__)
25class RedactIndex:
26 """Redact index per settings"""
28 def __init__(self, index: str, job: 'Job', counter: int):
29 try:
30 self.task = Task(job, index=index, id_suffix='REDACT-INDEX')
31 except Exception as exc:
32 logger.critical('Unable to create task: %s', exc)
33 raise FatalError('Unable to create task', exc) from exc
34 self.index = index
35 self.counter = counter
36 self.data = DotMap()
37 self.verify_index()
39 @property
40 def success(self) -> bool:
41 """Was the redaction a success?"""
42 return self._success
44 @success.setter
45 def success(self, value: bool) -> None:
46 self._success = value
48 def end_in_failure(
49 self,
50 exception: t.Union[BadClientResult, MissingIndex],
51 reraise: bool = False,
52 func: t.Union[t.Callable, None] = None,
53 kwargs: t.Union[t.Dict[str, t.Union[bool, str]], None] = None,
54 ) -> None:
55 """For steps and checks that end in failure, we lump you into this method"""
56 msg = exception_msgmaker(exception)
57 logger.critical(msg)
58 if func:
59 if kwargs is None:
60 kwargs = {}
61 logger.error('Empty kwargs passed')
62 if 'logmsg' in kwargs: # For the task ender
63 kwargs['logmsg'] = msg
64 func(**kwargs)
65 if reraise:
66 raise FatalError(msg, exception)
68 def verify_index(self):
69 """Verify the index exists"""
70 # If the index name changed because of an ILM phase shift from hot to cold
71 # or cold to frozen, then we should verify the name change here. We should raise
72 # an exception if the name of the index changed or it disappeared.
73 if not api.verify_index(self.task.job.client, self.index):
74 msg = f'Halting execution: Index {self.index} changed or is missing.'
75 logger.critical(msg)
76 self.success = False
77 raise ValueError(msg, 'index not found as expected', self.index)
79 def run_query(self):
80 """Run the query"""
81 self.data.result = DotMap(
82 dict(
83 api.do_search(
84 self.task.job.client,
85 self.index,
86 self.task.job.config['query'],
87 size=10000,
88 )
89 )
90 )
91 self.data.hits = self.data.result.hits.total.value
92 logger.debug('Checking document fields on index: %s...', self.index)
93 if self.data.hits == 0:
94 self.counter += 1
95 msg = f'Documents matching redaction query not found on index: {self.index}'
96 logger.debug(msg)
97 msg = f'Index {self.counter} of {self.task.job.total} processed...'
98 logger.info(msg)
99 # Record success for this task but send msg to the log field
100 # An index could be in the pattern but have no matches.
101 self.task.end(True, logmsg=msg)
102 self.task.add_log(f"Hits: {self.data.hits}")
104 def verify_fields(self):
105 """Verify the fields in the query results match what we expect"""
106 if not get_field_matches(self.task.job.config, self.data.result.toDict()) > 0:
107 msg = f'Fields required for redaction not found on index: {self.index}'
108 logger.warning(msg)
109 self.task.end(completed=True, logmsg=msg)
110 logger.warning(
111 'Not a fatal error. Index in pattern does not have the specified fields'
112 )
114 def get_phase(self):
115 """Get the ILM phase (if any) for the index"""
116 nope = 'Not assigned an ILM Phase'
117 try:
118 self.data.phase = api.get_phase(self.task.job.client, self.index) or nope
119 except MissingIndex as exc:
120 kwargs = {'completed': False, 'errors': True, 'logmsg': 'replaceme'}
121 self.end_in_failure(exc, reraise=True, func=self.task.end, kwargs=kwargs)
122 logger.debug('Index in phase: %s', self.data.phase.upper())
123 self.task.add_log(f'ILM Phase: {self.data.phase}')
125 def normal_redact(self):
126 """Redact data from a normal (not searchable-snapshot) index"""
127 msg = 'Initiating redaction of data from writeable index...'
128 logger.info(msg)
129 self.task.add_log(msg)
130 # As the redact_from_index function doesn't track dry-run, we have to do it
131 if not self.task.job.dry_run:
132 msg = f'Redacting data from {self.index}'
133 logger.info(msg)
134 self.task.add_log(msg)
135 try:
136 api.redact_from_index(
137 self.task.job.client, self.index, self.task.job.config
138 )
139 except (MissingIndex, BadClientResult) as exc:
140 kwargs = {'completed': False, 'errors': True, 'logmsg': 'replaceme'}
141 self.end_in_failure(
142 exc, reraise=False, func=self.task.end, kwargs=kwargs
143 )
144 else:
145 msg = f'DRY-RUN: Will not redact data from {self.index}'
146 logger.info(msg)
147 self.task.add_log(msg)
149 def snapshot_redact(self):
150 """Redact data from searchable snapshot-backed index"""
151 msg = 'Initiating redaction of data from mounted searchable snapshot...'
152 logger.info(msg)
153 self.task.add_log(msg)
154 try:
155 snp = RedactSnapshot(self.index, self.task.job, self.data.phase)
156 except Exception as exc:
157 logger.critical('Unable to build RedactSnapshot object. Exception: %s', exc)
158 raise exc
159 try:
160 snp.run()
161 except Exception as exc:
162 logger.critical('Unable to run RedactSnapshot object. Exception: %s', exc)
163 raise exc
165 def run(self):
166 """Do the actual run"""
167 if self.task.finished():
168 self.success = True
169 return
170 # Log task start time
171 self.task.begin()
172 self.run_query()
173 if self.task.completed:
174 self.success = True
175 return
176 self.verify_fields()
177 if self.task.completed:
178 self.success = True
179 return
180 self.get_phase()
181 if self.data.phase in ('cold', 'frozen'):
182 self.snapshot_redact()
183 else:
184 self.normal_redact()
185 # If we have reached this point, we've succeeded.
186 self.counter += 1
187 msg = f'Index {self.counter} of {self.task.job.total} processed...'
188 logger.info(msg)
189 self.task.add_log(msg)
190 self.task.end(completed=True, logmsg='DONE')
191 self.success = True