Coverage for /Users/buh/.pyenv/versions/3.12.2/envs/pii/lib/python3.12/site-packages/es_pii_tool/redacters/index.py: 68%
122 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-10-01 16:39 -0600
« prev ^ index » next coverage.py v7.5.0, created at 2024-10-01 16:39 -0600
1"""Redact data from an Elasticsearch index"""
3import typing as t
4import logging
5from dotmap import DotMap # type: ignore
6from es_pii_tool.exceptions import (
7 BadClientResult,
8 FatalError,
9 MissingIndex,
10)
11from es_pii_tool.task import Task
12from es_pii_tool.helpers.utils import (
13 exception_msgmaker,
14 get_field_matches,
15)
16from es_pii_tool.helpers import elastic_api as api
17from es_pii_tool.redacters.snapshot import RedactSnapshot
19if t.TYPE_CHECKING:
20 from es_pii_tool.job import Job
22logger = logging.getLogger(__name__)
25class RedactIndex:
26 """Redact index per settings"""
28 def __init__(self, index: str, job: 'Job', counter: int):
29 self.task = Task(job, index=index, id_suffix='REDACT-INDEX')
30 self.index = index
31 self.counter = counter
32 self.data = DotMap()
33 self.verify_index()
35 @property
36 def success(self) -> bool:
37 """Was the redaction a success?"""
38 return self._success
40 @success.setter
41 def success(self, value: bool) -> None:
42 self._success = value
44 def end_in_failure(
45 self,
46 exception: t.Union[BadClientResult, MissingIndex],
47 reraise: bool = False,
48 func: t.Union[t.Callable, None] = None,
49 kwargs: t.Union[t.Dict[str, t.Union[bool, str]], None] = None,
50 ) -> None:
51 """For steps and checks that end in failure, we lump you into this method"""
52 msg = exception_msgmaker(exception)
53 logger.critical(msg)
54 if func:
55 if kwargs is None:
56 kwargs = {}
57 logger.error('Empty kwargs passed')
58 if 'logmsg' in kwargs: # For the task ender
59 kwargs['logmsg'] = msg
60 func(**kwargs)
61 if reraise:
62 raise FatalError(msg, exception)
64 def verify_index(self):
65 """Verify the index exists"""
66 # If the index name changed because of an ILM phase shift from hot to cold
67 # or cold to frozen, then we should verify the name change here. We should raise
68 # an exception if the name of the index changed or it disappeared.
69 if not api.verify_index(self.task.job.client, self.index):
70 msg = f'Halting execution: Index {self.index} changed or is missing.'
71 logger.critical(msg)
72 self.success = False
73 raise ValueError(msg, 'index not found as expected', self.index)
75 def run_query(self):
76 """Run the query"""
77 self.data.result = DotMap(
78 dict(
79 api.do_search(
80 self.task.job.client,
81 self.index,
82 self.task.job.config['query'],
83 size=10000,
84 )
85 )
86 )
87 self.data.hits = self.data.result.hits.total.value
88 logger.debug('Checking document fields on index: %s...', self.index)
89 if self.data.hits == 0:
90 self.counter += 1
91 msg = f'Documents matching redaction query not found on index: {self.index}'
92 logger.debug(msg)
93 msg = f'Index {self.counter} of {self.task.job.total} processed...'
94 logger.info(msg)
95 # Record success for this task but send msg to the log field
96 # An index could be in the pattern but have no matches.
97 self.task.end(True, logmsg=msg)
98 self.task.add_log(f"Hits: {self.data.hits}")
100 def verify_fields(self):
101 """Verify the fields in the query results match what we expect"""
102 if not get_field_matches(self.task.job.config, self.data.result.toDict()) > 0:
103 msg = f'Fields required for redaction not found on index: {self.index}'
104 logger.warning(msg)
105 self.task.end(completed=True, logmsg=msg)
106 logger.warning(
107 'Not a fatal error. Index in pattern does not have the specified fields'
108 )
110 def get_phase(self):
111 """Get the ILM phase (if any) for the index"""
112 nope = 'Not assigned an ILM Phase'
113 try:
114 self.data.phase = api.get_phase(self.task.job.client, self.index) or nope
115 except MissingIndex as exc:
116 kwargs = {'completed': False, 'errors': True, 'logmsg': 'replaceme'}
117 self.end_in_failure(exc, reraise=True, func=self.task.end, kwargs=kwargs)
118 logger.debug('Index in phase: %s', self.data.phase.upper())
119 self.task.add_log(f'ILM Phase: {self.data.phase}')
121 def normal_redact(self):
122 """Redact data from a normal (not searchable-snapshot) index"""
123 msg = 'Initiating redaction of data from writeable index...'
124 logger.info(msg)
125 self.task.add_log(msg)
126 # As the redact_from_index function doesn't track dry-run, we have to do it
127 if not self.task.job.dry_run:
128 msg = f'Redacting data from {self.index}'
129 logger.info(msg)
130 self.task.add_log(msg)
131 try:
132 api.redact_from_index(
133 self.task.job.client, self.index, self.task.job.config
134 )
135 except (MissingIndex, BadClientResult) as exc:
136 kwargs = {'completed': False, 'errors': True, 'logmsg': 'replaceme'}
137 self.end_in_failure(
138 exc, reraise=False, func=self.task.end, kwargs=kwargs
139 )
140 else:
141 msg = f'DRY-RUN: Will not redact data from {self.index}'
142 logger.info(msg)
143 self.task.add_log(msg)
145 def snapshot_redact(self):
146 """Redact data from searchable snapshot-backed index"""
147 msg = 'Initiating redaction of data from mounted searchable snapshot...'
148 logger.info(msg)
149 self.task.add_log(msg)
150 try:
151 snp = RedactSnapshot(self.index, self.task.job, self.data.phase)
152 except Exception as exc:
153 logger.critical('Unable to build RedactSnapshot object. Exception: %s', exc)
154 raise
155 try:
156 snp.run()
157 except Exception as exc:
158 logger.critical('Unable to run RedactSnapshot object. Exception: %s', exc)
159 raise
161 def run(self):
162 """Do the actual run"""
163 if self.task.finished():
164 self.success = True
165 return
166 # Log task start time
167 self.task.begin()
168 self.run_query()
169 if self.task.completed:
170 self.success = True
171 return
172 self.verify_fields()
173 if self.task.completed:
174 self.success = True
175 return
176 self.get_phase()
177 if self.data.phase in ('cold', 'frozen'):
178 self.snapshot_redact()
179 else:
180 self.normal_redact()
181 # If we have reached this point, we've succeeded.
182 self.counter += 1
183 msg = f'Index {self.counter} of {self.task.job.total} processed...'
184 logger.info(msg)
185 self.task.add_log(msg)
186 self.task.end(completed=True, logmsg='DONE')
187 self.success = True