Coverage for /Users/buh/.pyenv/versions/3.12.2/envs/pii/lib/python3.12/site-packages/es_pii_tool/job.py: 72%
217 statements
« prev ^ index » next coverage.py v7.5.0, created at 2025-03-18 12:25 -0600
« prev ^ index » next coverage.py v7.5.0, created at 2025-03-18 12:25 -0600
1"""Functions for creating & updating the progress/status update doc in Elasticsearch"""
3import typing as t
4import logging
5from es_pii_tool.defaults import index_settings, status_mappings
6from es_pii_tool.exceptions import (
7 BadClientResult,
8 FatalError,
9 MissingDocument,
10 MissingIndex,
11)
12from es_pii_tool.helpers.elastic_api import (
13 create_index,
14 get_index,
15 get_tracking_doc,
16 update_doc,
17)
18from es_pii_tool.helpers.utils import now_iso8601, parse_job_config
20if t.TYPE_CHECKING:
21 from elasticsearch8 import Elasticsearch
23logger = logging.getLogger(__name__)
25# pylint: disable=R0902,R0904,R0913,R0917
28class Job:
29 """Class to manage a redaction job"""
31 ATTRLIST = ['start_time', 'completed', 'end_time', 'errors', 'logs']
33 def __init__(
34 self,
35 client: 'Elasticsearch',
36 index: str,
37 name: str,
38 config: t.Dict,
39 dry_run: bool = False,
40 ):
41 self.client = client
42 self.index = index
43 self.name = name
44 self.file_config = config
45 self.dry_run = dry_run
46 self.prev_dry_run = False
47 self.cleanup: list[str] = []
48 try:
49 # If the index is already existent, this function will log that fact and
50 # return cleanly
51 args = (client, index)
52 kwargs = {'settings': index_settings(), 'mappings': status_mappings()}
53 create_index(*args, **kwargs) # type: ignore
54 except BadClientResult as exc:
55 logger.critical(exc.message)
56 raise FatalError(
57 f'Unexpected, but fatal error trying to create index {index}', exc
58 ) from exc
59 self.get_history()
61 @property
62 def config(self) -> t.Dict:
63 """
64 :getter: Get the job configuration dictionary
65 :setter: Set the job configuration dictionary
66 :type: dict
67 """
68 return self._config
70 @config.setter
71 def config(self, value: t.Dict) -> None:
72 self._config = value
74 @property
75 def indices(self) -> t.Sequence[str]:
76 """
77 :getter: Get the list of indices in this job
78 :setter: Set the list of indices in this job
79 :type: list
80 """
81 return self._indices
83 @indices.setter
84 def indices(self, value: t.Sequence[str]) -> None:
85 self._indices = value
87 @property
88 def total(self) -> int:
89 """
90 :getter: Get the count of indices in this job
91 :setter: Set the count of indices in this job
92 :type: int
93 """
94 return self._total
96 @total.setter
97 def total(self, value: int) -> None:
98 self._total = value
100 @property
101 def status(self) -> t.Dict:
102 """
103 :getter: Get the job status
104 :setter: Set the job status
105 :type: dict
106 """
107 return self._status
109 @status.setter
110 def status(self, value: t.Dict) -> None:
111 self._status = value
113 @property
114 def start_time(self) -> str:
115 """
116 :getter: Get the ISO8601 string representing the start time of this job
117 :setter: Set the ISO8601 string representing the start time of this job
118 :type: str
119 """
120 return self._start_time
122 @start_time.setter
123 def start_time(self, value: str) -> None:
124 self._start_time = value
126 @property
127 def end_time(self) -> str:
128 """
129 :getter: Get the ISO8601 string representing the end time of this job
130 :setter: Set the ISO8601 string representing the end time of this job
131 :type: str
132 """
133 return self._end_time
135 @end_time.setter
136 def end_time(self, value: str) -> None:
137 self._end_time = value
139 @property
140 def completed(self) -> bool:
141 """
142 :getter: Get the job completion state
143 :setter: Set the job completion state
144 :type: bool
145 """
146 return self._completed
148 @completed.setter
149 def completed(self, value: bool) -> None:
150 self._completed = value
152 @property
153 def errors(self) -> bool:
154 """
155 :getter: Get job error state
156 :setter: Set job error state
157 :type: bool
158 """
159 return self._errors
161 @errors.setter
162 def errors(self, value: bool) -> None:
163 self._errors = value
165 @property
166 def logs(self) -> t.Sequence[str]:
167 """
168 :getter: Get job logs
169 :setter: Set job logs
170 :type: list
171 """
172 return self._logs
174 @logs.setter
175 def logs(self, value: t.Sequence[str]) -> None:
176 self._logs = value
178 def add_log(self, value: str) -> None:
179 """Append another entry to :py:attr:`logs`"""
180 if self.logs is None:
181 _ = []
182 _.append(f'{now_iso8601()} {value}')
183 else:
184 _ = self.logs
185 _.append(f'{now_iso8601()} {value}')
186 self.logs = _
188 def get_status(self, data: t.Dict) -> t.Dict:
189 """Read the status keys from the data
191 :param data: The raw contents of the job progress doc
193 :returns: Dictionary of results extracted from data
194 """
195 result = {}
196 for key in self.ATTRLIST:
197 if key in data:
198 result[key] = data[key]
199 else:
200 result[key] = None
201 if not result:
202 logger.info('No execution status for job %s', self.name)
203 if 'dry_run' in result:
204 if result['dry_run']:
205 logger.info('Prior record of job %s was a dry-run', self.name)
206 self.prev_dry_run = True
207 return result
209 def update_status(self) -> None:
210 """Update instance attribute doc with the current values"""
211 contents = {}
212 for val in self.ATTRLIST:
213 contents[val] = getattr(self, val)
214 self.status = contents
216 def build_doc(self) -> t.Dict:
217 """Build the dictionary which will be the written to the tracking doc
219 :returns: The tracking doc dictionary
220 """
221 doc = {}
222 self.update_status()
223 for key in self.ATTRLIST:
224 doc[key] = self.status[key]
225 if 'config' not in doc:
226 doc['config'] = {}
227 doc['job'] = self.name
228 doc['join_field'] = 'job'
229 doc['config'] = parse_job_config(self.config, 'write')
230 doc['dry_run'] = self.dry_run
231 if not self.dry_run:
232 doc['cleanup'] = self.cleanup
233 # logger.debug('Updated tracking doc: %s', doc)
234 return doc
236 def get_job(self) -> None:
237 """
238 Get any job history that may exist for :py:attr:`name`
240 Set :py:meth:`status` with the results.
241 """
242 result = {}
243 try:
244 result = get_tracking_doc(self.client, self.index, self.name)
245 except MissingDocument:
246 logger.debug('Job tracking doc does not yet exist.')
247 self.config = {}
248 self.status = {}
249 return
250 except Exception as exc:
251 logger.critical(exc.args[0]) # First arg is always message
252 raise FatalError('We experienced a fatal error', exc) from exc
253 try:
254 self.config = parse_job_config(result['config'], 'read')
255 except KeyError:
256 logger.info('No configuration data for job %s', self.name)
257 self.config = {}
258 self.status = self.get_status(result)
260 def launch_prep(self) -> None:
261 """
262 We don't need to do these actions until :py:meth:`begin` calls this method
264 1. Log dry-run status
265 2. Set :py:meth:`indices` with the list of indices matching the search pattern
266 in the configuration file.
267 3. Set :py:meth:`total` with the count of indices.
268 """
269 if self.dry_run:
270 msg = 'DRY-RUN: No changes will be made'
271 logger.info(msg)
272 self.add_log(msg)
273 self.indices = list(get_index(self.client, self.config['pattern']))
274 logger.debug('Indices from provided pattern: %s', self.indices)
275 self.total = len(self.indices)
276 logger.debug("Total number of indices to scrub: %s", self.total)
278 def load_status(self) -> None:
279 """Load prior status values (or not)"""
280 for key in self.ATTRLIST:
281 if self.prev_dry_run:
282 # If our last run was a dry run, set each other attribute to None
283 setattr(self, key, None)
284 else:
285 if key in self.status:
286 setattr(self, key, self.status[key])
287 else:
288 setattr(self, key, None)
290 def get_history(self) -> None:
291 """
292 Get the history of a job, if any. Ensure all values are populated from the doc,
293 or None
294 """
295 logger.debug('Pulling any history for job: %s', self.name)
296 try:
297 self.get_job()
298 except MissingIndex as exc:
299 logger.critical('Missing index: %s', exc.missing)
300 raise FatalError(
301 f'Fatal error encountered. Index {exc.missing} was not found', exc
302 ) from exc
303 if not self.config:
304 logger.info(
305 'No stored config for job: %s. Using file-based config', self.name
306 )
307 self.config = self.file_config
308 if not self.status:
309 logger.debug('No event history for job: %s', self.name)
310 self.load_status()
312 def report_history(self) -> None:
313 """
314 Report the history of any prior attempt to run the Job
315 Log aspects of the history here.
316 """
317 prefix = f'The prior run of job: {self.name}'
318 if self.prev_dry_run:
319 logger.info('%s was a dry_run', prefix)
320 if self.start_time:
321 logger.info('%s started at %s', prefix, self.start_time)
322 if self.completed:
323 if self.end_time:
324 logger.info('%s completed at %s', prefix, self.end_time)
325 else:
326 msg = 'is marked completed but did not record an end time'
327 logger.warning('%s started at %s and %s', prefix, self.start_time, msg)
328 if self.errors:
329 logger.warning('%s encountered errors.', prefix)
330 if self.logs:
331 # Only report the log if a error is True
332 logger.warning('%s had log(s): %s', prefix, self.logs)
334 def begin(self) -> None:
335 """Begin the job and record the current status"""
336 logger.info('Beginning job: %s', self.name)
337 self.launch_prep()
338 self.start_time = now_iso8601()
339 self.completed = False
340 self.record()
342 def end(
343 self,
344 completed: bool = False,
345 errors: bool = False,
346 logmsg: t.Union[str, None] = None,
347 ) -> None:
348 """End the job and record the current status
350 :param completed: Did the job complete successfully?
351 :param errors: Were errors encountered doing the job?
352 :param logs: Logs recorded doing the job (only if errors)
353 """
354 if self.dry_run:
355 msg = (
356 f'DRY-RUN: Not recording snapshots that can be deleted: {self.cleanup}'
357 )
358 logger.info(msg)
359 self.add_log(msg)
360 self.end_time = now_iso8601()
361 self.completed = completed
362 self.errors = errors
363 if logmsg:
364 self.add_log(logmsg)
365 self.record()
366 logger.info('Job: %s ended. Completed: %s', self.name, completed)
368 def record(self) -> None:
369 """Record the current status of the job
371 :rtype: None
372 :returns: No return value
373 """
374 doc = self.build_doc()
375 try:
376 update_doc(self.client, self.index, self.name, doc)
377 except Exception as exc:
378 logger.critical(exc.args[0]) # First arg is always message
379 raise FatalError('Unable to update document', exc) from exc
381 def finished(self) -> bool:
382 """Check if a prior run was recorded for this job and log accordingly
384 :returns: The boolean state of whether a prior run failed to complete
385 """
386 if self.completed:
387 if self.dry_run:
388 logger.info(
389 'DRY-RUN: Ignoring previous successful run of job: %s', self.name
390 )
391 else:
392 logger.info('Job %s was completed previously.', self.name)
393 return True
394 if self.start_time:
395 self.report_history()
396 logger.info('Restarting or resuming job: %s', self.name)
397 return False