Coverage for /Users/buh/.pyenv/versions/3.12.2/envs/pii/lib/python3.12/site-packages/es_pii_tool/job.py: 72%

217 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2025-03-18 12:25 -0600

1"""Functions for creating & updating the progress/status update doc in Elasticsearch""" 

2 

3import typing as t 

4import logging 

5from es_pii_tool.defaults import index_settings, status_mappings 

6from es_pii_tool.exceptions import ( 

7 BadClientResult, 

8 FatalError, 

9 MissingDocument, 

10 MissingIndex, 

11) 

12from es_pii_tool.helpers.elastic_api import ( 

13 create_index, 

14 get_index, 

15 get_tracking_doc, 

16 update_doc, 

17) 

18from es_pii_tool.helpers.utils import now_iso8601, parse_job_config 

19 

20if t.TYPE_CHECKING: 

21 from elasticsearch8 import Elasticsearch 

22 

23logger = logging.getLogger(__name__) 

24 

25# pylint: disable=R0902,R0904,R0913,R0917 

26 

27 

28class Job: 

29 """Class to manage a redaction job""" 

30 

31 ATTRLIST = ['start_time', 'completed', 'end_time', 'errors', 'logs'] 

32 

33 def __init__( 

34 self, 

35 client: 'Elasticsearch', 

36 index: str, 

37 name: str, 

38 config: t.Dict, 

39 dry_run: bool = False, 

40 ): 

41 self.client = client 

42 self.index = index 

43 self.name = name 

44 self.file_config = config 

45 self.dry_run = dry_run 

46 self.prev_dry_run = False 

47 self.cleanup: list[str] = [] 

48 try: 

49 # If the index is already existent, this function will log that fact and 

50 # return cleanly 

51 args = (client, index) 

52 kwargs = {'settings': index_settings(), 'mappings': status_mappings()} 

53 create_index(*args, **kwargs) # type: ignore 

54 except BadClientResult as exc: 

55 logger.critical(exc.message) 

56 raise FatalError( 

57 f'Unexpected, but fatal error trying to create index {index}', exc 

58 ) from exc 

59 self.get_history() 

60 

61 @property 

62 def config(self) -> t.Dict: 

63 """ 

64 :getter: Get the job configuration dictionary 

65 :setter: Set the job configuration dictionary 

66 :type: dict 

67 """ 

68 return self._config 

69 

70 @config.setter 

71 def config(self, value: t.Dict) -> None: 

72 self._config = value 

73 

74 @property 

75 def indices(self) -> t.Sequence[str]: 

76 """ 

77 :getter: Get the list of indices in this job 

78 :setter: Set the list of indices in this job 

79 :type: list 

80 """ 

81 return self._indices 

82 

83 @indices.setter 

84 def indices(self, value: t.Sequence[str]) -> None: 

85 self._indices = value 

86 

87 @property 

88 def total(self) -> int: 

89 """ 

90 :getter: Get the count of indices in this job 

91 :setter: Set the count of indices in this job 

92 :type: int 

93 """ 

94 return self._total 

95 

96 @total.setter 

97 def total(self, value: int) -> None: 

98 self._total = value 

99 

100 @property 

101 def status(self) -> t.Dict: 

102 """ 

103 :getter: Get the job status 

104 :setter: Set the job status 

105 :type: dict 

106 """ 

107 return self._status 

108 

109 @status.setter 

110 def status(self, value: t.Dict) -> None: 

111 self._status = value 

112 

113 @property 

114 def start_time(self) -> str: 

115 """ 

116 :getter: Get the ISO8601 string representing the start time of this job 

117 :setter: Set the ISO8601 string representing the start time of this job 

118 :type: str 

119 """ 

120 return self._start_time 

121 

122 @start_time.setter 

123 def start_time(self, value: str) -> None: 

124 self._start_time = value 

125 

126 @property 

127 def end_time(self) -> str: 

128 """ 

129 :getter: Get the ISO8601 string representing the end time of this job 

130 :setter: Set the ISO8601 string representing the end time of this job 

131 :type: str 

132 """ 

133 return self._end_time 

134 

135 @end_time.setter 

136 def end_time(self, value: str) -> None: 

137 self._end_time = value 

138 

139 @property 

140 def completed(self) -> bool: 

141 """ 

142 :getter: Get the job completion state 

143 :setter: Set the job completion state 

144 :type: bool 

145 """ 

146 return self._completed 

147 

148 @completed.setter 

149 def completed(self, value: bool) -> None: 

150 self._completed = value 

151 

152 @property 

153 def errors(self) -> bool: 

154 """ 

155 :getter: Get job error state 

156 :setter: Set job error state 

157 :type: bool 

158 """ 

159 return self._errors 

160 

161 @errors.setter 

162 def errors(self, value: bool) -> None: 

163 self._errors = value 

164 

165 @property 

166 def logs(self) -> t.Sequence[str]: 

167 """ 

168 :getter: Get job logs 

169 :setter: Set job logs 

170 :type: list 

171 """ 

172 return self._logs 

173 

174 @logs.setter 

175 def logs(self, value: t.Sequence[str]) -> None: 

176 self._logs = value 

177 

178 def add_log(self, value: str) -> None: 

179 """Append another entry to :py:attr:`logs`""" 

180 if self.logs is None: 

181 _ = [] 

182 _.append(f'{now_iso8601()} {value}') 

183 else: 

184 _ = self.logs 

185 _.append(f'{now_iso8601()} {value}') 

186 self.logs = _ 

187 

188 def get_status(self, data: t.Dict) -> t.Dict: 

189 """Read the status keys from the data 

190 

191 :param data: The raw contents of the job progress doc 

192 

193 :returns: Dictionary of results extracted from data 

194 """ 

195 result = {} 

196 for key in self.ATTRLIST: 

197 if key in data: 

198 result[key] = data[key] 

199 else: 

200 result[key] = None 

201 if not result: 

202 logger.info('No execution status for job %s', self.name) 

203 if 'dry_run' in result: 

204 if result['dry_run']: 

205 logger.info('Prior record of job %s was a dry-run', self.name) 

206 self.prev_dry_run = True 

207 return result 

208 

209 def update_status(self) -> None: 

210 """Update instance attribute doc with the current values""" 

211 contents = {} 

212 for val in self.ATTRLIST: 

213 contents[val] = getattr(self, val) 

214 self.status = contents 

215 

216 def build_doc(self) -> t.Dict: 

217 """Build the dictionary which will be the written to the tracking doc 

218 

219 :returns: The tracking doc dictionary 

220 """ 

221 doc = {} 

222 self.update_status() 

223 for key in self.ATTRLIST: 

224 doc[key] = self.status[key] 

225 if 'config' not in doc: 

226 doc['config'] = {} 

227 doc['job'] = self.name 

228 doc['join_field'] = 'job' 

229 doc['config'] = parse_job_config(self.config, 'write') 

230 doc['dry_run'] = self.dry_run 

231 if not self.dry_run: 

232 doc['cleanup'] = self.cleanup 

233 # logger.debug('Updated tracking doc: %s', doc) 

234 return doc 

235 

236 def get_job(self) -> None: 

237 """ 

238 Get any job history that may exist for :py:attr:`name` 

239 

240 Set :py:meth:`status` with the results. 

241 """ 

242 result = {} 

243 try: 

244 result = get_tracking_doc(self.client, self.index, self.name) 

245 except MissingDocument: 

246 logger.debug('Job tracking doc does not yet exist.') 

247 self.config = {} 

248 self.status = {} 

249 return 

250 except Exception as exc: 

251 logger.critical(exc.args[0]) # First arg is always message 

252 raise FatalError('We experienced a fatal error', exc) from exc 

253 try: 

254 self.config = parse_job_config(result['config'], 'read') 

255 except KeyError: 

256 logger.info('No configuration data for job %s', self.name) 

257 self.config = {} 

258 self.status = self.get_status(result) 

259 

260 def launch_prep(self) -> None: 

261 """ 

262 We don't need to do these actions until :py:meth:`begin` calls this method 

263 

264 1. Log dry-run status 

265 2. Set :py:meth:`indices` with the list of indices matching the search pattern 

266 in the configuration file. 

267 3. Set :py:meth:`total` with the count of indices. 

268 """ 

269 if self.dry_run: 

270 msg = 'DRY-RUN: No changes will be made' 

271 logger.info(msg) 

272 self.add_log(msg) 

273 self.indices = list(get_index(self.client, self.config['pattern'])) 

274 logger.debug('Indices from provided pattern: %s', self.indices) 

275 self.total = len(self.indices) 

276 logger.debug("Total number of indices to scrub: %s", self.total) 

277 

278 def load_status(self) -> None: 

279 """Load prior status values (or not)""" 

280 for key in self.ATTRLIST: 

281 if self.prev_dry_run: 

282 # If our last run was a dry run, set each other attribute to None 

283 setattr(self, key, None) 

284 else: 

285 if key in self.status: 

286 setattr(self, key, self.status[key]) 

287 else: 

288 setattr(self, key, None) 

289 

290 def get_history(self) -> None: 

291 """ 

292 Get the history of a job, if any. Ensure all values are populated from the doc, 

293 or None 

294 """ 

295 logger.debug('Pulling any history for job: %s', self.name) 

296 try: 

297 self.get_job() 

298 except MissingIndex as exc: 

299 logger.critical('Missing index: %s', exc.missing) 

300 raise FatalError( 

301 f'Fatal error encountered. Index {exc.missing} was not found', exc 

302 ) from exc 

303 if not self.config: 

304 logger.info( 

305 'No stored config for job: %s. Using file-based config', self.name 

306 ) 

307 self.config = self.file_config 

308 if not self.status: 

309 logger.debug('No event history for job: %s', self.name) 

310 self.load_status() 

311 

312 def report_history(self) -> None: 

313 """ 

314 Report the history of any prior attempt to run the Job 

315 Log aspects of the history here. 

316 """ 

317 prefix = f'The prior run of job: {self.name}' 

318 if self.prev_dry_run: 

319 logger.info('%s was a dry_run', prefix) 

320 if self.start_time: 

321 logger.info('%s started at %s', prefix, self.start_time) 

322 if self.completed: 

323 if self.end_time: 

324 logger.info('%s completed at %s', prefix, self.end_time) 

325 else: 

326 msg = 'is marked completed but did not record an end time' 

327 logger.warning('%s started at %s and %s', prefix, self.start_time, msg) 

328 if self.errors: 

329 logger.warning('%s encountered errors.', prefix) 

330 if self.logs: 

331 # Only report the log if a error is True 

332 logger.warning('%s had log(s): %s', prefix, self.logs) 

333 

334 def begin(self) -> None: 

335 """Begin the job and record the current status""" 

336 logger.info('Beginning job: %s', self.name) 

337 self.launch_prep() 

338 self.start_time = now_iso8601() 

339 self.completed = False 

340 self.record() 

341 

342 def end( 

343 self, 

344 completed: bool = False, 

345 errors: bool = False, 

346 logmsg: t.Union[str, None] = None, 

347 ) -> None: 

348 """End the job and record the current status 

349 

350 :param completed: Did the job complete successfully? 

351 :param errors: Were errors encountered doing the job? 

352 :param logs: Logs recorded doing the job (only if errors) 

353 """ 

354 if self.dry_run: 

355 msg = ( 

356 f'DRY-RUN: Not recording snapshots that can be deleted: {self.cleanup}' 

357 ) 

358 logger.info(msg) 

359 self.add_log(msg) 

360 self.end_time = now_iso8601() 

361 self.completed = completed 

362 self.errors = errors 

363 if logmsg: 

364 self.add_log(logmsg) 

365 self.record() 

366 logger.info('Job: %s ended. Completed: %s', self.name, completed) 

367 

368 def record(self) -> None: 

369 """Record the current status of the job 

370 

371 :rtype: None 

372 :returns: No return value 

373 """ 

374 doc = self.build_doc() 

375 try: 

376 update_doc(self.client, self.index, self.name, doc) 

377 except Exception as exc: 

378 logger.critical(exc.args[0]) # First arg is always message 

379 raise FatalError('Unable to update document', exc) from exc 

380 

381 def finished(self) -> bool: 

382 """Check if a prior run was recorded for this job and log accordingly 

383 

384 :returns: The boolean state of whether a prior run failed to complete 

385 """ 

386 if self.completed: 

387 if self.dry_run: 

388 logger.info( 

389 'DRY-RUN: Ignoring previous successful run of job: %s', self.name 

390 ) 

391 else: 

392 logger.info('Job %s was completed previously.', self.name) 

393 return True 

394 if self.start_time: 

395 self.report_history() 

396 logger.info('Restarting or resuming job: %s', self.name) 

397 return False