Coverage for /Users/buh/.pyenv/versions/3.12.2/envs/pii/lib/python3.12/site-packages/es_pii_tool/helpers/utils.py: 71%

206 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2025-03-18 12:25 -0600

1"""Helper Functions""" 

2 

3import typing as t 

4import logging 

5from os import environ 

6import json 

7from inspect import stack 

8from datetime import datetime, timezone 

9import re 

10from es_client.exceptions import ConfigurationError as esc_ConfigError 

11from es_client.helpers.schemacheck import SchemaCheck 

12from es_client.helpers.utils import get_yaml 

13from es_wait.exceptions import EsWaitFatal, EsWaitTimeout, IlmWaitError 

14import es_pii_tool.exceptions as e 

15from es_pii_tool.defaults import ( 

16 PHASES, 

17 PAUSE_DEFAULT, 

18 TIMEOUT_DEFAULT, 

19 TIMINGS, 

20 redaction_schema, 

21) 

22 

23if t.TYPE_CHECKING: 

24 from dotmap import DotMap # type: ignore 

25 from voluptuous import Schema 

26 from elasticsearch8 import Elasticsearch 

27 from es_pii_tool.job import Job 

28 from es_pii_tool.trackables import Task 

29 

30logger = logging.getLogger(__name__) 

31 

32 

33def build_script(message: str, fields: t.Sequence[str]) -> t.Dict[str, str]: 

34 """ 

35 Build a painless script for redacting fields by way of an update_by_query operation 

36 

37 :param message: The text to put in place of whatever is in a field 

38 :param fields: The list of field names to act on 

39 

40 :type message: str 

41 :type fields: list 

42 

43 :rtype: dict 

44 :returns: A dictionary of ``{"source": (assembled message), "lang": "painless"}`` 

45 """ 

46 msg = "" 

47 for field in fields: 

48 msg += f"ctx._source.{field} = '{message}'; " 

49 script = {"source": msg, "lang": "painless"} 

50 logger.debug('script = %s', script) 

51 return script 

52 

53 

54def check_dotted_fields(result: t.Dict, field: str, message: str) -> bool: 

55 """Iterate through dotted fields to ensure success 

56 

57 :param result: The search result object 

58 :param field: The field with dotted notation 

59 

60 :type result: dict 

61 :type field: str 

62 

63 :returns: Success (``True``) or Failure (``False``) 

64 :rtype: bool 

65 """ 

66 success = False 

67 logger.debug('Dotted field detected: (%s) ...', field) 

68 fielder = result['hits']['hits'][0]['_source'] 

69 iterations = len(field.split('.')) 

70 counter = 1 

71 for key in field.split('.'): 

72 # This should recursively look for each subkey 

73 if key in fielder: 

74 fielder = fielder[key] 

75 else: 

76 break 

77 if counter == iterations: 

78 if fielder == message: 

79 success = True 

80 counter += 1 

81 return success 

82 

83 

84def check_fields(result: t.Dict, job_config: t.Dict) -> bool: 

85 """Check document fields in result to ensure success 

86 

87 :param result: The search result object 

88 :param job_config: The configuration settings for this job 

89 

90 :type result: dict 

91 :type job_config: dict 

92 

93 :returns: Success (``True``) or Failure (``False``) 

94 :rtype: bool 

95 """ 

96 complete = True 

97 hit = result['hits']['hits'][0]['_source'] 

98 for field in job_config['fields']: 

99 success = False 

100 if len(field.split('.')) > 1: 

101 success = check_dotted_fields(result, field, job_config['message']) 

102 

103 elif field in hit: 

104 if hit[field] == job_config['message']: 

105 success = True 

106 

107 else: 

108 logger.warning("Field %s not present in document", field) 

109 # Don't need to report the expected fail 2x, so we break the loop here 

110 break 

111 

112 if success: 

113 logger.info("Field %s is redacted correctly", field) 

114 else: 

115 # A single failure is enough to make it a complete failure. 

116 complete = False 

117 logger.error("Field %s is not redacted correctly", field) 

118 return complete 

119 

120 

121def chunk_index_list(indices: t.Sequence[str]) -> t.Sequence[t.Sequence[str]]: 

122 """ 

123 This utility chunks very large index lists into 3KB chunks. 

124 It measures the size as a csv string, then converts back into a list for the return 

125 value. 

126 

127 :param indices: The list of indices 

128 

129 :type indices: list 

130 

131 :returns: A list of lists (each a piece of the original ``indices``) 

132 :rtype: list 

133 """ 

134 chunks = [] 

135 chunk = "" 

136 for index in indices: 

137 if len(chunk) < 3072: 

138 if not chunk: 

139 chunk = index 

140 else: 

141 chunk += "," + index 

142 else: 

143 chunks.append(chunk.split(',')) 

144 chunk = index 

145 chunks.append(chunk.split(',')) 

146 return chunks 

147 

148 

149def configure_ilm_policy(task: 'Task', data: 'DotMap') -> None: 

150 """ 

151 Prune phases we've already passed. 

152 

153 If only_expunge_deletes is True in the job config, set any force_merge_index 

154 actions to False. 

155 """ 

156 # Copy the existing policy to a new spot 

157 data.new.ilmpolicy = data.ilm.lifecycle.policy 

158 

159 # Prune phases from existing ILM policy we've already surpassed 

160 for phase in list(data.new.ilmpolicy.phases.toDict().keys()): 

161 if PHASES.index(data.ilm.explain.phase) > PHASES.index(phase): 

162 del data.new.ilmpolicy.phases[phase] 

163 

164 # Figure out if we're doing force merge 

165 fmerge = True 

166 if 'forcemerge' in task.job.config: 

167 fmkwargs = task.job.config['forcemerge'] 

168 if 'only_expunge_deletes' in fmkwargs and fmkwargs['only_expunge_deletes']: 

169 fmerge = False 

170 else: 

171 fmerge = False 

172 

173 # Loop through the remaining phases and set 'force_merge_index': False 

174 # to the cold or frozen actions. 

175 

176 for phase in data.new.ilmpolicy.phases: 

177 if phase not in ['cold', 'frozen']: 

178 continue 

179 if 'searchable_snapshot' in data.new.ilmpolicy.phases[phase].actions: 

180 data.new.ilmpolicy.phases[ 

181 phase 

182 ].actions.searchable_snapshot.force_merge_index = fmerge 

183 

184 

185def end_it(obj: t.Union['Job', 'Task'], success: bool) -> None: 

186 """Close out the object here to avoid code repetition""" 

187 # Record task success or fail here for THIS task_id 

188 # Each index in per_index has its own status tracker 

189 if not success: 

190 err = True 

191 log = 'Check application logs for detailed report' 

192 else: 

193 err = False 

194 log = 'DONE' 

195 obj.end(completed=success, errors=err, logmsg=log) 

196 

197 

198def exception_msgmaker(exc: t.Union[e.MissingIndex, e.BadClientResult]) -> str: 

199 """Most of the messages here are similar enough to warrant a single function""" 

200 msg = '' 

201 upstream = ( 

202 f'The upstream exception type was {type(exc.upstream).__name__}, ' 

203 f'with error message: {exc.upstream.args[0]}' 

204 ) 

205 if isinstance(exc, e.MissingIndex): 

206 msg = ( 

207 f'Exception raised because index {exc.missing} was not found. ' 

208 f'{upstream}' 

209 ) 

210 elif isinstance(exc, e.BadClientResult): 

211 msg = ( 

212 f'Exception raised because of a bad or unexpected response or result ' 

213 f'from the Elasticsearch cluster. {upstream}' 

214 ) 

215 return msg 

216 

217 

218def get_alias_actions(oldidx: str, newidx: str, aliases: t.Dict) -> t.Sequence: 

219 """ 

220 :param oldidx: The old index name 

221 :param newidx: The new index name 

222 :param aliases: The aliases 

223 

224 :type oldidx: str 

225 :type newidx: str 

226 :type aliases: dict 

227 

228 :returns: A list of actions suitable for 

229 :py:meth:`~.elasticsearch.client.IndicesClient.update_aliases` ``actions`` 

230 kwarg. 

231 :rtype: list 

232 """ 

233 actions = [] 

234 for alias in aliases.keys(): 

235 actions.append({'remove': {'index': oldidx, 'alias': alias}}) 

236 actions.append({'add': {'index': newidx, 'alias': alias}}) 

237 return actions 

238 

239 

240def get_field_matches(config: t.Dict, result: t.Dict) -> int: 

241 """Count docs which have the expected fields 

242 

243 :param config: The config from the YAML file 

244 :param result: The query result dict 

245 

246 :type config: dict 

247 :type result: dict 

248 

249 :returns: The count of docs in ``result`` which have the identified fields 

250 :rtype: int 

251 """ 

252 

253 logger.debug('Extracting doc hit count from result') 

254 doc_count = result['hits']['total']['value'] 

255 for element in range(0, result['hits']['total']['value']): 

256 for field in config['fields']: 

257 if len(field.split('.')) > 1: 

258 logger.debug('Dotted field "%s" detected...', field) 

259 fielder = result['hits']['hits'][element]['_source'] 

260 for key in field.split('.'): 

261 # This should recursively look for each subkey 

262 if key in fielder: 

263 fielder = fielder[key] 

264 else: 

265 doc_count -= 1 

266 break 

267 elif field not in list(result['hits']['hits'][element]['_source'].keys()): 

268 logger.debug('Fieldname "%s" NOT detected...', field) 

269 doc_count -= 1 

270 else: 

271 logger.debug('Root-level fieldname "%s" detected...', field) 

272 return doc_count 

273 

274 

275def get_fname() -> str: 

276 """Return the name of the calling function""" 

277 return stack()[1].function 

278 

279 

280def get_inc_version(name: str) -> int: 

281 """Extract the incrementing version value from the end of name 

282 

283 :param name: The name 

284 

285 :type name: str 

286 

287 :returns: The integer value of the current index revision, or 0 if no version 

288 :rtype: int 

289 """ 

290 # Anchor the end as 3 dashes, a v, and 3 digits, e.g. ---v001 

291 match = re.search(r'^.*---v(\d{3})$', name) 

292 if match: 

293 return int(match.group(1)) 

294 return 0 

295 

296 

297def get_redactions( 

298 file: str = '', data: t.Union[t.Dict, None] = None 

299) -> t.Union['Schema', None]: 

300 """ 

301 Return valid dictionary of redactions from either ``file`` or from ``data`` 

302 after checking Schema 

303 

304 :param file: YAML file with redactions to check 

305 :param data: Configuration data in dictinoary format 

306 

307 :type file: str 

308 :type data: dict 

309 

310 :rtype: dict 

311 :returns: Redactions configuration data 

312 """ 

313 if data is None: 

314 data = {} 

315 logger.debug('Getting redactions data...') 

316 if file: 

317 try: 

318 config = get_yaml(file) 

319 except esc_ConfigError as exc: 

320 msg = f'Unable to read and/or parse YAML REDACTIONS_FILE: {file} Exiting.' 

321 logger.critical(msg) 

322 raise e.ConfigError(msg, exc) 

323 elif data: 

324 config = data 

325 else: 

326 raise e.FatalError('No configuration file or dictionary provided.', Exception()) 

327 logger.debug('Performing redaction schema check...') 

328 retval = None 

329 try: 

330 retval = SchemaCheck( 

331 config, redaction_schema(), 'Redaction Configuration', 'redactions' 

332 ).result() 

333 except Exception as exc: 

334 msg = 'Redaction configuration schema check failed. Exiting.' 

335 logger.critical(msg) 

336 raise exc 

337 return retval 

338 

339 

340def now_iso8601() -> str: 

341 """ 

342 :returns: An ISO8601 timestamp based on datetime.now 

343 """ 

344 # Because Python 3.12 now requires non-naive timezone declarations, we must change. 

345 # 

346 # ## Example: 

347 # ## The new way: 

348 # ## datetime.now(timezone.utc).isoformat() 

349 # ## Result: 2024-04-16T16:00:00+00:00 

350 # ## End Example 

351 # 

352 # Note that the +00:00 is appended now where we affirmatively declare the UTC 

353 # timezone 

354 # 

355 # As a result, we will use this function to prune away the timezone if it is +00:00 

356 # and replace it with Z, which is shorter Zulu notation for UTC (per Elasticsearch) 

357 # 

358 # We are MANUALLY, FORCEFULLY declaring timezone.utc, so it should ALWAYS be +00:00, 

359 # but could in theory sometime show up as a Z, so we test for that. 

360 

361 parts = datetime.now(timezone.utc).isoformat().split('+') 

362 if len(parts) == 1: 

363 if parts[0][-1] == 'Z': 

364 return parts[0] # Our ISO8601 already ends with a Z for Zulu/UTC time 

365 return f'{parts[0]}Z' # It doesn't end with a Z so we put one there 

366 if parts[1] == '00:00': 

367 return f'{parts[0]}Z' # It doesn't end with a Z so we put one there 

368 return f'{parts[0]}+{parts[1]}' # Fallback publishes the +TZ, whatever that was 

369 

370 

371def config_fieldmap( 

372 rw_val: t.Literal['read', 'write'], 

373 key: t.Literal[ 

374 'pattern', 

375 'query', 

376 'fields', 

377 'message', 

378 'expected_docs', 

379 'restore_settings', 

380 'delete', 

381 ], 

382) -> t.Union[str, int, object]: 

383 """ 

384 Return the function from this function/key map 

385 """ 

386 which = { 

387 'read': { 

388 'pattern': json.loads, 

389 'query': json.loads, 

390 'fields': json.loads, 

391 'message': str, 

392 'expected_docs': int, 

393 'restore_settings': json.loads, 

394 'delete': str, 

395 }, 

396 'write': { 

397 'pattern': json.dumps, 

398 'query': json.dumps, 

399 'fields': json.dumps, 

400 'message': str, 

401 'expected_docs': int, 

402 'restore_settings': json.dumps, 

403 'delete': str, 

404 }, 

405 } 

406 return which[rw_val][key] 

407 

408 

409def parse_job_config(config: t.Dict, behavior: t.Literal['read', 'write']) -> t.Dict: 

410 """Parse raw config from the index. 

411 

412 Several fields are JSON escaped, so we need to fix it to put it in a dict. 

413 

414 :param config: The raw config data 

415 :param behavior: ``read`` or ``write`` 

416 

417 :type config: dict 

418 :type behavior: str 

419 

420 :rtype: dict 

421 

422 :returns: JSON-(de)sanitized configuration dict 

423 """ 

424 fields = [ 

425 'pattern', 

426 'query', 

427 'fields', 

428 'message', 

429 'expected_docs', 

430 'restore_settings', 

431 'delete', 

432 ] 

433 doc = {} 

434 for field in fields: 

435 if field in config: 

436 func = config_fieldmap(behavior, field) # type: ignore 

437 doc[field] = func(config[field]) # type: ignore 

438 return doc 

439 

440 

441def strip_ilm_name(name: str) -> str: 

442 """ 

443 Strip leading ``pii-tool-``, and trailing ``---v000`` from ``name`` 

444 

445 :param name: The ILM lifecycle name 

446 

447 :type name: str 

448 

449 :returns: The "cleaned up" and stripped ILM name 

450 :rtype: str 

451 """ 

452 retval = name.replace('pii-tool-', '') 

453 # Anchor the end as 3 dashes, a v, and 3 digits, e.g. ---v001 

454 match = re.search(r'^(.*)---v\d{3}$', retval) 

455 if match: 

456 retval = match.group(1) 

457 return retval 

458 

459 

460def strip_index_name(name: str) -> str: 

461 """ 

462 Strip ``partial-``, ``restored-``, ``redacted-``, and trailing ``---v000`` from 

463 ``name`` 

464 

465 :param name: The index name 

466 

467 :type name: str 

468 

469 :returns: The "cleaned up" and stripped index name 

470 :rtype: str 

471 """ 

472 retval = name.replace('partial-', '') 

473 retval = retval.replace('restored-', '') 

474 retval = retval.replace('redacted-', '') 

475 # Anchor the end as 3 dashes, a v, and 3 digits, e.g. ---v001 

476 match = re.search(r'^(.*)---v\d{3}$', retval) 

477 if match: 

478 retval = match.group(1) 

479 return retval 

480 

481 

482def es_waiter(client: 'Elasticsearch', cls, **kwargs) -> None: 

483 """Wait for ILM Phase & Step to be reached""" 

484 try: 

485 waiter = cls(client, **kwargs) 

486 waiter.wait() 

487 except ( 

488 IlmWaitError, 

489 EsWaitFatal, 

490 EsWaitTimeout, 

491 ) as wait_err: 

492 msg = f'{cls.__name__}: wait for completion failed: {kwargs}' 

493 logger.error(f'{msg}. Exception(s): - {wait_err}') 

494 raise e.BadClientResult(msg, wait_err) 

495 

496 

497def timing(kind: str) -> t.Tuple: 

498 """ 

499 Return a tuple of two floats: the pause value and the timeout value 

500 

501 :param kind: The kind of timing to do 

502 

503 :type kind: str 

504 

505 :returns: A tuple of two floats 

506 :rtype: tuple 

507 """ 

508 is_test = environ.get('PII_TOOL_TESTING', 'False') == 'True' 

509 pause = 1.0 if is_test else PAUSE_DEFAULT # Default values to be overridden 

510 timeout = 30.0 if is_test else TIMEOUT_DEFAULT # Default values to be overridden 

511 testkey = 'testing' if is_test else 'default' 

512 pause = TIMINGS[kind]['pause'][testkey] 

513 timeout = TIMINGS[kind]['timeout'][testkey] 

514 # logger.debug( 

515 # f'kind = {kind}, TESTING = {testing}, PAUSE = {pause}, TIMEOUT = {timeout}' 

516 # ) 

517 return pause, timeout