Coverage for /Users/buh/.pyenv/versions/3.12.2/envs/pii/lib/python3.12/site-packages/es_pii_tool/helpers/utils.py: 71%

189 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2025-01-29 19:29 -0700

1"""Helper Functions""" 

2 

3import typing as t 

4import logging 

5import json 

6from inspect import stack 

7from datetime import datetime, timezone 

8import re 

9from elasticsearch8.exceptions import NotFoundError 

10from es_client.exceptions import ConfigurationError as esc_ConfigError 

11from es_client.helpers.schemacheck import SchemaCheck 

12from es_client.helpers.utils import get_yaml 

13from es_wait.exceptions import IlmWaitError 

14import es_pii_tool.exceptions as e 

15from es_pii_tool.defaults import PHASES, redaction_schema 

16 

17if t.TYPE_CHECKING: 

18 from dotmap import DotMap # type: ignore 

19 from voluptuous import Schema 

20 from elasticsearch8 import Elasticsearch 

21 from es_pii_tool.job import Job 

22 from es_pii_tool.trackables import Task 

23 

24logger = logging.getLogger(__name__) 

25 

26 

27def build_script(message: str, fields: t.Sequence[str]) -> t.Dict[str, str]: 

28 """ 

29 Build a painless script for redacting fields by way of an update_by_query operation 

30 

31 :param message: The text to put in place of whatever is in a field 

32 :param fields: The list of field names to act on 

33 

34 :type message: str 

35 :type fields: list 

36 

37 :rtype: dict 

38 :returns: A dictionary of ``{"source": (assembled message), "lang": "painless"}`` 

39 """ 

40 msg = "" 

41 for field in fields: 

42 msg += f"ctx._source.{field} = '{message}'; " 

43 script = {"source": msg, "lang": "painless"} 

44 logger.debug('script = %s', script) 

45 return script 

46 

47 

48def check_dotted_fields(result: t.Dict, field: str, message: str) -> bool: 

49 """Iterate through dotted fields to ensure success 

50 

51 :param result: The search result object 

52 :param field: The field with dotted notation 

53 

54 :type result: dict 

55 :type field: str 

56 

57 :returns: Success (``True``) or Failure (``False``) 

58 :rtype: bool 

59 """ 

60 success = False 

61 logger.debug('Dotted field detected: (%s) ...', field) 

62 fielder = result['hits']['hits'][0]['_source'] 

63 iterations = len(field.split('.')) 

64 counter = 1 

65 for key in field.split('.'): 

66 # This should recursively look for each subkey 

67 if key in fielder: 

68 fielder = fielder[key] 

69 else: 

70 break 

71 if counter == iterations: 

72 if fielder == message: 

73 success = True 

74 counter += 1 

75 return success 

76 

77 

78def check_fields(result: t.Dict, job_config: t.Dict) -> bool: 

79 """Check document fields in result to ensure success 

80 

81 :param result: The search result object 

82 :param job_config: The configuration settings for this job 

83 

84 :type result: dict 

85 :type job_config: dict 

86 

87 :returns: Success (``True``) or Failure (``False``) 

88 :rtype: bool 

89 """ 

90 complete = True 

91 hit = result['hits']['hits'][0]['_source'] 

92 for field in job_config['fields']: 

93 success = False 

94 if len(field.split('.')) > 1: 

95 success = check_dotted_fields(result, field, job_config['message']) 

96 

97 elif field in hit: 

98 if hit[field] == job_config['message']: 

99 success = True 

100 

101 else: 

102 logger.warning("Field %s not present in document", field) 

103 # Don't need to report the expected fail 2x, so we break the loop here 

104 break 

105 

106 if success: 

107 logger.info("Field %s is redacted correctly", field) 

108 else: 

109 # A single failure is enough to make it a complete failure. 

110 complete = False 

111 logger.error("Field %s is not redacted correctly", field) 

112 return complete 

113 

114 

115def chunk_index_list(indices: t.Sequence[str]) -> t.Sequence[t.Sequence[str]]: 

116 """ 

117 This utility chunks very large index lists into 3KB chunks. 

118 It measures the size as a csv string, then converts back into a list for the return 

119 value. 

120 

121 :param indices: The list of indices 

122 

123 :type indices: list 

124 

125 :returns: A list of lists (each a piece of the original ``indices``) 

126 :rtype: list 

127 """ 

128 chunks = [] 

129 chunk = "" 

130 for index in indices: 

131 if len(chunk) < 3072: 

132 if not chunk: 

133 chunk = index 

134 else: 

135 chunk += "," + index 

136 else: 

137 chunks.append(chunk.split(',')) 

138 chunk = index 

139 chunks.append(chunk.split(',')) 

140 return chunks 

141 

142 

143def configure_ilm_policy(task: 'Task', data: 'DotMap') -> None: 

144 """ 

145 Prune phases we've already passed. 

146 

147 If only_expunge_deletes is True in the job config, set any force_merge_index 

148 actions to False. 

149 """ 

150 # Copy the existing policy to a new spot 

151 data.new.ilmpolicy = data.ilm.lifecycle.policy 

152 

153 # Prune phases from existing ILM policy we've already surpassed 

154 for phase in list(data.new.ilmpolicy.phases.toDict().keys()): 

155 if PHASES.index(data.ilm.explain.phase) > PHASES.index(phase): 

156 del data.new.ilmpolicy.phases[phase] 

157 

158 # Figure out if we're doing force merge 

159 fmerge = True 

160 if 'forcemerge' in task.job.config: 

161 fmkwargs = task.job.config['forcemerge'] 

162 if 'only_expunge_deletes' in fmkwargs and fmkwargs['only_expunge_deletes']: 

163 fmerge = False 

164 else: 

165 fmerge = False 

166 

167 # Loop through the remaining phases and set 'force_merge_index': False 

168 # to the cold or frozen actions. 

169 

170 for phase in data.new.ilmpolicy.phases: 

171 if phase not in ['cold', 'frozen']: 

172 continue 

173 if 'searchable_snapshot' in data.new.ilmpolicy.phases[phase].actions: 

174 data.new.ilmpolicy.phases[ 

175 phase 

176 ].actions.searchable_snapshot.force_merge_index = fmerge 

177 

178 

179def end_it(obj: t.Union['Job', 'Task'], success: bool) -> None: 

180 """Close out the object here to avoid code repetition""" 

181 # Record task success or fail here for THIS task_id 

182 # Each index in per_index has its own status tracker 

183 if not success: 

184 err = True 

185 log = 'Check application logs for detailed report' 

186 else: 

187 err = False 

188 log = 'DONE' 

189 obj.end(completed=success, errors=err, logmsg=log) 

190 

191 

192def exception_msgmaker(exc: t.Union[e.MissingIndex, e.BadClientResult]) -> str: 

193 """Most of the messages here are similar enough to warrant a single function""" 

194 msg = '' 

195 upstream = ( 

196 f'The upstream exception type was {type(exc.upstream).__name__}, ' 

197 f'with error message: {exc.upstream.args[0]}' 

198 ) 

199 if isinstance(exc, e.MissingIndex): 

200 msg = ( 

201 f'Exception raised because index {exc.missing} was not found. ' 

202 f'{upstream}' 

203 ) 

204 elif isinstance(exc, e.BadClientResult): 

205 msg = ( 

206 f'Exception raised because of a bad or unexpected response or result ' 

207 f'from the Elasticsearch cluster. {upstream}' 

208 ) 

209 return msg 

210 

211 

212def get_alias_actions(oldidx: str, newidx: str, aliases: t.Dict) -> t.Sequence: 

213 """ 

214 :param oldidx: The old index name 

215 :param newidx: The new index name 

216 :param aliases: The aliases 

217 

218 :type oldidx: str 

219 :type newidx: str 

220 :type aliases: dict 

221 

222 :returns: A list of actions suitable for 

223 :py:meth:`~.elasticsearch.client.IndicesClient.update_aliases` ``actions`` 

224 kwarg. 

225 :rtype: list 

226 """ 

227 actions = [] 

228 for alias in aliases.keys(): 

229 actions.append({'remove': {'index': oldidx, 'alias': alias}}) 

230 actions.append({'add': {'index': newidx, 'alias': alias}}) 

231 return actions 

232 

233 

234def get_field_matches(config: t.Dict, result: t.Dict) -> int: 

235 """Count docs which have the expected fields 

236 

237 :param config: The config from the YAML file 

238 :param result: The query result dict 

239 

240 :type config: dict 

241 :type result: dict 

242 

243 :returns: The count of docs in ``result`` which have the identified fields 

244 :rtype: int 

245 """ 

246 

247 logger.debug('Extracting doc hit count from result') 

248 doc_count = result['hits']['total']['value'] 

249 for element in range(0, result['hits']['total']['value']): 

250 for field in config['fields']: 

251 if len(field.split('.')) > 1: 

252 logger.debug('Dotted field "%s" detected...', field) 

253 fielder = result['hits']['hits'][element]['_source'] 

254 for key in field.split('.'): 

255 # This should recursively look for each subkey 

256 if key in fielder: 

257 fielder = fielder[key] 

258 else: 

259 doc_count -= 1 

260 break 

261 elif field not in list(result['hits']['hits'][element]['_source'].keys()): 

262 logger.debug('Fieldname "%s" NOT detected...', field) 

263 doc_count -= 1 

264 else: 

265 logger.debug('Root-level fieldname "%s" detected...', field) 

266 return doc_count 

267 

268 

269def get_fname() -> str: 

270 """Return the name of the calling function""" 

271 return stack()[1].function 

272 

273 

274def get_inc_version(name: str) -> int: 

275 """Extract the incrementing version value from the end of name 

276 

277 :param name: The name 

278 

279 :type name: str 

280 

281 :returns: The integer value of the current index revision, or 0 if no version 

282 :rtype: int 

283 """ 

284 # Anchor the end as 3 dashes, a v, and 3 digits, e.g. ---v001 

285 match = re.search(r'^.*---v(\d{3})$', name) 

286 if match: 

287 return int(match.group(1)) 

288 return 0 

289 

290 

291def get_redactions(file: str = '', data: t.Union[t.Dict, None] = None) -> 'Schema': 

292 """ 

293 Return valid dictionary of redactions from either ``file`` or from ``data`` 

294 after checking Schema 

295 

296 :param file: YAML file with redactions to check 

297 :param data: Configuration data in dictinoary format 

298 

299 :type file: str 

300 :type data: dict 

301 

302 :rtype: dict 

303 :returns: Redactions configuration data 

304 """ 

305 if data is None: 

306 data = {} 

307 logger.debug('Getting redactions data...') 

308 if file: 

309 try: 

310 config = get_yaml(file) 

311 except esc_ConfigError as exc: 

312 msg = f'Unable to read and/or parse YAML REDACTIONS_FILE: {file} Exiting.' 

313 logger.critical(msg) 

314 raise e.ConfigError(msg, exc) 

315 elif data: 

316 config = data 

317 else: 

318 raise e.FatalError('No configuration file or dictionary provided.', Exception()) 

319 return SchemaCheck( 

320 config, redaction_schema(), 'Redaction Configuration', 'redactions' 

321 ).result() 

322 

323 

324def now_iso8601() -> str: 

325 """ 

326 :returns: An ISO8601 timestamp based on datetime.now 

327 """ 

328 # Because Python 3.12 now requires non-naive timezone declarations, we must change. 

329 # 

330 # ## Example: 

331 # ## The new way: 

332 # ## datetime.now(timezone.utc).isoformat() 

333 # ## Result: 2024-04-16T16:00:00+00:00 

334 # ## End Example 

335 # 

336 # Note that the +00:00 is appended now where we affirmatively declare the UTC 

337 # timezone 

338 # 

339 # As a result, we will use this function to prune away the timezone if it is +00:00 

340 # and replace it with Z, which is shorter Zulu notation for UTC (per Elasticsearch) 

341 # 

342 # We are MANUALLY, FORCEFULLY declaring timezone.utc, so it should ALWAYS be +00:00, 

343 # but could in theory sometime show up as a Z, so we test for that. 

344 

345 parts = datetime.now(timezone.utc).isoformat().split('+') 

346 if len(parts) == 1: 

347 if parts[0][-1] == 'Z': 

348 return parts[0] # Our ISO8601 already ends with a Z for Zulu/UTC time 

349 return f'{parts[0]}Z' # It doesn't end with a Z so we put one there 

350 if parts[1] == '00:00': 

351 return f'{parts[0]}Z' # It doesn't end with a Z so we put one there 

352 return f'{parts[0]}+{parts[1]}' # Fallback publishes the +TZ, whatever that was 

353 

354 

355def config_fieldmap( 

356 rw_val: t.Literal['read', 'write'], 

357 key: t.Literal[ 

358 'pattern', 

359 'query', 

360 'fields', 

361 'message', 

362 'expected_docs', 

363 'restore_settings', 

364 'delete', 

365 ], 

366) -> t.Union[str, int, object]: 

367 """ 

368 Return the function from this function/key map 

369 """ 

370 which = { 

371 'read': { 

372 'pattern': json.loads, 

373 'query': json.loads, 

374 'fields': json.loads, 

375 'message': str, 

376 'expected_docs': int, 

377 'restore_settings': json.loads, 

378 'delete': str, 

379 }, 

380 'write': { 

381 'pattern': json.dumps, 

382 'query': json.dumps, 

383 'fields': json.dumps, 

384 'message': str, 

385 'expected_docs': int, 

386 'restore_settings': json.dumps, 

387 'delete': str, 

388 }, 

389 } 

390 return which[rw_val][key] 

391 

392 

393def parse_job_config(config: t.Dict, behavior: t.Literal['read', 'write']) -> t.Dict: 

394 """Parse raw config from the index. 

395 

396 Several fields are JSON escaped, so we need to fix it to put it in a dict. 

397 

398 :param config: The raw config data 

399 :param behavior: ``read`` or ``write`` 

400 

401 :type config: dict 

402 :type behavior: str 

403 

404 :rtype: dict 

405 

406 :returns: JSON-(de)sanitized configuration dict 

407 """ 

408 fields = [ 

409 'pattern', 

410 'query', 

411 'fields', 

412 'message', 

413 'expected_docs', 

414 'restore_settings', 

415 'delete', 

416 ] 

417 doc = {} 

418 for field in fields: 

419 if field in config: 

420 func = config_fieldmap(behavior, field) # type: ignore 

421 doc[field] = func(config[field]) # type: ignore 

422 return doc 

423 

424 

425def strip_ilm_name(name: str) -> str: 

426 """ 

427 Strip leading ``pii-tool-``, and trailing ``---v000`` from ``name`` 

428 

429 :param name: The ILM lifecycle name 

430 

431 :type name: str 

432 

433 :returns: The "cleaned up" and stripped ILM name 

434 :rtype: str 

435 """ 

436 retval = name.replace('pii-tool-', '') 

437 # Anchor the end as 3 dashes, a v, and 3 digits, e.g. ---v001 

438 match = re.search(r'^(.*)---v\d{3}$', retval) 

439 if match: 

440 retval = match.group(1) 

441 return retval 

442 

443 

444def strip_index_name(name: str) -> str: 

445 """ 

446 Strip ``partial-``, ``restored-``, ``redacted-``, and trailing ``---v000`` from 

447 ``name`` 

448 

449 :param name: The index name 

450 

451 :type name: str 

452 

453 :returns: The "cleaned up" and stripped index name 

454 :rtype: str 

455 """ 

456 retval = name.replace('partial-', '') 

457 retval = retval.replace('restored-', '') 

458 retval = retval.replace('redacted-', '') 

459 # Anchor the end as 3 dashes, a v, and 3 digits, e.g. ---v001 

460 match = re.search(r'^(.*)---v\d{3}$', retval) 

461 if match: 

462 retval = match.group(1) 

463 return retval 

464 

465 

466def es_waiter(client: 'Elasticsearch', cls, **kwargs) -> None: 

467 """Wait for ILM Phase & Step to be reached""" 

468 try: 

469 waiter = cls(client, **kwargs) 

470 waiter.wait() 

471 except ( 

472 KeyError, 

473 ValueError, 

474 TimeoutError, 

475 IlmWaitError, 

476 NotFoundError, 

477 ) as wait_err: 

478 msg = f'{cls.__name__}: wait for completion failed: {kwargs}' 

479 raise e.BadClientResult(msg, wait_err)