Coverage for /Users/buh/.pyenv/versions/3.12.2/envs/pii/lib/python3.12/site-packages/es_pii_tool/base.py: 69%

118 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2025-03-18 12:25 -0600

1"""Main app definition""" 

2 

3# pylint: disable=broad-exception-caught,R0913 

4import typing as t 

5import logging 

6from es_pii_tool.exceptions import FatalError, MissingIndex 

7from es_pii_tool.job import Job 

8from es_pii_tool.redacters.index import RedactIndex 

9from es_pii_tool.trackables import Task 

10from es_pii_tool.helpers.elastic_api import get_hits 

11from es_pii_tool.helpers.utils import end_it, get_redactions 

12 

13if t.TYPE_CHECKING: 

14 from elasticsearch8 import Elasticsearch 

15 

16# pylint: disable=R0917 

17 

18logger = logging.getLogger(__name__) 

19 

20 

21class PiiTool: 

22 """Elasticsearch PII Tool""" 

23 

24 def __init__( 

25 self, 

26 client: 'Elasticsearch', 

27 tracking_index: str, 

28 redaction_file: str = '', 

29 redaction_dict: t.Union[t.Dict, None] = None, 

30 dry_run: bool = False, 

31 ): 

32 if redaction_dict is None: 

33 redaction_dict = {} 

34 logger.debug('Redactions file: %s', redaction_file) 

35 self.counter = 0 

36 self.client = client 

37 try: 

38 self.redactions = get_redactions(redaction_file, redaction_dict) 

39 except Exception as err: 

40 logger.critical('Unable to load redactions: %s', err) 

41 raise err 

42 logger.debug(f'Loaded redactions: {self.redactions}') 

43 self.tracking_index = tracking_index 

44 self.dry_run = dry_run 

45 

46 def verify_doc_count(self, job: Job) -> bool: 

47 """Verify that expected_docs and the hits from the query have the same value 

48 

49 :param job: The job object for the present redaction run 

50 

51 :type job: :py:class:`~.app.tracking.Job` 

52 

53 :rtype: None 

54 :returns: No return value 

55 """ 

56 try: 

57 task = Task(job, task_id=f'PRE---{job.name}---DOC-COUNT-VERIFICATION') 

58 except Exception as err: 

59 logger.critical('Unable to create task: %s', err) 

60 raise FatalError('Unable to create task', err) from err 

61 success = False 

62 errors = False 

63 if task.finished(): 

64 return True # We're done already 

65 # Log task start 

66 task.begin() 

67 hits = 0 

68 try: 

69 hits = get_hits(self.client, job.config['pattern'], job.config['query']) 

70 except Exception as err: 

71 logger.critical('Unable to count query result hits: %s', err) 

72 raise err 

73 msg = f'{hits} hit(s)' 

74 logger.debug(msg) 

75 task.add_log(msg) 

76 logger.info("Checking expected document count...") 

77 zeromsg = ( 

78 f"For index pattern {job.config['pattern']}, with query " 

79 f"{job.config['query']} 'expected_docs' is {job.config['expected_docs']} " 

80 f"but query results is {hits} matches." 

81 ) 

82 if job.config['expected_docs'] == hits: 

83 msg = ( 

84 f'Query result hits: {hits} matches expected_docs: ' 

85 f'{job.config["expected_docs"]}' 

86 ) 

87 logger.debug(msg) 

88 task.add_log(msg) 

89 success = True 

90 if hits == 0: 

91 logger.critical(zeromsg) 

92 logger.info('Continuing to next configuration block (if any)') 

93 success = False 

94 else: 

95 logger.critical(zeromsg) 

96 logger.info('Continuing to next configuration block (if any)') 

97 if not success: 

98 errors = True 

99 task.add_log(zeromsg) 

100 task.end(success, errors=errors) 

101 return success 

102 

103 def iterate_indices(self, job: Job) -> bool: 

104 """Iterate over every index in job.indices""" 

105 all_succeeded = True 

106 for idx in job.indices: 

107 try: 

108 task = Task(job, index=idx, id_suffix='PARENT-TASK') 

109 # First check to see if idx has been touched as part of a previous run 

110 if task.finished(): 

111 continue # This index has already been verified 

112 task.begin() 

113 except Exception as err: 

114 logger.critical('Unable to create task: %s', err) 

115 raise FatalError('Unable to create task', err) from err 

116 task_success = False 

117 try: 

118 msg = f'Iterating per index: Index {idx} of {job.indices}' 

119 logger.debug(msg) 

120 task.add_log(msg) 

121 redact = RedactIndex(idx, job, self.counter) 

122 redact.run() 

123 task_success = redact.success 

124 self.counter = redact.counter 

125 logger.debug('RESULT: %s', task_success) 

126 except MissingIndex as err: 

127 logger.critical(err) 

128 raise FatalError(f'Index {err.missing} not found.', err) from err 

129 except FatalError as err: 

130 logger.critical('Fatal upstream error encountered: %s', err.message) 

131 raise FatalError('We suffered a fatal upstream error', err) from err 

132 end_it(task, task_success) 

133 if not task.completed: 

134 all_succeeded = False 

135 job.add_log(f'Unable to complete task {task.task_id}') 

136 return all_succeeded 

137 

138 def iterate_configuration(self) -> None: 

139 """Iterate over every configuration block in self.redactions""" 

140 logger.debug('Full redactions object from config: %s', self.redactions) 

141 for config_block in self.redactions['redactions']: # type: ignore 

142 job_success = True 

143 # Reset counter to zero for each full iteration 

144 self.counter = 0 

145 if self.dry_run: 

146 logger.info("DRY-RUN MODE ENABLED. No data will be changed.") 

147 

148 # There's really only 1 root-level key for each configuration block, 

149 # and that's job_id 

150 job_name = list(config_block.keys())[0] 

151 args = (self.client, self.tracking_index, job_name, config_block[job_name]) 

152 job = Job(*args, dry_run=self.dry_run) 

153 if job.finished(): 

154 continue 

155 job.begin() 

156 if not self.verify_doc_count(job): 

157 # This configuration block can't go further because of the mismatch 

158 job_success = False 

159 end_it(job, job_success) 

160 continue 

161 

162 job_success = self.iterate_indices(job) 

163 # At this point, self.counter should be equal to total, indicating that we 

164 # matched expected_docs. We should therefore register that the job was 

165 # successful, if we have reached this point with no other errors having 

166 # interrupted the process. 

167 

168 end_it(job, job_success) 

169 

170 def run(self) -> None: 

171 """Do the thing""" 

172 logger.info('PII scrub initiated') 

173 self.iterate_configuration()