Coverage for /Users/buh/.pyenv/versions/3.12.2/envs/pii/lib/python3.12/site-packages/es_pii_tool/redacters/index.py: 67%

126 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2025-03-17 23:33 -0600

1"""Redact data from an Elasticsearch index""" 

2 

3import typing as t 

4import logging 

5from dotmap import DotMap # type: ignore 

6from es_pii_tool.exceptions import ( 

7 BadClientResult, 

8 FatalError, 

9 MissingIndex, 

10) 

11from es_pii_tool.trackables import Task 

12from es_pii_tool.helpers.utils import ( 

13 exception_msgmaker, 

14 get_field_matches, 

15) 

16from es_pii_tool.helpers import elastic_api as api 

17from es_pii_tool.redacters.snapshot import RedactSnapshot 

18 

19if t.TYPE_CHECKING: 

20 from es_pii_tool.job import Job 

21 

22logger = logging.getLogger(__name__) 

23 

24 

25class RedactIndex: 

26 """Redact index per settings""" 

27 

28 def __init__(self, index: str, job: 'Job', counter: int): 

29 try: 

30 self.task = Task(job, index=index, id_suffix='REDACT-INDEX') 

31 except Exception as exc: 

32 logger.critical('Unable to create task: %s', exc) 

33 raise FatalError('Unable to create task', exc) from exc 

34 self.index = index 

35 self.counter = counter 

36 self.data = DotMap() 

37 self.verify_index() 

38 

39 @property 

40 def success(self) -> bool: 

41 """Was the redaction a success?""" 

42 return self._success 

43 

44 @success.setter 

45 def success(self, value: bool) -> None: 

46 self._success = value 

47 

48 def end_in_failure( 

49 self, 

50 exception: t.Union[BadClientResult, MissingIndex], 

51 reraise: bool = False, 

52 func: t.Union[t.Callable, None] = None, 

53 kwargs: t.Union[t.Dict[str, t.Union[bool, str]], None] = None, 

54 ) -> None: 

55 """For steps and checks that end in failure, we lump you into this method""" 

56 msg = exception_msgmaker(exception) 

57 logger.critical(msg) 

58 if func: 

59 if kwargs is None: 

60 kwargs = {} 

61 logger.error('Empty kwargs passed') 

62 if 'logmsg' in kwargs: # For the task ender 

63 kwargs['logmsg'] = msg 

64 func(**kwargs) 

65 if reraise: 

66 raise FatalError(msg, exception) 

67 

68 def verify_index(self): 

69 """Verify the index exists""" 

70 # If the index name changed because of an ILM phase shift from hot to cold 

71 # or cold to frozen, then we should verify the name change here. We should raise 

72 # an exception if the name of the index changed or it disappeared. 

73 if not api.verify_index(self.task.job.client, self.index): 

74 msg = f'Halting execution: Index {self.index} changed or is missing.' 

75 logger.critical(msg) 

76 self.success = False 

77 raise ValueError(msg, 'index not found as expected', self.index) 

78 

79 def run_query(self): 

80 """Run the query""" 

81 self.data.result = DotMap( 

82 dict( 

83 api.do_search( 

84 self.task.job.client, 

85 self.index, 

86 self.task.job.config['query'], 

87 size=10000, 

88 ) 

89 ) 

90 ) 

91 self.data.hits = self.data.result.hits.total.value 

92 logger.debug('Checking document fields on index: %s...', self.index) 

93 if self.data.hits == 0: 

94 self.counter += 1 

95 msg = f'Documents matching redaction query not found on index: {self.index}' 

96 logger.debug(msg) 

97 msg = f'Index {self.counter} of {self.task.job.total} processed...' 

98 logger.info(msg) 

99 # Record success for this task but send msg to the log field 

100 # An index could be in the pattern but have no matches. 

101 self.task.end(True, logmsg=msg) 

102 self.task.add_log(f"Hits: {self.data.hits}") 

103 

104 def verify_fields(self): 

105 """Verify the fields in the query results match what we expect""" 

106 if not get_field_matches(self.task.job.config, self.data.result.toDict()) > 0: 

107 msg = f'Fields required for redaction not found on index: {self.index}' 

108 logger.warning(msg) 

109 self.task.end(completed=True, logmsg=msg) 

110 logger.warning( 

111 'Not a fatal error. Index in pattern does not have the specified fields' 

112 ) 

113 

114 def get_phase(self): 

115 """Get the ILM phase (if any) for the index""" 

116 nope = 'Not assigned an ILM Phase' 

117 try: 

118 self.data.phase = api.get_phase(self.task.job.client, self.index) or nope 

119 except MissingIndex as exc: 

120 kwargs = {'completed': False, 'errors': True, 'logmsg': 'replaceme'} 

121 self.end_in_failure(exc, reraise=True, func=self.task.end, kwargs=kwargs) 

122 logger.debug('Index in phase: %s', self.data.phase.upper()) 

123 self.task.add_log(f'ILM Phase: {self.data.phase}') 

124 

125 def normal_redact(self): 

126 """Redact data from a normal (not searchable-snapshot) index""" 

127 msg = 'Initiating redaction of data from writeable index...' 

128 logger.info(msg) 

129 self.task.add_log(msg) 

130 # As the redact_from_index function doesn't track dry-run, we have to do it 

131 if not self.task.job.dry_run: 

132 msg = f'Redacting data from {self.index}' 

133 logger.info(msg) 

134 self.task.add_log(msg) 

135 try: 

136 api.redact_from_index( 

137 self.task.job.client, self.index, self.task.job.config 

138 ) 

139 except (MissingIndex, BadClientResult) as exc: 

140 kwargs = {'completed': False, 'errors': True, 'logmsg': 'replaceme'} 

141 self.end_in_failure( 

142 exc, reraise=False, func=self.task.end, kwargs=kwargs 

143 ) 

144 else: 

145 msg = f'DRY-RUN: Will not redact data from {self.index}' 

146 logger.info(msg) 

147 self.task.add_log(msg) 

148 

149 def snapshot_redact(self): 

150 """Redact data from searchable snapshot-backed index""" 

151 msg = 'Initiating redaction of data from mounted searchable snapshot...' 

152 logger.info(msg) 

153 self.task.add_log(msg) 

154 try: 

155 snp = RedactSnapshot(self.index, self.task.job, self.data.phase) 

156 except Exception as exc: 

157 logger.critical('Unable to build RedactSnapshot object. Exception: %s', exc) 

158 raise exc 

159 try: 

160 snp.run() 

161 except Exception as exc: 

162 logger.critical('Unable to run RedactSnapshot object. Exception: %s', exc) 

163 raise exc 

164 

165 def run(self): 

166 """Do the actual run""" 

167 if self.task.finished(): 

168 self.success = True 

169 return 

170 # Log task start time 

171 self.task.begin() 

172 self.run_query() 

173 if self.task.completed: 

174 self.success = True 

175 return 

176 self.verify_fields() 

177 if self.task.completed: 

178 self.success = True 

179 return 

180 self.get_phase() 

181 if self.data.phase in ('cold', 'frozen'): 

182 self.snapshot_redact() 

183 else: 

184 self.normal_redact() 

185 # If we have reached this point, we've succeeded. 

186 self.counter += 1 

187 msg = f'Index {self.counter} of {self.task.job.total} processed...' 

188 logger.info(msg) 

189 self.task.add_log(msg) 

190 self.task.end(completed=True, logmsg='DONE') 

191 self.success = True