Coverage for /Users/buh/.pyenv/versions/3.12.9/envs/es-testbed/lib/python3.12/site-packages/es_testbed/ilm.py: 100%

157 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-17 19:30 -0600

1"""ILM Defining Class""" 

2 

3import typing as t 

4import logging 

5from os import getenv 

6import time 

7from dotmap import DotMap 

8from es_wait import IlmPhase, IlmStep 

9from es_wait.exceptions import EsWaitFatal, EsWaitTimeout 

10from es_testbed.defaults import ( 

11 PAUSE_ENVVAR, 

12 PAUSE_DEFAULT, 

13 TIMEOUT_DEFAULT, 

14 TIMEOUT_ENVVAR, 

15) 

16from es_testbed.exceptions import ( 

17 NameChanged, 

18 ResultNotExpected, 

19 TestbedMisconfig, 

20 TestbedFailure, 

21) 

22from es_testbed.helpers.es_api import get_ilm_phases, ilm_explain, ilm_move, resolver 

23from es_testbed.helpers.utils import prettystr 

24 

25if t.TYPE_CHECKING: 

26 from elasticsearch8 import Elasticsearch 

27 

28PAUSE_VALUE = float(getenv(PAUSE_ENVVAR, default=PAUSE_DEFAULT)) 

29TIMEOUT_VALUE = float(getenv(TIMEOUT_ENVVAR, default=TIMEOUT_DEFAULT)) 

30 

31logger = logging.getLogger('es_testbed.IlmTracker') 

32 

33# ## Example ILM explain output 

34# { 

35# 'action': 'complete', 

36# 'action_time_millis': 0, 

37# 'age': '5.65m', 

38# 'index': 'INDEX_NAME', 

39# 'index_creation_date_millis': 0, 

40# 'lifecycle_date_millis': 0, 

41# 'managed': True, 

42# 'phase': 'hot', 

43# 'phase_execution': { 

44# 'modified_date_in_millis': 0, 

45# 'phase_definition': { 

46# 'actions': { 

47# 'rollover': { 

48# 'max_age': 'MAX_AGE', 

49# 'max_primary_shard_docs': 1000, 

50# 'max_primary_shard_size': 'MAX_SIZE', 

51# 'min_docs': 1 

52# } 

53# }, 

54# 'min_age': '0ms' 

55# }, 

56# 'policy': 'POLICY_NAME', 

57# 'version': 1 

58# }, 

59# 'phase_time_millis': 0, 

60# 'policy': 'POLICY_NAME', 

61# 'step': 'complete', 

62# 'step_time_millis': 0, 

63# 'time_since_index_creation': '5.65m' 

64# } 

65 

66 

67class IlmTracker: 

68 """ILM Phase Tracking Class""" 

69 

70 def __init__(self, client: 'Elasticsearch', name: str): 

71 self.client = client 

72 self.name = self.resolve(name) # A single index name 

73 self._explain = DotMap(self.get_explain_data()) 

74 self._phases = get_ilm_phases(self.client, self._explain.policy) 

75 

76 @property 

77 def explain(self) -> DotMap: 

78 """Return the current stored value of ILM Explain""" 

79 return self._explain 

80 

81 @property 

82 def policy_phases(self) -> t.Sequence[str]: 

83 """Return a list of phases in the ILM policy""" 

84 return list(self._phases.keys()) 

85 

86 def _log_phase(self, phase: str) -> None: 

87 logger.debug(f'ILM Explain Index: {self._explain.index}') 

88 logger.info(f'Index "{self.name}" now on phase "{phase}"') 

89 

90 def _phase_wait( 

91 self, phase: str, pause: float = PAUSE_VALUE, timeout: float = TIMEOUT_VALUE 

92 ) -> None: 

93 """Wait until the new phase shows up in ILM Explain""" 

94 kw = {'name': self.name, 'phase': phase, 'pause': pause, 'timeout': timeout} 

95 phasechk = IlmPhase(self.client, **kw) 

96 try: 

97 phasechk.wait() 

98 except EsWaitFatal as wait: 

99 msg = ( 

100 f'{wait.message}. Total elapsed time: {wait.elapsed}. ' 

101 f'Errors: {prettystr(wait.errors)}' 

102 ) 

103 logger.error(msg) 

104 raise TestbedFailure(msg) from wait 

105 except EsWaitTimeout as wait: 

106 msg = f'{wait.message}. Total elapsed time: {wait.elapsed}.' 

107 logger.error(msg) 

108 raise TestbedFailure(msg) from wait 

109 

110 def _ssphz(self, phase: str) -> bool: 

111 """Return True if the phase is for searchable snapshots (> 'warm')""" 

112 return bool(self.pnum(phase) > self.pnum('warm')) 

113 

114 def advance( 

115 self, 

116 phase: t.Optional[str] = None, 

117 action: t.Optional[str] = None, 

118 name: t.Optional[str] = None, 

119 ) -> None: 

120 """Advance index to next ILM phase""" 

121 if self._explain.phase == 'delete': 

122 logger.warning('Already on "delete" phase. No more phases to advance') 

123 return 

124 

125 logger.debug(f'current_step: {prettystr(self.current_step())}') 

126 next_step = self.next_step(phase, action=action, name=name) 

127 logger.debug(f'next_step: {prettystr(next_step)}') 

128 if self._explain.phase == 'new' and phase == 'hot': 

129 # It won't be for very long. 

130 self._phase_wait('hot') 

131 time.sleep(1) # Just to make sure the new index is ready 

132 

133 # Regardless of the remaining phases, the current phase steps must be 

134 # complete before proceeding with ilm_move 

135 self.update() 

136 self.wait4complete() 

137 self.update() 

138 

139 # We could have arrived with it hot, but incomplete 

140 if phase == 'hot': 

141 self._log_phase(phase) 

142 # we've advanced to our target phase, and all steps are completed 

143 return 

144 

145 # Remaining phases could be warm through frozen 

146 if self._explain.phase != phase: 

147 logger.debug(f'Current phase: {self.explain.phase}') 

148 logger.debug(f'Advancing to "{phase}" phase...') 

149 # We will only wait for steps to complete for the hot and warm tiers 

150 wait4steps = not self._ssphz(phase) 

151 ilm_move(self.client, self.name, self.current_step(), next_step) 

152 # Let the cluster catch up before proceeding. 

153 time.sleep(1) # Just to make sure the cluster state has gelled 

154 self._phase_wait(phase) 

155 # If cold or frozen, we can return now. We let the calling function 

156 # worry about the weird name changing behavior of searchable mounts 

157 if wait4steps: 

158 self.update() 

159 logger.debug(f'Waiting for "{phase}" phase steps to complete...') 

160 self.wait4complete() 

161 self.update() 

162 self._log_phase(phase) 

163 else: 

164 logger.debug(f'Already on "{phase}" phase. No need to advance') 

165 logger.debug(f'current_step: {prettystr(self.current_step())}') 

166 

167 def current_step(self) -> t.Dict[str, str]: 

168 """Return the current ILM step information 

169 

170 There is a disconnect between what the ILM Explain API returns and what 

171 the ILM Move API expects. The former returns a 'step' key, while the 

172 latter expects a 'name' key. This property returns a dictionary for 

173 use with the ILM Move API, so that the 'step' key is renamed to 'name'. 

174 """ 

175 

176 return { 

177 'phase': self._explain.phase, 

178 'action': self._explain.action, 

179 'name': self._explain.step, 

180 } 

181 

182 def get_explain_data(self) -> t.Dict: 

183 """Get the ILM explain data and return it""" 

184 try: 

185 return ilm_explain(self.client, self.name) 

186 except NameChanged as err: 

187 logger.debug('Passing along upstream exception...') 

188 raise err 

189 except ResultNotExpected as err: 

190 msg = f'Unable to get ilm_explain results. Error: {prettystr(err)}' 

191 logger.critical(msg) 

192 raise err 

193 

194 def next_phase(self) -> str: 

195 """Return the next phase in the index's ILM journey""" 

196 retval = None 

197 if self._explain.phase == 'delete': 

198 logger.warning('Already on "delete" phase. No more phases to advance') 

199 else: 

200 curr = self.pnum(self._explain.phase) # A numeric representation 

201 # A list of any remaining phases in the policy with a higher number than 

202 # the current 

203 remaining = [ 

204 self.pnum(x) for x in self.policy_phases if self.pnum(x) > curr 

205 ] 

206 if remaining: # If any: 

207 retval = self.pname(remaining[0]) 

208 # Get the phase name from the number stored in the first element 

209 return retval 

210 

211 def next_step( 

212 self, 

213 phase: t.Optional[str] = None, 

214 action: t.Optional[str] = None, 

215 name: t.Optional[str] = None, 

216 ) -> t.Dict[str, str]: 

217 """Determine the next ILM step based on the current phase, action, and name 

218 

219 There is a disconnect between what the ILM Explain API returns and what 

220 the ILM Move API expects. The former returns a 'step' key, while the 

221 latter expects a 'name' key. This property returns a dictionary for 

222 use with the ILM Move API, so that the 'step' key is renamed to 'name'. 

223 """ 

224 err1 = bool((action is not None) and (name is None)) 

225 err2 = bool((action is None) and (name is not None)) 

226 if err1 or err2: 

227 msg = 'If either action or name is specified, both must be' 

228 logger.critical(msg) 

229 raise TestbedMisconfig(msg) 

230 if not phase: 

231 logger.debug('No phase specified. Using next_phase') 

232 phase = self.next_phase() 

233 logger.debug(f'next_phase: {phase}') 

234 retval = {'phase': phase} 

235 if action: 

236 retval['action'] = action 

237 retval['name'] = name 

238 return retval 

239 

240 def pnum(self, phase: str) -> int: 

241 """Map a phase name to a phase number""" 

242 _ = {'new': 0, 'hot': 1, 'warm': 2, 'cold': 3, 'frozen': 4, 'delete': 5} 

243 return _[phase] 

244 

245 def pname(self, num: int) -> str: 

246 """Map a phase number to a phase name""" 

247 _ = {0: 'new', 1: 'hot', 2: 'warm', 3: 'cold', 4: 'frozen', 5: 'delete'} 

248 return _[num] 

249 

250 def resolve(self, name: str) -> str: 

251 """Resolve that we have an index and NOT an alias or a datastream""" 

252 res = resolver(self.client, name) 

253 if len(res['aliases']) > 0 or len(res['data_streams']) > 0: 

254 msg = f'{name} is not an index: {res}' 

255 logger.critical(msg) 

256 raise ResultNotExpected(msg) 

257 if len(res['indices']) > 1: 

258 msg = f'{name} resolved to multiple indices: {prettystr(res["indices"])}' 

259 logger.critical(msg) 

260 raise ResultNotExpected(msg) 

261 return res['indices'][0]['name'] 

262 

263 def update(self) -> None: 

264 """Update self._explain with the latest from :py:meth:`get_explain_data`""" 

265 try: 

266 self._explain = DotMap(self.get_explain_data()) 

267 except NameChanged as err: 

268 logger.debug('Passing along upstream exception...') 

269 raise err 

270 

271 def wait4complete(self) -> None: 

272 """Subroutine for waiting for an ILM step to complete""" 

273 phase_action = bool(self._explain.action == 'complete') 

274 phase_step = bool(self._explain.step == 'complete') 

275 if bool(phase_action and phase_step): 

276 logger.debug( 

277 f'{self.name}: Current step complete: {prettystr(self.current_step())}' 

278 ) 

279 return 

280 logger.debug( 

281 f'{self.name}: Current step not complete. {prettystr(self.current_step())}' 

282 ) 

283 kw = {'name': self.name, 'pause': PAUSE_VALUE, 'timeout': TIMEOUT_VALUE} 

284 step = IlmStep(self.client, **kw) 

285 try: 

286 step.wait() 

287 logger.debug('ILM Step successful. The wait is over') 

288 time.sleep(1) # Just to make sure the cluster state has gelled 

289 except EsWaitFatal as wait: 

290 msg = ( 

291 f'{wait.message}. Total elapsed time: {wait.elapsed}. ' 

292 f'Errors: {prettystr(wait.errors)}' 

293 ) 

294 logger.error(msg) 

295 raise TestbedFailure(msg) from wait 

296 except EsWaitTimeout as wait: 

297 msg = f'{wait.message}. Total elapsed time: {wait.elapsed}.' 

298 logger.error(msg) 

299 raise TestbedFailure(msg) from wait