Coverage for /Users/buh/.pyenv/versions/3.12.9/envs/es-testbed/lib/python3.12/site-packages/es_testbed/ilm.py: 100%

217 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-04-17 22:23 -0600

1"""ILM Defining Class""" 

2 

3import typing as t 

4import logging 

5from os import getenv 

6import time 

7from dotmap import DotMap 

8from es_wait import IlmPhase, IlmStep 

9from es_wait.exceptions import EsWaitFatal, EsWaitTimeout 

10from .debug import debug, begin_end 

11from .defaults import ( 

12 PAUSE_ENVVAR, 

13 PAUSE_DEFAULT, 

14 TIMEOUT_DEFAULT, 

15 TIMEOUT_ENVVAR, 

16) 

17from .exceptions import ( 

18 NameChanged, 

19 ResultNotExpected, 

20 TestbedMisconfig, 

21 TestbedFailure, 

22) 

23from .es_api import get_ilm_phases, ilm_explain, ilm_move, resolver 

24from .utils import prettystr 

25 

26if t.TYPE_CHECKING: 

27 from elasticsearch8 import Elasticsearch 

28 

29PAUSE_VALUE = float(getenv(PAUSE_ENVVAR, default=PAUSE_DEFAULT)) 

30TIMEOUT_VALUE = float(getenv(TIMEOUT_ENVVAR, default=TIMEOUT_DEFAULT)) 

31 

32logger = logging.getLogger('es_testbed.IlmTracker') 

33 

34# ## Example ILM explain output 

35# { 

36# 'action': 'complete', 

37# 'action_time_millis': 0, 

38# 'age': '5.65m', 

39# 'index': 'INDEX_NAME', 

40# 'index_creation_date_millis': 0, 

41# 'lifecycle_date_millis': 0, 

42# 'managed': True, 

43# 'phase': 'hot', 

44# 'phase_execution': { 

45# 'modified_date_in_millis': 0, 

46# 'phase_definition': { 

47# 'actions': { 

48# 'rollover': { 

49# 'max_age': 'MAX_AGE', 

50# 'max_primary_shard_docs': 1000, 

51# 'max_primary_shard_size': 'MAX_SIZE', 

52# 'min_docs': 1 

53# } 

54# }, 

55# 'min_age': '0ms' 

56# }, 

57# 'policy': 'POLICY_NAME', 

58# 'version': 1 

59# }, 

60# 'phase_time_millis': 0, 

61# 'policy': 'POLICY_NAME', 

62# 'step': 'complete', 

63# 'step_time_millis': 0, 

64# 'time_since_index_creation': '5.65m' 

65# } 

66 

67 

68class IlmTracker: 

69 """ILM Phase Tracking Class""" 

70 

71 def __init__(self, client: 'Elasticsearch', name: str): 

72 debug.lv2('Initializing IlmTracker object...') 

73 self.client = client 

74 self.name = self.resolve(name) # A single index name 

75 self._explain = DotMap(self.get_explain_data()) 

76 self._phases = get_ilm_phases(self.client, self._explain.policy) 

77 debug.lv3('IlmTracker object initialized') 

78 

79 @property 

80 def explain(self) -> DotMap: 

81 """Return the current stored value of ILM Explain""" 

82 return self._explain 

83 

84 @property 

85 def policy_phases(self) -> t.Sequence[str]: 

86 """Return a list of phases in the ILM policy""" 

87 return list(self._phases.keys()) 

88 

89 def _log_phase(self, phase: str) -> None: 

90 debug.lv3(f'ILM Explain Index: {self._explain.index}') 

91 debug.lv2(f'Index "{self.name}" now on phase "{phase}"') 

92 

93 @begin_end() 

94 def _phase_wait( 

95 self, phase: str, pause: float = PAUSE_VALUE, timeout: float = TIMEOUT_VALUE 

96 ) -> None: 

97 """Wait until the new phase shows up in ILM Explain""" 

98 kw = {'name': self.name, 'phase': phase, 'pause': pause, 'timeout': timeout} 

99 debug.lv5(f'Waiting for phase args = {prettystr(kw)}') 

100 phasechk = IlmPhase(self.client, **kw) 

101 try: 

102 debug.lv4('TRY: Waiting for ILM phase to complete') 

103 phasechk.wait() 

104 except EsWaitFatal as wait: 

105 msg = ( 

106 f'{wait.message}. Total elapsed time: {wait.elapsed}. ' 

107 f'Errors: {prettystr(wait.errors)}' 

108 ) 

109 logger.error(msg) 

110 raise TestbedFailure(msg) from wait 

111 except EsWaitTimeout as wait: 

112 msg = f'{wait.message}. Total elapsed time: {wait.elapsed}.' 

113 logger.error(msg) 

114 raise TestbedFailure(msg) from wait 

115 

116 @begin_end() 

117 def _ssphz(self, phase: str) -> bool: 

118 """Return True if the phase is for searchable snapshots (> 'warm')""" 

119 retval = bool(self.pnum(phase) > self.pnum('warm')) 

120 debug.lv5(f'Return value = {retval}') 

121 return retval 

122 

123 @begin_end() 

124 def advance( 

125 self, 

126 phase: t.Optional[str] = None, 

127 action: t.Optional[str] = None, 

128 name: t.Optional[str] = None, 

129 ) -> None: 

130 """Advance index to next ILM phase""" 

131 if self._explain.phase == 'delete': 

132 debug.lv1('Already on "delete" phase. No more phases to advance') 

133 

134 return 

135 

136 debug.lv3(f'current_step: {prettystr(self.current_step())}') 

137 next_step = self.next_step(phase, action=action, name=name) 

138 debug.lv3(f'next_step: {prettystr(next_step)}') 

139 if self._explain.phase == 'new' and phase == 'hot': 

140 debug.lv3('Phase is "new" and is still advancing to "hot"') 

141 # It won't be for very long. 

142 debug.lv5('Waiting for phase to fully reach "hot"...') 

143 self._phase_wait('hot') 

144 time.sleep(1) # Just to make sure the new index is ready 

145 

146 # Regardless of the remaining phases, the current phase steps must be 

147 # complete before proceeding with ilm_move 

148 debug.lv5('Running self.update()...') 

149 self.update() 

150 debug.lv5('Running self.wait4complete()...') 

151 self.wait4complete() 

152 debug.lv5('Running self.update()...') 

153 self.update() 

154 

155 # We could have arrived with it hot, but incomplete 

156 if phase == 'hot': 

157 self._log_phase(phase) 

158 debug.lv5('Phase "hot" reached, and all steps are completed') 

159 

160 return 

161 

162 # Remaining phases could be warm through frozen 

163 if self._explain.phase != phase: 

164 debug.lv5(f'"{self._explain.phase}" != "{phase}"') 

165 debug.lv3(f'Current phase: {self.explain.phase}') 

166 debug.lv3(f'Advancing to "{phase}" phase...') 

167 # We will only wait for steps to complete for the hot and warm tiers 

168 debug.lv5('Waiting for steps for non-cold/frozen phases to complete') 

169 wait4steps = not self._ssphz(phase) 

170 debug.lv5('Running ilm_move()...') 

171 ilm_move(self.client, self.name, self.current_step(), next_step) 

172 # Let the cluster catch up before proceeding. 

173 time.sleep(1) # Just to make sure the cluster state has gelled 

174 debug.lv5('Running self._phase_wait()...') 

175 self._phase_wait(phase) 

176 # If cold or frozen, we can return now. We let the calling function 

177 # worry about the weird name changing behavior of searchable mounts 

178 debug.lv5('Checking if wait4steps is True') 

179 if wait4steps: 

180 debug.lv5('Running self.update()...') 

181 self.update() 

182 debug.lv3(f'Waiting for "{phase}" phase steps to complete...') 

183 debug.lv5('Running self.wait4complete()...') 

184 self.wait4complete() 

185 debug.lv5('Running self.update()...') 

186 self.update() 

187 self._log_phase(phase) 

188 else: 

189 debug.lv3(f'Already on "{phase}" phase. No need to advance') 

190 debug.lv5(f'current_step: {prettystr(self.current_step())}') 

191 

192 @begin_end() 

193 def current_step(self) -> t.Dict[str, str]: 

194 """Return the current ILM step information 

195 

196 There is a disconnect between what the ILM Explain API returns and what 

197 the ILM Move API expects. The former returns a 'step' key, while the 

198 latter expects a 'name' key. This property returns a dictionary for 

199 use with the ILM Move API, so that the 'step' key is renamed to 'name'. 

200 """ 

201 retval = { 

202 'phase': self._explain.phase, 

203 'action': self._explain.action, 

204 'name': self._explain.step, 

205 } 

206 debug.lv5(f'Return value = {prettystr(retval)}') 

207 return retval 

208 

209 @begin_end() 

210 def get_explain_data(self) -> t.Dict: 

211 """Get the ILM explain data and return it""" 

212 try: 

213 debug.lv4('TRY: Getting ILM explain data') 

214 retval = ilm_explain(self.client, self.name) 

215 

216 debug.lv5(f'Return value = {prettystr(retval)}') 

217 return retval 

218 except NameChanged as err: 

219 debug.lv3('Exiting method, raising exception') 

220 debug.lv5(f'Exception = {prettystr(err)}') 

221 debug.lv3('Apparent name change. Passing along upstream exception...') 

222 raise err 

223 except ResultNotExpected as err: 

224 debug.lv3('Exiting method, raising exception') 

225 msg = f'Unable to get ilm_explain results. Error: {prettystr(err)}' 

226 logger.critical(msg) 

227 raise err 

228 

229 @begin_end() 

230 def next_phase(self) -> str: 

231 """Return the next phase in the index's ILM journey""" 

232 retval = None 

233 if self._explain.phase == 'delete': 

234 debug.lv3('Already on "delete" phase. No more phases to advance') 

235 else: 

236 curr = self.pnum(self._explain.phase) # A numeric representation 

237 # A list of any remaining phases in the policy with a higher number than 

238 # the current 

239 remaining = [ 

240 self.pnum(x) for x in self.policy_phases if self.pnum(x) > curr 

241 ] 

242 if remaining: # If any: 

243 retval = self.pname(remaining[0]) 

244 # Get the phase name from the number stored in the first element 

245 debug.lv5(f'Return value = {retval}') 

246 return retval 

247 

248 @begin_end() 

249 def next_step( 

250 self, 

251 phase: t.Optional[str] = None, 

252 action: t.Optional[str] = None, 

253 name: t.Optional[str] = None, 

254 ) -> t.Dict[str, str]: 

255 """Determine the next ILM step based on the current phase, action, and name 

256 

257 There is a disconnect between what the ILM Explain API returns and what 

258 the ILM Move API expects. The former returns a 'step' key, while the 

259 latter expects a 'name' key. This property returns a dictionary for 

260 use with the ILM Move API, so that the 'step' key is renamed to 'name'. 

261 """ 

262 err1 = bool((action is not None) and (name is None)) 

263 err2 = bool((action is None) and (name is not None)) 

264 if err1 or err2: 

265 debug.lv3('Exiting method, raising exception') 

266 msg = 'If either action or name is specified, both must be' 

267 logger.critical(msg) 

268 raise TestbedMisconfig(msg) 

269 if not phase: 

270 debug.lv3('No phase specified. Using next_phase') 

271 phase = self.next_phase() 

272 debug.lv5(f'next_phase: {phase}') 

273 retval = {'phase': phase} 

274 if action: 

275 retval['action'] = action 

276 retval['name'] = name 

277 debug.lv5(f'Return value = {prettystr(retval)}') 

278 return retval 

279 

280 @begin_end() 

281 def pnum(self, phase: str) -> int: 

282 """Map a phase name to a phase number""" 

283 _ = {'new': 0, 'hot': 1, 'warm': 2, 'cold': 3, 'frozen': 4, 'delete': 5} 

284 debug.lv5(f'Return value = {_[phase]}') 

285 return _[phase] 

286 

287 @begin_end() 

288 def pname(self, num: int) -> str: 

289 """Map a phase number to a phase name""" 

290 _ = {0: 'new', 1: 'hot', 2: 'warm', 3: 'cold', 4: 'frozen', 5: 'delete'} 

291 debug.lv5(f'Return value = {_[num]}') 

292 return _[num] 

293 

294 @begin_end() 

295 def resolve(self, name: str) -> str: 

296 """Resolve that we have an index and NOT an alias or a datastream""" 

297 res = resolver(self.client, name) 

298 debug.lv5(f'resolver: result = {res}') 

299 if len(res['aliases']) > 0 or len(res['data_streams']) > 0: 

300 debug.lv3('Exiting method, raising exception') 

301 msg = f'{name} is not an index: {res}' 

302 logger.critical(msg) 

303 raise ResultNotExpected(msg) 

304 if len(res['indices']) > 1: 

305 debug.lv3('Exiting method, raising exception') 

306 msg = f'{name} resolved to multiple indices: {prettystr(res["indices"])}' 

307 logger.critical(msg) 

308 raise ResultNotExpected(msg) 

309 debug.lv5(f'Return value = {res["indices"][0]["name"]}') 

310 return res['indices'][0]['name'] 

311 

312 @begin_end() 

313 def update(self) -> None: 

314 """Update self._explain with the latest from :py:meth:`get_explain_data`""" 

315 try: 

316 debug.lv4('TRY: self._explain = DotMap(self.get_explain_data())') 

317 self._explain = DotMap(self.get_explain_data()) 

318 debug.lv5(f'Updated explain: {prettystr(self._explain)}') 

319 except NameChanged as err: 

320 debug.lv3('Exiting method, raising exception') 

321 debug.lv3('Passing along upstream exception...') 

322 debug.lv5(f'Exception = {prettystr(err)}') 

323 raise err 

324 

325 @begin_end() 

326 def wait4complete(self) -> None: 

327 """Subroutine for waiting for an ILM step to complete""" 

328 phase_action = bool(self._explain.action == 'complete') 

329 phase_step = bool(self._explain.step == 'complete') 

330 if bool(phase_action and phase_step): 

331 debug.lv3( 

332 f'{self.name}: Current step complete: {prettystr(self.current_step())}' 

333 ) 

334 

335 return 

336 debug.lv3( 

337 f'{self.name}: Current step not complete. {prettystr(self.current_step())}' 

338 ) 

339 kw = {'name': self.name, 'pause': PAUSE_VALUE, 'timeout': TIMEOUT_VALUE} 

340 debug.lv5(f'IlmStep args = {prettystr(kw)}') 

341 step = IlmStep(self.client, **kw) 

342 try: 

343 debug.lv4('TRY: Waiting for ILM step to complete') 

344 step.wait() 

345 debug.lv3('ILM Step successful. The wait is over') 

346 time.sleep(1) # Just to make sure the cluster state has gelled 

347 except EsWaitFatal as wait: 

348 debug.lv3('Exiting method, raising exception') 

349 debug.lv5(f'Exception = {prettystr(wait)}') 

350 msg = ( 

351 f'{wait.message}. Total elapsed time: {wait.elapsed}. ' 

352 f'Errors: {prettystr(wait.errors)}' 

353 ) 

354 logger.error(msg) 

355 raise TestbedFailure(msg) from wait 

356 except EsWaitTimeout as wait: 

357 debug.lv3('Exiting method, raising exception') 

358 debug.lv5(f'Exception = {prettystr(wait)}') 

359 msg = f'{wait.message}. Total elapsed time: {wait.elapsed}.' 

360 logger.error(msg) 

361 raise TestbedFailure(msg) from wait