Coverage for /Users/buh/.pyenv/versions/3.12.9/envs/es-testbed/lib/python3.12/site-packages/es_testbed/ilm.py: 100%

234 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-31 13:12 -0600

1"""ILM Defining Class""" 

2 

3import typing as t 

4import logging 

5from os import getenv 

6import time 

7from dotmap import DotMap 

8import tiered_debug as debug 

9from es_wait import IlmPhase, IlmStep 

10from es_wait.exceptions import EsWaitFatal, EsWaitTimeout 

11from es_testbed.defaults import ( 

12 PAUSE_ENVVAR, 

13 PAUSE_DEFAULT, 

14 TIMEOUT_DEFAULT, 

15 TIMEOUT_ENVVAR, 

16) 

17from es_testbed.exceptions import ( 

18 NameChanged, 

19 ResultNotExpected, 

20 TestbedMisconfig, 

21 TestbedFailure, 

22) 

23from es_testbed.helpers.es_api import get_ilm_phases, ilm_explain, ilm_move, resolver 

24from es_testbed.helpers.utils import prettystr 

25 

26if t.TYPE_CHECKING: 

27 from elasticsearch8 import Elasticsearch 

28 

29PAUSE_VALUE = float(getenv(PAUSE_ENVVAR, default=PAUSE_DEFAULT)) 

30TIMEOUT_VALUE = float(getenv(TIMEOUT_ENVVAR, default=TIMEOUT_DEFAULT)) 

31 

32logger = logging.getLogger('es_testbed.IlmTracker') 

33 

34# ## Example ILM explain output 

35# { 

36# 'action': 'complete', 

37# 'action_time_millis': 0, 

38# 'age': '5.65m', 

39# 'index': 'INDEX_NAME', 

40# 'index_creation_date_millis': 0, 

41# 'lifecycle_date_millis': 0, 

42# 'managed': True, 

43# 'phase': 'hot', 

44# 'phase_execution': { 

45# 'modified_date_in_millis': 0, 

46# 'phase_definition': { 

47# 'actions': { 

48# 'rollover': { 

49# 'max_age': 'MAX_AGE', 

50# 'max_primary_shard_docs': 1000, 

51# 'max_primary_shard_size': 'MAX_SIZE', 

52# 'min_docs': 1 

53# } 

54# }, 

55# 'min_age': '0ms' 

56# }, 

57# 'policy': 'POLICY_NAME', 

58# 'version': 1 

59# }, 

60# 'phase_time_millis': 0, 

61# 'policy': 'POLICY_NAME', 

62# 'step': 'complete', 

63# 'step_time_millis': 0, 

64# 'time_since_index_creation': '5.65m' 

65# } 

66 

67 

68class IlmTracker: 

69 """ILM Phase Tracking Class""" 

70 

71 def __init__(self, client: 'Elasticsearch', name: str): 

72 debug.lv2('Initializing IlmTracker object...') 

73 self.client = client 

74 self.name = self.resolve(name) # A single index name 

75 self._explain = DotMap(self.get_explain_data()) 

76 self._phases = get_ilm_phases(self.client, self._explain.policy) 

77 debug.lv3('IlmTracker object initialized') 

78 

79 @property 

80 def explain(self) -> DotMap: 

81 """Return the current stored value of ILM Explain""" 

82 return self._explain 

83 

84 @property 

85 def policy_phases(self) -> t.Sequence[str]: 

86 """Return a list of phases in the ILM policy""" 

87 return list(self._phases.keys()) 

88 

89 def _log_phase(self, phase: str) -> None: 

90 debug.lv3(f'ILM Explain Index: {self._explain.index}') 

91 debug.lv2(f'Index "{self.name}" now on phase "{phase}"') 

92 

93 def _phase_wait( 

94 self, phase: str, pause: float = PAUSE_VALUE, timeout: float = TIMEOUT_VALUE 

95 ) -> None: 

96 """Wait until the new phase shows up in ILM Explain""" 

97 debug.lv2('Starting method...') 

98 kw = {'name': self.name, 'phase': phase, 'pause': pause, 'timeout': timeout} 

99 debug.lv5(f'Waiting for phase args = {prettystr(kw)}') 

100 phasechk = IlmPhase(self.client, **kw) 

101 try: 

102 debug.lv4('TRY: Waiting for ILM phase to complete') 

103 phasechk.wait() 

104 except EsWaitFatal as wait: 

105 msg = ( 

106 f'{wait.message}. Total elapsed time: {wait.elapsed}. ' 

107 f'Errors: {prettystr(wait.errors)}' 

108 ) 

109 logger.error(msg) 

110 raise TestbedFailure(msg) from wait 

111 except EsWaitTimeout as wait: 

112 msg = f'{wait.message}. Total elapsed time: {wait.elapsed}.' 

113 logger.error(msg) 

114 raise TestbedFailure(msg) from wait 

115 debug.lv3('Exiting method') 

116 

117 def _ssphz(self, phase: str) -> bool: 

118 """Return True if the phase is for searchable snapshots (> 'warm')""" 

119 debug.lv2('Starting method...') 

120 retval = bool(self.pnum(phase) > self.pnum('warm')) 

121 debug.lv3('Exiting method, returning value') 

122 debug.lv5(f'Value = {retval}') 

123 return retval 

124 

125 def advance( 

126 self, 

127 phase: t.Optional[str] = None, 

128 action: t.Optional[str] = None, 

129 name: t.Optional[str] = None, 

130 ) -> None: 

131 """Advance index to next ILM phase""" 

132 debug.lv2('Starting method...') 

133 if self._explain.phase == 'delete': 

134 debug.lv1('Already on "delete" phase. No more phases to advance') 

135 debug.lv3('Exiting method') 

136 return 

137 

138 debug.lv3(f'current_step: {prettystr(self.current_step())}') 

139 next_step = self.next_step(phase, action=action, name=name) 

140 debug.lv3(f'next_step: {prettystr(next_step)}') 

141 if self._explain.phase == 'new' and phase == 'hot': 

142 debug.lv3('Phase is "new" and is still advancing to "hot"') 

143 # It won't be for very long. 

144 debug.lv5('Waiting for phase to fully reach "hot"...') 

145 self._phase_wait('hot') 

146 time.sleep(1) # Just to make sure the new index is ready 

147 

148 # Regardless of the remaining phases, the current phase steps must be 

149 # complete before proceeding with ilm_move 

150 debug.lv5('Running self.update()...') 

151 self.update() 

152 debug.lv5('Running self.wait4complete()...') 

153 self.wait4complete() 

154 debug.lv5('Running self.update()...') 

155 self.update() 

156 

157 # We could have arrived with it hot, but incomplete 

158 if phase == 'hot': 

159 self._log_phase(phase) 

160 debug.lv5('Phase "hot" reached, and all steps are completed') 

161 debug.lv3('Exiting method') 

162 return 

163 

164 # Remaining phases could be warm through frozen 

165 if self._explain.phase != phase: 

166 debug.lv5(f'"{self._explain.phase}" != "{phase}"') 

167 debug.lv3(f'Current phase: {self.explain.phase}') 

168 debug.lv3(f'Advancing to "{phase}" phase...') 

169 # We will only wait for steps to complete for the hot and warm tiers 

170 debug.lv5('Waiting for steps for non-cold/frozen phases to complete') 

171 wait4steps = not self._ssphz(phase) 

172 debug.lv5('Running ilm_move()...') 

173 ilm_move(self.client, self.name, self.current_step(), next_step) 

174 # Let the cluster catch up before proceeding. 

175 time.sleep(1) # Just to make sure the cluster state has gelled 

176 debug.lv5('Running self._phase_wait()...') 

177 self._phase_wait(phase) 

178 # If cold or frozen, we can return now. We let the calling function 

179 # worry about the weird name changing behavior of searchable mounts 

180 debug.lv5('Checking if wait4steps is True') 

181 if wait4steps: 

182 debug.lv5('Running self.update()...') 

183 self.update() 

184 debug.lv3(f'Waiting for "{phase}" phase steps to complete...') 

185 debug.lv5('Running self.wait4complete()...') 

186 self.wait4complete() 

187 debug.lv5('Running self.update()...') 

188 self.update() 

189 self._log_phase(phase) 

190 else: 

191 debug.lv3(f'Already on "{phase}" phase. No need to advance') 

192 debug.lv5(f'current_step: {prettystr(self.current_step())}') 

193 debug.lv3('Exiting method') 

194 

195 def current_step(self) -> t.Dict[str, str]: 

196 """Return the current ILM step information 

197 

198 There is a disconnect between what the ILM Explain API returns and what 

199 the ILM Move API expects. The former returns a 'step' key, while the 

200 latter expects a 'name' key. This property returns a dictionary for 

201 use with the ILM Move API, so that the 'step' key is renamed to 'name'. 

202 """ 

203 debug.lv2('Starting method...') 

204 retval = { 

205 'phase': self._explain.phase, 

206 'action': self._explain.action, 

207 'name': self._explain.step, 

208 } 

209 debug.lv3('Exiting method, returning value') 

210 debug.lv5(f'Value = {prettystr(retval)}') 

211 return retval 

212 

213 def get_explain_data(self) -> t.Dict: 

214 """Get the ILM explain data and return it""" 

215 debug.lv2('Starting method...') 

216 try: 

217 debug.lv4('TRY: Getting ILM explain data') 

218 retval = ilm_explain(self.client, self.name) 

219 debug.lv3('Exiting method, returning value') 

220 debug.lv5(f'Value = {prettystr(retval)}') 

221 return retval 

222 except NameChanged as err: 

223 debug.lv3('Exiting method, raising exception') 

224 debug.lv5(f'Exception = {prettystr(err)}') 

225 debug.lv3('Apparent name change. Passing along upstream exception...') 

226 raise err 

227 except ResultNotExpected as err: 

228 debug.lv3('Exiting method, raising exception') 

229 msg = f'Unable to get ilm_explain results. Error: {prettystr(err)}' 

230 logger.critical(msg) 

231 raise err 

232 

233 def next_phase(self) -> str: 

234 """Return the next phase in the index's ILM journey""" 

235 debug.lv2('Starting method...') 

236 retval = None 

237 if self._explain.phase == 'delete': 

238 debug.lv3('Already on "delete" phase. No more phases to advance') 

239 else: 

240 curr = self.pnum(self._explain.phase) # A numeric representation 

241 # A list of any remaining phases in the policy with a higher number than 

242 # the current 

243 remaining = [ 

244 self.pnum(x) for x in self.policy_phases if self.pnum(x) > curr 

245 ] 

246 if remaining: # If any: 

247 retval = self.pname(remaining[0]) 

248 # Get the phase name from the number stored in the first element 

249 debug.lv3('Exiting method, returning value') 

250 debug.lv5(f'Value = {retval}') 

251 return retval 

252 

253 def next_step( 

254 self, 

255 phase: t.Optional[str] = None, 

256 action: t.Optional[str] = None, 

257 name: t.Optional[str] = None, 

258 ) -> t.Dict[str, str]: 

259 """Determine the next ILM step based on the current phase, action, and name 

260 

261 There is a disconnect between what the ILM Explain API returns and what 

262 the ILM Move API expects. The former returns a 'step' key, while the 

263 latter expects a 'name' key. This property returns a dictionary for 

264 use with the ILM Move API, so that the 'step' key is renamed to 'name'. 

265 """ 

266 debug.lv2('Starting method...') 

267 err1 = bool((action is not None) and (name is None)) 

268 err2 = bool((action is None) and (name is not None)) 

269 if err1 or err2: 

270 debug.lv3('Exiting method, raising exception') 

271 msg = 'If either action or name is specified, both must be' 

272 logger.critical(msg) 

273 raise TestbedMisconfig(msg) 

274 if not phase: 

275 debug.lv3('No phase specified. Using next_phase') 

276 phase = self.next_phase() 

277 debug.lv5(f'next_phase: {phase}') 

278 retval = {'phase': phase} 

279 if action: 

280 retval['action'] = action 

281 retval['name'] = name 

282 debug.lv3('Exiting method, returning value') 

283 debug.lv5(f'Value = {prettystr(retval)}') 

284 return retval 

285 

286 def pnum(self, phase: str) -> int: 

287 """Map a phase name to a phase number""" 

288 debug.lv2('Starting method...') 

289 _ = {'new': 0, 'hot': 1, 'warm': 2, 'cold': 3, 'frozen': 4, 'delete': 5} 

290 debug.lv3('Exiting method, returning value') 

291 debug.lv5(f'Value = {_[phase]}') 

292 return _[phase] 

293 

294 def pname(self, num: int) -> str: 

295 """Map a phase number to a phase name""" 

296 debug.lv2('Starting method...') 

297 _ = {0: 'new', 1: 'hot', 2: 'warm', 3: 'cold', 4: 'frozen', 5: 'delete'} 

298 debug.lv3('Exiting method, returning value') 

299 debug.lv5(f'Value = {_[num]}') 

300 return _[num] 

301 

302 def resolve(self, name: str) -> str: 

303 """Resolve that we have an index and NOT an alias or a datastream""" 

304 debug.lv2('Starting method...') 

305 res = resolver(self.client, name) 

306 debug.lv5(f'resolver: result = {res}') 

307 if len(res['aliases']) > 0 or len(res['data_streams']) > 0: 

308 debug.lv3('Exiting method, raising exception') 

309 msg = f'{name} is not an index: {res}' 

310 logger.critical(msg) 

311 raise ResultNotExpected(msg) 

312 if len(res['indices']) > 1: 

313 debug.lv3('Exiting method, raising exception') 

314 msg = f'{name} resolved to multiple indices: {prettystr(res["indices"])}' 

315 logger.critical(msg) 

316 raise ResultNotExpected(msg) 

317 debug.lv3('Exiting method, returning value') 

318 debug.lv5(f'Value = {res["indices"][0]["name"]}') 

319 return res['indices'][0]['name'] 

320 

321 def set_debug_tier(self, tier: int) -> None: 

322 """ 

323 Set the debug tier globally for this module 

324 """ 

325 debug.set_level(tier) 

326 

327 def update(self) -> None: 

328 """Update self._explain with the latest from :py:meth:`get_explain_data`""" 

329 debug.lv2('Starting method...') 

330 try: 

331 debug.lv4('TRY: self._explain = DotMap(self.get_explain_data())') 

332 self._explain = DotMap(self.get_explain_data()) 

333 debug.lv5(f'Updated explain: {prettystr(self._explain)}') 

334 except NameChanged as err: 

335 debug.lv3('Exiting method, raising exception') 

336 debug.lv3('Passing along upstream exception...') 

337 debug.lv5(f'Exception = {prettystr(err)}') 

338 raise err 

339 debug.lv3('Exiting method') 

340 

341 def wait4complete(self) -> None: 

342 """Subroutine for waiting for an ILM step to complete""" 

343 debug.lv2('Starting method...') 

344 phase_action = bool(self._explain.action == 'complete') 

345 phase_step = bool(self._explain.step == 'complete') 

346 if bool(phase_action and phase_step): 

347 debug.lv3( 

348 f'{self.name}: Current step complete: {prettystr(self.current_step())}' 

349 ) 

350 debug.lv3('Exiting method') 

351 return 

352 debug.lv3( 

353 f'{self.name}: Current step not complete. {prettystr(self.current_step())}' 

354 ) 

355 kw = {'name': self.name, 'pause': PAUSE_VALUE, 'timeout': TIMEOUT_VALUE} 

356 debug.lv5(f'IlmStep args = {prettystr(kw)}') 

357 step = IlmStep(self.client, **kw) 

358 try: 

359 debug.lv4('TRY: Waiting for ILM step to complete') 

360 step.wait() 

361 debug.lv3('ILM Step successful. The wait is over') 

362 time.sleep(1) # Just to make sure the cluster state has gelled 

363 except EsWaitFatal as wait: 

364 debug.lv3('Exiting method, raising exception') 

365 debug.lv5(f'Exception = {prettystr(wait)}') 

366 msg = ( 

367 f'{wait.message}. Total elapsed time: {wait.elapsed}. ' 

368 f'Errors: {prettystr(wait.errors)}' 

369 ) 

370 logger.error(msg) 

371 raise TestbedFailure(msg) from wait 

372 except EsWaitTimeout as wait: 

373 debug.lv3('Exiting method, raising exception') 

374 debug.lv5(f'Exception = {prettystr(wait)}') 

375 msg = f'{wait.message}. Total elapsed time: {wait.elapsed}.' 

376 logger.error(msg) 

377 raise TestbedFailure(msg) from wait 

378 debug.lv3('Exiting method')