Coverage for /Users/buh/.pyenv/versions/3.12.9/envs/es-testbed/lib/python3.12/site-packages/es_testbed/ilm.py: 100%
234 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-31 13:12 -0600
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-31 13:12 -0600
1"""ILM Defining Class"""
3import typing as t
4import logging
5from os import getenv
6import time
7from dotmap import DotMap
8import tiered_debug as debug
9from es_wait import IlmPhase, IlmStep
10from es_wait.exceptions import EsWaitFatal, EsWaitTimeout
11from es_testbed.defaults import (
12 PAUSE_ENVVAR,
13 PAUSE_DEFAULT,
14 TIMEOUT_DEFAULT,
15 TIMEOUT_ENVVAR,
16)
17from es_testbed.exceptions import (
18 NameChanged,
19 ResultNotExpected,
20 TestbedMisconfig,
21 TestbedFailure,
22)
23from es_testbed.helpers.es_api import get_ilm_phases, ilm_explain, ilm_move, resolver
24from es_testbed.helpers.utils import prettystr
26if t.TYPE_CHECKING:
27 from elasticsearch8 import Elasticsearch
29PAUSE_VALUE = float(getenv(PAUSE_ENVVAR, default=PAUSE_DEFAULT))
30TIMEOUT_VALUE = float(getenv(TIMEOUT_ENVVAR, default=TIMEOUT_DEFAULT))
32logger = logging.getLogger('es_testbed.IlmTracker')
34# ## Example ILM explain output
35# {
36# 'action': 'complete',
37# 'action_time_millis': 0,
38# 'age': '5.65m',
39# 'index': 'INDEX_NAME',
40# 'index_creation_date_millis': 0,
41# 'lifecycle_date_millis': 0,
42# 'managed': True,
43# 'phase': 'hot',
44# 'phase_execution': {
45# 'modified_date_in_millis': 0,
46# 'phase_definition': {
47# 'actions': {
48# 'rollover': {
49# 'max_age': 'MAX_AGE',
50# 'max_primary_shard_docs': 1000,
51# 'max_primary_shard_size': 'MAX_SIZE',
52# 'min_docs': 1
53# }
54# },
55# 'min_age': '0ms'
56# },
57# 'policy': 'POLICY_NAME',
58# 'version': 1
59# },
60# 'phase_time_millis': 0,
61# 'policy': 'POLICY_NAME',
62# 'step': 'complete',
63# 'step_time_millis': 0,
64# 'time_since_index_creation': '5.65m'
65# }
68class IlmTracker:
69 """ILM Phase Tracking Class"""
71 def __init__(self, client: 'Elasticsearch', name: str):
72 debug.lv2('Initializing IlmTracker object...')
73 self.client = client
74 self.name = self.resolve(name) # A single index name
75 self._explain = DotMap(self.get_explain_data())
76 self._phases = get_ilm_phases(self.client, self._explain.policy)
77 debug.lv3('IlmTracker object initialized')
79 @property
80 def explain(self) -> DotMap:
81 """Return the current stored value of ILM Explain"""
82 return self._explain
84 @property
85 def policy_phases(self) -> t.Sequence[str]:
86 """Return a list of phases in the ILM policy"""
87 return list(self._phases.keys())
89 def _log_phase(self, phase: str) -> None:
90 debug.lv3(f'ILM Explain Index: {self._explain.index}')
91 debug.lv2(f'Index "{self.name}" now on phase "{phase}"')
93 def _phase_wait(
94 self, phase: str, pause: float = PAUSE_VALUE, timeout: float = TIMEOUT_VALUE
95 ) -> None:
96 """Wait until the new phase shows up in ILM Explain"""
97 debug.lv2('Starting method...')
98 kw = {'name': self.name, 'phase': phase, 'pause': pause, 'timeout': timeout}
99 debug.lv5(f'Waiting for phase args = {prettystr(kw)}')
100 phasechk = IlmPhase(self.client, **kw)
101 try:
102 debug.lv4('TRY: Waiting for ILM phase to complete')
103 phasechk.wait()
104 except EsWaitFatal as wait:
105 msg = (
106 f'{wait.message}. Total elapsed time: {wait.elapsed}. '
107 f'Errors: {prettystr(wait.errors)}'
108 )
109 logger.error(msg)
110 raise TestbedFailure(msg) from wait
111 except EsWaitTimeout as wait:
112 msg = f'{wait.message}. Total elapsed time: {wait.elapsed}.'
113 logger.error(msg)
114 raise TestbedFailure(msg) from wait
115 debug.lv3('Exiting method')
117 def _ssphz(self, phase: str) -> bool:
118 """Return True if the phase is for searchable snapshots (> 'warm')"""
119 debug.lv2('Starting method...')
120 retval = bool(self.pnum(phase) > self.pnum('warm'))
121 debug.lv3('Exiting method, returning value')
122 debug.lv5(f'Value = {retval}')
123 return retval
125 def advance(
126 self,
127 phase: t.Optional[str] = None,
128 action: t.Optional[str] = None,
129 name: t.Optional[str] = None,
130 ) -> None:
131 """Advance index to next ILM phase"""
132 debug.lv2('Starting method...')
133 if self._explain.phase == 'delete':
134 debug.lv1('Already on "delete" phase. No more phases to advance')
135 debug.lv3('Exiting method')
136 return
138 debug.lv3(f'current_step: {prettystr(self.current_step())}')
139 next_step = self.next_step(phase, action=action, name=name)
140 debug.lv3(f'next_step: {prettystr(next_step)}')
141 if self._explain.phase == 'new' and phase == 'hot':
142 debug.lv3('Phase is "new" and is still advancing to "hot"')
143 # It won't be for very long.
144 debug.lv5('Waiting for phase to fully reach "hot"...')
145 self._phase_wait('hot')
146 time.sleep(1) # Just to make sure the new index is ready
148 # Regardless of the remaining phases, the current phase steps must be
149 # complete before proceeding with ilm_move
150 debug.lv5('Running self.update()...')
151 self.update()
152 debug.lv5('Running self.wait4complete()...')
153 self.wait4complete()
154 debug.lv5('Running self.update()...')
155 self.update()
157 # We could have arrived with it hot, but incomplete
158 if phase == 'hot':
159 self._log_phase(phase)
160 debug.lv5('Phase "hot" reached, and all steps are completed')
161 debug.lv3('Exiting method')
162 return
164 # Remaining phases could be warm through frozen
165 if self._explain.phase != phase:
166 debug.lv5(f'"{self._explain.phase}" != "{phase}"')
167 debug.lv3(f'Current phase: {self.explain.phase}')
168 debug.lv3(f'Advancing to "{phase}" phase...')
169 # We will only wait for steps to complete for the hot and warm tiers
170 debug.lv5('Waiting for steps for non-cold/frozen phases to complete')
171 wait4steps = not self._ssphz(phase)
172 debug.lv5('Running ilm_move()...')
173 ilm_move(self.client, self.name, self.current_step(), next_step)
174 # Let the cluster catch up before proceeding.
175 time.sleep(1) # Just to make sure the cluster state has gelled
176 debug.lv5('Running self._phase_wait()...')
177 self._phase_wait(phase)
178 # If cold or frozen, we can return now. We let the calling function
179 # worry about the weird name changing behavior of searchable mounts
180 debug.lv5('Checking if wait4steps is True')
181 if wait4steps:
182 debug.lv5('Running self.update()...')
183 self.update()
184 debug.lv3(f'Waiting for "{phase}" phase steps to complete...')
185 debug.lv5('Running self.wait4complete()...')
186 self.wait4complete()
187 debug.lv5('Running self.update()...')
188 self.update()
189 self._log_phase(phase)
190 else:
191 debug.lv3(f'Already on "{phase}" phase. No need to advance')
192 debug.lv5(f'current_step: {prettystr(self.current_step())}')
193 debug.lv3('Exiting method')
195 def current_step(self) -> t.Dict[str, str]:
196 """Return the current ILM step information
198 There is a disconnect between what the ILM Explain API returns and what
199 the ILM Move API expects. The former returns a 'step' key, while the
200 latter expects a 'name' key. This property returns a dictionary for
201 use with the ILM Move API, so that the 'step' key is renamed to 'name'.
202 """
203 debug.lv2('Starting method...')
204 retval = {
205 'phase': self._explain.phase,
206 'action': self._explain.action,
207 'name': self._explain.step,
208 }
209 debug.lv3('Exiting method, returning value')
210 debug.lv5(f'Value = {prettystr(retval)}')
211 return retval
213 def get_explain_data(self) -> t.Dict:
214 """Get the ILM explain data and return it"""
215 debug.lv2('Starting method...')
216 try:
217 debug.lv4('TRY: Getting ILM explain data')
218 retval = ilm_explain(self.client, self.name)
219 debug.lv3('Exiting method, returning value')
220 debug.lv5(f'Value = {prettystr(retval)}')
221 return retval
222 except NameChanged as err:
223 debug.lv3('Exiting method, raising exception')
224 debug.lv5(f'Exception = {prettystr(err)}')
225 debug.lv3('Apparent name change. Passing along upstream exception...')
226 raise err
227 except ResultNotExpected as err:
228 debug.lv3('Exiting method, raising exception')
229 msg = f'Unable to get ilm_explain results. Error: {prettystr(err)}'
230 logger.critical(msg)
231 raise err
233 def next_phase(self) -> str:
234 """Return the next phase in the index's ILM journey"""
235 debug.lv2('Starting method...')
236 retval = None
237 if self._explain.phase == 'delete':
238 debug.lv3('Already on "delete" phase. No more phases to advance')
239 else:
240 curr = self.pnum(self._explain.phase) # A numeric representation
241 # A list of any remaining phases in the policy with a higher number than
242 # the current
243 remaining = [
244 self.pnum(x) for x in self.policy_phases if self.pnum(x) > curr
245 ]
246 if remaining: # If any:
247 retval = self.pname(remaining[0])
248 # Get the phase name from the number stored in the first element
249 debug.lv3('Exiting method, returning value')
250 debug.lv5(f'Value = {retval}')
251 return retval
253 def next_step(
254 self,
255 phase: t.Optional[str] = None,
256 action: t.Optional[str] = None,
257 name: t.Optional[str] = None,
258 ) -> t.Dict[str, str]:
259 """Determine the next ILM step based on the current phase, action, and name
261 There is a disconnect between what the ILM Explain API returns and what
262 the ILM Move API expects. The former returns a 'step' key, while the
263 latter expects a 'name' key. This property returns a dictionary for
264 use with the ILM Move API, so that the 'step' key is renamed to 'name'.
265 """
266 debug.lv2('Starting method...')
267 err1 = bool((action is not None) and (name is None))
268 err2 = bool((action is None) and (name is not None))
269 if err1 or err2:
270 debug.lv3('Exiting method, raising exception')
271 msg = 'If either action or name is specified, both must be'
272 logger.critical(msg)
273 raise TestbedMisconfig(msg)
274 if not phase:
275 debug.lv3('No phase specified. Using next_phase')
276 phase = self.next_phase()
277 debug.lv5(f'next_phase: {phase}')
278 retval = {'phase': phase}
279 if action:
280 retval['action'] = action
281 retval['name'] = name
282 debug.lv3('Exiting method, returning value')
283 debug.lv5(f'Value = {prettystr(retval)}')
284 return retval
286 def pnum(self, phase: str) -> int:
287 """Map a phase name to a phase number"""
288 debug.lv2('Starting method...')
289 _ = {'new': 0, 'hot': 1, 'warm': 2, 'cold': 3, 'frozen': 4, 'delete': 5}
290 debug.lv3('Exiting method, returning value')
291 debug.lv5(f'Value = {_[phase]}')
292 return _[phase]
294 def pname(self, num: int) -> str:
295 """Map a phase number to a phase name"""
296 debug.lv2('Starting method...')
297 _ = {0: 'new', 1: 'hot', 2: 'warm', 3: 'cold', 4: 'frozen', 5: 'delete'}
298 debug.lv3('Exiting method, returning value')
299 debug.lv5(f'Value = {_[num]}')
300 return _[num]
302 def resolve(self, name: str) -> str:
303 """Resolve that we have an index and NOT an alias or a datastream"""
304 debug.lv2('Starting method...')
305 res = resolver(self.client, name)
306 debug.lv5(f'resolver: result = {res}')
307 if len(res['aliases']) > 0 or len(res['data_streams']) > 0:
308 debug.lv3('Exiting method, raising exception')
309 msg = f'{name} is not an index: {res}'
310 logger.critical(msg)
311 raise ResultNotExpected(msg)
312 if len(res['indices']) > 1:
313 debug.lv3('Exiting method, raising exception')
314 msg = f'{name} resolved to multiple indices: {prettystr(res["indices"])}'
315 logger.critical(msg)
316 raise ResultNotExpected(msg)
317 debug.lv3('Exiting method, returning value')
318 debug.lv5(f'Value = {res["indices"][0]["name"]}')
319 return res['indices'][0]['name']
321 def set_debug_tier(self, tier: int) -> None:
322 """
323 Set the debug tier globally for this module
324 """
325 debug.set_level(tier)
327 def update(self) -> None:
328 """Update self._explain with the latest from :py:meth:`get_explain_data`"""
329 debug.lv2('Starting method...')
330 try:
331 debug.lv4('TRY: self._explain = DotMap(self.get_explain_data())')
332 self._explain = DotMap(self.get_explain_data())
333 debug.lv5(f'Updated explain: {prettystr(self._explain)}')
334 except NameChanged as err:
335 debug.lv3('Exiting method, raising exception')
336 debug.lv3('Passing along upstream exception...')
337 debug.lv5(f'Exception = {prettystr(err)}')
338 raise err
339 debug.lv3('Exiting method')
341 def wait4complete(self) -> None:
342 """Subroutine for waiting for an ILM step to complete"""
343 debug.lv2('Starting method...')
344 phase_action = bool(self._explain.action == 'complete')
345 phase_step = bool(self._explain.step == 'complete')
346 if bool(phase_action and phase_step):
347 debug.lv3(
348 f'{self.name}: Current step complete: {prettystr(self.current_step())}'
349 )
350 debug.lv3('Exiting method')
351 return
352 debug.lv3(
353 f'{self.name}: Current step not complete. {prettystr(self.current_step())}'
354 )
355 kw = {'name': self.name, 'pause': PAUSE_VALUE, 'timeout': TIMEOUT_VALUE}
356 debug.lv5(f'IlmStep args = {prettystr(kw)}')
357 step = IlmStep(self.client, **kw)
358 try:
359 debug.lv4('TRY: Waiting for ILM step to complete')
360 step.wait()
361 debug.lv3('ILM Step successful. The wait is over')
362 time.sleep(1) # Just to make sure the cluster state has gelled
363 except EsWaitFatal as wait:
364 debug.lv3('Exiting method, raising exception')
365 debug.lv5(f'Exception = {prettystr(wait)}')
366 msg = (
367 f'{wait.message}. Total elapsed time: {wait.elapsed}. '
368 f'Errors: {prettystr(wait.errors)}'
369 )
370 logger.error(msg)
371 raise TestbedFailure(msg) from wait
372 except EsWaitTimeout as wait:
373 debug.lv3('Exiting method, raising exception')
374 debug.lv5(f'Exception = {prettystr(wait)}')
375 msg = f'{wait.message}. Total elapsed time: {wait.elapsed}.'
376 logger.error(msg)
377 raise TestbedFailure(msg) from wait
378 debug.lv3('Exiting method')