Coverage for /Users/buh/.pyenv/versions/3.12.9/envs/es-testbed/lib/python3.12/site-packages/es_testbed/ilm.py: 100%
157 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-17 19:30 -0600
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-17 19:30 -0600
1"""ILM Defining Class"""
3import typing as t
4import logging
5from os import getenv
6import time
7from dotmap import DotMap
8from es_wait import IlmPhase, IlmStep
9from es_wait.exceptions import EsWaitFatal, EsWaitTimeout
10from es_testbed.defaults import (
11 PAUSE_ENVVAR,
12 PAUSE_DEFAULT,
13 TIMEOUT_DEFAULT,
14 TIMEOUT_ENVVAR,
15)
16from es_testbed.exceptions import (
17 NameChanged,
18 ResultNotExpected,
19 TestbedMisconfig,
20 TestbedFailure,
21)
22from es_testbed.helpers.es_api import get_ilm_phases, ilm_explain, ilm_move, resolver
23from es_testbed.helpers.utils import prettystr
25if t.TYPE_CHECKING:
26 from elasticsearch8 import Elasticsearch
28PAUSE_VALUE = float(getenv(PAUSE_ENVVAR, default=PAUSE_DEFAULT))
29TIMEOUT_VALUE = float(getenv(TIMEOUT_ENVVAR, default=TIMEOUT_DEFAULT))
31logger = logging.getLogger('es_testbed.IlmTracker')
33# ## Example ILM explain output
34# {
35# 'action': 'complete',
36# 'action_time_millis': 0,
37# 'age': '5.65m',
38# 'index': 'INDEX_NAME',
39# 'index_creation_date_millis': 0,
40# 'lifecycle_date_millis': 0,
41# 'managed': True,
42# 'phase': 'hot',
43# 'phase_execution': {
44# 'modified_date_in_millis': 0,
45# 'phase_definition': {
46# 'actions': {
47# 'rollover': {
48# 'max_age': 'MAX_AGE',
49# 'max_primary_shard_docs': 1000,
50# 'max_primary_shard_size': 'MAX_SIZE',
51# 'min_docs': 1
52# }
53# },
54# 'min_age': '0ms'
55# },
56# 'policy': 'POLICY_NAME',
57# 'version': 1
58# },
59# 'phase_time_millis': 0,
60# 'policy': 'POLICY_NAME',
61# 'step': 'complete',
62# 'step_time_millis': 0,
63# 'time_since_index_creation': '5.65m'
64# }
67class IlmTracker:
68 """ILM Phase Tracking Class"""
70 def __init__(self, client: 'Elasticsearch', name: str):
71 self.client = client
72 self.name = self.resolve(name) # A single index name
73 self._explain = DotMap(self.get_explain_data())
74 self._phases = get_ilm_phases(self.client, self._explain.policy)
76 @property
77 def explain(self) -> DotMap:
78 """Return the current stored value of ILM Explain"""
79 return self._explain
81 @property
82 def policy_phases(self) -> t.Sequence[str]:
83 """Return a list of phases in the ILM policy"""
84 return list(self._phases.keys())
86 def _log_phase(self, phase: str) -> None:
87 logger.debug(f'ILM Explain Index: {self._explain.index}')
88 logger.info(f'Index "{self.name}" now on phase "{phase}"')
90 def _phase_wait(
91 self, phase: str, pause: float = PAUSE_VALUE, timeout: float = TIMEOUT_VALUE
92 ) -> None:
93 """Wait until the new phase shows up in ILM Explain"""
94 kw = {'name': self.name, 'phase': phase, 'pause': pause, 'timeout': timeout}
95 phasechk = IlmPhase(self.client, **kw)
96 try:
97 phasechk.wait()
98 except EsWaitFatal as wait:
99 msg = (
100 f'{wait.message}. Total elapsed time: {wait.elapsed}. '
101 f'Errors: {prettystr(wait.errors)}'
102 )
103 logger.error(msg)
104 raise TestbedFailure(msg) from wait
105 except EsWaitTimeout as wait:
106 msg = f'{wait.message}. Total elapsed time: {wait.elapsed}.'
107 logger.error(msg)
108 raise TestbedFailure(msg) from wait
110 def _ssphz(self, phase: str) -> bool:
111 """Return True if the phase is for searchable snapshots (> 'warm')"""
112 return bool(self.pnum(phase) > self.pnum('warm'))
114 def advance(
115 self,
116 phase: t.Optional[str] = None,
117 action: t.Optional[str] = None,
118 name: t.Optional[str] = None,
119 ) -> None:
120 """Advance index to next ILM phase"""
121 if self._explain.phase == 'delete':
122 logger.warning('Already on "delete" phase. No more phases to advance')
123 return
125 logger.debug(f'current_step: {prettystr(self.current_step())}')
126 next_step = self.next_step(phase, action=action, name=name)
127 logger.debug(f'next_step: {prettystr(next_step)}')
128 if self._explain.phase == 'new' and phase == 'hot':
129 # It won't be for very long.
130 self._phase_wait('hot')
131 time.sleep(1) # Just to make sure the new index is ready
133 # Regardless of the remaining phases, the current phase steps must be
134 # complete before proceeding with ilm_move
135 self.update()
136 self.wait4complete()
137 self.update()
139 # We could have arrived with it hot, but incomplete
140 if phase == 'hot':
141 self._log_phase(phase)
142 # we've advanced to our target phase, and all steps are completed
143 return
145 # Remaining phases could be warm through frozen
146 if self._explain.phase != phase:
147 logger.debug(f'Current phase: {self.explain.phase}')
148 logger.debug(f'Advancing to "{phase}" phase...')
149 # We will only wait for steps to complete for the hot and warm tiers
150 wait4steps = not self._ssphz(phase)
151 ilm_move(self.client, self.name, self.current_step(), next_step)
152 # Let the cluster catch up before proceeding.
153 time.sleep(1) # Just to make sure the cluster state has gelled
154 self._phase_wait(phase)
155 # If cold or frozen, we can return now. We let the calling function
156 # worry about the weird name changing behavior of searchable mounts
157 if wait4steps:
158 self.update()
159 logger.debug(f'Waiting for "{phase}" phase steps to complete...')
160 self.wait4complete()
161 self.update()
162 self._log_phase(phase)
163 else:
164 logger.debug(f'Already on "{phase}" phase. No need to advance')
165 logger.debug(f'current_step: {prettystr(self.current_step())}')
167 def current_step(self) -> t.Dict[str, str]:
168 """Return the current ILM step information
170 There is a disconnect between what the ILM Explain API returns and what
171 the ILM Move API expects. The former returns a 'step' key, while the
172 latter expects a 'name' key. This property returns a dictionary for
173 use with the ILM Move API, so that the 'step' key is renamed to 'name'.
174 """
176 return {
177 'phase': self._explain.phase,
178 'action': self._explain.action,
179 'name': self._explain.step,
180 }
182 def get_explain_data(self) -> t.Dict:
183 """Get the ILM explain data and return it"""
184 try:
185 return ilm_explain(self.client, self.name)
186 except NameChanged as err:
187 logger.debug('Passing along upstream exception...')
188 raise err
189 except ResultNotExpected as err:
190 msg = f'Unable to get ilm_explain results. Error: {prettystr(err)}'
191 logger.critical(msg)
192 raise err
194 def next_phase(self) -> str:
195 """Return the next phase in the index's ILM journey"""
196 retval = None
197 if self._explain.phase == 'delete':
198 logger.warning('Already on "delete" phase. No more phases to advance')
199 else:
200 curr = self.pnum(self._explain.phase) # A numeric representation
201 # A list of any remaining phases in the policy with a higher number than
202 # the current
203 remaining = [
204 self.pnum(x) for x in self.policy_phases if self.pnum(x) > curr
205 ]
206 if remaining: # If any:
207 retval = self.pname(remaining[0])
208 # Get the phase name from the number stored in the first element
209 return retval
211 def next_step(
212 self,
213 phase: t.Optional[str] = None,
214 action: t.Optional[str] = None,
215 name: t.Optional[str] = None,
216 ) -> t.Dict[str, str]:
217 """Determine the next ILM step based on the current phase, action, and name
219 There is a disconnect between what the ILM Explain API returns and what
220 the ILM Move API expects. The former returns a 'step' key, while the
221 latter expects a 'name' key. This property returns a dictionary for
222 use with the ILM Move API, so that the 'step' key is renamed to 'name'.
223 """
224 err1 = bool((action is not None) and (name is None))
225 err2 = bool((action is None) and (name is not None))
226 if err1 or err2:
227 msg = 'If either action or name is specified, both must be'
228 logger.critical(msg)
229 raise TestbedMisconfig(msg)
230 if not phase:
231 logger.debug('No phase specified. Using next_phase')
232 phase = self.next_phase()
233 logger.debug(f'next_phase: {phase}')
234 retval = {'phase': phase}
235 if action:
236 retval['action'] = action
237 retval['name'] = name
238 return retval
240 def pnum(self, phase: str) -> int:
241 """Map a phase name to a phase number"""
242 _ = {'new': 0, 'hot': 1, 'warm': 2, 'cold': 3, 'frozen': 4, 'delete': 5}
243 return _[phase]
245 def pname(self, num: int) -> str:
246 """Map a phase number to a phase name"""
247 _ = {0: 'new', 1: 'hot', 2: 'warm', 3: 'cold', 4: 'frozen', 5: 'delete'}
248 return _[num]
250 def resolve(self, name: str) -> str:
251 """Resolve that we have an index and NOT an alias or a datastream"""
252 res = resolver(self.client, name)
253 if len(res['aliases']) > 0 or len(res['data_streams']) > 0:
254 msg = f'{name} is not an index: {res}'
255 logger.critical(msg)
256 raise ResultNotExpected(msg)
257 if len(res['indices']) > 1:
258 msg = f'{name} resolved to multiple indices: {prettystr(res["indices"])}'
259 logger.critical(msg)
260 raise ResultNotExpected(msg)
261 return res['indices'][0]['name']
263 def update(self) -> None:
264 """Update self._explain with the latest from :py:meth:`get_explain_data`"""
265 try:
266 self._explain = DotMap(self.get_explain_data())
267 except NameChanged as err:
268 logger.debug('Passing along upstream exception...')
269 raise err
271 def wait4complete(self) -> None:
272 """Subroutine for waiting for an ILM step to complete"""
273 phase_action = bool(self._explain.action == 'complete')
274 phase_step = bool(self._explain.step == 'complete')
275 if bool(phase_action and phase_step):
276 logger.debug(
277 f'{self.name}: Current step complete: {prettystr(self.current_step())}'
278 )
279 return
280 logger.debug(
281 f'{self.name}: Current step not complete. {prettystr(self.current_step())}'
282 )
283 kw = {'name': self.name, 'pause': PAUSE_VALUE, 'timeout': TIMEOUT_VALUE}
284 step = IlmStep(self.client, **kw)
285 try:
286 step.wait()
287 logger.debug('ILM Step successful. The wait is over')
288 time.sleep(1) # Just to make sure the cluster state has gelled
289 except EsWaitFatal as wait:
290 msg = (
291 f'{wait.message}. Total elapsed time: {wait.elapsed}. '
292 f'Errors: {prettystr(wait.errors)}'
293 )
294 logger.error(msg)
295 raise TestbedFailure(msg) from wait
296 except EsWaitTimeout as wait:
297 msg = f'{wait.message}. Total elapsed time: {wait.elapsed}.'
298 logger.error(msg)
299 raise TestbedFailure(msg) from wait