Coverage for /Users/buh/.pyenv/versions/3.12.9/envs/es-testbed/lib/python3.12/site-packages/es_testbed/ilm.py: 100%
217 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-21 21:08 -0600
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-21 21:08 -0600
1"""ILM Defining Class"""
3import typing as t
4import logging
5from os import getenv
6import time
7from dotmap import DotMap
8from es_wait import IlmPhase, IlmStep
9from es_wait.exceptions import EsWaitFatal, EsWaitTimeout
10from .debug import debug, begin_end
11from .defaults import (
12 PAUSE_ENVVAR,
13 PAUSE_DEFAULT,
14 TIMEOUT_DEFAULT,
15 TIMEOUT_ENVVAR,
16)
17from .exceptions import (
18 NameChanged,
19 ResultNotExpected,
20 TestbedMisconfig,
21 TestbedFailure,
22)
23from .es_api import get_ilm_phases, ilm_explain, ilm_move, resolver
24from .utils import prettystr
26if t.TYPE_CHECKING:
27 from elasticsearch8 import Elasticsearch
29PAUSE_VALUE = float(getenv(PAUSE_ENVVAR, default=PAUSE_DEFAULT))
30TIMEOUT_VALUE = float(getenv(TIMEOUT_ENVVAR, default=TIMEOUT_DEFAULT))
32logger = logging.getLogger("es_testbed.IlmTracker")
34# ## Example ILM explain output
35# {
36# 'action': 'complete',
37# 'action_time_millis': 0,
38# 'age': '5.65m',
39# 'index': 'INDEX_NAME',
40# 'index_creation_date_millis': 0,
41# 'lifecycle_date_millis': 0,
42# 'managed': True,
43# 'phase': 'hot',
44# 'phase_execution': {
45# 'modified_date_in_millis': 0,
46# 'phase_definition': {
47# 'actions': {
48# 'rollover': {
49# 'max_age': 'MAX_AGE',
50# 'max_primary_shard_docs': 1000,
51# 'max_primary_shard_size': 'MAX_SIZE',
52# 'min_docs': 1
53# }
54# },
55# 'min_age': '0ms'
56# },
57# 'policy': 'POLICY_NAME',
58# 'version': 1
59# },
60# 'phase_time_millis': 0,
61# 'policy': 'POLICY_NAME',
62# 'step': 'complete',
63# 'step_time_millis': 0,
64# 'time_since_index_creation': '5.65m'
65# }
68class IlmTracker:
69 """ILM Phase Tracking Class"""
71 def __init__(self, client: "Elasticsearch", name: str):
72 debug.lv2("Initializing IlmTracker object...")
73 self.client = client
74 self.name = self.resolve(name) # A single index name
75 self._explain = DotMap(self.get_explain_data())
76 self._phases = get_ilm_phases(self.client, self._explain.policy)
77 debug.lv3("IlmTracker object initialized")
79 @property
80 def explain(self) -> DotMap:
81 """Return the current stored value of ILM Explain"""
82 return self._explain
84 @property
85 def policy_phases(self) -> t.Sequence[str]:
86 """Return a list of phases in the ILM policy"""
87 return list(self._phases.keys())
89 def _log_phase(self, phase: str) -> None:
90 debug.lv3(f"ILM Explain Index: {self._explain.index}")
91 debug.lv2(f'Index "{self.name}" now on phase "{phase}"')
93 @begin_end()
94 def _phase_wait(
95 self, phase: str, pause: float = PAUSE_VALUE, timeout: float = TIMEOUT_VALUE
96 ) -> None:
97 """Wait until the new phase shows up in ILM Explain"""
98 kw = {"name": self.name, "phase": phase, "pause": pause, "timeout": timeout}
99 debug.lv5(f"Waiting for phase args = {prettystr(kw)}")
100 phasechk = IlmPhase(self.client, **kw)
101 try:
102 debug.lv4("TRY: Waiting for ILM phase to complete")
103 phasechk.wait()
104 except EsWaitFatal as wait:
105 msg = (
106 f"{wait.message}. Total elapsed time: {wait.elapsed}. "
107 f"Errors: {prettystr(wait.errors)}"
108 )
109 logger.error(msg)
110 raise TestbedFailure(msg) from wait
111 except EsWaitTimeout as wait:
112 msg = f"{wait.message}. Total elapsed time: {wait.elapsed}."
113 logger.error(msg)
114 raise TestbedFailure(msg) from wait
116 @begin_end()
117 def _ssphz(self, phase: str) -> bool:
118 """Return True if the phase is for searchable snapshots (> 'warm')"""
119 retval = bool(self.pnum(phase) > self.pnum("warm"))
120 debug.lv5(f"Return value = {retval}")
121 return retval
123 @begin_end()
124 def advance(
125 self,
126 phase: t.Optional[str] = None,
127 action: t.Optional[str] = None,
128 name: t.Optional[str] = None,
129 ) -> None:
130 """Advance index to next ILM phase"""
131 if self._explain.phase == "delete":
132 debug.lv1('Already on "delete" phase. No more phases to advance')
134 return
136 debug.lv3(f"current_step: {prettystr(self.current_step())}")
137 next_step = self.next_step(phase, action=action, name=name)
138 debug.lv3(f"next_step: {prettystr(next_step)}")
139 if self._explain.phase == "new" and phase == "hot":
140 debug.lv3('Phase is "new" and is still advancing to "hot"')
141 # It won't be for very long.
142 debug.lv5('Waiting for phase to fully reach "hot"...')
143 self._phase_wait("hot")
144 time.sleep(1) # Just to make sure the new index is ready
146 # Regardless of the remaining phases, the current phase steps must be
147 # complete before proceeding with ilm_move
148 debug.lv5("Running self.update()...")
149 self.update()
150 debug.lv5("Running self.wait4complete()...")
151 self.wait4complete()
152 debug.lv5("Running self.update()...")
153 self.update()
155 # We could have arrived with it hot, but incomplete
156 if phase == "hot":
157 self._log_phase(phase)
158 debug.lv5('Phase "hot" reached, and all steps are completed')
160 return
162 # Remaining phases could be warm through frozen
163 if self._explain.phase != phase:
164 debug.lv5(f'"{self._explain.phase}" != "{phase}"')
165 debug.lv3(f"Current phase: {self.explain.phase}")
166 debug.lv3(f'Advancing to "{phase}" phase...')
167 # We will only wait for steps to complete for the hot and warm tiers
168 debug.lv5("Waiting for steps for non-cold/frozen phases to complete")
169 wait4steps = not self._ssphz(phase)
170 debug.lv5("Running ilm_move()...")
171 ilm_move(self.client, self.name, self.current_step(), next_step)
172 # Let the cluster catch up before proceeding.
173 time.sleep(1) # Just to make sure the cluster state has gelled
174 debug.lv5("Running self._phase_wait()...")
175 self._phase_wait(phase)
176 # If cold or frozen, we can return now. We let the calling function
177 # worry about the weird name changing behavior of searchable mounts
178 debug.lv5("Checking if wait4steps is True")
179 if wait4steps:
180 debug.lv5("Running self.update()...")
181 self.update()
182 debug.lv3(f'Waiting for "{phase}" phase steps to complete...')
183 debug.lv5("Running self.wait4complete()...")
184 self.wait4complete()
185 debug.lv5("Running self.update()...")
186 self.update()
187 self._log_phase(phase)
188 else:
189 debug.lv3(f'Already on "{phase}" phase. No need to advance')
190 debug.lv5(f"current_step: {prettystr(self.current_step())}")
192 @begin_end()
193 def current_step(self) -> t.Dict[str, str]:
194 """Return the current ILM step information
196 There is a disconnect between what the ILM Explain API returns and what
197 the ILM Move API expects. The former returns a 'step' key, while the
198 latter expects a 'name' key. This property returns a dictionary for
199 use with the ILM Move API, so that the 'step' key is renamed to 'name'.
200 """
201 retval = {
202 "phase": self._explain.phase,
203 "action": self._explain.action,
204 "name": self._explain.step,
205 }
206 debug.lv5(f"Return value = {prettystr(retval)}")
207 return retval
209 @begin_end()
210 def get_explain_data(self) -> t.Dict:
211 """Get the ILM explain data and return it"""
212 try:
213 debug.lv4("TRY: Getting ILM explain data")
214 retval = ilm_explain(self.client, self.name)
216 debug.lv5(f"Return value = {prettystr(retval)}")
217 return retval
218 except NameChanged as err:
219 debug.lv3("Exiting method, raising exception")
220 debug.lv5(f"Exception = {prettystr(err)}")
221 debug.lv3("Apparent name change. Passing along upstream exception...")
222 raise err
223 except ResultNotExpected as err:
224 debug.lv3("Exiting method, raising exception")
225 msg = f"Unable to get ilm_explain results. Error: {prettystr(err)}"
226 logger.critical(msg)
227 raise err
229 @begin_end()
230 def next_phase(self) -> str:
231 """Return the next phase in the index's ILM journey"""
232 retval = None
233 if self._explain.phase == "delete":
234 debug.lv3('Already on "delete" phase. No more phases to advance')
235 else:
236 curr = self.pnum(self._explain.phase) # A numeric representation
237 # A list of any remaining phases in the policy with a higher number than
238 # the current
239 remaining = [
240 self.pnum(x) for x in self.policy_phases if self.pnum(x) > curr
241 ]
242 if remaining: # If any:
243 retval = self.pname(remaining[0])
244 # Get the phase name from the number stored in the first element
245 debug.lv5(f"Return value = {retval}")
246 return retval
248 @begin_end()
249 def next_step(
250 self,
251 phase: t.Optional[str] = None,
252 action: t.Optional[str] = None,
253 name: t.Optional[str] = None,
254 ) -> t.Dict[str, str]:
255 """Determine the next ILM step based on the current phase, action, and name
257 There is a disconnect between what the ILM Explain API returns and what
258 the ILM Move API expects. The former returns a 'step' key, while the
259 latter expects a 'name' key. This property returns a dictionary for
260 use with the ILM Move API, so that the 'step' key is renamed to 'name'.
261 """
262 err1 = bool((action is not None) and (name is None))
263 err2 = bool((action is None) and (name is not None))
264 if err1 or err2:
265 debug.lv3("Exiting method, raising exception")
266 msg = "If either action or name is specified, both must be"
267 logger.critical(msg)
268 raise TestbedMisconfig(msg)
269 if not phase:
270 debug.lv3("No phase specified. Using next_phase")
271 phase = self.next_phase()
272 debug.lv5(f"next_phase: {phase}")
273 retval = {"phase": phase}
274 if action:
275 retval["action"] = action
276 retval["name"] = name
277 debug.lv5(f"Return value = {prettystr(retval)}")
278 return retval
280 @begin_end()
281 def pnum(self, phase: str) -> int:
282 """Map a phase name to a phase number"""
283 _ = {"new": 0, "hot": 1, "warm": 2, "cold": 3, "frozen": 4, "delete": 5}
284 debug.lv5(f"Return value = {_[phase]}")
285 return _[phase]
287 @begin_end()
288 def pname(self, num: int) -> str:
289 """Map a phase number to a phase name"""
290 _ = {0: "new", 1: "hot", 2: "warm", 3: "cold", 4: "frozen", 5: "delete"}
291 debug.lv5(f"Return value = {_[num]}")
292 return _[num]
294 @begin_end()
295 def resolve(self, name: str) -> str:
296 """Resolve that we have an index and NOT an alias or a datastream"""
297 res = resolver(self.client, name)
298 debug.lv5(f"resolver: result = {res}")
299 if len(res["aliases"]) > 0 or len(res["data_streams"]) > 0:
300 debug.lv3("Exiting method, raising exception")
301 msg = f"{name} is not an index: {res}"
302 logger.critical(msg)
303 raise ResultNotExpected(msg)
304 if len(res["indices"]) > 1:
305 debug.lv3("Exiting method, raising exception")
306 msg = f'{name} resolved to multiple indices: {prettystr(res["indices"])}'
307 logger.critical(msg)
308 raise ResultNotExpected(msg)
309 debug.lv5(f'Return value = {res["indices"][0]["name"]}')
310 return res["indices"][0]["name"]
312 @begin_end()
313 def update(self) -> None:
314 """Update self._explain with the latest from :py:meth:`get_explain_data`"""
315 try:
316 debug.lv4("TRY: self._explain = DotMap(self.get_explain_data())")
317 self._explain = DotMap(self.get_explain_data())
318 debug.lv5(f"Updated explain: {prettystr(self._explain)}")
319 except NameChanged as err:
320 debug.lv3("Exiting method, raising exception")
321 debug.lv3("Passing along upstream exception...")
322 debug.lv5(f"Exception = {prettystr(err)}")
323 raise err
325 @begin_end()
326 def wait4complete(self) -> None:
327 """Subroutine for waiting for an ILM step to complete"""
328 phase_action = bool(self._explain.action == "complete")
329 phase_step = bool(self._explain.step == "complete")
330 if bool(phase_action and phase_step):
331 debug.lv3(
332 f"{self.name}: Current step complete: {prettystr(self.current_step())}"
333 )
335 return
336 debug.lv3(
337 f"{self.name}: Current step not complete. {prettystr(self.current_step())}"
338 )
339 kw = {"name": self.name, "pause": PAUSE_VALUE, "timeout": TIMEOUT_VALUE}
340 debug.lv5(f"IlmStep args = {prettystr(kw)}")
341 step = IlmStep(self.client, **kw)
342 try:
343 debug.lv4("TRY: Waiting for ILM step to complete")
344 step.wait()
345 debug.lv3("ILM Step successful. The wait is over")
346 time.sleep(1) # Just to make sure the cluster state has gelled
347 except EsWaitFatal as wait:
348 debug.lv3("Exiting method, raising exception")
349 debug.lv5(f"Exception = {prettystr(wait)}")
350 msg = (
351 f"{wait.message}. Total elapsed time: {wait.elapsed}. "
352 f"Errors: {prettystr(wait.errors)}"
353 )
354 logger.error(msg)
355 raise TestbedFailure(msg) from wait
356 except EsWaitTimeout as wait:
357 debug.lv3("Exiting method, raising exception")
358 debug.lv5(f"Exception = {prettystr(wait)}")
359 msg = f"{wait.message}. Total elapsed time: {wait.elapsed}."
360 logger.error(msg)
361 raise TestbedFailure(msg) from wait