Coverage for /Users/eugene/Development/legion-utils/legion_utils/instrumentation.py: 66%
121 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-01 20:15 -0400
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-01 20:15 -0400
1from abc import abstractmethod, ABC
2from functools import wraps
3from itertools import count
4from random import randint
5from threading import Event
6from time import sleep
7from traceback import format_exc
8from typing import Callable, Union, Optional, Dict, Any, List
10from robotnikmq import RobotnikConfig
12from legion_utils import WarningMsg, ErrorMsg, CriticalMsg, broadcast_alert_msg
13from legion_utils.core import HOSTNAME
16class Alerter(ABC):
17 def __init__(self,
18 task_id: str,
19 exchange: str,
20 route: str,
21 default_ttl: int,
22 config: Optional[RobotnikConfig] = None):
23 self.task_id = task_id
24 self.exchange = exchange
25 self.route = route
26 self.config = config
27 self.default_ttl = default_ttl
29 @abstractmethod
30 def key(self, task_id: str) -> List[str]:
31 pass # pragma: no cover
33 def broadcast_warning(self, contents: Dict[str, Any],
34 desc: str,
35 ttl: Optional[int] = None) -> None:
36 broadcast_alert_msg(exchange=self.exchange,
37 route=self.route,
38 config=self.config,
39 alert=WarningMsg(contents=contents,
40 key=self.key(self.task_id),
41 desc=desc,
42 ttl=(ttl or self.default_ttl)))
44 def broadcast_error(self, contents: Dict[str, Any],
45 desc: str,
46 ttl: Optional[int] = None) -> None:
47 broadcast_alert_msg(exchange=self.exchange,
48 route=self.route,
49 config=self.config,
50 alert=ErrorMsg(contents=contents,
51 key=self.key(self.task_id),
52 desc=desc,
53 ttl=(ttl or self.default_ttl)))
55 def broadcast_critical(self, contents: Dict[str, Any],
56 desc: str,
57 ttl: Optional[int] = None) -> None:
58 broadcast_alert_msg(exchange=self.exchange,
59 route=self.route,
60 config=self.config,
61 alert=CriticalMsg(contents=contents,
62 key=self.key(self.task_id),
63 desc=desc,
64 ttl=(ttl or self.default_ttl)))
67class Runner(Alerter):
68 def __init__(self, task_id: str, exchange: str, route: str, default_ttl: int,
69 halt_flag: Optional[Event] = None, config: Optional[RobotnikConfig] = None):
70 super().__init__(task_id=task_id, exchange=exchange, route=route, default_ttl=default_ttl, config=config)
71 self.halt_flag = halt_flag or Event()
73 @abstractmethod
74 def __call__(self, func: Callable[[], None]) -> None:
75 pass # pragma: no cover
78class Service(Runner):
79 def __init__(self,
80 task_id: str,
81 exchange: str,
82 route: str,
83 ttl: Optional[int] = None,
84 start_delay: Union[int, Callable[[], int], None] = None,
85 relaunch_delay: Optional[int] = None,
86 jitter: Optional[int] = None,
87 warn_after_attempts: Union[int, float, None] = None,
88 error_after_attempts: Union[int, float, None] = None,
89 critical_after_attempts: Union[int, float, None] = None,
90 halt_flag: Optional[Event] = None,
91 config: Optional[RobotnikConfig] = None):
92 super().__init__(task_id=task_id, exchange=exchange, route=route,
93 default_ttl=(ttl or 120), halt_flag=halt_flag, config=config)
94 self.relaunch_delay = relaunch_delay if relaunch_delay is not None else 30
95 self.start_delay = start_delay if start_delay is not None else (lambda: randint(0, self.relaunch_delay)) 95 ↛ exitline 95 didn't run the lambda on line 95
96 self.jitter = jitter if jitter is not None else 3
97 self.warn_after_attempts = warn_after_attempts or float('inf')
98 self.error_after_attempts = error_after_attempts or (1 if warn_after_attempts is None else float('inf'))
99 self.critical_after_attempts = critical_after_attempts or float('inf')
101 @property
102 def _start_delay(self) -> int:
103 return self.start_delay if not callable(self.start_delay) else self.start_delay()
105 def delay_start(self) -> None:
106 sleep(abs(self._start_delay))
108 @property
109 def _relaunch_delay(self) -> int:
110 return self.relaunch_delay + randint(0 - self.jitter, self.jitter)
112 def delay_relaunch(self):
113 sleep(abs(self._relaunch_delay))
115 def key(self, task_id: str) -> List[str]:
116 return [HOSTNAME, 'legion', 'service_failure', task_id]
118 def __call__(self, func: Callable[[], None]) -> Callable[[], None]:
119 @wraps(func)
120 def retry_infinity_wrapper() -> None:
121 last_traceback: Optional[str] = None
122 self.delay_start()
123 for i in count(1): # pragma: no branch
124 if self.halt_flag.is_set():
125 break
126 try:
127 func()
128 except Exception:
129 last_traceback = format_exc()
130 finally:
131 contents = {"task_id": self.task_id,
132 "last_stack_trace": last_traceback,
133 "num_failures": i}
134 if i == 1:
135 desc = f"Service '{self.task_id}' stopped running"
136 else:
137 desc = f"Service '{self.task_id}' stopped running {i} times in a row"
138 if i >= self.critical_after_attempts: 138 ↛ 139line 138 didn't jump to line 139, because the condition on line 138 was never true
139 self.broadcast_critical(contents=contents, desc=desc)
140 elif i >= self.error_after_attempts:
141 self.broadcast_error(contents=contents, desc=desc)
142 elif i >= self.warn_after_attempts: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true
143 self.broadcast_warning(contents=contents, desc=desc)
144 self.delay_relaunch()
146 return retry_infinity_wrapper
149class Periodic(Runner):
150 def __init__(self,
151 task_id: str,
152 exchange: str,
153 route: str,
154 delay: int,
155 ttl: Optional[int] = None,
156 start_delay: Union[int, Callable[[], int], None] = None,
157 jitter: Optional[int] = None,
158 warn_after_failures: Union[int, float, None] = None,
159 error_after_failures: Union[int, float, None] = None,
160 critical_after_failures: Union[int, float, None] = None,
161 halt_flag: Optional[Event] = None,
162 config: Optional[RobotnikConfig] = None):
163 super().__init__(task_id=task_id,
164 exchange=exchange,
165 route=route,
166 default_ttl=(ttl or 120),
167 halt_flag=halt_flag,
168 config=config)
169 self.delay = delay
170 self.start_delay = start_delay if start_delay is not None else (lambda: randint(0, self.delay))
171 self.jitter = jitter if jitter is not None else 3
172 self.warn_after_failures = warn_after_failures or float('inf')
173 self.error_after_failures = error_after_failures or (1 if warn_after_failures is None else float('inf'))
174 self.critical_after_failures = critical_after_failures or float('inf')
176 @property
177 def _start_delay(self) -> int:
178 return self.start_delay if not callable(self.start_delay) else self.start_delay()
180 def delay_start(self) -> None:
181 sleep(abs(self._start_delay))
183 @property
184 def _jittery_delay(self) -> int:
185 return self.delay + randint(0 - self.jitter, self.jitter)
187 def jittery_delay(self):
188 sleep(abs(self._jittery_delay))
190 def key(self, task_id: str) -> List[str]:
191 return [HOSTNAME, 'legion', 'periodic_task_failure', task_id]
193 def __call__(self, func: Callable[[], None]) -> Callable[[], None]:
194 @wraps(func)
195 def run_infinity_wrapper() -> None:
196 num_failures = 0
197 self.delay_start()
198 for _ in count(): # pragma: no branch
199 if self.halt_flag.is_set():
200 break
201 try:
202 func()
203 num_failures = 0
204 except Exception:
205 num_failures += 1
206 contents = {"task_id": self.task_id,
207 "last_stack_trace": format_exc(),
208 "num_failures": num_failures}
209 desc = f"Periodic task '{self.task_id}' failed {num_failures} times in a row"
210 if num_failures >= self.critical_after_failures:
211 self.broadcast_critical(contents=contents, desc=desc)
212 elif num_failures >= self.error_after_failures:
213 self.broadcast_error(contents=contents, desc=desc)
214 elif num_failures >= self.warn_after_failures:
215 self.broadcast_warning(contents=contents, desc=desc)
216 self.jittery_delay()
217 return run_infinity_wrapper