bettermdptools.envs.acrobot_model

  1from math import cos, sin
  2
  3import numpy as np
  4
  5
  6class DiscretizedAcrobot:
  7    LINK_LENGTH_1 = 1.0  # [m]
  8    LINK_LENGTH_2 = 1.0  # [m]
  9    LINK_MASS_1 = 1.0  #: [kg] mass of link 1
 10    LINK_MASS_2 = 1.0  #: [kg] mass of link 2
 11    LINK_COM_POS_1 = 0.5  #: [m] position of the center of mass of link 1
 12    LINK_COM_POS_2 = 0.5  #: [m] position of the center of mass of link 2
 13    LINK_MOI = 1.0  #: moments of inertia for both links
 14    AVAIL_TORQUE = [-1, 0, 1]
 15    MAX_VEL_1 = 4 * np.pi
 16    MAX_VEL_2 = 9 * np.pi
 17
 18    def __init__(
 19        self,
 20        angular_resolution_rad=0.01,
 21        angular_vel_resolution_rad_per_sec=0.05,
 22        angle_bins=None,
 23        velocity_bins=None,
 24        precomputed_P=None,
 25        verbose=False,
 26    ):
 27        self.verbose = verbose
 28
 29        if angle_bins is None:
 30            self.angle_1_bins = int(np.pi // angular_resolution_rad) + 1
 31            self.angle_2_bins = int(np.pi // angular_resolution_rad) + 1
 32        else:
 33            self.angle_1_bins = angle_bins
 34            self.angle_2_bins = angle_bins
 35
 36        if velocity_bins is None:
 37            self.angular_vel_1_bins = (
 38                int((2 * self.MAX_VEL_1) // angular_vel_resolution_rad_per_sec) + 1
 39            )
 40            self.angular_vel_2_bins = (
 41                int((2 * self.MAX_VEL_2) // angular_vel_resolution_rad_per_sec) + 1
 42            )
 43        else:
 44            self.angular_vel_1_bins = velocity_bins
 45            self.angular_vel_2_bins = velocity_bins
 46
 47        self.n_states = (
 48            self.angle_1_bins
 49            * self.angle_2_bins
 50            * self.angular_vel_1_bins
 51            * self.angular_vel_2_bins
 52        )
 53
 54        self.angle_range = (-np.pi, np.pi)
 55        self.velocity_1_range = (-self.MAX_VEL_1, self.MAX_VEL_1)
 56        self.velocity_2_range = (-self.MAX_VEL_2, self.MAX_VEL_2)
 57        self.action_space = len(self.AVAIL_TORQUE)  # -1, 0, 1
 58        self.dt = 0.2
 59
 60        if precomputed_P is None:
 61            self.P = {
 62                state: {action: [] for action in range(self.action_space)}
 63                for state in range(self.n_states)
 64            }
 65            self.setup_transition_probabilities()
 66
 67        else:
 68            self.P = precomputed_P
 69
 70        # add transform_obs
 71        self.transform_obs = lambda obs: (
 72            np.ravel_multi_index(
 73                (
 74                    np.clip(
 75                        np.digitize(
 76                            obs[0], np.linspace(*self.angle_range, self.angle_1_bins)
 77                        )
 78                        - 1,
 79                        0,
 80                        self.angle_1_bins - 1,
 81                    ),
 82                    np.clip(
 83                        np.digitize(
 84                            obs[1], np.linspace(*self.angle_range, self.angle_2_bins)
 85                        )
 86                        - 1,
 87                        0,
 88                        self.angle_2_bins - 1,
 89                    ),
 90                    np.clip(
 91                        np.digitize(
 92                            obs[2],
 93                            np.linspace(
 94                                *self.velocity_1_range, self.angular_vel_1_bins
 95                            ),
 96                        )
 97                        - 1,
 98                        0,
 99                        self.angular_vel_1_bins - 1,
100                    ),
101                    np.clip(
102                        np.digitize(
103                            obs[3],
104                            np.linspace(
105                                *self.velocity_2_range, self.angular_vel_2_bins
106                            ),
107                        )
108                        - 1,
109                        0,
110                        self.angular_vel_2_bins - 1,
111                    ),
112                ),
113                (
114                    self.angle_1_bins,
115                    self.angle_2_bins,
116                    self.angular_vel_1_bins,
117                    self.angular_vel_2_bins,
118                ),
119            )
120        )
121
122    def setup_transition_probabilities(self):
123        """
124        Sets up the transition probabilities for the environment. This method iterates through all possible
125        states and actions, simulates the next state, and records the transition probability
126        (deterministic in this setup), reward, and termination status.
127        """
128        percent = 0
129
130        for state in range(self.n_states):
131            angle_1, angle_2, vel_1, vel_2 = self.index_to_state(state)
132            for action in range(self.action_space):
133                next_state, reward, done = self.compute_next_state(
134                    angle_1, angle_2, vel_1, vel_2, action
135                )
136                self.P[state][action].append((1, next_state, reward, done))
137
138            if state % int(self.n_states / 10) == 0 and state > 0 and self.verbose:
139                percent += 10
140                print(f"{percent}% of probabilities calculated!")
141
142    def index_to_state(self, index):
143        """
144        Converts a single index into a multidimensional state representation.
145
146        Parameters:
147        - index (int): The flat index representing the state.
148
149        Returns:
150        - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
151        """
152
153        totals = [
154            self.angle_1_bins,
155            self.angle_2_bins,
156            self.angular_vel_1_bins,
157            self.angular_vel_2_bins,
158        ]
159        multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1]
160        components = [int((index // multipliers[i]) % totals[i]) for i in range(4)]
161        return components
162
163    # Modified from original implementation of Gym Acrobot
164    def dsdt(self, state):
165        m1 = self.LINK_MASS_1
166        m2 = self.LINK_MASS_2
167        l1 = self.LINK_LENGTH_1
168        lc1 = self.LINK_COM_POS_1
169        lc2 = self.LINK_COM_POS_2
170        I1 = self.LINK_MOI
171        I2 = self.LINK_MOI
172        g = 9.8
173        a = state[-1]
174        s = state[:-1]
175        theta1 = s[0]
176        theta2 = s[1]
177        dtheta1 = s[2]
178        dtheta2 = s[3]
179        d1 = m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2
180        d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2
181        phi2 = m2 * lc2 * g * cos(theta1 + theta2 - np.pi / 2.0)
182        phi1 = (
183            -m2 * l1 * lc2 * dtheta2**2 * sin(theta2)
184            - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
185            + (m1 * lc1 + m2 * l1) * g * cos(theta1 - np.pi / 2)
186            + phi2
187        )
188
189        # the following line is consistent with the java implementation and the
190        # book
191        ddtheta2 = (
192            a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2
193        ) / (m2 * lc2**2 + I2 - d2**2 / d1)
194
195        ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
196        return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0
197
198    # Modified from original implementation of Gym Acrobot
199    def rk4(self, derivs, y0, t):
200        """
201        Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.
202
203        Example for 2D system:
204
205            >>> def derivs(x):
206            ...     d1 =  x[0] + 2*x[1]
207            ...     d2 =  -3*x[0] + 4*x[1]
208            ...     return d1, d2
209
210            >>> dt = 0.0005
211            >>> t = np.arange(0.0, 2.0, dt)
212            >>> y0 = (1,2)
213            >>> yout = rk4(derivs, y0, t)
214
215        Args:
216            derivs: the derivative of the system and has the signature `dy = derivs(yi)`
217            y0: initial state vector
218            t: sample times
219
220        Returns:
221            yout: Runge-Kutta approximation of the ODE
222        """
223
224        try:
225            Ny = len(y0)
226        except TypeError:
227            yout = np.zeros((len(t),), np.float64)
228        else:
229            yout = np.zeros((len(t), Ny), np.float64)
230
231        yout[0] = y0
232
233        for i in np.arange(len(t) - 1):
234            this = t[i]
235            dt = t[i + 1] - this
236            dt2 = dt / 2.0
237            y0 = yout[i]
238
239            k1 = np.asarray(derivs(y0))
240            k2 = np.asarray(derivs(y0 + dt2 * k1))
241            k3 = np.asarray(derivs(y0 + dt2 * k2))
242            k4 = np.asarray(derivs(y0 + dt * k3))
243            yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
244        # We only care about the final timestep and we cleave off action value which will be zero
245        return yout[-1][:4]
246
247    def compute_next_state(
248        self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action
249    ):
250        angle_space = np.linspace(*self.angle_range, self.angle_1_bins)
251        velocity_1_space = np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)
252        velocity_2_space = np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)
253
254        angle_1 = angle_space[angle_1_idx]
255        angle_2 = angle_space[angle_2_idx]
256
257        velocity_1 = velocity_1_space[vel_1_idx]
258        velocity_2 = velocity_2_space[vel_2_idx]
259
260        torque = self.AVAIL_TORQUE[action]
261
262        state = [angle_1, angle_2, velocity_1, velocity_2, torque]
263
264        new_state = self.rk4(self.dsdt, state, [0, self.dt])
265
266        new_state[0] = self.wrap(new_state[0], -np.pi, np.pi)
267        new_state[1] = self.wrap(new_state[1], -np.pi, np.pi)
268        new_state[2] = self.bound(new_state[2], -self.MAX_VEL_1, self.MAX_VEL_1)
269        new_state[3] = self.bound(new_state[3], -self.MAX_VEL_2, self.MAX_VEL_2)
270
271        # index new state
272
273        new_angle_1_idx = np.clip(
274            np.digitize(new_state[0], np.linspace(*self.angle_range, self.angle_1_bins))
275            - 1,
276            0,
277            self.angle_1_bins - 1,
278        )
279        new_angle_2_idx = np.clip(
280            np.digitize(new_state[1], np.linspace(*self.angle_range, self.angle_2_bins))
281            - 1,
282            0,
283            self.angle_2_bins - 1,
284        )
285        new_vel_1_idx = np.clip(
286            np.digitize(
287                new_state[2],
288                np.linspace(*self.velocity_1_range, self.angular_vel_1_bins),
289            )
290            - 1,
291            0,
292            self.angular_vel_1_bins - 1,
293        )
294        new_vel_2_idx = np.clip(
295            np.digitize(
296                new_state[3],
297                np.linspace(*self.velocity_2_range, self.angular_vel_2_bins),
298            )
299            - 1,
300            0,
301            self.angular_vel_2_bins - 1,
302        )
303
304        new_state_idx = np.ravel_multi_index(
305            (new_angle_1_idx, new_angle_2_idx, new_vel_1_idx, new_vel_2_idx),
306            (
307                self.angle_1_bins,
308                self.angle_2_bins,
309                self.angular_vel_1_bins,
310                self.angular_vel_2_bins,
311            ),
312        )
313
314        # self.state = new_state
315        terminated = bool(-cos(new_state[0]) - cos(new_state[1] + new_state[0]) > 1.0)
316        reward = -1.0 if not terminated else 0.0
317
318        return new_state_idx, reward, terminated
319
320    def wrap(self, x, m, M):
321        """Wraps `x` so m <= x <= M; but unlike `bound()` which
322        truncates, `wrap()` wraps x around the coordinate system defined by m,M.\n
323        For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
324
325        Args:
326            x: a scalar
327            m: minimum possible value in range
328            M: maximum possible value in range
329
330        Returns:
331            x: a scalar, wrapped
332        """
333        diff = M - m
334        while x > M:
335            x = x - diff
336        while x < m:
337            x = x + diff
338        return x
339
340    def bound(self, x, m, M=None):
341        """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
342        have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
343
344        Args:
345            x: scalar
346            m: The lower bound
347            M: The upper bound
348
349        Returns:
350            x: scalar, bound between min (m) and Max (M)
351        """
352        if M is None:
353            M = m[1]
354            m = m[0]
355        # bound x between min (m) and Max (M)
356        return min(max(x, m), M)
class DiscretizedAcrobot:
  7class DiscretizedAcrobot:
  8    LINK_LENGTH_1 = 1.0  # [m]
  9    LINK_LENGTH_2 = 1.0  # [m]
 10    LINK_MASS_1 = 1.0  #: [kg] mass of link 1
 11    LINK_MASS_2 = 1.0  #: [kg] mass of link 2
 12    LINK_COM_POS_1 = 0.5  #: [m] position of the center of mass of link 1
 13    LINK_COM_POS_2 = 0.5  #: [m] position of the center of mass of link 2
 14    LINK_MOI = 1.0  #: moments of inertia for both links
 15    AVAIL_TORQUE = [-1, 0, 1]
 16    MAX_VEL_1 = 4 * np.pi
 17    MAX_VEL_2 = 9 * np.pi
 18
 19    def __init__(
 20        self,
 21        angular_resolution_rad=0.01,
 22        angular_vel_resolution_rad_per_sec=0.05,
 23        angle_bins=None,
 24        velocity_bins=None,
 25        precomputed_P=None,
 26        verbose=False,
 27    ):
 28        self.verbose = verbose
 29
 30        if angle_bins is None:
 31            self.angle_1_bins = int(np.pi // angular_resolution_rad) + 1
 32            self.angle_2_bins = int(np.pi // angular_resolution_rad) + 1
 33        else:
 34            self.angle_1_bins = angle_bins
 35            self.angle_2_bins = angle_bins
 36
 37        if velocity_bins is None:
 38            self.angular_vel_1_bins = (
 39                int((2 * self.MAX_VEL_1) // angular_vel_resolution_rad_per_sec) + 1
 40            )
 41            self.angular_vel_2_bins = (
 42                int((2 * self.MAX_VEL_2) // angular_vel_resolution_rad_per_sec) + 1
 43            )
 44        else:
 45            self.angular_vel_1_bins = velocity_bins
 46            self.angular_vel_2_bins = velocity_bins
 47
 48        self.n_states = (
 49            self.angle_1_bins
 50            * self.angle_2_bins
 51            * self.angular_vel_1_bins
 52            * self.angular_vel_2_bins
 53        )
 54
 55        self.angle_range = (-np.pi, np.pi)
 56        self.velocity_1_range = (-self.MAX_VEL_1, self.MAX_VEL_1)
 57        self.velocity_2_range = (-self.MAX_VEL_2, self.MAX_VEL_2)
 58        self.action_space = len(self.AVAIL_TORQUE)  # -1, 0, 1
 59        self.dt = 0.2
 60
 61        if precomputed_P is None:
 62            self.P = {
 63                state: {action: [] for action in range(self.action_space)}
 64                for state in range(self.n_states)
 65            }
 66            self.setup_transition_probabilities()
 67
 68        else:
 69            self.P = precomputed_P
 70
 71        # add transform_obs
 72        self.transform_obs = lambda obs: (
 73            np.ravel_multi_index(
 74                (
 75                    np.clip(
 76                        np.digitize(
 77                            obs[0], np.linspace(*self.angle_range, self.angle_1_bins)
 78                        )
 79                        - 1,
 80                        0,
 81                        self.angle_1_bins - 1,
 82                    ),
 83                    np.clip(
 84                        np.digitize(
 85                            obs[1], np.linspace(*self.angle_range, self.angle_2_bins)
 86                        )
 87                        - 1,
 88                        0,
 89                        self.angle_2_bins - 1,
 90                    ),
 91                    np.clip(
 92                        np.digitize(
 93                            obs[2],
 94                            np.linspace(
 95                                *self.velocity_1_range, self.angular_vel_1_bins
 96                            ),
 97                        )
 98                        - 1,
 99                        0,
100                        self.angular_vel_1_bins - 1,
101                    ),
102                    np.clip(
103                        np.digitize(
104                            obs[3],
105                            np.linspace(
106                                *self.velocity_2_range, self.angular_vel_2_bins
107                            ),
108                        )
109                        - 1,
110                        0,
111                        self.angular_vel_2_bins - 1,
112                    ),
113                ),
114                (
115                    self.angle_1_bins,
116                    self.angle_2_bins,
117                    self.angular_vel_1_bins,
118                    self.angular_vel_2_bins,
119                ),
120            )
121        )
122
123    def setup_transition_probabilities(self):
124        """
125        Sets up the transition probabilities for the environment. This method iterates through all possible
126        states and actions, simulates the next state, and records the transition probability
127        (deterministic in this setup), reward, and termination status.
128        """
129        percent = 0
130
131        for state in range(self.n_states):
132            angle_1, angle_2, vel_1, vel_2 = self.index_to_state(state)
133            for action in range(self.action_space):
134                next_state, reward, done = self.compute_next_state(
135                    angle_1, angle_2, vel_1, vel_2, action
136                )
137                self.P[state][action].append((1, next_state, reward, done))
138
139            if state % int(self.n_states / 10) == 0 and state > 0 and self.verbose:
140                percent += 10
141                print(f"{percent}% of probabilities calculated!")
142
143    def index_to_state(self, index):
144        """
145        Converts a single index into a multidimensional state representation.
146
147        Parameters:
148        - index (int): The flat index representing the state.
149
150        Returns:
151        - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
152        """
153
154        totals = [
155            self.angle_1_bins,
156            self.angle_2_bins,
157            self.angular_vel_1_bins,
158            self.angular_vel_2_bins,
159        ]
160        multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1]
161        components = [int((index // multipliers[i]) % totals[i]) for i in range(4)]
162        return components
163
164    # Modified from original implementation of Gym Acrobot
165    def dsdt(self, state):
166        m1 = self.LINK_MASS_1
167        m2 = self.LINK_MASS_2
168        l1 = self.LINK_LENGTH_1
169        lc1 = self.LINK_COM_POS_1
170        lc2 = self.LINK_COM_POS_2
171        I1 = self.LINK_MOI
172        I2 = self.LINK_MOI
173        g = 9.8
174        a = state[-1]
175        s = state[:-1]
176        theta1 = s[0]
177        theta2 = s[1]
178        dtheta1 = s[2]
179        dtheta2 = s[3]
180        d1 = m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2
181        d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2
182        phi2 = m2 * lc2 * g * cos(theta1 + theta2 - np.pi / 2.0)
183        phi1 = (
184            -m2 * l1 * lc2 * dtheta2**2 * sin(theta2)
185            - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
186            + (m1 * lc1 + m2 * l1) * g * cos(theta1 - np.pi / 2)
187            + phi2
188        )
189
190        # the following line is consistent with the java implementation and the
191        # book
192        ddtheta2 = (
193            a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2
194        ) / (m2 * lc2**2 + I2 - d2**2 / d1)
195
196        ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
197        return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0
198
199    # Modified from original implementation of Gym Acrobot
200    def rk4(self, derivs, y0, t):
201        """
202        Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.
203
204        Example for 2D system:
205
206            >>> def derivs(x):
207            ...     d1 =  x[0] + 2*x[1]
208            ...     d2 =  -3*x[0] + 4*x[1]
209            ...     return d1, d2
210
211            >>> dt = 0.0005
212            >>> t = np.arange(0.0, 2.0, dt)
213            >>> y0 = (1,2)
214            >>> yout = rk4(derivs, y0, t)
215
216        Args:
217            derivs: the derivative of the system and has the signature `dy = derivs(yi)`
218            y0: initial state vector
219            t: sample times
220
221        Returns:
222            yout: Runge-Kutta approximation of the ODE
223        """
224
225        try:
226            Ny = len(y0)
227        except TypeError:
228            yout = np.zeros((len(t),), np.float64)
229        else:
230            yout = np.zeros((len(t), Ny), np.float64)
231
232        yout[0] = y0
233
234        for i in np.arange(len(t) - 1):
235            this = t[i]
236            dt = t[i + 1] - this
237            dt2 = dt / 2.0
238            y0 = yout[i]
239
240            k1 = np.asarray(derivs(y0))
241            k2 = np.asarray(derivs(y0 + dt2 * k1))
242            k3 = np.asarray(derivs(y0 + dt2 * k2))
243            k4 = np.asarray(derivs(y0 + dt * k3))
244            yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
245        # We only care about the final timestep and we cleave off action value which will be zero
246        return yout[-1][:4]
247
248    def compute_next_state(
249        self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action
250    ):
251        angle_space = np.linspace(*self.angle_range, self.angle_1_bins)
252        velocity_1_space = np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)
253        velocity_2_space = np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)
254
255        angle_1 = angle_space[angle_1_idx]
256        angle_2 = angle_space[angle_2_idx]
257
258        velocity_1 = velocity_1_space[vel_1_idx]
259        velocity_2 = velocity_2_space[vel_2_idx]
260
261        torque = self.AVAIL_TORQUE[action]
262
263        state = [angle_1, angle_2, velocity_1, velocity_2, torque]
264
265        new_state = self.rk4(self.dsdt, state, [0, self.dt])
266
267        new_state[0] = self.wrap(new_state[0], -np.pi, np.pi)
268        new_state[1] = self.wrap(new_state[1], -np.pi, np.pi)
269        new_state[2] = self.bound(new_state[2], -self.MAX_VEL_1, self.MAX_VEL_1)
270        new_state[3] = self.bound(new_state[3], -self.MAX_VEL_2, self.MAX_VEL_2)
271
272        # index new state
273
274        new_angle_1_idx = np.clip(
275            np.digitize(new_state[0], np.linspace(*self.angle_range, self.angle_1_bins))
276            - 1,
277            0,
278            self.angle_1_bins - 1,
279        )
280        new_angle_2_idx = np.clip(
281            np.digitize(new_state[1], np.linspace(*self.angle_range, self.angle_2_bins))
282            - 1,
283            0,
284            self.angle_2_bins - 1,
285        )
286        new_vel_1_idx = np.clip(
287            np.digitize(
288                new_state[2],
289                np.linspace(*self.velocity_1_range, self.angular_vel_1_bins),
290            )
291            - 1,
292            0,
293            self.angular_vel_1_bins - 1,
294        )
295        new_vel_2_idx = np.clip(
296            np.digitize(
297                new_state[3],
298                np.linspace(*self.velocity_2_range, self.angular_vel_2_bins),
299            )
300            - 1,
301            0,
302            self.angular_vel_2_bins - 1,
303        )
304
305        new_state_idx = np.ravel_multi_index(
306            (new_angle_1_idx, new_angle_2_idx, new_vel_1_idx, new_vel_2_idx),
307            (
308                self.angle_1_bins,
309                self.angle_2_bins,
310                self.angular_vel_1_bins,
311                self.angular_vel_2_bins,
312            ),
313        )
314
315        # self.state = new_state
316        terminated = bool(-cos(new_state[0]) - cos(new_state[1] + new_state[0]) > 1.0)
317        reward = -1.0 if not terminated else 0.0
318
319        return new_state_idx, reward, terminated
320
321    def wrap(self, x, m, M):
322        """Wraps `x` so m <= x <= M; but unlike `bound()` which
323        truncates, `wrap()` wraps x around the coordinate system defined by m,M.\n
324        For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
325
326        Args:
327            x: a scalar
328            m: minimum possible value in range
329            M: maximum possible value in range
330
331        Returns:
332            x: a scalar, wrapped
333        """
334        diff = M - m
335        while x > M:
336            x = x - diff
337        while x < m:
338            x = x + diff
339        return x
340
341    def bound(self, x, m, M=None):
342        """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
343        have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
344
345        Args:
346            x: scalar
347            m: The lower bound
348            M: The upper bound
349
350        Returns:
351            x: scalar, bound between min (m) and Max (M)
352        """
353        if M is None:
354            M = m[1]
355            m = m[0]
356        # bound x between min (m) and Max (M)
357        return min(max(x, m), M)
DiscretizedAcrobot( angular_resolution_rad=0.01, angular_vel_resolution_rad_per_sec=0.05, angle_bins=None, velocity_bins=None, precomputed_P=None, verbose=False)
 19    def __init__(
 20        self,
 21        angular_resolution_rad=0.01,
 22        angular_vel_resolution_rad_per_sec=0.05,
 23        angle_bins=None,
 24        velocity_bins=None,
 25        precomputed_P=None,
 26        verbose=False,
 27    ):
 28        self.verbose = verbose
 29
 30        if angle_bins is None:
 31            self.angle_1_bins = int(np.pi // angular_resolution_rad) + 1
 32            self.angle_2_bins = int(np.pi // angular_resolution_rad) + 1
 33        else:
 34            self.angle_1_bins = angle_bins
 35            self.angle_2_bins = angle_bins
 36
 37        if velocity_bins is None:
 38            self.angular_vel_1_bins = (
 39                int((2 * self.MAX_VEL_1) // angular_vel_resolution_rad_per_sec) + 1
 40            )
 41            self.angular_vel_2_bins = (
 42                int((2 * self.MAX_VEL_2) // angular_vel_resolution_rad_per_sec) + 1
 43            )
 44        else:
 45            self.angular_vel_1_bins = velocity_bins
 46            self.angular_vel_2_bins = velocity_bins
 47
 48        self.n_states = (
 49            self.angle_1_bins
 50            * self.angle_2_bins
 51            * self.angular_vel_1_bins
 52            * self.angular_vel_2_bins
 53        )
 54
 55        self.angle_range = (-np.pi, np.pi)
 56        self.velocity_1_range = (-self.MAX_VEL_1, self.MAX_VEL_1)
 57        self.velocity_2_range = (-self.MAX_VEL_2, self.MAX_VEL_2)
 58        self.action_space = len(self.AVAIL_TORQUE)  # -1, 0, 1
 59        self.dt = 0.2
 60
 61        if precomputed_P is None:
 62            self.P = {
 63                state: {action: [] for action in range(self.action_space)}
 64                for state in range(self.n_states)
 65            }
 66            self.setup_transition_probabilities()
 67
 68        else:
 69            self.P = precomputed_P
 70
 71        # add transform_obs
 72        self.transform_obs = lambda obs: (
 73            np.ravel_multi_index(
 74                (
 75                    np.clip(
 76                        np.digitize(
 77                            obs[0], np.linspace(*self.angle_range, self.angle_1_bins)
 78                        )
 79                        - 1,
 80                        0,
 81                        self.angle_1_bins - 1,
 82                    ),
 83                    np.clip(
 84                        np.digitize(
 85                            obs[1], np.linspace(*self.angle_range, self.angle_2_bins)
 86                        )
 87                        - 1,
 88                        0,
 89                        self.angle_2_bins - 1,
 90                    ),
 91                    np.clip(
 92                        np.digitize(
 93                            obs[2],
 94                            np.linspace(
 95                                *self.velocity_1_range, self.angular_vel_1_bins
 96                            ),
 97                        )
 98                        - 1,
 99                        0,
100                        self.angular_vel_1_bins - 1,
101                    ),
102                    np.clip(
103                        np.digitize(
104                            obs[3],
105                            np.linspace(
106                                *self.velocity_2_range, self.angular_vel_2_bins
107                            ),
108                        )
109                        - 1,
110                        0,
111                        self.angular_vel_2_bins - 1,
112                    ),
113                ),
114                (
115                    self.angle_1_bins,
116                    self.angle_2_bins,
117                    self.angular_vel_1_bins,
118                    self.angular_vel_2_bins,
119                ),
120            )
121        )
AVAIL_TORQUE = [-1, 0, 1]
MAX_VEL_1 = 12.566370614359172
MAX_VEL_2 = 28.274333882308138
verbose
n_states
angle_range
velocity_1_range
velocity_2_range
action_space
dt
transform_obs
def setup_transition_probabilities(self):
123    def setup_transition_probabilities(self):
124        """
125        Sets up the transition probabilities for the environment. This method iterates through all possible
126        states and actions, simulates the next state, and records the transition probability
127        (deterministic in this setup), reward, and termination status.
128        """
129        percent = 0
130
131        for state in range(self.n_states):
132            angle_1, angle_2, vel_1, vel_2 = self.index_to_state(state)
133            for action in range(self.action_space):
134                next_state, reward, done = self.compute_next_state(
135                    angle_1, angle_2, vel_1, vel_2, action
136                )
137                self.P[state][action].append((1, next_state, reward, done))
138
139            if state % int(self.n_states / 10) == 0 and state > 0 and self.verbose:
140                percent += 10
141                print(f"{percent}% of probabilities calculated!")

Sets up the transition probabilities for the environment. This method iterates through all possible states and actions, simulates the next state, and records the transition probability (deterministic in this setup), reward, and termination status.

def index_to_state(self, index):
143    def index_to_state(self, index):
144        """
145        Converts a single index into a multidimensional state representation.
146
147        Parameters:
148        - index (int): The flat index representing the state.
149
150        Returns:
151        - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
152        """
153
154        totals = [
155            self.angle_1_bins,
156            self.angle_2_bins,
157            self.angular_vel_1_bins,
158            self.angular_vel_2_bins,
159        ]
160        multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1]
161        components = [int((index // multipliers[i]) % totals[i]) for i in range(4)]
162        return components

Converts a single index into a multidimensional state representation.

Parameters:

  • index (int): The flat index representing the state.

Returns:

  • list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
def dsdt(self, state):
165    def dsdt(self, state):
166        m1 = self.LINK_MASS_1
167        m2 = self.LINK_MASS_2
168        l1 = self.LINK_LENGTH_1
169        lc1 = self.LINK_COM_POS_1
170        lc2 = self.LINK_COM_POS_2
171        I1 = self.LINK_MOI
172        I2 = self.LINK_MOI
173        g = 9.8
174        a = state[-1]
175        s = state[:-1]
176        theta1 = s[0]
177        theta2 = s[1]
178        dtheta1 = s[2]
179        dtheta2 = s[3]
180        d1 = m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2
181        d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2
182        phi2 = m2 * lc2 * g * cos(theta1 + theta2 - np.pi / 2.0)
183        phi1 = (
184            -m2 * l1 * lc2 * dtheta2**2 * sin(theta2)
185            - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
186            + (m1 * lc1 + m2 * l1) * g * cos(theta1 - np.pi / 2)
187            + phi2
188        )
189
190        # the following line is consistent with the java implementation and the
191        # book
192        ddtheta2 = (
193            a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2
194        ) / (m2 * lc2**2 + I2 - d2**2 / d1)
195
196        ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
197        return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0
def rk4(self, derivs, y0, t):
200    def rk4(self, derivs, y0, t):
201        """
202        Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.
203
204        Example for 2D system:
205
206            >>> def derivs(x):
207            ...     d1 =  x[0] + 2*x[1]
208            ...     d2 =  -3*x[0] + 4*x[1]
209            ...     return d1, d2
210
211            >>> dt = 0.0005
212            >>> t = np.arange(0.0, 2.0, dt)
213            >>> y0 = (1,2)
214            >>> yout = rk4(derivs, y0, t)
215
216        Args:
217            derivs: the derivative of the system and has the signature `dy = derivs(yi)`
218            y0: initial state vector
219            t: sample times
220
221        Returns:
222            yout: Runge-Kutta approximation of the ODE
223        """
224
225        try:
226            Ny = len(y0)
227        except TypeError:
228            yout = np.zeros((len(t),), np.float64)
229        else:
230            yout = np.zeros((len(t), Ny), np.float64)
231
232        yout[0] = y0
233
234        for i in np.arange(len(t) - 1):
235            this = t[i]
236            dt = t[i + 1] - this
237            dt2 = dt / 2.0
238            y0 = yout[i]
239
240            k1 = np.asarray(derivs(y0))
241            k2 = np.asarray(derivs(y0 + dt2 * k1))
242            k3 = np.asarray(derivs(y0 + dt2 * k2))
243            k4 = np.asarray(derivs(y0 + dt * k3))
244            yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
245        # We only care about the final timestep and we cleave off action value which will be zero
246        return yout[-1][:4]

Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.

Example for 2D system:

>>> def derivs(x):
...     d1 =  x[0] + 2*x[1]
...     d2 =  -3*x[0] + 4*x[1]
...     return d1, d2

>>> dt = 0.0005
>>> t = np.arange(0.0, 2.0, dt)
>>> y0 = (1,2)
>>> yout = rk4(derivs, y0, t)

Args: derivs: the derivative of the system and has the signature dy = derivs(yi) y0: initial state vector t: sample times

Returns: yout: Runge-Kutta approximation of the ODE

def compute_next_state(self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action):
248    def compute_next_state(
249        self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action
250    ):
251        angle_space = np.linspace(*self.angle_range, self.angle_1_bins)
252        velocity_1_space = np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)
253        velocity_2_space = np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)
254
255        angle_1 = angle_space[angle_1_idx]
256        angle_2 = angle_space[angle_2_idx]
257
258        velocity_1 = velocity_1_space[vel_1_idx]
259        velocity_2 = velocity_2_space[vel_2_idx]
260
261        torque = self.AVAIL_TORQUE[action]
262
263        state = [angle_1, angle_2, velocity_1, velocity_2, torque]
264
265        new_state = self.rk4(self.dsdt, state, [0, self.dt])
266
267        new_state[0] = self.wrap(new_state[0], -np.pi, np.pi)
268        new_state[1] = self.wrap(new_state[1], -np.pi, np.pi)
269        new_state[2] = self.bound(new_state[2], -self.MAX_VEL_1, self.MAX_VEL_1)
270        new_state[3] = self.bound(new_state[3], -self.MAX_VEL_2, self.MAX_VEL_2)
271
272        # index new state
273
274        new_angle_1_idx = np.clip(
275            np.digitize(new_state[0], np.linspace(*self.angle_range, self.angle_1_bins))
276            - 1,
277            0,
278            self.angle_1_bins - 1,
279        )
280        new_angle_2_idx = np.clip(
281            np.digitize(new_state[1], np.linspace(*self.angle_range, self.angle_2_bins))
282            - 1,
283            0,
284            self.angle_2_bins - 1,
285        )
286        new_vel_1_idx = np.clip(
287            np.digitize(
288                new_state[2],
289                np.linspace(*self.velocity_1_range, self.angular_vel_1_bins),
290            )
291            - 1,
292            0,
293            self.angular_vel_1_bins - 1,
294        )
295        new_vel_2_idx = np.clip(
296            np.digitize(
297                new_state[3],
298                np.linspace(*self.velocity_2_range, self.angular_vel_2_bins),
299            )
300            - 1,
301            0,
302            self.angular_vel_2_bins - 1,
303        )
304
305        new_state_idx = np.ravel_multi_index(
306            (new_angle_1_idx, new_angle_2_idx, new_vel_1_idx, new_vel_2_idx),
307            (
308                self.angle_1_bins,
309                self.angle_2_bins,
310                self.angular_vel_1_bins,
311                self.angular_vel_2_bins,
312            ),
313        )
314
315        # self.state = new_state
316        terminated = bool(-cos(new_state[0]) - cos(new_state[1] + new_state[0]) > 1.0)
317        reward = -1.0 if not terminated else 0.0
318
319        return new_state_idx, reward, terminated
def wrap(self, x, m, M):
321    def wrap(self, x, m, M):
322        """Wraps `x` so m <= x <= M; but unlike `bound()` which
323        truncates, `wrap()` wraps x around the coordinate system defined by m,M.\n
324        For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
325
326        Args:
327            x: a scalar
328            m: minimum possible value in range
329            M: maximum possible value in range
330
331        Returns:
332            x: a scalar, wrapped
333        """
334        diff = M - m
335        while x > M:
336            x = x - diff
337        while x < m:
338            x = x + diff
339        return x

Wraps x so m <= x <= M; but unlike bound() which truncates, wrap() wraps x around the coordinate system defined by m,M.

For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.

Args: x: a scalar m: minimum possible value in range M: maximum possible value in range

Returns: x: a scalar, wrapped

def bound(self, x, m, M=None):
341    def bound(self, x, m, M=None):
342        """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
343        have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
344
345        Args:
346            x: scalar
347            m: The lower bound
348            M: The upper bound
349
350        Returns:
351            x: scalar, bound between min (m) and Max (M)
352        """
353        if M is None:
354            M = m[1]
355            m = m[0]
356        # bound x between min (m) and Max (M)
357        return min(max(x, m), M)

Either have m as scalar, so bound(x,m,M) which returns m <= x <= M OR have m as length 2 vector, bound(x,m, ) returns m[0] <= x <= m[1].

Args: x: scalar m: The lower bound M: The upper bound

Returns: x: scalar, bound between min (m) and Max (M)