envs.acrobot_model

  1import numpy as np
  2from math import cos, sin
  3
  4class DiscretizedAcrobot:
  5 
  6  LINK_LENGTH_1 = 1.0  # [m]
  7  LINK_LENGTH_2 = 1.0  # [m]
  8  LINK_MASS_1 = 1.0  #: [kg] mass of link 1
  9  LINK_MASS_2 = 1.0  #: [kg] mass of link 2
 10  LINK_COM_POS_1 = 0.5  #: [m] position of the center of mass of link 1
 11  LINK_COM_POS_2 = 0.5  #: [m] position of the center of mass of link 2
 12  LINK_MOI = 1.0  #: moments of inertia for both links
 13  AVAIL_TORQUE = [-1, 0, 1]
 14  MAX_VEL_1 = 4 * np.pi
 15  MAX_VEL_2 = 9 * np.pi
 16
 17  def __init__(self,
 18               angular_resolution_rad = 0.01,
 19               angular_vel_resolution_rad_per_sec = 0.05,
 20               angle_bins = None,
 21               velocity_bins = None,
 22               precomputed_P = None,
 23               verbose=False):
 24
 25    self.verbose = verbose
 26
 27    if angle_bins is None:
 28      self.angle_1_bins = int(np.pi // angular_resolution_rad) + 1
 29      self.angle_2_bins = int(np.pi // angular_resolution_rad) + 1
 30    else:
 31       self.angle_1_bins = angle_bins
 32       self.angle_2_bins = angle_bins
 33    
 34    if velocity_bins is None:
 35      self.angular_vel_1_bins = int((2 * self.MAX_VEL_1) // angular_vel_resolution_rad_per_sec) + 1
 36      self.angular_vel_2_bins = int((2 * self.MAX_VEL_2) // angular_vel_resolution_rad_per_sec) + 1
 37    else:
 38      self.angular_vel_1_bins = velocity_bins
 39      self.angular_vel_2_bins = velocity_bins
 40
 41    self.n_states = self.angle_1_bins * self.angle_2_bins * self.angular_vel_1_bins * self.angular_vel_2_bins
 42
 43    self.angle_range = (-np.pi, np.pi)
 44    self.velocity_1_range = (-self.MAX_VEL_1, self.MAX_VEL_1)
 45    self.velocity_2_range = (-self.MAX_VEL_2, self.MAX_VEL_2)
 46    self.action_space = len(self.AVAIL_TORQUE) # -1, 0, 1
 47    self.dt = 0.2
 48
 49    if precomputed_P is None:
 50      self.P = {state: {action: [] for action in range(self.action_space)} for state in range(self.n_states)}
 51      self.setup_transition_probabilities()
 52
 53    else:
 54      self.P = precomputed_P
 55
 56    # add transform_obs
 57    self.transform_obs = lambda obs: (
 58        np.ravel_multi_index((
 59            np.clip(np.digitize(obs[0], np.linspace(*self.angle_range, self.angle_1_bins)) - 1, 0,
 60                    self.angle_1_bins - 1),
 61            np.clip(np.digitize(obs[1], np.linspace(*self.angle_range, self.angle_2_bins)) - 1, 0,
 62                    self.angle_2_bins - 1),
 63            np.clip(np.digitize(obs[2], np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)) - 1, 0,
 64                    self.angular_vel_1_bins - 1),
 65            np.clip(np.digitize(obs[3], np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)) - 1, 0,
 66                    self.angular_vel_2_bins - 1),
 67        ), (self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins))
 68      )
 69
 70  def setup_transition_probabilities(self):
 71      """
 72      Sets up the transition probabilities for the environment. This method iterates through all possible
 73      states and actions, simulates the next state, and records the transition probability
 74      (deterministic in this setup), reward, and termination status.
 75      """
 76      percent = 0
 77
 78      for state in range(self.n_states):
 79          angle_1, angle_2, vel_1, vel_2 = self.index_to_state(state)
 80          for action in range(self.action_space):
 81              next_state, reward, done = self.compute_next_state(angle_1, angle_2, vel_1, vel_2, action)
 82              self.P[state][action].append((1, next_state, reward, done))
 83
 84          if state % int(self.n_states / 10) == 0 and state > 0 and self.verbose:
 85            percent += 10
 86            print(f'{percent}% of probabilities calculated!')
 87
 88  def index_to_state(self, index):
 89        """
 90        Converts a single index into a multidimensional state representation.
 91
 92        Parameters:
 93        - index (int): The flat index representing the state.
 94
 95        Returns:
 96        - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
 97        """
 98
 99        totals = [self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins]
100        multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1]
101        components = [int((index // multipliers[i]) % totals[i]) for i in range(4)]
102        return components
103
104  # Modified from original implementation of Gym Acrobot
105  def dsdt(self, state):
106    m1 = self.LINK_MASS_1
107    m2 = self.LINK_MASS_2
108    l1 = self.LINK_LENGTH_1
109    lc1 = self.LINK_COM_POS_1
110    lc2 = self.LINK_COM_POS_2
111    I1 = self.LINK_MOI
112    I2 = self.LINK_MOI
113    g = 9.8
114    a = state[-1]
115    s = state[:-1]
116    theta1 = s[0]
117    theta2 = s[1]
118    dtheta1 = s[2]
119    dtheta2 = s[3]
120    d1 = m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2
121    d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2
122    phi2 = m2 * lc2 * g * cos(theta1 + theta2 - np.pi / 2.0)
123    phi1 = (
124        -m2 * l1 * lc2 * dtheta2**2 * sin(theta2)
125        - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
126        + (m1 * lc1 + m2 * l1) * g * cos(theta1 - np.pi / 2)
127        + phi2
128    )
129
130    # the following line is consistent with the java implementation and the
131    # book
132    ddtheta2 = (
133        a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2
134    ) / (m2 * lc2**2 + I2 - d2**2 / d1)
135
136    ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
137    return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0
138
139  # Modified from original implementation of Gym Acrobot
140  def rk4(self, derivs, y0, t):
141
142    """
143    Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.
144
145    Example for 2D system:
146
147        >>> def derivs(x):
148        ...     d1 =  x[0] + 2*x[1]
149        ...     d2 =  -3*x[0] + 4*x[1]
150        ...     return d1, d2
151
152        >>> dt = 0.0005
153        >>> t = np.arange(0.0, 2.0, dt)
154        >>> y0 = (1,2)
155        >>> yout = rk4(derivs, y0, t)
156
157    Args:
158        derivs: the derivative of the system and has the signature `dy = derivs(yi)`
159        y0: initial state vector
160        t: sample times
161
162    Returns:
163        yout: Runge-Kutta approximation of the ODE
164    """
165
166    try:
167        Ny = len(y0)
168    except TypeError:
169        yout = np.zeros((len(t),), np.float64)
170    else:
171        yout = np.zeros((len(t), Ny), np.float64)
172
173    yout[0] = y0
174
175    for i in np.arange(len(t) - 1):
176        this = t[i]
177        dt = t[i + 1] - this
178        dt2 = dt / 2.0
179        y0 = yout[i]
180
181        k1 = np.asarray(derivs(y0))
182        k2 = np.asarray(derivs(y0 + dt2 * k1))
183        k3 = np.asarray(derivs(y0 + dt2 * k2))
184        k4 = np.asarray(derivs(y0 + dt * k3))
185        yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
186    # We only care about the final timestep and we cleave off action value which will be zero
187    return yout[-1][:4]
188  
189  def compute_next_state(self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action):
190
191    angle_space = np.linspace(*self.angle_range,self.angle_1_bins)
192    velocity_1_space = np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)
193    velocity_2_space = np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)
194
195    angle_1 = angle_space[angle_1_idx]
196    angle_2 = angle_space[angle_2_idx]
197
198    velocity_1 = velocity_1_space[vel_1_idx]
199    velocity_2 = velocity_2_space[vel_2_idx]
200
201    torque = self.AVAIL_TORQUE[action]
202
203    state = [angle_1, angle_2, velocity_1, velocity_2, torque]
204
205    new_state = self.rk4(self.dsdt, state, [0, self.dt])
206
207    new_state[0] = self.wrap(new_state[0], -np.pi, np.pi)
208    new_state[1] = self.wrap(new_state[1], -np.pi, np.pi)
209    new_state[2] = self.bound(new_state[2], -self.MAX_VEL_1, self.MAX_VEL_1)
210    new_state[3] = self.bound(new_state[3], -self.MAX_VEL_2, self.MAX_VEL_2)
211
212    # index new state
213
214    new_angle_1_idx = np.clip(np.digitize(new_state[0], np.linspace(*self.angle_range, self.angle_1_bins)) - 1, 0,
215                    self.angle_1_bins - 1)
216    new_angle_2_idx = np.clip(np.digitize(new_state[1], np.linspace(*self.angle_range, self.angle_2_bins)) - 1, 0,
217                    self.angle_2_bins - 1)
218    new_vel_1_idx = np.clip(np.digitize(new_state[2], np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)) - 1, 0,
219                    self.angular_vel_1_bins - 1)
220    new_vel_2_idx = np.clip(np.digitize(new_state[3], np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)) - 1, 0,
221                    self.angular_vel_2_bins - 1)
222
223    new_state_idx = np.ravel_multi_index((new_angle_1_idx, new_angle_2_idx, new_vel_1_idx, new_vel_2_idx),
224                                         (self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins))
225
226    # self.state = new_state
227    terminated = bool(-cos(new_state[0]) - cos(new_state[1] + new_state[0]) > 1.0)
228    reward = -1.0 if not terminated else 0.0
229
230    return new_state_idx, reward, terminated
231
232  def wrap(self, x, m, M):
233      """Wraps `x` so m <= x <= M; but unlike `bound()` which
234      truncates, `wrap()` wraps x around the coordinate system defined by m,M.\n
235      For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
236
237      Args:
238          x: a scalar
239          m: minimum possible value in range
240          M: maximum possible value in range
241
242      Returns:
243          x: a scalar, wrapped
244      """
245      diff = M - m
246      while x > M:
247          x = x - diff
248      while x < m:
249          x = x + diff
250      return x
251
252  def bound(self, x, m, M=None):
253      """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
254      have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
255
256      Args:
257          x: scalar
258          m: The lower bound
259          M: The upper bound
260
261      Returns:
262          x: scalar, bound between min (m) and Max (M)
263      """
264      if M is None:
265          M = m[1]
266          m = m[0]
267      # bound x between min (m) and Max (M)
268      return min(max(x, m), M)
class DiscretizedAcrobot:
  5class DiscretizedAcrobot:
  6 
  7  LINK_LENGTH_1 = 1.0  # [m]
  8  LINK_LENGTH_2 = 1.0  # [m]
  9  LINK_MASS_1 = 1.0  #: [kg] mass of link 1
 10  LINK_MASS_2 = 1.0  #: [kg] mass of link 2
 11  LINK_COM_POS_1 = 0.5  #: [m] position of the center of mass of link 1
 12  LINK_COM_POS_2 = 0.5  #: [m] position of the center of mass of link 2
 13  LINK_MOI = 1.0  #: moments of inertia for both links
 14  AVAIL_TORQUE = [-1, 0, 1]
 15  MAX_VEL_1 = 4 * np.pi
 16  MAX_VEL_2 = 9 * np.pi
 17
 18  def __init__(self,
 19               angular_resolution_rad = 0.01,
 20               angular_vel_resolution_rad_per_sec = 0.05,
 21               angle_bins = None,
 22               velocity_bins = None,
 23               precomputed_P = None,
 24               verbose=False):
 25
 26    self.verbose = verbose
 27
 28    if angle_bins is None:
 29      self.angle_1_bins = int(np.pi // angular_resolution_rad) + 1
 30      self.angle_2_bins = int(np.pi // angular_resolution_rad) + 1
 31    else:
 32       self.angle_1_bins = angle_bins
 33       self.angle_2_bins = angle_bins
 34    
 35    if velocity_bins is None:
 36      self.angular_vel_1_bins = int((2 * self.MAX_VEL_1) // angular_vel_resolution_rad_per_sec) + 1
 37      self.angular_vel_2_bins = int((2 * self.MAX_VEL_2) // angular_vel_resolution_rad_per_sec) + 1
 38    else:
 39      self.angular_vel_1_bins = velocity_bins
 40      self.angular_vel_2_bins = velocity_bins
 41
 42    self.n_states = self.angle_1_bins * self.angle_2_bins * self.angular_vel_1_bins * self.angular_vel_2_bins
 43
 44    self.angle_range = (-np.pi, np.pi)
 45    self.velocity_1_range = (-self.MAX_VEL_1, self.MAX_VEL_1)
 46    self.velocity_2_range = (-self.MAX_VEL_2, self.MAX_VEL_2)
 47    self.action_space = len(self.AVAIL_TORQUE) # -1, 0, 1
 48    self.dt = 0.2
 49
 50    if precomputed_P is None:
 51      self.P = {state: {action: [] for action in range(self.action_space)} for state in range(self.n_states)}
 52      self.setup_transition_probabilities()
 53
 54    else:
 55      self.P = precomputed_P
 56
 57    # add transform_obs
 58    self.transform_obs = lambda obs: (
 59        np.ravel_multi_index((
 60            np.clip(np.digitize(obs[0], np.linspace(*self.angle_range, self.angle_1_bins)) - 1, 0,
 61                    self.angle_1_bins - 1),
 62            np.clip(np.digitize(obs[1], np.linspace(*self.angle_range, self.angle_2_bins)) - 1, 0,
 63                    self.angle_2_bins - 1),
 64            np.clip(np.digitize(obs[2], np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)) - 1, 0,
 65                    self.angular_vel_1_bins - 1),
 66            np.clip(np.digitize(obs[3], np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)) - 1, 0,
 67                    self.angular_vel_2_bins - 1),
 68        ), (self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins))
 69      )
 70
 71  def setup_transition_probabilities(self):
 72      """
 73      Sets up the transition probabilities for the environment. This method iterates through all possible
 74      states and actions, simulates the next state, and records the transition probability
 75      (deterministic in this setup), reward, and termination status.
 76      """
 77      percent = 0
 78
 79      for state in range(self.n_states):
 80          angle_1, angle_2, vel_1, vel_2 = self.index_to_state(state)
 81          for action in range(self.action_space):
 82              next_state, reward, done = self.compute_next_state(angle_1, angle_2, vel_1, vel_2, action)
 83              self.P[state][action].append((1, next_state, reward, done))
 84
 85          if state % int(self.n_states / 10) == 0 and state > 0 and self.verbose:
 86            percent += 10
 87            print(f'{percent}% of probabilities calculated!')
 88
 89  def index_to_state(self, index):
 90        """
 91        Converts a single index into a multidimensional state representation.
 92
 93        Parameters:
 94        - index (int): The flat index representing the state.
 95
 96        Returns:
 97        - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
 98        """
 99
100        totals = [self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins]
101        multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1]
102        components = [int((index // multipliers[i]) % totals[i]) for i in range(4)]
103        return components
104
105  # Modified from original implementation of Gym Acrobot
106  def dsdt(self, state):
107    m1 = self.LINK_MASS_1
108    m2 = self.LINK_MASS_2
109    l1 = self.LINK_LENGTH_1
110    lc1 = self.LINK_COM_POS_1
111    lc2 = self.LINK_COM_POS_2
112    I1 = self.LINK_MOI
113    I2 = self.LINK_MOI
114    g = 9.8
115    a = state[-1]
116    s = state[:-1]
117    theta1 = s[0]
118    theta2 = s[1]
119    dtheta1 = s[2]
120    dtheta2 = s[3]
121    d1 = m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2
122    d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2
123    phi2 = m2 * lc2 * g * cos(theta1 + theta2 - np.pi / 2.0)
124    phi1 = (
125        -m2 * l1 * lc2 * dtheta2**2 * sin(theta2)
126        - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
127        + (m1 * lc1 + m2 * l1) * g * cos(theta1 - np.pi / 2)
128        + phi2
129    )
130
131    # the following line is consistent with the java implementation and the
132    # book
133    ddtheta2 = (
134        a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2
135    ) / (m2 * lc2**2 + I2 - d2**2 / d1)
136
137    ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
138    return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0
139
140  # Modified from original implementation of Gym Acrobot
141  def rk4(self, derivs, y0, t):
142
143    """
144    Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.
145
146    Example for 2D system:
147
148        >>> def derivs(x):
149        ...     d1 =  x[0] + 2*x[1]
150        ...     d2 =  -3*x[0] + 4*x[1]
151        ...     return d1, d2
152
153        >>> dt = 0.0005
154        >>> t = np.arange(0.0, 2.0, dt)
155        >>> y0 = (1,2)
156        >>> yout = rk4(derivs, y0, t)
157
158    Args:
159        derivs: the derivative of the system and has the signature `dy = derivs(yi)`
160        y0: initial state vector
161        t: sample times
162
163    Returns:
164        yout: Runge-Kutta approximation of the ODE
165    """
166
167    try:
168        Ny = len(y0)
169    except TypeError:
170        yout = np.zeros((len(t),), np.float64)
171    else:
172        yout = np.zeros((len(t), Ny), np.float64)
173
174    yout[0] = y0
175
176    for i in np.arange(len(t) - 1):
177        this = t[i]
178        dt = t[i + 1] - this
179        dt2 = dt / 2.0
180        y0 = yout[i]
181
182        k1 = np.asarray(derivs(y0))
183        k2 = np.asarray(derivs(y0 + dt2 * k1))
184        k3 = np.asarray(derivs(y0 + dt2 * k2))
185        k4 = np.asarray(derivs(y0 + dt * k3))
186        yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
187    # We only care about the final timestep and we cleave off action value which will be zero
188    return yout[-1][:4]
189  
190  def compute_next_state(self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action):
191
192    angle_space = np.linspace(*self.angle_range,self.angle_1_bins)
193    velocity_1_space = np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)
194    velocity_2_space = np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)
195
196    angle_1 = angle_space[angle_1_idx]
197    angle_2 = angle_space[angle_2_idx]
198
199    velocity_1 = velocity_1_space[vel_1_idx]
200    velocity_2 = velocity_2_space[vel_2_idx]
201
202    torque = self.AVAIL_TORQUE[action]
203
204    state = [angle_1, angle_2, velocity_1, velocity_2, torque]
205
206    new_state = self.rk4(self.dsdt, state, [0, self.dt])
207
208    new_state[0] = self.wrap(new_state[0], -np.pi, np.pi)
209    new_state[1] = self.wrap(new_state[1], -np.pi, np.pi)
210    new_state[2] = self.bound(new_state[2], -self.MAX_VEL_1, self.MAX_VEL_1)
211    new_state[3] = self.bound(new_state[3], -self.MAX_VEL_2, self.MAX_VEL_2)
212
213    # index new state
214
215    new_angle_1_idx = np.clip(np.digitize(new_state[0], np.linspace(*self.angle_range, self.angle_1_bins)) - 1, 0,
216                    self.angle_1_bins - 1)
217    new_angle_2_idx = np.clip(np.digitize(new_state[1], np.linspace(*self.angle_range, self.angle_2_bins)) - 1, 0,
218                    self.angle_2_bins - 1)
219    new_vel_1_idx = np.clip(np.digitize(new_state[2], np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)) - 1, 0,
220                    self.angular_vel_1_bins - 1)
221    new_vel_2_idx = np.clip(np.digitize(new_state[3], np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)) - 1, 0,
222                    self.angular_vel_2_bins - 1)
223
224    new_state_idx = np.ravel_multi_index((new_angle_1_idx, new_angle_2_idx, new_vel_1_idx, new_vel_2_idx),
225                                         (self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins))
226
227    # self.state = new_state
228    terminated = bool(-cos(new_state[0]) - cos(new_state[1] + new_state[0]) > 1.0)
229    reward = -1.0 if not terminated else 0.0
230
231    return new_state_idx, reward, terminated
232
233  def wrap(self, x, m, M):
234      """Wraps `x` so m <= x <= M; but unlike `bound()` which
235      truncates, `wrap()` wraps x around the coordinate system defined by m,M.\n
236      For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
237
238      Args:
239          x: a scalar
240          m: minimum possible value in range
241          M: maximum possible value in range
242
243      Returns:
244          x: a scalar, wrapped
245      """
246      diff = M - m
247      while x > M:
248          x = x - diff
249      while x < m:
250          x = x + diff
251      return x
252
253  def bound(self, x, m, M=None):
254      """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
255      have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
256
257      Args:
258          x: scalar
259          m: The lower bound
260          M: The upper bound
261
262      Returns:
263          x: scalar, bound between min (m) and Max (M)
264      """
265      if M is None:
266          M = m[1]
267          m = m[0]
268      # bound x between min (m) and Max (M)
269      return min(max(x, m), M)
DiscretizedAcrobot( angular_resolution_rad=0.01, angular_vel_resolution_rad_per_sec=0.05, angle_bins=None, velocity_bins=None, precomputed_P=None, verbose=False)
18  def __init__(self,
19               angular_resolution_rad = 0.01,
20               angular_vel_resolution_rad_per_sec = 0.05,
21               angle_bins = None,
22               velocity_bins = None,
23               precomputed_P = None,
24               verbose=False):
25
26    self.verbose = verbose
27
28    if angle_bins is None:
29      self.angle_1_bins = int(np.pi // angular_resolution_rad) + 1
30      self.angle_2_bins = int(np.pi // angular_resolution_rad) + 1
31    else:
32       self.angle_1_bins = angle_bins
33       self.angle_2_bins = angle_bins
34    
35    if velocity_bins is None:
36      self.angular_vel_1_bins = int((2 * self.MAX_VEL_1) // angular_vel_resolution_rad_per_sec) + 1
37      self.angular_vel_2_bins = int((2 * self.MAX_VEL_2) // angular_vel_resolution_rad_per_sec) + 1
38    else:
39      self.angular_vel_1_bins = velocity_bins
40      self.angular_vel_2_bins = velocity_bins
41
42    self.n_states = self.angle_1_bins * self.angle_2_bins * self.angular_vel_1_bins * self.angular_vel_2_bins
43
44    self.angle_range = (-np.pi, np.pi)
45    self.velocity_1_range = (-self.MAX_VEL_1, self.MAX_VEL_1)
46    self.velocity_2_range = (-self.MAX_VEL_2, self.MAX_VEL_2)
47    self.action_space = len(self.AVAIL_TORQUE) # -1, 0, 1
48    self.dt = 0.2
49
50    if precomputed_P is None:
51      self.P = {state: {action: [] for action in range(self.action_space)} for state in range(self.n_states)}
52      self.setup_transition_probabilities()
53
54    else:
55      self.P = precomputed_P
56
57    # add transform_obs
58    self.transform_obs = lambda obs: (
59        np.ravel_multi_index((
60            np.clip(np.digitize(obs[0], np.linspace(*self.angle_range, self.angle_1_bins)) - 1, 0,
61                    self.angle_1_bins - 1),
62            np.clip(np.digitize(obs[1], np.linspace(*self.angle_range, self.angle_2_bins)) - 1, 0,
63                    self.angle_2_bins - 1),
64            np.clip(np.digitize(obs[2], np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)) - 1, 0,
65                    self.angular_vel_1_bins - 1),
66            np.clip(np.digitize(obs[3], np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)) - 1, 0,
67                    self.angular_vel_2_bins - 1),
68        ), (self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins))
69      )
AVAIL_TORQUE = [-1, 0, 1]
MAX_VEL_1 = 12.566370614359172
MAX_VEL_2 = 28.274333882308138
verbose
n_states
angle_range
velocity_1_range
velocity_2_range
action_space
dt
transform_obs
def setup_transition_probabilities(self):
71  def setup_transition_probabilities(self):
72      """
73      Sets up the transition probabilities for the environment. This method iterates through all possible
74      states and actions, simulates the next state, and records the transition probability
75      (deterministic in this setup), reward, and termination status.
76      """
77      percent = 0
78
79      for state in range(self.n_states):
80          angle_1, angle_2, vel_1, vel_2 = self.index_to_state(state)
81          for action in range(self.action_space):
82              next_state, reward, done = self.compute_next_state(angle_1, angle_2, vel_1, vel_2, action)
83              self.P[state][action].append((1, next_state, reward, done))
84
85          if state % int(self.n_states / 10) == 0 and state > 0 and self.verbose:
86            percent += 10
87            print(f'{percent}% of probabilities calculated!')

Sets up the transition probabilities for the environment. This method iterates through all possible states and actions, simulates the next state, and records the transition probability (deterministic in this setup), reward, and termination status.

def index_to_state(self, index):
 89  def index_to_state(self, index):
 90        """
 91        Converts a single index into a multidimensional state representation.
 92
 93        Parameters:
 94        - index (int): The flat index representing the state.
 95
 96        Returns:
 97        - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
 98        """
 99
100        totals = [self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins]
101        multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1]
102        components = [int((index // multipliers[i]) % totals[i]) for i in range(4)]
103        return components

Converts a single index into a multidimensional state representation.

Parameters:

  • index (int): The flat index representing the state.

Returns:

  • list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
def dsdt(self, state):
106  def dsdt(self, state):
107    m1 = self.LINK_MASS_1
108    m2 = self.LINK_MASS_2
109    l1 = self.LINK_LENGTH_1
110    lc1 = self.LINK_COM_POS_1
111    lc2 = self.LINK_COM_POS_2
112    I1 = self.LINK_MOI
113    I2 = self.LINK_MOI
114    g = 9.8
115    a = state[-1]
116    s = state[:-1]
117    theta1 = s[0]
118    theta2 = s[1]
119    dtheta1 = s[2]
120    dtheta2 = s[3]
121    d1 = m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2
122    d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2
123    phi2 = m2 * lc2 * g * cos(theta1 + theta2 - np.pi / 2.0)
124    phi1 = (
125        -m2 * l1 * lc2 * dtheta2**2 * sin(theta2)
126        - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
127        + (m1 * lc1 + m2 * l1) * g * cos(theta1 - np.pi / 2)
128        + phi2
129    )
130
131    # the following line is consistent with the java implementation and the
132    # book
133    ddtheta2 = (
134        a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2
135    ) / (m2 * lc2**2 + I2 - d2**2 / d1)
136
137    ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
138    return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0
def rk4(self, derivs, y0, t):
141  def rk4(self, derivs, y0, t):
142
143    """
144    Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.
145
146    Example for 2D system:
147
148        >>> def derivs(x):
149        ...     d1 =  x[0] + 2*x[1]
150        ...     d2 =  -3*x[0] + 4*x[1]
151        ...     return d1, d2
152
153        >>> dt = 0.0005
154        >>> t = np.arange(0.0, 2.0, dt)
155        >>> y0 = (1,2)
156        >>> yout = rk4(derivs, y0, t)
157
158    Args:
159        derivs: the derivative of the system and has the signature `dy = derivs(yi)`
160        y0: initial state vector
161        t: sample times
162
163    Returns:
164        yout: Runge-Kutta approximation of the ODE
165    """
166
167    try:
168        Ny = len(y0)
169    except TypeError:
170        yout = np.zeros((len(t),), np.float64)
171    else:
172        yout = np.zeros((len(t), Ny), np.float64)
173
174    yout[0] = y0
175
176    for i in np.arange(len(t) - 1):
177        this = t[i]
178        dt = t[i + 1] - this
179        dt2 = dt / 2.0
180        y0 = yout[i]
181
182        k1 = np.asarray(derivs(y0))
183        k2 = np.asarray(derivs(y0 + dt2 * k1))
184        k3 = np.asarray(derivs(y0 + dt2 * k2))
185        k4 = np.asarray(derivs(y0 + dt * k3))
186        yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
187    # We only care about the final timestep and we cleave off action value which will be zero
188    return yout[-1][:4]

Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.

Example for 2D system:

>>> def derivs(x):
...     d1 =  x[0] + 2*x[1]
...     d2 =  -3*x[0] + 4*x[1]
...     return d1, d2

>>> dt = 0.0005
>>> t = np.arange(0.0, 2.0, dt)
>>> y0 = (1,2)
>>> yout = rk4(derivs, y0, t)

Args: derivs: the derivative of the system and has the signature dy = derivs(yi) y0: initial state vector t: sample times

Returns: yout: Runge-Kutta approximation of the ODE

def compute_next_state(self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action):
190  def compute_next_state(self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action):
191
192    angle_space = np.linspace(*self.angle_range,self.angle_1_bins)
193    velocity_1_space = np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)
194    velocity_2_space = np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)
195
196    angle_1 = angle_space[angle_1_idx]
197    angle_2 = angle_space[angle_2_idx]
198
199    velocity_1 = velocity_1_space[vel_1_idx]
200    velocity_2 = velocity_2_space[vel_2_idx]
201
202    torque = self.AVAIL_TORQUE[action]
203
204    state = [angle_1, angle_2, velocity_1, velocity_2, torque]
205
206    new_state = self.rk4(self.dsdt, state, [0, self.dt])
207
208    new_state[0] = self.wrap(new_state[0], -np.pi, np.pi)
209    new_state[1] = self.wrap(new_state[1], -np.pi, np.pi)
210    new_state[2] = self.bound(new_state[2], -self.MAX_VEL_1, self.MAX_VEL_1)
211    new_state[3] = self.bound(new_state[3], -self.MAX_VEL_2, self.MAX_VEL_2)
212
213    # index new state
214
215    new_angle_1_idx = np.clip(np.digitize(new_state[0], np.linspace(*self.angle_range, self.angle_1_bins)) - 1, 0,
216                    self.angle_1_bins - 1)
217    new_angle_2_idx = np.clip(np.digitize(new_state[1], np.linspace(*self.angle_range, self.angle_2_bins)) - 1, 0,
218                    self.angle_2_bins - 1)
219    new_vel_1_idx = np.clip(np.digitize(new_state[2], np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)) - 1, 0,
220                    self.angular_vel_1_bins - 1)
221    new_vel_2_idx = np.clip(np.digitize(new_state[3], np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)) - 1, 0,
222                    self.angular_vel_2_bins - 1)
223
224    new_state_idx = np.ravel_multi_index((new_angle_1_idx, new_angle_2_idx, new_vel_1_idx, new_vel_2_idx),
225                                         (self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins))
226
227    # self.state = new_state
228    terminated = bool(-cos(new_state[0]) - cos(new_state[1] + new_state[0]) > 1.0)
229    reward = -1.0 if not terminated else 0.0
230
231    return new_state_idx, reward, terminated
def wrap(self, x, m, M):
233  def wrap(self, x, m, M):
234      """Wraps `x` so m <= x <= M; but unlike `bound()` which
235      truncates, `wrap()` wraps x around the coordinate system defined by m,M.\n
236      For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
237
238      Args:
239          x: a scalar
240          m: minimum possible value in range
241          M: maximum possible value in range
242
243      Returns:
244          x: a scalar, wrapped
245      """
246      diff = M - m
247      while x > M:
248          x = x - diff
249      while x < m:
250          x = x + diff
251      return x

Wraps x so m <= x <= M; but unlike bound() which truncates, wrap() wraps x around the coordinate system defined by m,M.

For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.

Args: x: a scalar m: minimum possible value in range M: maximum possible value in range

Returns: x: a scalar, wrapped

def bound(self, x, m, M=None):
253  def bound(self, x, m, M=None):
254      """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
255      have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
256
257      Args:
258          x: scalar
259          m: The lower bound
260          M: The upper bound
261
262      Returns:
263          x: scalar, bound between min (m) and Max (M)
264      """
265      if M is None:
266          M = m[1]
267          m = m[0]
268      # bound x between min (m) and Max (M)
269      return min(max(x, m), M)

Either have m as scalar, so bound(x,m,M) which returns m <= x <= M OR have m as length 2 vector, bound(x,m, ) returns m[0] <= x <= m[1].

Args: x: scalar m: The lower bound M: The upper bound

Returns: x: scalar, bound between min (m) and Max (M)