bettermdptools.envs.acrobot_model
1from math import cos, sin 2 3import numpy as np 4 5 6class DiscretizedAcrobot: 7 LINK_LENGTH_1 = 1.0 # [m] 8 LINK_LENGTH_2 = 1.0 # [m] 9 LINK_MASS_1 = 1.0 #: [kg] mass of link 1 10 LINK_MASS_2 = 1.0 #: [kg] mass of link 2 11 LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1 12 LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2 13 LINK_MOI = 1.0 #: moments of inertia for both links 14 AVAIL_TORQUE = [-1, 0, 1] 15 MAX_VEL_1 = 4 * np.pi 16 MAX_VEL_2 = 9 * np.pi 17 18 def __init__( 19 self, 20 angular_resolution_rad=0.01, 21 angular_vel_resolution_rad_per_sec=0.05, 22 angle_bins=None, 23 velocity_bins=None, 24 precomputed_P=None, 25 verbose=False, 26 ): 27 self.verbose = verbose 28 29 if angle_bins is None: 30 self.angle_1_bins = int(np.pi // angular_resolution_rad) + 1 31 self.angle_2_bins = int(np.pi // angular_resolution_rad) + 1 32 else: 33 self.angle_1_bins = angle_bins 34 self.angle_2_bins = angle_bins 35 36 if velocity_bins is None: 37 self.angular_vel_1_bins = ( 38 int((2 * self.MAX_VEL_1) // angular_vel_resolution_rad_per_sec) + 1 39 ) 40 self.angular_vel_2_bins = ( 41 int((2 * self.MAX_VEL_2) // angular_vel_resolution_rad_per_sec) + 1 42 ) 43 else: 44 self.angular_vel_1_bins = velocity_bins 45 self.angular_vel_2_bins = velocity_bins 46 47 self.n_states = ( 48 self.angle_1_bins 49 * self.angle_2_bins 50 * self.angular_vel_1_bins 51 * self.angular_vel_2_bins 52 ) 53 54 self.angle_range = (-np.pi, np.pi) 55 self.velocity_1_range = (-self.MAX_VEL_1, self.MAX_VEL_1) 56 self.velocity_2_range = (-self.MAX_VEL_2, self.MAX_VEL_2) 57 self.action_space = len(self.AVAIL_TORQUE) # -1, 0, 1 58 self.dt = 0.2 59 60 if precomputed_P is None: 61 self.P = { 62 state: {action: [] for action in range(self.action_space)} 63 for state in range(self.n_states) 64 } 65 self.setup_transition_probabilities() 66 67 else: 68 self.P = precomputed_P 69 70 # add transform_obs 71 self.transform_obs = lambda obs: ( 72 np.ravel_multi_index( 73 ( 74 np.clip( 75 np.digitize( 76 obs[0], np.linspace(*self.angle_range, self.angle_1_bins) 77 ) 78 - 1, 79 0, 80 self.angle_1_bins - 1, 81 ), 82 np.clip( 83 np.digitize( 84 obs[1], np.linspace(*self.angle_range, self.angle_2_bins) 85 ) 86 - 1, 87 0, 88 self.angle_2_bins - 1, 89 ), 90 np.clip( 91 np.digitize( 92 obs[2], 93 np.linspace( 94 *self.velocity_1_range, self.angular_vel_1_bins 95 ), 96 ) 97 - 1, 98 0, 99 self.angular_vel_1_bins - 1, 100 ), 101 np.clip( 102 np.digitize( 103 obs[3], 104 np.linspace( 105 *self.velocity_2_range, self.angular_vel_2_bins 106 ), 107 ) 108 - 1, 109 0, 110 self.angular_vel_2_bins - 1, 111 ), 112 ), 113 ( 114 self.angle_1_bins, 115 self.angle_2_bins, 116 self.angular_vel_1_bins, 117 self.angular_vel_2_bins, 118 ), 119 ) 120 ) 121 122 def setup_transition_probabilities(self): 123 """ 124 Sets up the transition probabilities for the environment. This method iterates through all possible 125 states and actions, simulates the next state, and records the transition probability 126 (deterministic in this setup), reward, and termination status. 127 """ 128 percent = 0 129 130 for state in range(self.n_states): 131 angle_1, angle_2, vel_1, vel_2 = self.index_to_state(state) 132 for action in range(self.action_space): 133 next_state, reward, done = self.compute_next_state( 134 angle_1, angle_2, vel_1, vel_2, action 135 ) 136 self.P[state][action].append((1, next_state, reward, done)) 137 138 if state % int(self.n_states / 10) == 0 and state > 0 and self.verbose: 139 percent += 10 140 print(f"{percent}% of probabilities calculated!") 141 142 def index_to_state(self, index): 143 """ 144 Converts a single index into a multidimensional state representation. 145 146 Parameters: 147 - index (int): The flat index representing the state. 148 149 Returns: 150 - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins. 151 """ 152 153 totals = [ 154 self.angle_1_bins, 155 self.angle_2_bins, 156 self.angular_vel_1_bins, 157 self.angular_vel_2_bins, 158 ] 159 multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1] 160 components = [int((index // multipliers[i]) % totals[i]) for i in range(4)] 161 return components 162 163 # Modified from original implementation of Gym Acrobot 164 def dsdt(self, state): 165 m1 = self.LINK_MASS_1 166 m2 = self.LINK_MASS_2 167 l1 = self.LINK_LENGTH_1 168 lc1 = self.LINK_COM_POS_1 169 lc2 = self.LINK_COM_POS_2 170 I1 = self.LINK_MOI 171 I2 = self.LINK_MOI 172 g = 9.8 173 a = state[-1] 174 s = state[:-1] 175 theta1 = s[0] 176 theta2 = s[1] 177 dtheta1 = s[2] 178 dtheta2 = s[3] 179 d1 = m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2 180 d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2 181 phi2 = m2 * lc2 * g * cos(theta1 + theta2 - np.pi / 2.0) 182 phi1 = ( 183 -m2 * l1 * lc2 * dtheta2**2 * sin(theta2) 184 - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2) 185 + (m1 * lc1 + m2 * l1) * g * cos(theta1 - np.pi / 2) 186 + phi2 187 ) 188 189 # the following line is consistent with the java implementation and the 190 # book 191 ddtheta2 = ( 192 a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2 193 ) / (m2 * lc2**2 + I2 - d2**2 / d1) 194 195 ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 196 return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0 197 198 # Modified from original implementation of Gym Acrobot 199 def rk4(self, derivs, y0, t): 200 """ 201 Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta. 202 203 Example for 2D system: 204 205 >>> def derivs(x): 206 ... d1 = x[0] + 2*x[1] 207 ... d2 = -3*x[0] + 4*x[1] 208 ... return d1, d2 209 210 >>> dt = 0.0005 211 >>> t = np.arange(0.0, 2.0, dt) 212 >>> y0 = (1,2) 213 >>> yout = rk4(derivs, y0, t) 214 215 Args: 216 derivs: the derivative of the system and has the signature `dy = derivs(yi)` 217 y0: initial state vector 218 t: sample times 219 220 Returns: 221 yout: Runge-Kutta approximation of the ODE 222 """ 223 224 try: 225 Ny = len(y0) 226 except TypeError: 227 yout = np.zeros((len(t),), np.float64) 228 else: 229 yout = np.zeros((len(t), Ny), np.float64) 230 231 yout[0] = y0 232 233 for i in np.arange(len(t) - 1): 234 this = t[i] 235 dt = t[i + 1] - this 236 dt2 = dt / 2.0 237 y0 = yout[i] 238 239 k1 = np.asarray(derivs(y0)) 240 k2 = np.asarray(derivs(y0 + dt2 * k1)) 241 k3 = np.asarray(derivs(y0 + dt2 * k2)) 242 k4 = np.asarray(derivs(y0 + dt * k3)) 243 yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4) 244 # We only care about the final timestep and we cleave off action value which will be zero 245 return yout[-1][:4] 246 247 def compute_next_state( 248 self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action 249 ): 250 angle_space = np.linspace(*self.angle_range, self.angle_1_bins) 251 velocity_1_space = np.linspace(*self.velocity_1_range, self.angular_vel_1_bins) 252 velocity_2_space = np.linspace(*self.velocity_2_range, self.angular_vel_2_bins) 253 254 angle_1 = angle_space[angle_1_idx] 255 angle_2 = angle_space[angle_2_idx] 256 257 velocity_1 = velocity_1_space[vel_1_idx] 258 velocity_2 = velocity_2_space[vel_2_idx] 259 260 torque = self.AVAIL_TORQUE[action] 261 262 state = [angle_1, angle_2, velocity_1, velocity_2, torque] 263 264 new_state = self.rk4(self.dsdt, state, [0, self.dt]) 265 266 new_state[0] = self.wrap(new_state[0], -np.pi, np.pi) 267 new_state[1] = self.wrap(new_state[1], -np.pi, np.pi) 268 new_state[2] = self.bound(new_state[2], -self.MAX_VEL_1, self.MAX_VEL_1) 269 new_state[3] = self.bound(new_state[3], -self.MAX_VEL_2, self.MAX_VEL_2) 270 271 # index new state 272 273 new_angle_1_idx = np.clip( 274 np.digitize(new_state[0], np.linspace(*self.angle_range, self.angle_1_bins)) 275 - 1, 276 0, 277 self.angle_1_bins - 1, 278 ) 279 new_angle_2_idx = np.clip( 280 np.digitize(new_state[1], np.linspace(*self.angle_range, self.angle_2_bins)) 281 - 1, 282 0, 283 self.angle_2_bins - 1, 284 ) 285 new_vel_1_idx = np.clip( 286 np.digitize( 287 new_state[2], 288 np.linspace(*self.velocity_1_range, self.angular_vel_1_bins), 289 ) 290 - 1, 291 0, 292 self.angular_vel_1_bins - 1, 293 ) 294 new_vel_2_idx = np.clip( 295 np.digitize( 296 new_state[3], 297 np.linspace(*self.velocity_2_range, self.angular_vel_2_bins), 298 ) 299 - 1, 300 0, 301 self.angular_vel_2_bins - 1, 302 ) 303 304 new_state_idx = np.ravel_multi_index( 305 (new_angle_1_idx, new_angle_2_idx, new_vel_1_idx, new_vel_2_idx), 306 ( 307 self.angle_1_bins, 308 self.angle_2_bins, 309 self.angular_vel_1_bins, 310 self.angular_vel_2_bins, 311 ), 312 ) 313 314 # self.state = new_state 315 terminated = bool(-cos(new_state[0]) - cos(new_state[1] + new_state[0]) > 1.0) 316 reward = -1.0 if not terminated else 0.0 317 318 return new_state_idx, reward, terminated 319 320 def wrap(self, x, m, M): 321 """Wraps `x` so m <= x <= M; but unlike `bound()` which 322 truncates, `wrap()` wraps x around the coordinate system defined by m,M.\n 323 For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. 324 325 Args: 326 x: a scalar 327 m: minimum possible value in range 328 M: maximum possible value in range 329 330 Returns: 331 x: a scalar, wrapped 332 """ 333 diff = M - m 334 while x > M: 335 x = x - diff 336 while x < m: 337 x = x + diff 338 return x 339 340 def bound(self, x, m, M=None): 341 """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR* 342 have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1]. 343 344 Args: 345 x: scalar 346 m: The lower bound 347 M: The upper bound 348 349 Returns: 350 x: scalar, bound between min (m) and Max (M) 351 """ 352 if M is None: 353 M = m[1] 354 m = m[0] 355 # bound x between min (m) and Max (M) 356 return min(max(x, m), M)
7class DiscretizedAcrobot: 8 LINK_LENGTH_1 = 1.0 # [m] 9 LINK_LENGTH_2 = 1.0 # [m] 10 LINK_MASS_1 = 1.0 #: [kg] mass of link 1 11 LINK_MASS_2 = 1.0 #: [kg] mass of link 2 12 LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1 13 LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2 14 LINK_MOI = 1.0 #: moments of inertia for both links 15 AVAIL_TORQUE = [-1, 0, 1] 16 MAX_VEL_1 = 4 * np.pi 17 MAX_VEL_2 = 9 * np.pi 18 19 def __init__( 20 self, 21 angular_resolution_rad=0.01, 22 angular_vel_resolution_rad_per_sec=0.05, 23 angle_bins=None, 24 velocity_bins=None, 25 precomputed_P=None, 26 verbose=False, 27 ): 28 self.verbose = verbose 29 30 if angle_bins is None: 31 self.angle_1_bins = int(np.pi // angular_resolution_rad) + 1 32 self.angle_2_bins = int(np.pi // angular_resolution_rad) + 1 33 else: 34 self.angle_1_bins = angle_bins 35 self.angle_2_bins = angle_bins 36 37 if velocity_bins is None: 38 self.angular_vel_1_bins = ( 39 int((2 * self.MAX_VEL_1) // angular_vel_resolution_rad_per_sec) + 1 40 ) 41 self.angular_vel_2_bins = ( 42 int((2 * self.MAX_VEL_2) // angular_vel_resolution_rad_per_sec) + 1 43 ) 44 else: 45 self.angular_vel_1_bins = velocity_bins 46 self.angular_vel_2_bins = velocity_bins 47 48 self.n_states = ( 49 self.angle_1_bins 50 * self.angle_2_bins 51 * self.angular_vel_1_bins 52 * self.angular_vel_2_bins 53 ) 54 55 self.angle_range = (-np.pi, np.pi) 56 self.velocity_1_range = (-self.MAX_VEL_1, self.MAX_VEL_1) 57 self.velocity_2_range = (-self.MAX_VEL_2, self.MAX_VEL_2) 58 self.action_space = len(self.AVAIL_TORQUE) # -1, 0, 1 59 self.dt = 0.2 60 61 if precomputed_P is None: 62 self.P = { 63 state: {action: [] for action in range(self.action_space)} 64 for state in range(self.n_states) 65 } 66 self.setup_transition_probabilities() 67 68 else: 69 self.P = precomputed_P 70 71 # add transform_obs 72 self.transform_obs = lambda obs: ( 73 np.ravel_multi_index( 74 ( 75 np.clip( 76 np.digitize( 77 obs[0], np.linspace(*self.angle_range, self.angle_1_bins) 78 ) 79 - 1, 80 0, 81 self.angle_1_bins - 1, 82 ), 83 np.clip( 84 np.digitize( 85 obs[1], np.linspace(*self.angle_range, self.angle_2_bins) 86 ) 87 - 1, 88 0, 89 self.angle_2_bins - 1, 90 ), 91 np.clip( 92 np.digitize( 93 obs[2], 94 np.linspace( 95 *self.velocity_1_range, self.angular_vel_1_bins 96 ), 97 ) 98 - 1, 99 0, 100 self.angular_vel_1_bins - 1, 101 ), 102 np.clip( 103 np.digitize( 104 obs[3], 105 np.linspace( 106 *self.velocity_2_range, self.angular_vel_2_bins 107 ), 108 ) 109 - 1, 110 0, 111 self.angular_vel_2_bins - 1, 112 ), 113 ), 114 ( 115 self.angle_1_bins, 116 self.angle_2_bins, 117 self.angular_vel_1_bins, 118 self.angular_vel_2_bins, 119 ), 120 ) 121 ) 122 123 def setup_transition_probabilities(self): 124 """ 125 Sets up the transition probabilities for the environment. This method iterates through all possible 126 states and actions, simulates the next state, and records the transition probability 127 (deterministic in this setup), reward, and termination status. 128 """ 129 percent = 0 130 131 for state in range(self.n_states): 132 angle_1, angle_2, vel_1, vel_2 = self.index_to_state(state) 133 for action in range(self.action_space): 134 next_state, reward, done = self.compute_next_state( 135 angle_1, angle_2, vel_1, vel_2, action 136 ) 137 self.P[state][action].append((1, next_state, reward, done)) 138 139 if state % int(self.n_states / 10) == 0 and state > 0 and self.verbose: 140 percent += 10 141 print(f"{percent}% of probabilities calculated!") 142 143 def index_to_state(self, index): 144 """ 145 Converts a single index into a multidimensional state representation. 146 147 Parameters: 148 - index (int): The flat index representing the state. 149 150 Returns: 151 - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins. 152 """ 153 154 totals = [ 155 self.angle_1_bins, 156 self.angle_2_bins, 157 self.angular_vel_1_bins, 158 self.angular_vel_2_bins, 159 ] 160 multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1] 161 components = [int((index // multipliers[i]) % totals[i]) for i in range(4)] 162 return components 163 164 # Modified from original implementation of Gym Acrobot 165 def dsdt(self, state): 166 m1 = self.LINK_MASS_1 167 m2 = self.LINK_MASS_2 168 l1 = self.LINK_LENGTH_1 169 lc1 = self.LINK_COM_POS_1 170 lc2 = self.LINK_COM_POS_2 171 I1 = self.LINK_MOI 172 I2 = self.LINK_MOI 173 g = 9.8 174 a = state[-1] 175 s = state[:-1] 176 theta1 = s[0] 177 theta2 = s[1] 178 dtheta1 = s[2] 179 dtheta2 = s[3] 180 d1 = m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2 181 d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2 182 phi2 = m2 * lc2 * g * cos(theta1 + theta2 - np.pi / 2.0) 183 phi1 = ( 184 -m2 * l1 * lc2 * dtheta2**2 * sin(theta2) 185 - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2) 186 + (m1 * lc1 + m2 * l1) * g * cos(theta1 - np.pi / 2) 187 + phi2 188 ) 189 190 # the following line is consistent with the java implementation and the 191 # book 192 ddtheta2 = ( 193 a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2 194 ) / (m2 * lc2**2 + I2 - d2**2 / d1) 195 196 ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 197 return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0 198 199 # Modified from original implementation of Gym Acrobot 200 def rk4(self, derivs, y0, t): 201 """ 202 Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta. 203 204 Example for 2D system: 205 206 >>> def derivs(x): 207 ... d1 = x[0] + 2*x[1] 208 ... d2 = -3*x[0] + 4*x[1] 209 ... return d1, d2 210 211 >>> dt = 0.0005 212 >>> t = np.arange(0.0, 2.0, dt) 213 >>> y0 = (1,2) 214 >>> yout = rk4(derivs, y0, t) 215 216 Args: 217 derivs: the derivative of the system and has the signature `dy = derivs(yi)` 218 y0: initial state vector 219 t: sample times 220 221 Returns: 222 yout: Runge-Kutta approximation of the ODE 223 """ 224 225 try: 226 Ny = len(y0) 227 except TypeError: 228 yout = np.zeros((len(t),), np.float64) 229 else: 230 yout = np.zeros((len(t), Ny), np.float64) 231 232 yout[0] = y0 233 234 for i in np.arange(len(t) - 1): 235 this = t[i] 236 dt = t[i + 1] - this 237 dt2 = dt / 2.0 238 y0 = yout[i] 239 240 k1 = np.asarray(derivs(y0)) 241 k2 = np.asarray(derivs(y0 + dt2 * k1)) 242 k3 = np.asarray(derivs(y0 + dt2 * k2)) 243 k4 = np.asarray(derivs(y0 + dt * k3)) 244 yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4) 245 # We only care about the final timestep and we cleave off action value which will be zero 246 return yout[-1][:4] 247 248 def compute_next_state( 249 self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action 250 ): 251 angle_space = np.linspace(*self.angle_range, self.angle_1_bins) 252 velocity_1_space = np.linspace(*self.velocity_1_range, self.angular_vel_1_bins) 253 velocity_2_space = np.linspace(*self.velocity_2_range, self.angular_vel_2_bins) 254 255 angle_1 = angle_space[angle_1_idx] 256 angle_2 = angle_space[angle_2_idx] 257 258 velocity_1 = velocity_1_space[vel_1_idx] 259 velocity_2 = velocity_2_space[vel_2_idx] 260 261 torque = self.AVAIL_TORQUE[action] 262 263 state = [angle_1, angle_2, velocity_1, velocity_2, torque] 264 265 new_state = self.rk4(self.dsdt, state, [0, self.dt]) 266 267 new_state[0] = self.wrap(new_state[0], -np.pi, np.pi) 268 new_state[1] = self.wrap(new_state[1], -np.pi, np.pi) 269 new_state[2] = self.bound(new_state[2], -self.MAX_VEL_1, self.MAX_VEL_1) 270 new_state[3] = self.bound(new_state[3], -self.MAX_VEL_2, self.MAX_VEL_2) 271 272 # index new state 273 274 new_angle_1_idx = np.clip( 275 np.digitize(new_state[0], np.linspace(*self.angle_range, self.angle_1_bins)) 276 - 1, 277 0, 278 self.angle_1_bins - 1, 279 ) 280 new_angle_2_idx = np.clip( 281 np.digitize(new_state[1], np.linspace(*self.angle_range, self.angle_2_bins)) 282 - 1, 283 0, 284 self.angle_2_bins - 1, 285 ) 286 new_vel_1_idx = np.clip( 287 np.digitize( 288 new_state[2], 289 np.linspace(*self.velocity_1_range, self.angular_vel_1_bins), 290 ) 291 - 1, 292 0, 293 self.angular_vel_1_bins - 1, 294 ) 295 new_vel_2_idx = np.clip( 296 np.digitize( 297 new_state[3], 298 np.linspace(*self.velocity_2_range, self.angular_vel_2_bins), 299 ) 300 - 1, 301 0, 302 self.angular_vel_2_bins - 1, 303 ) 304 305 new_state_idx = np.ravel_multi_index( 306 (new_angle_1_idx, new_angle_2_idx, new_vel_1_idx, new_vel_2_idx), 307 ( 308 self.angle_1_bins, 309 self.angle_2_bins, 310 self.angular_vel_1_bins, 311 self.angular_vel_2_bins, 312 ), 313 ) 314 315 # self.state = new_state 316 terminated = bool(-cos(new_state[0]) - cos(new_state[1] + new_state[0]) > 1.0) 317 reward = -1.0 if not terminated else 0.0 318 319 return new_state_idx, reward, terminated 320 321 def wrap(self, x, m, M): 322 """Wraps `x` so m <= x <= M; but unlike `bound()` which 323 truncates, `wrap()` wraps x around the coordinate system defined by m,M.\n 324 For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. 325 326 Args: 327 x: a scalar 328 m: minimum possible value in range 329 M: maximum possible value in range 330 331 Returns: 332 x: a scalar, wrapped 333 """ 334 diff = M - m 335 while x > M: 336 x = x - diff 337 while x < m: 338 x = x + diff 339 return x 340 341 def bound(self, x, m, M=None): 342 """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR* 343 have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1]. 344 345 Args: 346 x: scalar 347 m: The lower bound 348 M: The upper bound 349 350 Returns: 351 x: scalar, bound between min (m) and Max (M) 352 """ 353 if M is None: 354 M = m[1] 355 m = m[0] 356 # bound x between min (m) and Max (M) 357 return min(max(x, m), M)
19 def __init__( 20 self, 21 angular_resolution_rad=0.01, 22 angular_vel_resolution_rad_per_sec=0.05, 23 angle_bins=None, 24 velocity_bins=None, 25 precomputed_P=None, 26 verbose=False, 27 ): 28 self.verbose = verbose 29 30 if angle_bins is None: 31 self.angle_1_bins = int(np.pi // angular_resolution_rad) + 1 32 self.angle_2_bins = int(np.pi // angular_resolution_rad) + 1 33 else: 34 self.angle_1_bins = angle_bins 35 self.angle_2_bins = angle_bins 36 37 if velocity_bins is None: 38 self.angular_vel_1_bins = ( 39 int((2 * self.MAX_VEL_1) // angular_vel_resolution_rad_per_sec) + 1 40 ) 41 self.angular_vel_2_bins = ( 42 int((2 * self.MAX_VEL_2) // angular_vel_resolution_rad_per_sec) + 1 43 ) 44 else: 45 self.angular_vel_1_bins = velocity_bins 46 self.angular_vel_2_bins = velocity_bins 47 48 self.n_states = ( 49 self.angle_1_bins 50 * self.angle_2_bins 51 * self.angular_vel_1_bins 52 * self.angular_vel_2_bins 53 ) 54 55 self.angle_range = (-np.pi, np.pi) 56 self.velocity_1_range = (-self.MAX_VEL_1, self.MAX_VEL_1) 57 self.velocity_2_range = (-self.MAX_VEL_2, self.MAX_VEL_2) 58 self.action_space = len(self.AVAIL_TORQUE) # -1, 0, 1 59 self.dt = 0.2 60 61 if precomputed_P is None: 62 self.P = { 63 state: {action: [] for action in range(self.action_space)} 64 for state in range(self.n_states) 65 } 66 self.setup_transition_probabilities() 67 68 else: 69 self.P = precomputed_P 70 71 # add transform_obs 72 self.transform_obs = lambda obs: ( 73 np.ravel_multi_index( 74 ( 75 np.clip( 76 np.digitize( 77 obs[0], np.linspace(*self.angle_range, self.angle_1_bins) 78 ) 79 - 1, 80 0, 81 self.angle_1_bins - 1, 82 ), 83 np.clip( 84 np.digitize( 85 obs[1], np.linspace(*self.angle_range, self.angle_2_bins) 86 ) 87 - 1, 88 0, 89 self.angle_2_bins - 1, 90 ), 91 np.clip( 92 np.digitize( 93 obs[2], 94 np.linspace( 95 *self.velocity_1_range, self.angular_vel_1_bins 96 ), 97 ) 98 - 1, 99 0, 100 self.angular_vel_1_bins - 1, 101 ), 102 np.clip( 103 np.digitize( 104 obs[3], 105 np.linspace( 106 *self.velocity_2_range, self.angular_vel_2_bins 107 ), 108 ) 109 - 1, 110 0, 111 self.angular_vel_2_bins - 1, 112 ), 113 ), 114 ( 115 self.angle_1_bins, 116 self.angle_2_bins, 117 self.angular_vel_1_bins, 118 self.angular_vel_2_bins, 119 ), 120 ) 121 )
123 def setup_transition_probabilities(self): 124 """ 125 Sets up the transition probabilities for the environment. This method iterates through all possible 126 states and actions, simulates the next state, and records the transition probability 127 (deterministic in this setup), reward, and termination status. 128 """ 129 percent = 0 130 131 for state in range(self.n_states): 132 angle_1, angle_2, vel_1, vel_2 = self.index_to_state(state) 133 for action in range(self.action_space): 134 next_state, reward, done = self.compute_next_state( 135 angle_1, angle_2, vel_1, vel_2, action 136 ) 137 self.P[state][action].append((1, next_state, reward, done)) 138 139 if state % int(self.n_states / 10) == 0 and state > 0 and self.verbose: 140 percent += 10 141 print(f"{percent}% of probabilities calculated!")
Sets up the transition probabilities for the environment. This method iterates through all possible states and actions, simulates the next state, and records the transition probability (deterministic in this setup), reward, and termination status.
143 def index_to_state(self, index): 144 """ 145 Converts a single index into a multidimensional state representation. 146 147 Parameters: 148 - index (int): The flat index representing the state. 149 150 Returns: 151 - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins. 152 """ 153 154 totals = [ 155 self.angle_1_bins, 156 self.angle_2_bins, 157 self.angular_vel_1_bins, 158 self.angular_vel_2_bins, 159 ] 160 multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1] 161 components = [int((index // multipliers[i]) % totals[i]) for i in range(4)] 162 return components
Converts a single index into a multidimensional state representation.
Parameters:
- index (int): The flat index representing the state.
Returns:
- list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
165 def dsdt(self, state): 166 m1 = self.LINK_MASS_1 167 m2 = self.LINK_MASS_2 168 l1 = self.LINK_LENGTH_1 169 lc1 = self.LINK_COM_POS_1 170 lc2 = self.LINK_COM_POS_2 171 I1 = self.LINK_MOI 172 I2 = self.LINK_MOI 173 g = 9.8 174 a = state[-1] 175 s = state[:-1] 176 theta1 = s[0] 177 theta2 = s[1] 178 dtheta1 = s[2] 179 dtheta2 = s[3] 180 d1 = m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2 181 d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2 182 phi2 = m2 * lc2 * g * cos(theta1 + theta2 - np.pi / 2.0) 183 phi1 = ( 184 -m2 * l1 * lc2 * dtheta2**2 * sin(theta2) 185 - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2) 186 + (m1 * lc1 + m2 * l1) * g * cos(theta1 - np.pi / 2) 187 + phi2 188 ) 189 190 # the following line is consistent with the java implementation and the 191 # book 192 ddtheta2 = ( 193 a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2 194 ) / (m2 * lc2**2 + I2 - d2**2 / d1) 195 196 ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 197 return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0
200 def rk4(self, derivs, y0, t): 201 """ 202 Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta. 203 204 Example for 2D system: 205 206 >>> def derivs(x): 207 ... d1 = x[0] + 2*x[1] 208 ... d2 = -3*x[0] + 4*x[1] 209 ... return d1, d2 210 211 >>> dt = 0.0005 212 >>> t = np.arange(0.0, 2.0, dt) 213 >>> y0 = (1,2) 214 >>> yout = rk4(derivs, y0, t) 215 216 Args: 217 derivs: the derivative of the system and has the signature `dy = derivs(yi)` 218 y0: initial state vector 219 t: sample times 220 221 Returns: 222 yout: Runge-Kutta approximation of the ODE 223 """ 224 225 try: 226 Ny = len(y0) 227 except TypeError: 228 yout = np.zeros((len(t),), np.float64) 229 else: 230 yout = np.zeros((len(t), Ny), np.float64) 231 232 yout[0] = y0 233 234 for i in np.arange(len(t) - 1): 235 this = t[i] 236 dt = t[i + 1] - this 237 dt2 = dt / 2.0 238 y0 = yout[i] 239 240 k1 = np.asarray(derivs(y0)) 241 k2 = np.asarray(derivs(y0 + dt2 * k1)) 242 k3 = np.asarray(derivs(y0 + dt2 * k2)) 243 k4 = np.asarray(derivs(y0 + dt * k3)) 244 yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4) 245 # We only care about the final timestep and we cleave off action value which will be zero 246 return yout[-1][:4]
Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.
Example for 2D system:
>>> def derivs(x):
... d1 = x[0] + 2*x[1]
... d2 = -3*x[0] + 4*x[1]
... return d1, d2
>>> dt = 0.0005
>>> t = np.arange(0.0, 2.0, dt)
>>> y0 = (1,2)
>>> yout = rk4(derivs, y0, t)
Args:
derivs: the derivative of the system and has the signature dy = derivs(yi)
y0: initial state vector
t: sample times
Returns: yout: Runge-Kutta approximation of the ODE
248 def compute_next_state( 249 self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action 250 ): 251 angle_space = np.linspace(*self.angle_range, self.angle_1_bins) 252 velocity_1_space = np.linspace(*self.velocity_1_range, self.angular_vel_1_bins) 253 velocity_2_space = np.linspace(*self.velocity_2_range, self.angular_vel_2_bins) 254 255 angle_1 = angle_space[angle_1_idx] 256 angle_2 = angle_space[angle_2_idx] 257 258 velocity_1 = velocity_1_space[vel_1_idx] 259 velocity_2 = velocity_2_space[vel_2_idx] 260 261 torque = self.AVAIL_TORQUE[action] 262 263 state = [angle_1, angle_2, velocity_1, velocity_2, torque] 264 265 new_state = self.rk4(self.dsdt, state, [0, self.dt]) 266 267 new_state[0] = self.wrap(new_state[0], -np.pi, np.pi) 268 new_state[1] = self.wrap(new_state[1], -np.pi, np.pi) 269 new_state[2] = self.bound(new_state[2], -self.MAX_VEL_1, self.MAX_VEL_1) 270 new_state[3] = self.bound(new_state[3], -self.MAX_VEL_2, self.MAX_VEL_2) 271 272 # index new state 273 274 new_angle_1_idx = np.clip( 275 np.digitize(new_state[0], np.linspace(*self.angle_range, self.angle_1_bins)) 276 - 1, 277 0, 278 self.angle_1_bins - 1, 279 ) 280 new_angle_2_idx = np.clip( 281 np.digitize(new_state[1], np.linspace(*self.angle_range, self.angle_2_bins)) 282 - 1, 283 0, 284 self.angle_2_bins - 1, 285 ) 286 new_vel_1_idx = np.clip( 287 np.digitize( 288 new_state[2], 289 np.linspace(*self.velocity_1_range, self.angular_vel_1_bins), 290 ) 291 - 1, 292 0, 293 self.angular_vel_1_bins - 1, 294 ) 295 new_vel_2_idx = np.clip( 296 np.digitize( 297 new_state[3], 298 np.linspace(*self.velocity_2_range, self.angular_vel_2_bins), 299 ) 300 - 1, 301 0, 302 self.angular_vel_2_bins - 1, 303 ) 304 305 new_state_idx = np.ravel_multi_index( 306 (new_angle_1_idx, new_angle_2_idx, new_vel_1_idx, new_vel_2_idx), 307 ( 308 self.angle_1_bins, 309 self.angle_2_bins, 310 self.angular_vel_1_bins, 311 self.angular_vel_2_bins, 312 ), 313 ) 314 315 # self.state = new_state 316 terminated = bool(-cos(new_state[0]) - cos(new_state[1] + new_state[0]) > 1.0) 317 reward = -1.0 if not terminated else 0.0 318 319 return new_state_idx, reward, terminated
321 def wrap(self, x, m, M): 322 """Wraps `x` so m <= x <= M; but unlike `bound()` which 323 truncates, `wrap()` wraps x around the coordinate system defined by m,M.\n 324 For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. 325 326 Args: 327 x: a scalar 328 m: minimum possible value in range 329 M: maximum possible value in range 330 331 Returns: 332 x: a scalar, wrapped 333 """ 334 diff = M - m 335 while x > M: 336 x = x - diff 337 while x < m: 338 x = x + diff 339 return x
Wraps x
so m <= x <= M; but unlike bound()
which
truncates, wrap()
wraps x around the coordinate system defined by m,M.
For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
Args: x: a scalar m: minimum possible value in range M: maximum possible value in range
Returns: x: a scalar, wrapped
341 def bound(self, x, m, M=None): 342 """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR* 343 have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1]. 344 345 Args: 346 x: scalar 347 m: The lower bound 348 M: The upper bound 349 350 Returns: 351 x: scalar, bound between min (m) and Max (M) 352 """ 353 if M is None: 354 M = m[1] 355 m = m[0] 356 # bound x between min (m) and Max (M) 357 return min(max(x, m), M)
Either have m as scalar, so bound(x,m,M) which returns m <= x <= M OR
have m as length 2 vector, bound(x,m,
Args: x: scalar m: The lower bound M: The upper bound
Returns: x: scalar, bound between min (m) and Max (M)