envs.acrobot_model
1import numpy as np 2from math import cos, sin 3 4class DiscretizedAcrobot: 5 6 LINK_LENGTH_1 = 1.0 # [m] 7 LINK_LENGTH_2 = 1.0 # [m] 8 LINK_MASS_1 = 1.0 #: [kg] mass of link 1 9 LINK_MASS_2 = 1.0 #: [kg] mass of link 2 10 LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1 11 LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2 12 LINK_MOI = 1.0 #: moments of inertia for both links 13 AVAIL_TORQUE = [-1, 0, 1] 14 MAX_VEL_1 = 4 * np.pi 15 MAX_VEL_2 = 9 * np.pi 16 17 def __init__(self, 18 angular_resolution_rad = 0.01, 19 angular_vel_resolution_rad_per_sec = 0.05, 20 angle_bins = None, 21 velocity_bins = None, 22 precomputed_P = None, 23 verbose=False): 24 25 self.verbose = verbose 26 27 if angle_bins is None: 28 self.angle_1_bins = int(np.pi // angular_resolution_rad) + 1 29 self.angle_2_bins = int(np.pi // angular_resolution_rad) + 1 30 else: 31 self.angle_1_bins = angle_bins 32 self.angle_2_bins = angle_bins 33 34 if velocity_bins is None: 35 self.angular_vel_1_bins = int((2 * self.MAX_VEL_1) // angular_vel_resolution_rad_per_sec) + 1 36 self.angular_vel_2_bins = int((2 * self.MAX_VEL_2) // angular_vel_resolution_rad_per_sec) + 1 37 else: 38 self.angular_vel_1_bins = velocity_bins 39 self.angular_vel_2_bins = velocity_bins 40 41 self.n_states = self.angle_1_bins * self.angle_2_bins * self.angular_vel_1_bins * self.angular_vel_2_bins 42 43 self.angle_range = (-np.pi, np.pi) 44 self.velocity_1_range = (-self.MAX_VEL_1, self.MAX_VEL_1) 45 self.velocity_2_range = (-self.MAX_VEL_2, self.MAX_VEL_2) 46 self.action_space = len(self.AVAIL_TORQUE) # -1, 0, 1 47 self.dt = 0.2 48 49 if precomputed_P is None: 50 self.P = {state: {action: [] for action in range(self.action_space)} for state in range(self.n_states)} 51 self.setup_transition_probabilities() 52 53 else: 54 self.P = precomputed_P 55 56 # add transform_obs 57 self.transform_obs = lambda obs: ( 58 np.ravel_multi_index(( 59 np.clip(np.digitize(obs[0], np.linspace(*self.angle_range, self.angle_1_bins)) - 1, 0, 60 self.angle_1_bins - 1), 61 np.clip(np.digitize(obs[1], np.linspace(*self.angle_range, self.angle_2_bins)) - 1, 0, 62 self.angle_2_bins - 1), 63 np.clip(np.digitize(obs[2], np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)) - 1, 0, 64 self.angular_vel_1_bins - 1), 65 np.clip(np.digitize(obs[3], np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)) - 1, 0, 66 self.angular_vel_2_bins - 1), 67 ), (self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins)) 68 ) 69 70 def setup_transition_probabilities(self): 71 """ 72 Sets up the transition probabilities for the environment. This method iterates through all possible 73 states and actions, simulates the next state, and records the transition probability 74 (deterministic in this setup), reward, and termination status. 75 """ 76 percent = 0 77 78 for state in range(self.n_states): 79 angle_1, angle_2, vel_1, vel_2 = self.index_to_state(state) 80 for action in range(self.action_space): 81 next_state, reward, done = self.compute_next_state(angle_1, angle_2, vel_1, vel_2, action) 82 self.P[state][action].append((1, next_state, reward, done)) 83 84 if state % int(self.n_states / 10) == 0 and state > 0 and self.verbose: 85 percent += 10 86 print(f'{percent}% of probabilities calculated!') 87 88 def index_to_state(self, index): 89 """ 90 Converts a single index into a multidimensional state representation. 91 92 Parameters: 93 - index (int): The flat index representing the state. 94 95 Returns: 96 - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins. 97 """ 98 99 totals = [self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins] 100 multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1] 101 components = [int((index // multipliers[i]) % totals[i]) for i in range(4)] 102 return components 103 104 # Modified from original implementation of Gym Acrobot 105 def dsdt(self, state): 106 m1 = self.LINK_MASS_1 107 m2 = self.LINK_MASS_2 108 l1 = self.LINK_LENGTH_1 109 lc1 = self.LINK_COM_POS_1 110 lc2 = self.LINK_COM_POS_2 111 I1 = self.LINK_MOI 112 I2 = self.LINK_MOI 113 g = 9.8 114 a = state[-1] 115 s = state[:-1] 116 theta1 = s[0] 117 theta2 = s[1] 118 dtheta1 = s[2] 119 dtheta2 = s[3] 120 d1 = m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2 121 d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2 122 phi2 = m2 * lc2 * g * cos(theta1 + theta2 - np.pi / 2.0) 123 phi1 = ( 124 -m2 * l1 * lc2 * dtheta2**2 * sin(theta2) 125 - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2) 126 + (m1 * lc1 + m2 * l1) * g * cos(theta1 - np.pi / 2) 127 + phi2 128 ) 129 130 # the following line is consistent with the java implementation and the 131 # book 132 ddtheta2 = ( 133 a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2 134 ) / (m2 * lc2**2 + I2 - d2**2 / d1) 135 136 ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 137 return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0 138 139 # Modified from original implementation of Gym Acrobot 140 def rk4(self, derivs, y0, t): 141 142 """ 143 Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta. 144 145 Example for 2D system: 146 147 >>> def derivs(x): 148 ... d1 = x[0] + 2*x[1] 149 ... d2 = -3*x[0] + 4*x[1] 150 ... return d1, d2 151 152 >>> dt = 0.0005 153 >>> t = np.arange(0.0, 2.0, dt) 154 >>> y0 = (1,2) 155 >>> yout = rk4(derivs, y0, t) 156 157 Args: 158 derivs: the derivative of the system and has the signature `dy = derivs(yi)` 159 y0: initial state vector 160 t: sample times 161 162 Returns: 163 yout: Runge-Kutta approximation of the ODE 164 """ 165 166 try: 167 Ny = len(y0) 168 except TypeError: 169 yout = np.zeros((len(t),), np.float64) 170 else: 171 yout = np.zeros((len(t), Ny), np.float64) 172 173 yout[0] = y0 174 175 for i in np.arange(len(t) - 1): 176 this = t[i] 177 dt = t[i + 1] - this 178 dt2 = dt / 2.0 179 y0 = yout[i] 180 181 k1 = np.asarray(derivs(y0)) 182 k2 = np.asarray(derivs(y0 + dt2 * k1)) 183 k3 = np.asarray(derivs(y0 + dt2 * k2)) 184 k4 = np.asarray(derivs(y0 + dt * k3)) 185 yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4) 186 # We only care about the final timestep and we cleave off action value which will be zero 187 return yout[-1][:4] 188 189 def compute_next_state(self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action): 190 191 angle_space = np.linspace(*self.angle_range,self.angle_1_bins) 192 velocity_1_space = np.linspace(*self.velocity_1_range, self.angular_vel_1_bins) 193 velocity_2_space = np.linspace(*self.velocity_2_range, self.angular_vel_2_bins) 194 195 angle_1 = angle_space[angle_1_idx] 196 angle_2 = angle_space[angle_2_idx] 197 198 velocity_1 = velocity_1_space[vel_1_idx] 199 velocity_2 = velocity_2_space[vel_2_idx] 200 201 torque = self.AVAIL_TORQUE[action] 202 203 state = [angle_1, angle_2, velocity_1, velocity_2, torque] 204 205 new_state = self.rk4(self.dsdt, state, [0, self.dt]) 206 207 new_state[0] = self.wrap(new_state[0], -np.pi, np.pi) 208 new_state[1] = self.wrap(new_state[1], -np.pi, np.pi) 209 new_state[2] = self.bound(new_state[2], -self.MAX_VEL_1, self.MAX_VEL_1) 210 new_state[3] = self.bound(new_state[3], -self.MAX_VEL_2, self.MAX_VEL_2) 211 212 # index new state 213 214 new_angle_1_idx = np.clip(np.digitize(new_state[0], np.linspace(*self.angle_range, self.angle_1_bins)) - 1, 0, 215 self.angle_1_bins - 1) 216 new_angle_2_idx = np.clip(np.digitize(new_state[1], np.linspace(*self.angle_range, self.angle_2_bins)) - 1, 0, 217 self.angle_2_bins - 1) 218 new_vel_1_idx = np.clip(np.digitize(new_state[2], np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)) - 1, 0, 219 self.angular_vel_1_bins - 1) 220 new_vel_2_idx = np.clip(np.digitize(new_state[3], np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)) - 1, 0, 221 self.angular_vel_2_bins - 1) 222 223 new_state_idx = np.ravel_multi_index((new_angle_1_idx, new_angle_2_idx, new_vel_1_idx, new_vel_2_idx), 224 (self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins)) 225 226 # self.state = new_state 227 terminated = bool(-cos(new_state[0]) - cos(new_state[1] + new_state[0]) > 1.0) 228 reward = -1.0 if not terminated else 0.0 229 230 return new_state_idx, reward, terminated 231 232 def wrap(self, x, m, M): 233 """Wraps `x` so m <= x <= M; but unlike `bound()` which 234 truncates, `wrap()` wraps x around the coordinate system defined by m,M.\n 235 For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. 236 237 Args: 238 x: a scalar 239 m: minimum possible value in range 240 M: maximum possible value in range 241 242 Returns: 243 x: a scalar, wrapped 244 """ 245 diff = M - m 246 while x > M: 247 x = x - diff 248 while x < m: 249 x = x + diff 250 return x 251 252 def bound(self, x, m, M=None): 253 """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR* 254 have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1]. 255 256 Args: 257 x: scalar 258 m: The lower bound 259 M: The upper bound 260 261 Returns: 262 x: scalar, bound between min (m) and Max (M) 263 """ 264 if M is None: 265 M = m[1] 266 m = m[0] 267 # bound x between min (m) and Max (M) 268 return min(max(x, m), M)
5class DiscretizedAcrobot: 6 7 LINK_LENGTH_1 = 1.0 # [m] 8 LINK_LENGTH_2 = 1.0 # [m] 9 LINK_MASS_1 = 1.0 #: [kg] mass of link 1 10 LINK_MASS_2 = 1.0 #: [kg] mass of link 2 11 LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1 12 LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2 13 LINK_MOI = 1.0 #: moments of inertia for both links 14 AVAIL_TORQUE = [-1, 0, 1] 15 MAX_VEL_1 = 4 * np.pi 16 MAX_VEL_2 = 9 * np.pi 17 18 def __init__(self, 19 angular_resolution_rad = 0.01, 20 angular_vel_resolution_rad_per_sec = 0.05, 21 angle_bins = None, 22 velocity_bins = None, 23 precomputed_P = None, 24 verbose=False): 25 26 self.verbose = verbose 27 28 if angle_bins is None: 29 self.angle_1_bins = int(np.pi // angular_resolution_rad) + 1 30 self.angle_2_bins = int(np.pi // angular_resolution_rad) + 1 31 else: 32 self.angle_1_bins = angle_bins 33 self.angle_2_bins = angle_bins 34 35 if velocity_bins is None: 36 self.angular_vel_1_bins = int((2 * self.MAX_VEL_1) // angular_vel_resolution_rad_per_sec) + 1 37 self.angular_vel_2_bins = int((2 * self.MAX_VEL_2) // angular_vel_resolution_rad_per_sec) + 1 38 else: 39 self.angular_vel_1_bins = velocity_bins 40 self.angular_vel_2_bins = velocity_bins 41 42 self.n_states = self.angle_1_bins * self.angle_2_bins * self.angular_vel_1_bins * self.angular_vel_2_bins 43 44 self.angle_range = (-np.pi, np.pi) 45 self.velocity_1_range = (-self.MAX_VEL_1, self.MAX_VEL_1) 46 self.velocity_2_range = (-self.MAX_VEL_2, self.MAX_VEL_2) 47 self.action_space = len(self.AVAIL_TORQUE) # -1, 0, 1 48 self.dt = 0.2 49 50 if precomputed_P is None: 51 self.P = {state: {action: [] for action in range(self.action_space)} for state in range(self.n_states)} 52 self.setup_transition_probabilities() 53 54 else: 55 self.P = precomputed_P 56 57 # add transform_obs 58 self.transform_obs = lambda obs: ( 59 np.ravel_multi_index(( 60 np.clip(np.digitize(obs[0], np.linspace(*self.angle_range, self.angle_1_bins)) - 1, 0, 61 self.angle_1_bins - 1), 62 np.clip(np.digitize(obs[1], np.linspace(*self.angle_range, self.angle_2_bins)) - 1, 0, 63 self.angle_2_bins - 1), 64 np.clip(np.digitize(obs[2], np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)) - 1, 0, 65 self.angular_vel_1_bins - 1), 66 np.clip(np.digitize(obs[3], np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)) - 1, 0, 67 self.angular_vel_2_bins - 1), 68 ), (self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins)) 69 ) 70 71 def setup_transition_probabilities(self): 72 """ 73 Sets up the transition probabilities for the environment. This method iterates through all possible 74 states and actions, simulates the next state, and records the transition probability 75 (deterministic in this setup), reward, and termination status. 76 """ 77 percent = 0 78 79 for state in range(self.n_states): 80 angle_1, angle_2, vel_1, vel_2 = self.index_to_state(state) 81 for action in range(self.action_space): 82 next_state, reward, done = self.compute_next_state(angle_1, angle_2, vel_1, vel_2, action) 83 self.P[state][action].append((1, next_state, reward, done)) 84 85 if state % int(self.n_states / 10) == 0 and state > 0 and self.verbose: 86 percent += 10 87 print(f'{percent}% of probabilities calculated!') 88 89 def index_to_state(self, index): 90 """ 91 Converts a single index into a multidimensional state representation. 92 93 Parameters: 94 - index (int): The flat index representing the state. 95 96 Returns: 97 - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins. 98 """ 99 100 totals = [self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins] 101 multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1] 102 components = [int((index // multipliers[i]) % totals[i]) for i in range(4)] 103 return components 104 105 # Modified from original implementation of Gym Acrobot 106 def dsdt(self, state): 107 m1 = self.LINK_MASS_1 108 m2 = self.LINK_MASS_2 109 l1 = self.LINK_LENGTH_1 110 lc1 = self.LINK_COM_POS_1 111 lc2 = self.LINK_COM_POS_2 112 I1 = self.LINK_MOI 113 I2 = self.LINK_MOI 114 g = 9.8 115 a = state[-1] 116 s = state[:-1] 117 theta1 = s[0] 118 theta2 = s[1] 119 dtheta1 = s[2] 120 dtheta2 = s[3] 121 d1 = m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2 122 d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2 123 phi2 = m2 * lc2 * g * cos(theta1 + theta2 - np.pi / 2.0) 124 phi1 = ( 125 -m2 * l1 * lc2 * dtheta2**2 * sin(theta2) 126 - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2) 127 + (m1 * lc1 + m2 * l1) * g * cos(theta1 - np.pi / 2) 128 + phi2 129 ) 130 131 # the following line is consistent with the java implementation and the 132 # book 133 ddtheta2 = ( 134 a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2 135 ) / (m2 * lc2**2 + I2 - d2**2 / d1) 136 137 ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 138 return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0 139 140 # Modified from original implementation of Gym Acrobot 141 def rk4(self, derivs, y0, t): 142 143 """ 144 Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta. 145 146 Example for 2D system: 147 148 >>> def derivs(x): 149 ... d1 = x[0] + 2*x[1] 150 ... d2 = -3*x[0] + 4*x[1] 151 ... return d1, d2 152 153 >>> dt = 0.0005 154 >>> t = np.arange(0.0, 2.0, dt) 155 >>> y0 = (1,2) 156 >>> yout = rk4(derivs, y0, t) 157 158 Args: 159 derivs: the derivative of the system and has the signature `dy = derivs(yi)` 160 y0: initial state vector 161 t: sample times 162 163 Returns: 164 yout: Runge-Kutta approximation of the ODE 165 """ 166 167 try: 168 Ny = len(y0) 169 except TypeError: 170 yout = np.zeros((len(t),), np.float64) 171 else: 172 yout = np.zeros((len(t), Ny), np.float64) 173 174 yout[0] = y0 175 176 for i in np.arange(len(t) - 1): 177 this = t[i] 178 dt = t[i + 1] - this 179 dt2 = dt / 2.0 180 y0 = yout[i] 181 182 k1 = np.asarray(derivs(y0)) 183 k2 = np.asarray(derivs(y0 + dt2 * k1)) 184 k3 = np.asarray(derivs(y0 + dt2 * k2)) 185 k4 = np.asarray(derivs(y0 + dt * k3)) 186 yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4) 187 # We only care about the final timestep and we cleave off action value which will be zero 188 return yout[-1][:4] 189 190 def compute_next_state(self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action): 191 192 angle_space = np.linspace(*self.angle_range,self.angle_1_bins) 193 velocity_1_space = np.linspace(*self.velocity_1_range, self.angular_vel_1_bins) 194 velocity_2_space = np.linspace(*self.velocity_2_range, self.angular_vel_2_bins) 195 196 angle_1 = angle_space[angle_1_idx] 197 angle_2 = angle_space[angle_2_idx] 198 199 velocity_1 = velocity_1_space[vel_1_idx] 200 velocity_2 = velocity_2_space[vel_2_idx] 201 202 torque = self.AVAIL_TORQUE[action] 203 204 state = [angle_1, angle_2, velocity_1, velocity_2, torque] 205 206 new_state = self.rk4(self.dsdt, state, [0, self.dt]) 207 208 new_state[0] = self.wrap(new_state[0], -np.pi, np.pi) 209 new_state[1] = self.wrap(new_state[1], -np.pi, np.pi) 210 new_state[2] = self.bound(new_state[2], -self.MAX_VEL_1, self.MAX_VEL_1) 211 new_state[3] = self.bound(new_state[3], -self.MAX_VEL_2, self.MAX_VEL_2) 212 213 # index new state 214 215 new_angle_1_idx = np.clip(np.digitize(new_state[0], np.linspace(*self.angle_range, self.angle_1_bins)) - 1, 0, 216 self.angle_1_bins - 1) 217 new_angle_2_idx = np.clip(np.digitize(new_state[1], np.linspace(*self.angle_range, self.angle_2_bins)) - 1, 0, 218 self.angle_2_bins - 1) 219 new_vel_1_idx = np.clip(np.digitize(new_state[2], np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)) - 1, 0, 220 self.angular_vel_1_bins - 1) 221 new_vel_2_idx = np.clip(np.digitize(new_state[3], np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)) - 1, 0, 222 self.angular_vel_2_bins - 1) 223 224 new_state_idx = np.ravel_multi_index((new_angle_1_idx, new_angle_2_idx, new_vel_1_idx, new_vel_2_idx), 225 (self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins)) 226 227 # self.state = new_state 228 terminated = bool(-cos(new_state[0]) - cos(new_state[1] + new_state[0]) > 1.0) 229 reward = -1.0 if not terminated else 0.0 230 231 return new_state_idx, reward, terminated 232 233 def wrap(self, x, m, M): 234 """Wraps `x` so m <= x <= M; but unlike `bound()` which 235 truncates, `wrap()` wraps x around the coordinate system defined by m,M.\n 236 For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. 237 238 Args: 239 x: a scalar 240 m: minimum possible value in range 241 M: maximum possible value in range 242 243 Returns: 244 x: a scalar, wrapped 245 """ 246 diff = M - m 247 while x > M: 248 x = x - diff 249 while x < m: 250 x = x + diff 251 return x 252 253 def bound(self, x, m, M=None): 254 """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR* 255 have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1]. 256 257 Args: 258 x: scalar 259 m: The lower bound 260 M: The upper bound 261 262 Returns: 263 x: scalar, bound between min (m) and Max (M) 264 """ 265 if M is None: 266 M = m[1] 267 m = m[0] 268 # bound x between min (m) and Max (M) 269 return min(max(x, m), M)
18 def __init__(self, 19 angular_resolution_rad = 0.01, 20 angular_vel_resolution_rad_per_sec = 0.05, 21 angle_bins = None, 22 velocity_bins = None, 23 precomputed_P = None, 24 verbose=False): 25 26 self.verbose = verbose 27 28 if angle_bins is None: 29 self.angle_1_bins = int(np.pi // angular_resolution_rad) + 1 30 self.angle_2_bins = int(np.pi // angular_resolution_rad) + 1 31 else: 32 self.angle_1_bins = angle_bins 33 self.angle_2_bins = angle_bins 34 35 if velocity_bins is None: 36 self.angular_vel_1_bins = int((2 * self.MAX_VEL_1) // angular_vel_resolution_rad_per_sec) + 1 37 self.angular_vel_2_bins = int((2 * self.MAX_VEL_2) // angular_vel_resolution_rad_per_sec) + 1 38 else: 39 self.angular_vel_1_bins = velocity_bins 40 self.angular_vel_2_bins = velocity_bins 41 42 self.n_states = self.angle_1_bins * self.angle_2_bins * self.angular_vel_1_bins * self.angular_vel_2_bins 43 44 self.angle_range = (-np.pi, np.pi) 45 self.velocity_1_range = (-self.MAX_VEL_1, self.MAX_VEL_1) 46 self.velocity_2_range = (-self.MAX_VEL_2, self.MAX_VEL_2) 47 self.action_space = len(self.AVAIL_TORQUE) # -1, 0, 1 48 self.dt = 0.2 49 50 if precomputed_P is None: 51 self.P = {state: {action: [] for action in range(self.action_space)} for state in range(self.n_states)} 52 self.setup_transition_probabilities() 53 54 else: 55 self.P = precomputed_P 56 57 # add transform_obs 58 self.transform_obs = lambda obs: ( 59 np.ravel_multi_index(( 60 np.clip(np.digitize(obs[0], np.linspace(*self.angle_range, self.angle_1_bins)) - 1, 0, 61 self.angle_1_bins - 1), 62 np.clip(np.digitize(obs[1], np.linspace(*self.angle_range, self.angle_2_bins)) - 1, 0, 63 self.angle_2_bins - 1), 64 np.clip(np.digitize(obs[2], np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)) - 1, 0, 65 self.angular_vel_1_bins - 1), 66 np.clip(np.digitize(obs[3], np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)) - 1, 0, 67 self.angular_vel_2_bins - 1), 68 ), (self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins)) 69 )
71 def setup_transition_probabilities(self): 72 """ 73 Sets up the transition probabilities for the environment. This method iterates through all possible 74 states and actions, simulates the next state, and records the transition probability 75 (deterministic in this setup), reward, and termination status. 76 """ 77 percent = 0 78 79 for state in range(self.n_states): 80 angle_1, angle_2, vel_1, vel_2 = self.index_to_state(state) 81 for action in range(self.action_space): 82 next_state, reward, done = self.compute_next_state(angle_1, angle_2, vel_1, vel_2, action) 83 self.P[state][action].append((1, next_state, reward, done)) 84 85 if state % int(self.n_states / 10) == 0 and state > 0 and self.verbose: 86 percent += 10 87 print(f'{percent}% of probabilities calculated!')
Sets up the transition probabilities for the environment. This method iterates through all possible states and actions, simulates the next state, and records the transition probability (deterministic in this setup), reward, and termination status.
89 def index_to_state(self, index): 90 """ 91 Converts a single index into a multidimensional state representation. 92 93 Parameters: 94 - index (int): The flat index representing the state. 95 96 Returns: 97 - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins. 98 """ 99 100 totals = [self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins] 101 multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1] 102 components = [int((index // multipliers[i]) % totals[i]) for i in range(4)] 103 return components
Converts a single index into a multidimensional state representation.
Parameters:
- index (int): The flat index representing the state.
Returns:
- list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
106 def dsdt(self, state): 107 m1 = self.LINK_MASS_1 108 m2 = self.LINK_MASS_2 109 l1 = self.LINK_LENGTH_1 110 lc1 = self.LINK_COM_POS_1 111 lc2 = self.LINK_COM_POS_2 112 I1 = self.LINK_MOI 113 I2 = self.LINK_MOI 114 g = 9.8 115 a = state[-1] 116 s = state[:-1] 117 theta1 = s[0] 118 theta2 = s[1] 119 dtheta1 = s[2] 120 dtheta2 = s[3] 121 d1 = m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2 122 d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2 123 phi2 = m2 * lc2 * g * cos(theta1 + theta2 - np.pi / 2.0) 124 phi1 = ( 125 -m2 * l1 * lc2 * dtheta2**2 * sin(theta2) 126 - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2) 127 + (m1 * lc1 + m2 * l1) * g * cos(theta1 - np.pi / 2) 128 + phi2 129 ) 130 131 # the following line is consistent with the java implementation and the 132 # book 133 ddtheta2 = ( 134 a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2 135 ) / (m2 * lc2**2 + I2 - d2**2 / d1) 136 137 ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 138 return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0
141 def rk4(self, derivs, y0, t): 142 143 """ 144 Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta. 145 146 Example for 2D system: 147 148 >>> def derivs(x): 149 ... d1 = x[0] + 2*x[1] 150 ... d2 = -3*x[0] + 4*x[1] 151 ... return d1, d2 152 153 >>> dt = 0.0005 154 >>> t = np.arange(0.0, 2.0, dt) 155 >>> y0 = (1,2) 156 >>> yout = rk4(derivs, y0, t) 157 158 Args: 159 derivs: the derivative of the system and has the signature `dy = derivs(yi)` 160 y0: initial state vector 161 t: sample times 162 163 Returns: 164 yout: Runge-Kutta approximation of the ODE 165 """ 166 167 try: 168 Ny = len(y0) 169 except TypeError: 170 yout = np.zeros((len(t),), np.float64) 171 else: 172 yout = np.zeros((len(t), Ny), np.float64) 173 174 yout[0] = y0 175 176 for i in np.arange(len(t) - 1): 177 this = t[i] 178 dt = t[i + 1] - this 179 dt2 = dt / 2.0 180 y0 = yout[i] 181 182 k1 = np.asarray(derivs(y0)) 183 k2 = np.asarray(derivs(y0 + dt2 * k1)) 184 k3 = np.asarray(derivs(y0 + dt2 * k2)) 185 k4 = np.asarray(derivs(y0 + dt * k3)) 186 yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4) 187 # We only care about the final timestep and we cleave off action value which will be zero 188 return yout[-1][:4]
Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.
Example for 2D system:
>>> def derivs(x):
... d1 = x[0] + 2*x[1]
... d2 = -3*x[0] + 4*x[1]
... return d1, d2
>>> dt = 0.0005
>>> t = np.arange(0.0, 2.0, dt)
>>> y0 = (1,2)
>>> yout = rk4(derivs, y0, t)
Args:
derivs: the derivative of the system and has the signature dy = derivs(yi)
y0: initial state vector
t: sample times
Returns: yout: Runge-Kutta approximation of the ODE
190 def compute_next_state(self, angle_1_idx, angle_2_idx, vel_1_idx, vel_2_idx, action): 191 192 angle_space = np.linspace(*self.angle_range,self.angle_1_bins) 193 velocity_1_space = np.linspace(*self.velocity_1_range, self.angular_vel_1_bins) 194 velocity_2_space = np.linspace(*self.velocity_2_range, self.angular_vel_2_bins) 195 196 angle_1 = angle_space[angle_1_idx] 197 angle_2 = angle_space[angle_2_idx] 198 199 velocity_1 = velocity_1_space[vel_1_idx] 200 velocity_2 = velocity_2_space[vel_2_idx] 201 202 torque = self.AVAIL_TORQUE[action] 203 204 state = [angle_1, angle_2, velocity_1, velocity_2, torque] 205 206 new_state = self.rk4(self.dsdt, state, [0, self.dt]) 207 208 new_state[0] = self.wrap(new_state[0], -np.pi, np.pi) 209 new_state[1] = self.wrap(new_state[1], -np.pi, np.pi) 210 new_state[2] = self.bound(new_state[2], -self.MAX_VEL_1, self.MAX_VEL_1) 211 new_state[3] = self.bound(new_state[3], -self.MAX_VEL_2, self.MAX_VEL_2) 212 213 # index new state 214 215 new_angle_1_idx = np.clip(np.digitize(new_state[0], np.linspace(*self.angle_range, self.angle_1_bins)) - 1, 0, 216 self.angle_1_bins - 1) 217 new_angle_2_idx = np.clip(np.digitize(new_state[1], np.linspace(*self.angle_range, self.angle_2_bins)) - 1, 0, 218 self.angle_2_bins - 1) 219 new_vel_1_idx = np.clip(np.digitize(new_state[2], np.linspace(*self.velocity_1_range, self.angular_vel_1_bins)) - 1, 0, 220 self.angular_vel_1_bins - 1) 221 new_vel_2_idx = np.clip(np.digitize(new_state[3], np.linspace(*self.velocity_2_range, self.angular_vel_2_bins)) - 1, 0, 222 self.angular_vel_2_bins - 1) 223 224 new_state_idx = np.ravel_multi_index((new_angle_1_idx, new_angle_2_idx, new_vel_1_idx, new_vel_2_idx), 225 (self.angle_1_bins, self.angle_2_bins, self.angular_vel_1_bins, self.angular_vel_2_bins)) 226 227 # self.state = new_state 228 terminated = bool(-cos(new_state[0]) - cos(new_state[1] + new_state[0]) > 1.0) 229 reward = -1.0 if not terminated else 0.0 230 231 return new_state_idx, reward, terminated
233 def wrap(self, x, m, M): 234 """Wraps `x` so m <= x <= M; but unlike `bound()` which 235 truncates, `wrap()` wraps x around the coordinate system defined by m,M.\n 236 For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. 237 238 Args: 239 x: a scalar 240 m: minimum possible value in range 241 M: maximum possible value in range 242 243 Returns: 244 x: a scalar, wrapped 245 """ 246 diff = M - m 247 while x > M: 248 x = x - diff 249 while x < m: 250 x = x + diff 251 return x
Wraps x
so m <= x <= M; but unlike bound()
which
truncates, wrap()
wraps x around the coordinate system defined by m,M.
For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
Args: x: a scalar m: minimum possible value in range M: maximum possible value in range
Returns: x: a scalar, wrapped
253 def bound(self, x, m, M=None): 254 """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR* 255 have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1]. 256 257 Args: 258 x: scalar 259 m: The lower bound 260 M: The upper bound 261 262 Returns: 263 x: scalar, bound between min (m) and Max (M) 264 """ 265 if M is None: 266 M = m[1] 267 m = m[0] 268 # bound x between min (m) and Max (M) 269 return min(max(x, m), M)
Either have m as scalar, so bound(x,m,M) which returns m <= x <= M OR
have m as length 2 vector, bound(x,m,
Args: x: scalar m: The lower bound M: The upper bound
Returns: x: scalar, bound between min (m) and Max (M)