bettermdptools.envs.cartpole_model
generative AI experiment - discretized cartpole transition and reward (P) matrix with adaptive angle binning created with chatGPT
Example usage: dpole = DiscretizedCartPole(10, 10, 10, .1, .5) # Example bin sizes for each variable and adaptive angle binning center/outer resolution
1""" 2generative AI experiment - discretized cartpole transition and reward (P) matrix with adaptive angle binning 3created with chatGPT 4 5Example usage: 6dpole = DiscretizedCartPole(10, 10, 10, .1, .5) # Example bin sizes for each variable and adaptive angle binning center/outer resolution 7 8""" 9 10import numpy as np 11 12 13class DiscretizedCartPole: 14 def __init__( 15 self, 16 position_bins, 17 velocity_bins, 18 angular_velocity_bins, 19 angular_center_resolution, 20 angular_outer_resolution, 21 ): 22 """ 23 Initializes the DiscretizedCartPole model. 24 25 Parameters: 26 - position_bins (int): Number of discrete bins for the cart's position. 27 - velocity_bins (int): Number of discrete bins for the cart's velocity. 28 - angular_velocity_bins (int): Number of discrete bins for the pole's angular velocity. 29 - angular_center_resolution (float): The resolution of angle bins near the center (around zero). 30 - angular_outer_resolution (float): The resolution of angle bins away from the center. 31 32 Attributes: 33 - state_space (int): Total number of discrete states in the environment. 34 - P (dict): Transition probability matrix where P[state][action] is a list of tuples (probability, next_state, 35 reward, done). 36 - transform_obs (lambda): Function to transform continuous observations into a discrete state index. 37 """ 38 self.position_bins = position_bins 39 self.velocity_bins = velocity_bins 40 self.angular_velocity_bins = angular_velocity_bins 41 self.action_space = 2 # Left or Right 42 43 # Define the range for each variable 44 self.position_range = (-2.4, 2.4) 45 self.velocity_range = (-3, 3) 46 self.angle_range = (-12 * np.pi / 180, 12 * np.pi / 180) 47 self.angular_velocity_range = (-1.5, 1.5) 48 self.angular_center_resolution = angular_center_resolution 49 self.angular_outer_resolution = angular_outer_resolution 50 51 # Use adaptive binning for the pole angle 52 self.angle_bins = self.adaptive_angle_bins( 53 self.angle_range, 54 self.angular_center_resolution, 55 self.angular_outer_resolution, 56 ) # Adjust these values as needed 57 58 self.state_space = np.prod( 59 [ 60 self.position_bins, 61 self.velocity_bins, 62 len(self.angle_bins), 63 self.angular_velocity_bins, 64 ] 65 ) 66 self.P = { 67 state: {action: [] for action in range(self.action_space)} 68 for state in range(self.state_space) 69 } 70 self.setup_transition_probabilities() 71 self.n_states = ( 72 len(self.angle_bins) 73 * self.velocity_bins 74 * self.position_bins 75 * self.angular_velocity_bins 76 ) 77 """ 78 Explanation of transform_obs lambda: 79 This lambda function will take cartpole observations, determine which bins they fall into, 80 and then convert bin coordinates into a single index. This makes it possible 81 to use traditional reinforcement learning and planning algorithms, designed for discrete spaces, with continuous 82 state space environments. 83 84 Parameters: 85 - obs (list): A list of continuous observations [position, velocity, angle, angular_velocity]. 86 87 Returns: 88 - int: A single integer representing the discretized state index. 89 """ 90 self.transform_obs = lambda obs: ( 91 np.ravel_multi_index( 92 ( 93 np.clip( 94 np.digitize( 95 obs[0], 96 np.linspace(*self.position_range, self.position_bins), 97 ) 98 - 1, 99 0, 100 self.position_bins - 1, 101 ), 102 np.clip( 103 np.digitize( 104 obs[1], 105 np.linspace(*self.velocity_range, self.velocity_bins), 106 ) 107 - 1, 108 0, 109 self.velocity_bins - 1, 110 ), 111 np.clip( 112 np.digitize(obs[2], self.angle_bins) - 1, 113 0, 114 len(self.angle_bins) - 1, 115 ), 116 # Use adaptive angle bins 117 np.clip( 118 np.digitize( 119 obs[3], 120 np.linspace( 121 *self.angular_velocity_range, self.angular_velocity_bins 122 ), 123 ) 124 - 1, 125 0, 126 self.angular_velocity_bins - 1, 127 ), 128 ), 129 ( 130 self.position_bins, 131 self.velocity_bins, 132 len(self.angle_bins), 133 self.angular_velocity_bins, 134 ), 135 ) 136 ) 137 138 def adaptive_angle_bins(self, angle_range, center_resolution, outer_resolution): 139 """ 140 Generates adaptive bins for the pole's angle to allow for finer resolution near the center and coarser 141 resolution farther away. 142 143 Parameters: 144 - angle_range (tuple): The minimum and maximum angles in radians. 145 - center_resolution (float): Bin width near zero angle for higher resolution. 146 - outer_resolution (float): Bin width away from zero for lower resolution. 147 148 Returns: 149 - np.array: An array of bin edges with adaptive spacing. 150 """ 151 min_angle, max_angle = angle_range 152 # Generate finer bins around zero 153 center_bins = np.arange( 154 -center_resolution, center_resolution + 1e-6, center_resolution / 10 155 ) 156 # Generate sparser bins outside the center region 157 left_bins = np.linspace( 158 min_angle, 159 -center_resolution, 160 num=int((center_resolution - min_angle) / outer_resolution) + 1, 161 endpoint=False, 162 ) 163 right_bins = np.linspace( 164 center_resolution, 165 max_angle, 166 num=int((max_angle - center_resolution) / outer_resolution) + 1, 167 endpoint=True, 168 ) 169 return np.unique(np.concatenate([left_bins, center_bins, right_bins])) 170 171 def setup_transition_probabilities(self): 172 """ 173 Sets up the transition probabilities for the environment. This method iterates through all possible 174 states and actions, simulates the next state, and records the transition probability 175 (deterministic in this setup), reward, and termination status. 176 """ 177 for state in range(self.state_space): 178 position, velocity, angle, angular_velocity = self.index_to_state(state) 179 for action in range(self.action_space): 180 next_state, reward, done = self.compute_next_state( 181 position, velocity, angle, angular_velocity, action 182 ) 183 self.P[state][action].append((1, next_state, reward, done)) 184 185 def index_to_state(self, index): 186 """ 187 Converts a single index into a multidimensional state representation. 188 189 Parameters: 190 - index (int): The flat index representing the state. 191 192 Returns: 193 - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins. 194 """ 195 totals = [ 196 self.position_bins, 197 self.velocity_bins, 198 len(self.angle_bins), 199 self.angular_velocity_bins, 200 ] 201 multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1] 202 components = [int((index // multipliers[i]) % totals[i]) for i in range(4)] 203 return components 204 205 def compute_next_state( 206 self, position_idx, velocity_idx, angle_idx, angular_velocity_idx, action 207 ): 208 """ 209 Computes the next state based on the current state indices and the action taken. Applies simplified physics calculations to determine the next state. 210 211 Parameters: 212 - position_idx (int): Current index of the cart's position. 213 - velocity_idx (int): Current index of the cart's velocity. 214 - angle_idx (int): Current index of the pole's angle. 215 - angular_velocity_idx (int): Current index of the pole's angular velocity. 216 - action (int): Action taken (0 for left, 1 for right). 217 218 Returns: 219 - tuple: Contains the next state index, the reward, and the done flag indicating if the episode has ended. 220 """ 221 position = np.linspace(*self.position_range, self.position_bins)[position_idx] 222 velocity = np.linspace(*self.velocity_range, self.velocity_bins)[velocity_idx] 223 angle = self.angle_bins[angle_idx] 224 angular_velocity = np.linspace( 225 *self.angular_velocity_range, self.angular_velocity_bins 226 )[angular_velocity_idx] 227 228 # Simulate physics here (simplified) 229 force = 10 if action == 1 else -10 230 new_velocity = velocity + (force + np.cos(angle) * -10.0) * 0.02 231 new_position = position + new_velocity * 0.02 232 new_angular_velocity = angular_velocity + (-3.0 * np.sin(angle)) * 0.02 233 new_angle = angle + new_angular_velocity * 0.02 234 235 new_position_idx = np.clip( 236 np.digitize( 237 new_position, np.linspace(*self.position_range, self.position_bins) 238 ) 239 - 1, 240 0, 241 self.position_bins - 1, 242 ) 243 new_velocity_idx = np.clip( 244 np.digitize( 245 new_velocity, np.linspace(*self.velocity_range, self.velocity_bins) 246 ) 247 - 1, 248 0, 249 self.velocity_bins - 1, 250 ) 251 new_angle_idx = np.clip( 252 np.digitize(new_angle, self.angle_bins) - 1, 0, len(self.angle_bins) - 1 253 ) 254 new_angular_velocity_idx = np.clip( 255 np.digitize( 256 new_angular_velocity, 257 np.linspace(*self.angular_velocity_range, self.angular_velocity_bins), 258 ) 259 - 1, 260 0, 261 self.angular_velocity_bins - 1, 262 ) 263 264 new_state_idx = np.ravel_multi_index( 265 ( 266 new_position_idx, 267 new_velocity_idx, 268 new_angle_idx, 269 new_angular_velocity_idx, 270 ), 271 ( 272 self.position_bins, 273 self.velocity_bins, 274 len(self.angle_bins), 275 self.angular_velocity_bins, 276 ), 277 ) 278 279 reward = 1 if np.abs(new_angle) < 12 * np.pi / 180 else -1 280 done = ( 281 True 282 if np.abs(new_angle) >= 12 * np.pi / 180 or np.abs(new_position) > 2.4 283 else False 284 ) 285 286 return new_state_idx, reward, done
14class DiscretizedCartPole: 15 def __init__( 16 self, 17 position_bins, 18 velocity_bins, 19 angular_velocity_bins, 20 angular_center_resolution, 21 angular_outer_resolution, 22 ): 23 """ 24 Initializes the DiscretizedCartPole model. 25 26 Parameters: 27 - position_bins (int): Number of discrete bins for the cart's position. 28 - velocity_bins (int): Number of discrete bins for the cart's velocity. 29 - angular_velocity_bins (int): Number of discrete bins for the pole's angular velocity. 30 - angular_center_resolution (float): The resolution of angle bins near the center (around zero). 31 - angular_outer_resolution (float): The resolution of angle bins away from the center. 32 33 Attributes: 34 - state_space (int): Total number of discrete states in the environment. 35 - P (dict): Transition probability matrix where P[state][action] is a list of tuples (probability, next_state, 36 reward, done). 37 - transform_obs (lambda): Function to transform continuous observations into a discrete state index. 38 """ 39 self.position_bins = position_bins 40 self.velocity_bins = velocity_bins 41 self.angular_velocity_bins = angular_velocity_bins 42 self.action_space = 2 # Left or Right 43 44 # Define the range for each variable 45 self.position_range = (-2.4, 2.4) 46 self.velocity_range = (-3, 3) 47 self.angle_range = (-12 * np.pi / 180, 12 * np.pi / 180) 48 self.angular_velocity_range = (-1.5, 1.5) 49 self.angular_center_resolution = angular_center_resolution 50 self.angular_outer_resolution = angular_outer_resolution 51 52 # Use adaptive binning for the pole angle 53 self.angle_bins = self.adaptive_angle_bins( 54 self.angle_range, 55 self.angular_center_resolution, 56 self.angular_outer_resolution, 57 ) # Adjust these values as needed 58 59 self.state_space = np.prod( 60 [ 61 self.position_bins, 62 self.velocity_bins, 63 len(self.angle_bins), 64 self.angular_velocity_bins, 65 ] 66 ) 67 self.P = { 68 state: {action: [] for action in range(self.action_space)} 69 for state in range(self.state_space) 70 } 71 self.setup_transition_probabilities() 72 self.n_states = ( 73 len(self.angle_bins) 74 * self.velocity_bins 75 * self.position_bins 76 * self.angular_velocity_bins 77 ) 78 """ 79 Explanation of transform_obs lambda: 80 This lambda function will take cartpole observations, determine which bins they fall into, 81 and then convert bin coordinates into a single index. This makes it possible 82 to use traditional reinforcement learning and planning algorithms, designed for discrete spaces, with continuous 83 state space environments. 84 85 Parameters: 86 - obs (list): A list of continuous observations [position, velocity, angle, angular_velocity]. 87 88 Returns: 89 - int: A single integer representing the discretized state index. 90 """ 91 self.transform_obs = lambda obs: ( 92 np.ravel_multi_index( 93 ( 94 np.clip( 95 np.digitize( 96 obs[0], 97 np.linspace(*self.position_range, self.position_bins), 98 ) 99 - 1, 100 0, 101 self.position_bins - 1, 102 ), 103 np.clip( 104 np.digitize( 105 obs[1], 106 np.linspace(*self.velocity_range, self.velocity_bins), 107 ) 108 - 1, 109 0, 110 self.velocity_bins - 1, 111 ), 112 np.clip( 113 np.digitize(obs[2], self.angle_bins) - 1, 114 0, 115 len(self.angle_bins) - 1, 116 ), 117 # Use adaptive angle bins 118 np.clip( 119 np.digitize( 120 obs[3], 121 np.linspace( 122 *self.angular_velocity_range, self.angular_velocity_bins 123 ), 124 ) 125 - 1, 126 0, 127 self.angular_velocity_bins - 1, 128 ), 129 ), 130 ( 131 self.position_bins, 132 self.velocity_bins, 133 len(self.angle_bins), 134 self.angular_velocity_bins, 135 ), 136 ) 137 ) 138 139 def adaptive_angle_bins(self, angle_range, center_resolution, outer_resolution): 140 """ 141 Generates adaptive bins for the pole's angle to allow for finer resolution near the center and coarser 142 resolution farther away. 143 144 Parameters: 145 - angle_range (tuple): The minimum and maximum angles in radians. 146 - center_resolution (float): Bin width near zero angle for higher resolution. 147 - outer_resolution (float): Bin width away from zero for lower resolution. 148 149 Returns: 150 - np.array: An array of bin edges with adaptive spacing. 151 """ 152 min_angle, max_angle = angle_range 153 # Generate finer bins around zero 154 center_bins = np.arange( 155 -center_resolution, center_resolution + 1e-6, center_resolution / 10 156 ) 157 # Generate sparser bins outside the center region 158 left_bins = np.linspace( 159 min_angle, 160 -center_resolution, 161 num=int((center_resolution - min_angle) / outer_resolution) + 1, 162 endpoint=False, 163 ) 164 right_bins = np.linspace( 165 center_resolution, 166 max_angle, 167 num=int((max_angle - center_resolution) / outer_resolution) + 1, 168 endpoint=True, 169 ) 170 return np.unique(np.concatenate([left_bins, center_bins, right_bins])) 171 172 def setup_transition_probabilities(self): 173 """ 174 Sets up the transition probabilities for the environment. This method iterates through all possible 175 states and actions, simulates the next state, and records the transition probability 176 (deterministic in this setup), reward, and termination status. 177 """ 178 for state in range(self.state_space): 179 position, velocity, angle, angular_velocity = self.index_to_state(state) 180 for action in range(self.action_space): 181 next_state, reward, done = self.compute_next_state( 182 position, velocity, angle, angular_velocity, action 183 ) 184 self.P[state][action].append((1, next_state, reward, done)) 185 186 def index_to_state(self, index): 187 """ 188 Converts a single index into a multidimensional state representation. 189 190 Parameters: 191 - index (int): The flat index representing the state. 192 193 Returns: 194 - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins. 195 """ 196 totals = [ 197 self.position_bins, 198 self.velocity_bins, 199 len(self.angle_bins), 200 self.angular_velocity_bins, 201 ] 202 multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1] 203 components = [int((index // multipliers[i]) % totals[i]) for i in range(4)] 204 return components 205 206 def compute_next_state( 207 self, position_idx, velocity_idx, angle_idx, angular_velocity_idx, action 208 ): 209 """ 210 Computes the next state based on the current state indices and the action taken. Applies simplified physics calculations to determine the next state. 211 212 Parameters: 213 - position_idx (int): Current index of the cart's position. 214 - velocity_idx (int): Current index of the cart's velocity. 215 - angle_idx (int): Current index of the pole's angle. 216 - angular_velocity_idx (int): Current index of the pole's angular velocity. 217 - action (int): Action taken (0 for left, 1 for right). 218 219 Returns: 220 - tuple: Contains the next state index, the reward, and the done flag indicating if the episode has ended. 221 """ 222 position = np.linspace(*self.position_range, self.position_bins)[position_idx] 223 velocity = np.linspace(*self.velocity_range, self.velocity_bins)[velocity_idx] 224 angle = self.angle_bins[angle_idx] 225 angular_velocity = np.linspace( 226 *self.angular_velocity_range, self.angular_velocity_bins 227 )[angular_velocity_idx] 228 229 # Simulate physics here (simplified) 230 force = 10 if action == 1 else -10 231 new_velocity = velocity + (force + np.cos(angle) * -10.0) * 0.02 232 new_position = position + new_velocity * 0.02 233 new_angular_velocity = angular_velocity + (-3.0 * np.sin(angle)) * 0.02 234 new_angle = angle + new_angular_velocity * 0.02 235 236 new_position_idx = np.clip( 237 np.digitize( 238 new_position, np.linspace(*self.position_range, self.position_bins) 239 ) 240 - 1, 241 0, 242 self.position_bins - 1, 243 ) 244 new_velocity_idx = np.clip( 245 np.digitize( 246 new_velocity, np.linspace(*self.velocity_range, self.velocity_bins) 247 ) 248 - 1, 249 0, 250 self.velocity_bins - 1, 251 ) 252 new_angle_idx = np.clip( 253 np.digitize(new_angle, self.angle_bins) - 1, 0, len(self.angle_bins) - 1 254 ) 255 new_angular_velocity_idx = np.clip( 256 np.digitize( 257 new_angular_velocity, 258 np.linspace(*self.angular_velocity_range, self.angular_velocity_bins), 259 ) 260 - 1, 261 0, 262 self.angular_velocity_bins - 1, 263 ) 264 265 new_state_idx = np.ravel_multi_index( 266 ( 267 new_position_idx, 268 new_velocity_idx, 269 new_angle_idx, 270 new_angular_velocity_idx, 271 ), 272 ( 273 self.position_bins, 274 self.velocity_bins, 275 len(self.angle_bins), 276 self.angular_velocity_bins, 277 ), 278 ) 279 280 reward = 1 if np.abs(new_angle) < 12 * np.pi / 180 else -1 281 done = ( 282 True 283 if np.abs(new_angle) >= 12 * np.pi / 180 or np.abs(new_position) > 2.4 284 else False 285 ) 286 287 return new_state_idx, reward, done
15 def __init__( 16 self, 17 position_bins, 18 velocity_bins, 19 angular_velocity_bins, 20 angular_center_resolution, 21 angular_outer_resolution, 22 ): 23 """ 24 Initializes the DiscretizedCartPole model. 25 26 Parameters: 27 - position_bins (int): Number of discrete bins for the cart's position. 28 - velocity_bins (int): Number of discrete bins for the cart's velocity. 29 - angular_velocity_bins (int): Number of discrete bins for the pole's angular velocity. 30 - angular_center_resolution (float): The resolution of angle bins near the center (around zero). 31 - angular_outer_resolution (float): The resolution of angle bins away from the center. 32 33 Attributes: 34 - state_space (int): Total number of discrete states in the environment. 35 - P (dict): Transition probability matrix where P[state][action] is a list of tuples (probability, next_state, 36 reward, done). 37 - transform_obs (lambda): Function to transform continuous observations into a discrete state index. 38 """ 39 self.position_bins = position_bins 40 self.velocity_bins = velocity_bins 41 self.angular_velocity_bins = angular_velocity_bins 42 self.action_space = 2 # Left or Right 43 44 # Define the range for each variable 45 self.position_range = (-2.4, 2.4) 46 self.velocity_range = (-3, 3) 47 self.angle_range = (-12 * np.pi / 180, 12 * np.pi / 180) 48 self.angular_velocity_range = (-1.5, 1.5) 49 self.angular_center_resolution = angular_center_resolution 50 self.angular_outer_resolution = angular_outer_resolution 51 52 # Use adaptive binning for the pole angle 53 self.angle_bins = self.adaptive_angle_bins( 54 self.angle_range, 55 self.angular_center_resolution, 56 self.angular_outer_resolution, 57 ) # Adjust these values as needed 58 59 self.state_space = np.prod( 60 [ 61 self.position_bins, 62 self.velocity_bins, 63 len(self.angle_bins), 64 self.angular_velocity_bins, 65 ] 66 ) 67 self.P = { 68 state: {action: [] for action in range(self.action_space)} 69 for state in range(self.state_space) 70 } 71 self.setup_transition_probabilities() 72 self.n_states = ( 73 len(self.angle_bins) 74 * self.velocity_bins 75 * self.position_bins 76 * self.angular_velocity_bins 77 ) 78 """ 79 Explanation of transform_obs lambda: 80 This lambda function will take cartpole observations, determine which bins they fall into, 81 and then convert bin coordinates into a single index. This makes it possible 82 to use traditional reinforcement learning and planning algorithms, designed for discrete spaces, with continuous 83 state space environments. 84 85 Parameters: 86 - obs (list): A list of continuous observations [position, velocity, angle, angular_velocity]. 87 88 Returns: 89 - int: A single integer representing the discretized state index. 90 """ 91 self.transform_obs = lambda obs: ( 92 np.ravel_multi_index( 93 ( 94 np.clip( 95 np.digitize( 96 obs[0], 97 np.linspace(*self.position_range, self.position_bins), 98 ) 99 - 1, 100 0, 101 self.position_bins - 1, 102 ), 103 np.clip( 104 np.digitize( 105 obs[1], 106 np.linspace(*self.velocity_range, self.velocity_bins), 107 ) 108 - 1, 109 0, 110 self.velocity_bins - 1, 111 ), 112 np.clip( 113 np.digitize(obs[2], self.angle_bins) - 1, 114 0, 115 len(self.angle_bins) - 1, 116 ), 117 # Use adaptive angle bins 118 np.clip( 119 np.digitize( 120 obs[3], 121 np.linspace( 122 *self.angular_velocity_range, self.angular_velocity_bins 123 ), 124 ) 125 - 1, 126 0, 127 self.angular_velocity_bins - 1, 128 ), 129 ), 130 ( 131 self.position_bins, 132 self.velocity_bins, 133 len(self.angle_bins), 134 self.angular_velocity_bins, 135 ), 136 ) 137 )
Initializes the DiscretizedCartPole model.
Parameters:
- position_bins (int): Number of discrete bins for the cart's position.
- velocity_bins (int): Number of discrete bins for the cart's velocity.
- angular_velocity_bins (int): Number of discrete bins for the pole's angular velocity.
- angular_center_resolution (float): The resolution of angle bins near the center (around zero).
- angular_outer_resolution (float): The resolution of angle bins away from the center.
Attributes:
- state_space (int): Total number of discrete states in the environment.
- P (dict): Transition probability matrix where P[state][action] is a list of tuples (probability, next_state, reward, done).
- transform_obs (lambda): Function to transform continuous observations into a discrete state index.
Explanation of transform_obs lambda: This lambda function will take cartpole observations, determine which bins they fall into, and then convert bin coordinates into a single index. This makes it possible to use traditional reinforcement learning and planning algorithms, designed for discrete spaces, with continuous state space environments.
Parameters:
- obs (list): A list of continuous observations [position, velocity, angle, angular_velocity].
Returns:
- int: A single integer representing the discretized state index.
139 def adaptive_angle_bins(self, angle_range, center_resolution, outer_resolution): 140 """ 141 Generates adaptive bins for the pole's angle to allow for finer resolution near the center and coarser 142 resolution farther away. 143 144 Parameters: 145 - angle_range (tuple): The minimum and maximum angles in radians. 146 - center_resolution (float): Bin width near zero angle for higher resolution. 147 - outer_resolution (float): Bin width away from zero for lower resolution. 148 149 Returns: 150 - np.array: An array of bin edges with adaptive spacing. 151 """ 152 min_angle, max_angle = angle_range 153 # Generate finer bins around zero 154 center_bins = np.arange( 155 -center_resolution, center_resolution + 1e-6, center_resolution / 10 156 ) 157 # Generate sparser bins outside the center region 158 left_bins = np.linspace( 159 min_angle, 160 -center_resolution, 161 num=int((center_resolution - min_angle) / outer_resolution) + 1, 162 endpoint=False, 163 ) 164 right_bins = np.linspace( 165 center_resolution, 166 max_angle, 167 num=int((max_angle - center_resolution) / outer_resolution) + 1, 168 endpoint=True, 169 ) 170 return np.unique(np.concatenate([left_bins, center_bins, right_bins]))
Generates adaptive bins for the pole's angle to allow for finer resolution near the center and coarser resolution farther away.
Parameters:
- angle_range (tuple): The minimum and maximum angles in radians.
- center_resolution (float): Bin width near zero angle for higher resolution.
- outer_resolution (float): Bin width away from zero for lower resolution.
Returns:
- np.array: An array of bin edges with adaptive spacing.
172 def setup_transition_probabilities(self): 173 """ 174 Sets up the transition probabilities for the environment. This method iterates through all possible 175 states and actions, simulates the next state, and records the transition probability 176 (deterministic in this setup), reward, and termination status. 177 """ 178 for state in range(self.state_space): 179 position, velocity, angle, angular_velocity = self.index_to_state(state) 180 for action in range(self.action_space): 181 next_state, reward, done = self.compute_next_state( 182 position, velocity, angle, angular_velocity, action 183 ) 184 self.P[state][action].append((1, next_state, reward, done))
Sets up the transition probabilities for the environment. This method iterates through all possible states and actions, simulates the next state, and records the transition probability (deterministic in this setup), reward, and termination status.
186 def index_to_state(self, index): 187 """ 188 Converts a single index into a multidimensional state representation. 189 190 Parameters: 191 - index (int): The flat index representing the state. 192 193 Returns: 194 - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins. 195 """ 196 totals = [ 197 self.position_bins, 198 self.velocity_bins, 199 len(self.angle_bins), 200 self.angular_velocity_bins, 201 ] 202 multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1] 203 components = [int((index // multipliers[i]) % totals[i]) for i in range(4)] 204 return components
Converts a single index into a multidimensional state representation.
Parameters:
- index (int): The flat index representing the state.
Returns:
- list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
206 def compute_next_state( 207 self, position_idx, velocity_idx, angle_idx, angular_velocity_idx, action 208 ): 209 """ 210 Computes the next state based on the current state indices and the action taken. Applies simplified physics calculations to determine the next state. 211 212 Parameters: 213 - position_idx (int): Current index of the cart's position. 214 - velocity_idx (int): Current index of the cart's velocity. 215 - angle_idx (int): Current index of the pole's angle. 216 - angular_velocity_idx (int): Current index of the pole's angular velocity. 217 - action (int): Action taken (0 for left, 1 for right). 218 219 Returns: 220 - tuple: Contains the next state index, the reward, and the done flag indicating if the episode has ended. 221 """ 222 position = np.linspace(*self.position_range, self.position_bins)[position_idx] 223 velocity = np.linspace(*self.velocity_range, self.velocity_bins)[velocity_idx] 224 angle = self.angle_bins[angle_idx] 225 angular_velocity = np.linspace( 226 *self.angular_velocity_range, self.angular_velocity_bins 227 )[angular_velocity_idx] 228 229 # Simulate physics here (simplified) 230 force = 10 if action == 1 else -10 231 new_velocity = velocity + (force + np.cos(angle) * -10.0) * 0.02 232 new_position = position + new_velocity * 0.02 233 new_angular_velocity = angular_velocity + (-3.0 * np.sin(angle)) * 0.02 234 new_angle = angle + new_angular_velocity * 0.02 235 236 new_position_idx = np.clip( 237 np.digitize( 238 new_position, np.linspace(*self.position_range, self.position_bins) 239 ) 240 - 1, 241 0, 242 self.position_bins - 1, 243 ) 244 new_velocity_idx = np.clip( 245 np.digitize( 246 new_velocity, np.linspace(*self.velocity_range, self.velocity_bins) 247 ) 248 - 1, 249 0, 250 self.velocity_bins - 1, 251 ) 252 new_angle_idx = np.clip( 253 np.digitize(new_angle, self.angle_bins) - 1, 0, len(self.angle_bins) - 1 254 ) 255 new_angular_velocity_idx = np.clip( 256 np.digitize( 257 new_angular_velocity, 258 np.linspace(*self.angular_velocity_range, self.angular_velocity_bins), 259 ) 260 - 1, 261 0, 262 self.angular_velocity_bins - 1, 263 ) 264 265 new_state_idx = np.ravel_multi_index( 266 ( 267 new_position_idx, 268 new_velocity_idx, 269 new_angle_idx, 270 new_angular_velocity_idx, 271 ), 272 ( 273 self.position_bins, 274 self.velocity_bins, 275 len(self.angle_bins), 276 self.angular_velocity_bins, 277 ), 278 ) 279 280 reward = 1 if np.abs(new_angle) < 12 * np.pi / 180 else -1 281 done = ( 282 True 283 if np.abs(new_angle) >= 12 * np.pi / 180 or np.abs(new_position) > 2.4 284 else False 285 ) 286 287 return new_state_idx, reward, done
Computes the next state based on the current state indices and the action taken. Applies simplified physics calculations to determine the next state.
Parameters:
- position_idx (int): Current index of the cart's position.
- velocity_idx (int): Current index of the cart's velocity.
- angle_idx (int): Current index of the pole's angle.
- angular_velocity_idx (int): Current index of the pole's angular velocity.
- action (int): Action taken (0 for left, 1 for right).
Returns:
- tuple: Contains the next state index, the reward, and the done flag indicating if the episode has ended.