bettermdptools.envs.cartpole_model

generative AI experiment - discretized cartpole transition and reward (P) matrix with adaptive angle binning created with chatGPT

Example usage: dpole = DiscretizedCartPole(10, 10, 10, .1, .5) # Example bin sizes for each variable and adaptive angle binning center/outer resolution

  1"""
  2generative AI experiment - discretized cartpole transition and reward (P) matrix with adaptive angle binning
  3created with chatGPT
  4
  5Example usage:
  6dpole = DiscretizedCartPole(10, 10, 10, .1, .5)  # Example bin sizes for each variable and adaptive angle binning center/outer resolution
  7
  8"""
  9
 10import numpy as np
 11
 12
 13class DiscretizedCartPole:
 14    def __init__(
 15        self,
 16        position_bins,
 17        velocity_bins,
 18        angular_velocity_bins,
 19        angular_center_resolution,
 20        angular_outer_resolution,
 21    ):
 22        """
 23        Initializes the DiscretizedCartPole model.
 24
 25        Parameters:
 26        - position_bins (int): Number of discrete bins for the cart's position.
 27        - velocity_bins (int): Number of discrete bins for the cart's velocity.
 28        - angular_velocity_bins (int): Number of discrete bins for the pole's angular velocity.
 29        - angular_center_resolution (float): The resolution of angle bins near the center (around zero).
 30        - angular_outer_resolution (float): The resolution of angle bins away from the center.
 31
 32        Attributes:
 33        - state_space (int): Total number of discrete states in the environment.
 34        - P (dict): Transition probability matrix where P[state][action] is a list of tuples (probability, next_state,
 35        reward, done).
 36        - transform_obs (lambda): Function to transform continuous observations into a discrete state index.
 37        """
 38        self.position_bins = position_bins
 39        self.velocity_bins = velocity_bins
 40        self.angular_velocity_bins = angular_velocity_bins
 41        self.action_space = 2  # Left or Right
 42
 43        # Define the range for each variable
 44        self.position_range = (-2.4, 2.4)
 45        self.velocity_range = (-3, 3)
 46        self.angle_range = (-12 * np.pi / 180, 12 * np.pi / 180)
 47        self.angular_velocity_range = (-1.5, 1.5)
 48        self.angular_center_resolution = angular_center_resolution
 49        self.angular_outer_resolution = angular_outer_resolution
 50
 51        # Use adaptive binning for the pole angle
 52        self.angle_bins = self.adaptive_angle_bins(
 53            self.angle_range,
 54            self.angular_center_resolution,
 55            self.angular_outer_resolution,
 56        )  # Adjust these values as needed
 57
 58        self.state_space = np.prod(
 59            [
 60                self.position_bins,
 61                self.velocity_bins,
 62                len(self.angle_bins),
 63                self.angular_velocity_bins,
 64            ]
 65        )
 66        self.P = {
 67            state: {action: [] for action in range(self.action_space)}
 68            for state in range(self.state_space)
 69        }
 70        self.setup_transition_probabilities()
 71        self.n_states = (
 72            len(self.angle_bins)
 73            * self.velocity_bins
 74            * self.position_bins
 75            * self.angular_velocity_bins
 76        )
 77        """
 78        Explanation of transform_obs lambda: 
 79        This lambda function will take cartpole observations, determine which bins they fall into, 
 80        and then convert bin coordinates into a single index.  This makes it possible 
 81        to use traditional reinforcement learning and planning algorithms, designed for discrete spaces, with continuous 
 82        state space environments. 
 83        
 84        Parameters:
 85        - obs (list): A list of continuous observations [position, velocity, angle, angular_velocity].
 86
 87        Returns:
 88        - int: A single integer representing the discretized state index.
 89        """
 90        self.transform_obs = lambda obs: (
 91            np.ravel_multi_index(
 92                (
 93                    np.clip(
 94                        np.digitize(
 95                            obs[0],
 96                            np.linspace(*self.position_range, self.position_bins),
 97                        )
 98                        - 1,
 99                        0,
100                        self.position_bins - 1,
101                    ),
102                    np.clip(
103                        np.digitize(
104                            obs[1],
105                            np.linspace(*self.velocity_range, self.velocity_bins),
106                        )
107                        - 1,
108                        0,
109                        self.velocity_bins - 1,
110                    ),
111                    np.clip(
112                        np.digitize(obs[2], self.angle_bins) - 1,
113                        0,
114                        len(self.angle_bins) - 1,
115                    ),
116                    # Use adaptive angle bins
117                    np.clip(
118                        np.digitize(
119                            obs[3],
120                            np.linspace(
121                                *self.angular_velocity_range, self.angular_velocity_bins
122                            ),
123                        )
124                        - 1,
125                        0,
126                        self.angular_velocity_bins - 1,
127                    ),
128                ),
129                (
130                    self.position_bins,
131                    self.velocity_bins,
132                    len(self.angle_bins),
133                    self.angular_velocity_bins,
134                ),
135            )
136        )
137
138    def adaptive_angle_bins(self, angle_range, center_resolution, outer_resolution):
139        """
140        Generates adaptive bins for the pole's angle to allow for finer resolution near the center and coarser
141        resolution farther away.
142
143        Parameters:
144        - angle_range (tuple): The minimum and maximum angles in radians.
145        - center_resolution (float): Bin width near zero angle for higher resolution.
146        - outer_resolution (float): Bin width away from zero for lower resolution.
147
148        Returns:
149        - np.array: An array of bin edges with adaptive spacing.
150        """
151        min_angle, max_angle = angle_range
152        # Generate finer bins around zero
153        center_bins = np.arange(
154            -center_resolution, center_resolution + 1e-6, center_resolution / 10
155        )
156        # Generate sparser bins outside the center region
157        left_bins = np.linspace(
158            min_angle,
159            -center_resolution,
160            num=int((center_resolution - min_angle) / outer_resolution) + 1,
161            endpoint=False,
162        )
163        right_bins = np.linspace(
164            center_resolution,
165            max_angle,
166            num=int((max_angle - center_resolution) / outer_resolution) + 1,
167            endpoint=True,
168        )
169        return np.unique(np.concatenate([left_bins, center_bins, right_bins]))
170
171    def setup_transition_probabilities(self):
172        """
173        Sets up the transition probabilities for the environment. This method iterates through all possible
174        states and actions, simulates the next state, and records the transition probability
175        (deterministic in this setup), reward, and termination status.
176        """
177        for state in range(self.state_space):
178            position, velocity, angle, angular_velocity = self.index_to_state(state)
179            for action in range(self.action_space):
180                next_state, reward, done = self.compute_next_state(
181                    position, velocity, angle, angular_velocity, action
182                )
183                self.P[state][action].append((1, next_state, reward, done))
184
185    def index_to_state(self, index):
186        """
187        Converts a single index into a multidimensional state representation.
188
189        Parameters:
190        - index (int): The flat index representing the state.
191
192        Returns:
193        - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
194        """
195        totals = [
196            self.position_bins,
197            self.velocity_bins,
198            len(self.angle_bins),
199            self.angular_velocity_bins,
200        ]
201        multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1]
202        components = [int((index // multipliers[i]) % totals[i]) for i in range(4)]
203        return components
204
205    def compute_next_state(
206        self, position_idx, velocity_idx, angle_idx, angular_velocity_idx, action
207    ):
208        """
209        Computes the next state based on the current state indices and the action taken. Applies simplified physics calculations to determine the next state.
210
211        Parameters:
212        - position_idx (int): Current index of the cart's position.
213        - velocity_idx (int): Current index of the cart's velocity.
214        - angle_idx (int): Current index of the pole's angle.
215        - angular_velocity_idx (int): Current index of the pole's angular velocity.
216        - action (int): Action taken (0 for left, 1 for right).
217
218        Returns:
219        - tuple: Contains the next state index, the reward, and the done flag indicating if the episode has ended.
220        """
221        position = np.linspace(*self.position_range, self.position_bins)[position_idx]
222        velocity = np.linspace(*self.velocity_range, self.velocity_bins)[velocity_idx]
223        angle = self.angle_bins[angle_idx]
224        angular_velocity = np.linspace(
225            *self.angular_velocity_range, self.angular_velocity_bins
226        )[angular_velocity_idx]
227
228        # Simulate physics here (simplified)
229        force = 10 if action == 1 else -10
230        new_velocity = velocity + (force + np.cos(angle) * -10.0) * 0.02
231        new_position = position + new_velocity * 0.02
232        new_angular_velocity = angular_velocity + (-3.0 * np.sin(angle)) * 0.02
233        new_angle = angle + new_angular_velocity * 0.02
234
235        new_position_idx = np.clip(
236            np.digitize(
237                new_position, np.linspace(*self.position_range, self.position_bins)
238            )
239            - 1,
240            0,
241            self.position_bins - 1,
242        )
243        new_velocity_idx = np.clip(
244            np.digitize(
245                new_velocity, np.linspace(*self.velocity_range, self.velocity_bins)
246            )
247            - 1,
248            0,
249            self.velocity_bins - 1,
250        )
251        new_angle_idx = np.clip(
252            np.digitize(new_angle, self.angle_bins) - 1, 0, len(self.angle_bins) - 1
253        )
254        new_angular_velocity_idx = np.clip(
255            np.digitize(
256                new_angular_velocity,
257                np.linspace(*self.angular_velocity_range, self.angular_velocity_bins),
258            )
259            - 1,
260            0,
261            self.angular_velocity_bins - 1,
262        )
263
264        new_state_idx = np.ravel_multi_index(
265            (
266                new_position_idx,
267                new_velocity_idx,
268                new_angle_idx,
269                new_angular_velocity_idx,
270            ),
271            (
272                self.position_bins,
273                self.velocity_bins,
274                len(self.angle_bins),
275                self.angular_velocity_bins,
276            ),
277        )
278
279        reward = 1 if np.abs(new_angle) < 12 * np.pi / 180 else -1
280        done = (
281            True
282            if np.abs(new_angle) >= 12 * np.pi / 180 or np.abs(new_position) > 2.4
283            else False
284        )
285
286        return new_state_idx, reward, done
class DiscretizedCartPole:
 14class DiscretizedCartPole:
 15    def __init__(
 16        self,
 17        position_bins,
 18        velocity_bins,
 19        angular_velocity_bins,
 20        angular_center_resolution,
 21        angular_outer_resolution,
 22    ):
 23        """
 24        Initializes the DiscretizedCartPole model.
 25
 26        Parameters:
 27        - position_bins (int): Number of discrete bins for the cart's position.
 28        - velocity_bins (int): Number of discrete bins for the cart's velocity.
 29        - angular_velocity_bins (int): Number of discrete bins for the pole's angular velocity.
 30        - angular_center_resolution (float): The resolution of angle bins near the center (around zero).
 31        - angular_outer_resolution (float): The resolution of angle bins away from the center.
 32
 33        Attributes:
 34        - state_space (int): Total number of discrete states in the environment.
 35        - P (dict): Transition probability matrix where P[state][action] is a list of tuples (probability, next_state,
 36        reward, done).
 37        - transform_obs (lambda): Function to transform continuous observations into a discrete state index.
 38        """
 39        self.position_bins = position_bins
 40        self.velocity_bins = velocity_bins
 41        self.angular_velocity_bins = angular_velocity_bins
 42        self.action_space = 2  # Left or Right
 43
 44        # Define the range for each variable
 45        self.position_range = (-2.4, 2.4)
 46        self.velocity_range = (-3, 3)
 47        self.angle_range = (-12 * np.pi / 180, 12 * np.pi / 180)
 48        self.angular_velocity_range = (-1.5, 1.5)
 49        self.angular_center_resolution = angular_center_resolution
 50        self.angular_outer_resolution = angular_outer_resolution
 51
 52        # Use adaptive binning for the pole angle
 53        self.angle_bins = self.adaptive_angle_bins(
 54            self.angle_range,
 55            self.angular_center_resolution,
 56            self.angular_outer_resolution,
 57        )  # Adjust these values as needed
 58
 59        self.state_space = np.prod(
 60            [
 61                self.position_bins,
 62                self.velocity_bins,
 63                len(self.angle_bins),
 64                self.angular_velocity_bins,
 65            ]
 66        )
 67        self.P = {
 68            state: {action: [] for action in range(self.action_space)}
 69            for state in range(self.state_space)
 70        }
 71        self.setup_transition_probabilities()
 72        self.n_states = (
 73            len(self.angle_bins)
 74            * self.velocity_bins
 75            * self.position_bins
 76            * self.angular_velocity_bins
 77        )
 78        """
 79        Explanation of transform_obs lambda: 
 80        This lambda function will take cartpole observations, determine which bins they fall into, 
 81        and then convert bin coordinates into a single index.  This makes it possible 
 82        to use traditional reinforcement learning and planning algorithms, designed for discrete spaces, with continuous 
 83        state space environments. 
 84        
 85        Parameters:
 86        - obs (list): A list of continuous observations [position, velocity, angle, angular_velocity].
 87
 88        Returns:
 89        - int: A single integer representing the discretized state index.
 90        """
 91        self.transform_obs = lambda obs: (
 92            np.ravel_multi_index(
 93                (
 94                    np.clip(
 95                        np.digitize(
 96                            obs[0],
 97                            np.linspace(*self.position_range, self.position_bins),
 98                        )
 99                        - 1,
100                        0,
101                        self.position_bins - 1,
102                    ),
103                    np.clip(
104                        np.digitize(
105                            obs[1],
106                            np.linspace(*self.velocity_range, self.velocity_bins),
107                        )
108                        - 1,
109                        0,
110                        self.velocity_bins - 1,
111                    ),
112                    np.clip(
113                        np.digitize(obs[2], self.angle_bins) - 1,
114                        0,
115                        len(self.angle_bins) - 1,
116                    ),
117                    # Use adaptive angle bins
118                    np.clip(
119                        np.digitize(
120                            obs[3],
121                            np.linspace(
122                                *self.angular_velocity_range, self.angular_velocity_bins
123                            ),
124                        )
125                        - 1,
126                        0,
127                        self.angular_velocity_bins - 1,
128                    ),
129                ),
130                (
131                    self.position_bins,
132                    self.velocity_bins,
133                    len(self.angle_bins),
134                    self.angular_velocity_bins,
135                ),
136            )
137        )
138
139    def adaptive_angle_bins(self, angle_range, center_resolution, outer_resolution):
140        """
141        Generates adaptive bins for the pole's angle to allow for finer resolution near the center and coarser
142        resolution farther away.
143
144        Parameters:
145        - angle_range (tuple): The minimum and maximum angles in radians.
146        - center_resolution (float): Bin width near zero angle for higher resolution.
147        - outer_resolution (float): Bin width away from zero for lower resolution.
148
149        Returns:
150        - np.array: An array of bin edges with adaptive spacing.
151        """
152        min_angle, max_angle = angle_range
153        # Generate finer bins around zero
154        center_bins = np.arange(
155            -center_resolution, center_resolution + 1e-6, center_resolution / 10
156        )
157        # Generate sparser bins outside the center region
158        left_bins = np.linspace(
159            min_angle,
160            -center_resolution,
161            num=int((center_resolution - min_angle) / outer_resolution) + 1,
162            endpoint=False,
163        )
164        right_bins = np.linspace(
165            center_resolution,
166            max_angle,
167            num=int((max_angle - center_resolution) / outer_resolution) + 1,
168            endpoint=True,
169        )
170        return np.unique(np.concatenate([left_bins, center_bins, right_bins]))
171
172    def setup_transition_probabilities(self):
173        """
174        Sets up the transition probabilities for the environment. This method iterates through all possible
175        states and actions, simulates the next state, and records the transition probability
176        (deterministic in this setup), reward, and termination status.
177        """
178        for state in range(self.state_space):
179            position, velocity, angle, angular_velocity = self.index_to_state(state)
180            for action in range(self.action_space):
181                next_state, reward, done = self.compute_next_state(
182                    position, velocity, angle, angular_velocity, action
183                )
184                self.P[state][action].append((1, next_state, reward, done))
185
186    def index_to_state(self, index):
187        """
188        Converts a single index into a multidimensional state representation.
189
190        Parameters:
191        - index (int): The flat index representing the state.
192
193        Returns:
194        - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
195        """
196        totals = [
197            self.position_bins,
198            self.velocity_bins,
199            len(self.angle_bins),
200            self.angular_velocity_bins,
201        ]
202        multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1]
203        components = [int((index // multipliers[i]) % totals[i]) for i in range(4)]
204        return components
205
206    def compute_next_state(
207        self, position_idx, velocity_idx, angle_idx, angular_velocity_idx, action
208    ):
209        """
210        Computes the next state based on the current state indices and the action taken. Applies simplified physics calculations to determine the next state.
211
212        Parameters:
213        - position_idx (int): Current index of the cart's position.
214        - velocity_idx (int): Current index of the cart's velocity.
215        - angle_idx (int): Current index of the pole's angle.
216        - angular_velocity_idx (int): Current index of the pole's angular velocity.
217        - action (int): Action taken (0 for left, 1 for right).
218
219        Returns:
220        - tuple: Contains the next state index, the reward, and the done flag indicating if the episode has ended.
221        """
222        position = np.linspace(*self.position_range, self.position_bins)[position_idx]
223        velocity = np.linspace(*self.velocity_range, self.velocity_bins)[velocity_idx]
224        angle = self.angle_bins[angle_idx]
225        angular_velocity = np.linspace(
226            *self.angular_velocity_range, self.angular_velocity_bins
227        )[angular_velocity_idx]
228
229        # Simulate physics here (simplified)
230        force = 10 if action == 1 else -10
231        new_velocity = velocity + (force + np.cos(angle) * -10.0) * 0.02
232        new_position = position + new_velocity * 0.02
233        new_angular_velocity = angular_velocity + (-3.0 * np.sin(angle)) * 0.02
234        new_angle = angle + new_angular_velocity * 0.02
235
236        new_position_idx = np.clip(
237            np.digitize(
238                new_position, np.linspace(*self.position_range, self.position_bins)
239            )
240            - 1,
241            0,
242            self.position_bins - 1,
243        )
244        new_velocity_idx = np.clip(
245            np.digitize(
246                new_velocity, np.linspace(*self.velocity_range, self.velocity_bins)
247            )
248            - 1,
249            0,
250            self.velocity_bins - 1,
251        )
252        new_angle_idx = np.clip(
253            np.digitize(new_angle, self.angle_bins) - 1, 0, len(self.angle_bins) - 1
254        )
255        new_angular_velocity_idx = np.clip(
256            np.digitize(
257                new_angular_velocity,
258                np.linspace(*self.angular_velocity_range, self.angular_velocity_bins),
259            )
260            - 1,
261            0,
262            self.angular_velocity_bins - 1,
263        )
264
265        new_state_idx = np.ravel_multi_index(
266            (
267                new_position_idx,
268                new_velocity_idx,
269                new_angle_idx,
270                new_angular_velocity_idx,
271            ),
272            (
273                self.position_bins,
274                self.velocity_bins,
275                len(self.angle_bins),
276                self.angular_velocity_bins,
277            ),
278        )
279
280        reward = 1 if np.abs(new_angle) < 12 * np.pi / 180 else -1
281        done = (
282            True
283            if np.abs(new_angle) >= 12 * np.pi / 180 or np.abs(new_position) > 2.4
284            else False
285        )
286
287        return new_state_idx, reward, done
DiscretizedCartPole( position_bins, velocity_bins, angular_velocity_bins, angular_center_resolution, angular_outer_resolution)
 15    def __init__(
 16        self,
 17        position_bins,
 18        velocity_bins,
 19        angular_velocity_bins,
 20        angular_center_resolution,
 21        angular_outer_resolution,
 22    ):
 23        """
 24        Initializes the DiscretizedCartPole model.
 25
 26        Parameters:
 27        - position_bins (int): Number of discrete bins for the cart's position.
 28        - velocity_bins (int): Number of discrete bins for the cart's velocity.
 29        - angular_velocity_bins (int): Number of discrete bins for the pole's angular velocity.
 30        - angular_center_resolution (float): The resolution of angle bins near the center (around zero).
 31        - angular_outer_resolution (float): The resolution of angle bins away from the center.
 32
 33        Attributes:
 34        - state_space (int): Total number of discrete states in the environment.
 35        - P (dict): Transition probability matrix where P[state][action] is a list of tuples (probability, next_state,
 36        reward, done).
 37        - transform_obs (lambda): Function to transform continuous observations into a discrete state index.
 38        """
 39        self.position_bins = position_bins
 40        self.velocity_bins = velocity_bins
 41        self.angular_velocity_bins = angular_velocity_bins
 42        self.action_space = 2  # Left or Right
 43
 44        # Define the range for each variable
 45        self.position_range = (-2.4, 2.4)
 46        self.velocity_range = (-3, 3)
 47        self.angle_range = (-12 * np.pi / 180, 12 * np.pi / 180)
 48        self.angular_velocity_range = (-1.5, 1.5)
 49        self.angular_center_resolution = angular_center_resolution
 50        self.angular_outer_resolution = angular_outer_resolution
 51
 52        # Use adaptive binning for the pole angle
 53        self.angle_bins = self.adaptive_angle_bins(
 54            self.angle_range,
 55            self.angular_center_resolution,
 56            self.angular_outer_resolution,
 57        )  # Adjust these values as needed
 58
 59        self.state_space = np.prod(
 60            [
 61                self.position_bins,
 62                self.velocity_bins,
 63                len(self.angle_bins),
 64                self.angular_velocity_bins,
 65            ]
 66        )
 67        self.P = {
 68            state: {action: [] for action in range(self.action_space)}
 69            for state in range(self.state_space)
 70        }
 71        self.setup_transition_probabilities()
 72        self.n_states = (
 73            len(self.angle_bins)
 74            * self.velocity_bins
 75            * self.position_bins
 76            * self.angular_velocity_bins
 77        )
 78        """
 79        Explanation of transform_obs lambda: 
 80        This lambda function will take cartpole observations, determine which bins they fall into, 
 81        and then convert bin coordinates into a single index.  This makes it possible 
 82        to use traditional reinforcement learning and planning algorithms, designed for discrete spaces, with continuous 
 83        state space environments. 
 84        
 85        Parameters:
 86        - obs (list): A list of continuous observations [position, velocity, angle, angular_velocity].
 87
 88        Returns:
 89        - int: A single integer representing the discretized state index.
 90        """
 91        self.transform_obs = lambda obs: (
 92            np.ravel_multi_index(
 93                (
 94                    np.clip(
 95                        np.digitize(
 96                            obs[0],
 97                            np.linspace(*self.position_range, self.position_bins),
 98                        )
 99                        - 1,
100                        0,
101                        self.position_bins - 1,
102                    ),
103                    np.clip(
104                        np.digitize(
105                            obs[1],
106                            np.linspace(*self.velocity_range, self.velocity_bins),
107                        )
108                        - 1,
109                        0,
110                        self.velocity_bins - 1,
111                    ),
112                    np.clip(
113                        np.digitize(obs[2], self.angle_bins) - 1,
114                        0,
115                        len(self.angle_bins) - 1,
116                    ),
117                    # Use adaptive angle bins
118                    np.clip(
119                        np.digitize(
120                            obs[3],
121                            np.linspace(
122                                *self.angular_velocity_range, self.angular_velocity_bins
123                            ),
124                        )
125                        - 1,
126                        0,
127                        self.angular_velocity_bins - 1,
128                    ),
129                ),
130                (
131                    self.position_bins,
132                    self.velocity_bins,
133                    len(self.angle_bins),
134                    self.angular_velocity_bins,
135                ),
136            )
137        )

Initializes the DiscretizedCartPole model.

Parameters:

  • position_bins (int): Number of discrete bins for the cart's position.
  • velocity_bins (int): Number of discrete bins for the cart's velocity.
  • angular_velocity_bins (int): Number of discrete bins for the pole's angular velocity.
  • angular_center_resolution (float): The resolution of angle bins near the center (around zero).
  • angular_outer_resolution (float): The resolution of angle bins away from the center.

Attributes:

  • state_space (int): Total number of discrete states in the environment.
  • P (dict): Transition probability matrix where P[state][action] is a list of tuples (probability, next_state, reward, done).
  • transform_obs (lambda): Function to transform continuous observations into a discrete state index.
position_bins
velocity_bins
angular_velocity_bins
action_space
position_range
velocity_range
angle_range
angular_velocity_range
angular_center_resolution
angular_outer_resolution
angle_bins
state_space
P
n_states

Explanation of transform_obs lambda: This lambda function will take cartpole observations, determine which bins they fall into, and then convert bin coordinates into a single index. This makes it possible to use traditional reinforcement learning and planning algorithms, designed for discrete spaces, with continuous state space environments.

Parameters:

  • obs (list): A list of continuous observations [position, velocity, angle, angular_velocity].

Returns:

  • int: A single integer representing the discretized state index.
transform_obs
def adaptive_angle_bins(self, angle_range, center_resolution, outer_resolution):
139    def adaptive_angle_bins(self, angle_range, center_resolution, outer_resolution):
140        """
141        Generates adaptive bins for the pole's angle to allow for finer resolution near the center and coarser
142        resolution farther away.
143
144        Parameters:
145        - angle_range (tuple): The minimum and maximum angles in radians.
146        - center_resolution (float): Bin width near zero angle for higher resolution.
147        - outer_resolution (float): Bin width away from zero for lower resolution.
148
149        Returns:
150        - np.array: An array of bin edges with adaptive spacing.
151        """
152        min_angle, max_angle = angle_range
153        # Generate finer bins around zero
154        center_bins = np.arange(
155            -center_resolution, center_resolution + 1e-6, center_resolution / 10
156        )
157        # Generate sparser bins outside the center region
158        left_bins = np.linspace(
159            min_angle,
160            -center_resolution,
161            num=int((center_resolution - min_angle) / outer_resolution) + 1,
162            endpoint=False,
163        )
164        right_bins = np.linspace(
165            center_resolution,
166            max_angle,
167            num=int((max_angle - center_resolution) / outer_resolution) + 1,
168            endpoint=True,
169        )
170        return np.unique(np.concatenate([left_bins, center_bins, right_bins]))

Generates adaptive bins for the pole's angle to allow for finer resolution near the center and coarser resolution farther away.

Parameters:

  • angle_range (tuple): The minimum and maximum angles in radians.
  • center_resolution (float): Bin width near zero angle for higher resolution.
  • outer_resolution (float): Bin width away from zero for lower resolution.

Returns:

  • np.array: An array of bin edges with adaptive spacing.
def setup_transition_probabilities(self):
172    def setup_transition_probabilities(self):
173        """
174        Sets up the transition probabilities for the environment. This method iterates through all possible
175        states and actions, simulates the next state, and records the transition probability
176        (deterministic in this setup), reward, and termination status.
177        """
178        for state in range(self.state_space):
179            position, velocity, angle, angular_velocity = self.index_to_state(state)
180            for action in range(self.action_space):
181                next_state, reward, done = self.compute_next_state(
182                    position, velocity, angle, angular_velocity, action
183                )
184                self.P[state][action].append((1, next_state, reward, done))

Sets up the transition probabilities for the environment. This method iterates through all possible states and actions, simulates the next state, and records the transition probability (deterministic in this setup), reward, and termination status.

def index_to_state(self, index):
186    def index_to_state(self, index):
187        """
188        Converts a single index into a multidimensional state representation.
189
190        Parameters:
191        - index (int): The flat index representing the state.
192
193        Returns:
194        - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
195        """
196        totals = [
197            self.position_bins,
198            self.velocity_bins,
199            len(self.angle_bins),
200            self.angular_velocity_bins,
201        ]
202        multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1]
203        components = [int((index // multipliers[i]) % totals[i]) for i in range(4)]
204        return components

Converts a single index into a multidimensional state representation.

Parameters:

  • index (int): The flat index representing the state.

Returns:

  • list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
def compute_next_state( self, position_idx, velocity_idx, angle_idx, angular_velocity_idx, action):
206    def compute_next_state(
207        self, position_idx, velocity_idx, angle_idx, angular_velocity_idx, action
208    ):
209        """
210        Computes the next state based on the current state indices and the action taken. Applies simplified physics calculations to determine the next state.
211
212        Parameters:
213        - position_idx (int): Current index of the cart's position.
214        - velocity_idx (int): Current index of the cart's velocity.
215        - angle_idx (int): Current index of the pole's angle.
216        - angular_velocity_idx (int): Current index of the pole's angular velocity.
217        - action (int): Action taken (0 for left, 1 for right).
218
219        Returns:
220        - tuple: Contains the next state index, the reward, and the done flag indicating if the episode has ended.
221        """
222        position = np.linspace(*self.position_range, self.position_bins)[position_idx]
223        velocity = np.linspace(*self.velocity_range, self.velocity_bins)[velocity_idx]
224        angle = self.angle_bins[angle_idx]
225        angular_velocity = np.linspace(
226            *self.angular_velocity_range, self.angular_velocity_bins
227        )[angular_velocity_idx]
228
229        # Simulate physics here (simplified)
230        force = 10 if action == 1 else -10
231        new_velocity = velocity + (force + np.cos(angle) * -10.0) * 0.02
232        new_position = position + new_velocity * 0.02
233        new_angular_velocity = angular_velocity + (-3.0 * np.sin(angle)) * 0.02
234        new_angle = angle + new_angular_velocity * 0.02
235
236        new_position_idx = np.clip(
237            np.digitize(
238                new_position, np.linspace(*self.position_range, self.position_bins)
239            )
240            - 1,
241            0,
242            self.position_bins - 1,
243        )
244        new_velocity_idx = np.clip(
245            np.digitize(
246                new_velocity, np.linspace(*self.velocity_range, self.velocity_bins)
247            )
248            - 1,
249            0,
250            self.velocity_bins - 1,
251        )
252        new_angle_idx = np.clip(
253            np.digitize(new_angle, self.angle_bins) - 1, 0, len(self.angle_bins) - 1
254        )
255        new_angular_velocity_idx = np.clip(
256            np.digitize(
257                new_angular_velocity,
258                np.linspace(*self.angular_velocity_range, self.angular_velocity_bins),
259            )
260            - 1,
261            0,
262            self.angular_velocity_bins - 1,
263        )
264
265        new_state_idx = np.ravel_multi_index(
266            (
267                new_position_idx,
268                new_velocity_idx,
269                new_angle_idx,
270                new_angular_velocity_idx,
271            ),
272            (
273                self.position_bins,
274                self.velocity_bins,
275                len(self.angle_bins),
276                self.angular_velocity_bins,
277            ),
278        )
279
280        reward = 1 if np.abs(new_angle) < 12 * np.pi / 180 else -1
281        done = (
282            True
283            if np.abs(new_angle) >= 12 * np.pi / 180 or np.abs(new_position) > 2.4
284            else False
285        )
286
287        return new_state_idx, reward, done

Computes the next state based on the current state indices and the action taken. Applies simplified physics calculations to determine the next state.

Parameters:

  • position_idx (int): Current index of the cart's position.
  • velocity_idx (int): Current index of the cart's velocity.
  • angle_idx (int): Current index of the pole's angle.
  • angular_velocity_idx (int): Current index of the pole's angular velocity.
  • action (int): Action taken (0 for left, 1 for right).

Returns:

  • tuple: Contains the next state index, the reward, and the done flag indicating if the episode has ended.