envs.cartpole_model

generative AI experiment - discretized cartpole transition and reward (P) matrix with adaptive angle binning created with chatGPT

Example usage: dpole = DiscretizedCartPole(10, 10, 10, .1, .5) # Example bin sizes for each variable and adaptive angle binning center/outer resolution

  1"""
  2generative AI experiment - discretized cartpole transition and reward (P) matrix with adaptive angle binning
  3created with chatGPT
  4
  5Example usage:
  6dpole = DiscretizedCartPole(10, 10, 10, .1, .5)  # Example bin sizes for each variable and adaptive angle binning center/outer resolution
  7
  8"""
  9
 10import numpy as np
 11
 12
 13class DiscretizedCartPole:
 14    def __init__(self,
 15                 position_bins,
 16                 velocity_bins,
 17                 angular_velocity_bins,
 18                 angular_center_resolution,
 19                 angular_outer_resolution):
 20
 21        """
 22         Initializes the DiscretizedCartPole model.
 23
 24         Parameters:
 25         - position_bins (int): Number of discrete bins for the cart's position.
 26         - velocity_bins (int): Number of discrete bins for the cart's velocity.
 27         - angular_velocity_bins (int): Number of discrete bins for the pole's angular velocity.
 28         - angular_center_resolution (float): The resolution of angle bins near the center (around zero).
 29         - angular_outer_resolution (float): The resolution of angle bins away from the center.
 30
 31         Attributes:
 32         - state_space (int): Total number of discrete states in the environment.
 33         - P (dict): Transition probability matrix where P[state][action] is a list of tuples (probability, next_state,
 34         reward, done).
 35         - transform_obs (lambda): Function to transform continuous observations into a discrete state index.
 36         """
 37        self.position_bins = position_bins
 38        self.velocity_bins = velocity_bins
 39        self.angular_velocity_bins = angular_velocity_bins
 40        self.action_space = 2  # Left or Right
 41
 42        # Define the range for each variable
 43        self.position_range = (-2.4, 2.4)
 44        self.velocity_range = (-3, 3)
 45        self.angle_range = (-12 * np.pi / 180, 12 * np.pi / 180)
 46        self.angular_velocity_range = (-1.5, 1.5)
 47        self.angular_center_resolution = angular_center_resolution
 48        self.angular_outer_resolution = angular_outer_resolution
 49
 50        # Use adaptive binning for the pole angle
 51        self.angle_bins = self.adaptive_angle_bins(self.angle_range, self.angular_center_resolution, self.angular_outer_resolution)  # Adjust these values as needed
 52
 53        self.state_space = np.prod([self.position_bins, self.velocity_bins, len(self.angle_bins), self.angular_velocity_bins])
 54        self.P = {state: {action: [] for action in range(self.action_space)} for state in range(self.state_space)}
 55        self.setup_transition_probabilities()
 56        self.n_states = len(self.angle_bins)*self.velocity_bins*self.position_bins*self.angular_velocity_bins
 57        """
 58        Explanation of transform_obs lambda: 
 59        This lambda function will take cartpole observations, determine which bins they fall into, 
 60        and then convert bin coordinates into a single index.  This makes it possible 
 61        to use traditional reinforcement learning and planning algorithms, designed for discrete spaces, with continuous 
 62        state space environments. 
 63        
 64        Parameters:
 65        - obs (list): A list of continuous observations [position, velocity, angle, angular_velocity].
 66
 67        Returns:
 68        - int: A single integer representing the discretized state index.
 69        """
 70        self.transform_obs = lambda obs: (
 71            np.ravel_multi_index((
 72                np.clip(np.digitize(obs[0], np.linspace(*self.position_range, self.position_bins)) - 1, 0,
 73                        self.position_bins - 1),
 74                np.clip(np.digitize(obs[1], np.linspace(*self.velocity_range, self.velocity_bins)) - 1, 0,
 75                        self.velocity_bins - 1),
 76                np.clip(np.digitize(obs[2], self.angle_bins) - 1, 0, len(self.angle_bins) - 1),
 77                # Use adaptive angle bins
 78                np.clip(np.digitize(obs[3], np.linspace(*self.angular_velocity_range, self.angular_velocity_bins)) - 1,
 79                        0, self.angular_velocity_bins - 1)
 80            ), (self.position_bins, self.velocity_bins, len(self.angle_bins), self.angular_velocity_bins))
 81        )
 82
 83    def adaptive_angle_bins(self, angle_range, center_resolution, outer_resolution):
 84        """
 85        Generates adaptive bins for the pole's angle to allow for finer resolution near the center and coarser
 86        resolution farther away.
 87
 88        Parameters:
 89        - angle_range (tuple): The minimum and maximum angles in radians.
 90        - center_resolution (float): Bin width near zero angle for higher resolution.
 91        - outer_resolution (float): Bin width away from zero for lower resolution.
 92
 93        Returns:
 94        - np.array: An array of bin edges with adaptive spacing.
 95        """
 96        min_angle, max_angle = angle_range
 97        # Generate finer bins around zero
 98        center_bins = np.arange(-center_resolution, center_resolution + 1e-6, center_resolution / 10)
 99        # Generate sparser bins outside the center region
100        left_bins = np.linspace(min_angle, -center_resolution,
101                                num=int((center_resolution - min_angle) / outer_resolution) + 1, endpoint=False)
102        right_bins = np.linspace(center_resolution, max_angle,
103                                 num=int((max_angle - center_resolution) / outer_resolution) + 1, endpoint=True)
104        return np.unique(np.concatenate([left_bins, center_bins, right_bins]))
105
106    def setup_transition_probabilities(self):
107        """
108        Sets up the transition probabilities for the environment. This method iterates through all possible
109        states and actions, simulates the next state, and records the transition probability
110        (deterministic in this setup), reward, and termination status.
111        """
112        for state in range(self.state_space):
113            position, velocity, angle, angular_velocity = self.index_to_state(state)
114            for action in range(self.action_space):
115                next_state, reward, done = self.compute_next_state(position, velocity, angle, angular_velocity, action)
116                self.P[state][action].append((1, next_state, reward, done))
117
118    def index_to_state(self, index):
119        """
120        Converts a single index into a multidimensional state representation.
121
122        Parameters:
123        - index (int): The flat index representing the state.
124
125        Returns:
126        - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
127        """
128        totals = [self.position_bins, self.velocity_bins, len(self.angle_bins), self.angular_velocity_bins]
129        multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1]
130        components = [int((index // multipliers[i]) % totals[i]) for i in range(4)]
131        return components
132
133    def compute_next_state(self, position_idx, velocity_idx, angle_idx, angular_velocity_idx, action):
134        """
135        Computes the next state based on the current state indices and the action taken. Applies simplified physics calculations to determine the next state.
136
137        Parameters:
138        - position_idx (int): Current index of the cart's position.
139        - velocity_idx (int): Current index of the cart's velocity.
140        - angle_idx (int): Current index of the pole's angle.
141        - angular_velocity_idx (int): Current index of the pole's angular velocity.
142        - action (int): Action taken (0 for left, 1 for right).
143
144        Returns:
145        - tuple: Contains the next state index, the reward, and the done flag indicating if the episode has ended.
146        """
147        position = np.linspace(*self.position_range, self.position_bins)[position_idx]
148        velocity = np.linspace(*self.velocity_range, self.velocity_bins)[velocity_idx]
149        angle = self.angle_bins[angle_idx]
150        angular_velocity = np.linspace(*self.angular_velocity_range, self.angular_velocity_bins)[angular_velocity_idx]
151
152        # Simulate physics here (simplified)
153        force = 10 if action == 1 else -10
154        new_velocity = velocity + (force + np.cos(angle) * -10.0) * 0.02
155        new_position = position + new_velocity * 0.02
156        new_angular_velocity = angular_velocity + (-3.0 * np.sin(angle)) * 0.02
157        new_angle = angle + new_angular_velocity * 0.02
158
159        new_position_idx = np.clip(np.digitize(new_position, np.linspace(*self.position_range, self.position_bins)) - 1, 0, self.position_bins-1)
160        new_velocity_idx = np.clip(np.digitize(new_velocity, np.linspace(*self.velocity_range, self.velocity_bins)) - 1, 0, self.velocity_bins-1)
161        new_angle_idx = np.clip(np.digitize(new_angle, self.angle_bins) - 1, 0, len(self.angle_bins)-1)
162        new_angular_velocity_idx = np.clip(np.digitize(new_angular_velocity, np.linspace(*self.angular_velocity_range, self.angular_velocity_bins)) - 1, 0, self.angular_velocity_bins-1)
163
164        new_state_idx = np.ravel_multi_index((new_position_idx, new_velocity_idx, new_angle_idx, new_angular_velocity_idx),
165                                             (self.position_bins, self.velocity_bins, len(self.angle_bins), self.angular_velocity_bins))
166
167        reward = 1 if np.abs(new_angle) < 12 * np.pi / 180 else -1
168        done = True if np.abs(new_angle) >= 12 * np.pi / 180 or np.abs(new_position) > 2.4 else False
169
170        return new_state_idx, reward, done
class DiscretizedCartPole:
 14class DiscretizedCartPole:
 15    def __init__(self,
 16                 position_bins,
 17                 velocity_bins,
 18                 angular_velocity_bins,
 19                 angular_center_resolution,
 20                 angular_outer_resolution):
 21
 22        """
 23         Initializes the DiscretizedCartPole model.
 24
 25         Parameters:
 26         - position_bins (int): Number of discrete bins for the cart's position.
 27         - velocity_bins (int): Number of discrete bins for the cart's velocity.
 28         - angular_velocity_bins (int): Number of discrete bins for the pole's angular velocity.
 29         - angular_center_resolution (float): The resolution of angle bins near the center (around zero).
 30         - angular_outer_resolution (float): The resolution of angle bins away from the center.
 31
 32         Attributes:
 33         - state_space (int): Total number of discrete states in the environment.
 34         - P (dict): Transition probability matrix where P[state][action] is a list of tuples (probability, next_state,
 35         reward, done).
 36         - transform_obs (lambda): Function to transform continuous observations into a discrete state index.
 37         """
 38        self.position_bins = position_bins
 39        self.velocity_bins = velocity_bins
 40        self.angular_velocity_bins = angular_velocity_bins
 41        self.action_space = 2  # Left or Right
 42
 43        # Define the range for each variable
 44        self.position_range = (-2.4, 2.4)
 45        self.velocity_range = (-3, 3)
 46        self.angle_range = (-12 * np.pi / 180, 12 * np.pi / 180)
 47        self.angular_velocity_range = (-1.5, 1.5)
 48        self.angular_center_resolution = angular_center_resolution
 49        self.angular_outer_resolution = angular_outer_resolution
 50
 51        # Use adaptive binning for the pole angle
 52        self.angle_bins = self.adaptive_angle_bins(self.angle_range, self.angular_center_resolution, self.angular_outer_resolution)  # Adjust these values as needed
 53
 54        self.state_space = np.prod([self.position_bins, self.velocity_bins, len(self.angle_bins), self.angular_velocity_bins])
 55        self.P = {state: {action: [] for action in range(self.action_space)} for state in range(self.state_space)}
 56        self.setup_transition_probabilities()
 57        self.n_states = len(self.angle_bins)*self.velocity_bins*self.position_bins*self.angular_velocity_bins
 58        """
 59        Explanation of transform_obs lambda: 
 60        This lambda function will take cartpole observations, determine which bins they fall into, 
 61        and then convert bin coordinates into a single index.  This makes it possible 
 62        to use traditional reinforcement learning and planning algorithms, designed for discrete spaces, with continuous 
 63        state space environments. 
 64        
 65        Parameters:
 66        - obs (list): A list of continuous observations [position, velocity, angle, angular_velocity].
 67
 68        Returns:
 69        - int: A single integer representing the discretized state index.
 70        """
 71        self.transform_obs = lambda obs: (
 72            np.ravel_multi_index((
 73                np.clip(np.digitize(obs[0], np.linspace(*self.position_range, self.position_bins)) - 1, 0,
 74                        self.position_bins - 1),
 75                np.clip(np.digitize(obs[1], np.linspace(*self.velocity_range, self.velocity_bins)) - 1, 0,
 76                        self.velocity_bins - 1),
 77                np.clip(np.digitize(obs[2], self.angle_bins) - 1, 0, len(self.angle_bins) - 1),
 78                # Use adaptive angle bins
 79                np.clip(np.digitize(obs[3], np.linspace(*self.angular_velocity_range, self.angular_velocity_bins)) - 1,
 80                        0, self.angular_velocity_bins - 1)
 81            ), (self.position_bins, self.velocity_bins, len(self.angle_bins), self.angular_velocity_bins))
 82        )
 83
 84    def adaptive_angle_bins(self, angle_range, center_resolution, outer_resolution):
 85        """
 86        Generates adaptive bins for the pole's angle to allow for finer resolution near the center and coarser
 87        resolution farther away.
 88
 89        Parameters:
 90        - angle_range (tuple): The minimum and maximum angles in radians.
 91        - center_resolution (float): Bin width near zero angle for higher resolution.
 92        - outer_resolution (float): Bin width away from zero for lower resolution.
 93
 94        Returns:
 95        - np.array: An array of bin edges with adaptive spacing.
 96        """
 97        min_angle, max_angle = angle_range
 98        # Generate finer bins around zero
 99        center_bins = np.arange(-center_resolution, center_resolution + 1e-6, center_resolution / 10)
100        # Generate sparser bins outside the center region
101        left_bins = np.linspace(min_angle, -center_resolution,
102                                num=int((center_resolution - min_angle) / outer_resolution) + 1, endpoint=False)
103        right_bins = np.linspace(center_resolution, max_angle,
104                                 num=int((max_angle - center_resolution) / outer_resolution) + 1, endpoint=True)
105        return np.unique(np.concatenate([left_bins, center_bins, right_bins]))
106
107    def setup_transition_probabilities(self):
108        """
109        Sets up the transition probabilities for the environment. This method iterates through all possible
110        states and actions, simulates the next state, and records the transition probability
111        (deterministic in this setup), reward, and termination status.
112        """
113        for state in range(self.state_space):
114            position, velocity, angle, angular_velocity = self.index_to_state(state)
115            for action in range(self.action_space):
116                next_state, reward, done = self.compute_next_state(position, velocity, angle, angular_velocity, action)
117                self.P[state][action].append((1, next_state, reward, done))
118
119    def index_to_state(self, index):
120        """
121        Converts a single index into a multidimensional state representation.
122
123        Parameters:
124        - index (int): The flat index representing the state.
125
126        Returns:
127        - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
128        """
129        totals = [self.position_bins, self.velocity_bins, len(self.angle_bins), self.angular_velocity_bins]
130        multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1]
131        components = [int((index // multipliers[i]) % totals[i]) for i in range(4)]
132        return components
133
134    def compute_next_state(self, position_idx, velocity_idx, angle_idx, angular_velocity_idx, action):
135        """
136        Computes the next state based on the current state indices and the action taken. Applies simplified physics calculations to determine the next state.
137
138        Parameters:
139        - position_idx (int): Current index of the cart's position.
140        - velocity_idx (int): Current index of the cart's velocity.
141        - angle_idx (int): Current index of the pole's angle.
142        - angular_velocity_idx (int): Current index of the pole's angular velocity.
143        - action (int): Action taken (0 for left, 1 for right).
144
145        Returns:
146        - tuple: Contains the next state index, the reward, and the done flag indicating if the episode has ended.
147        """
148        position = np.linspace(*self.position_range, self.position_bins)[position_idx]
149        velocity = np.linspace(*self.velocity_range, self.velocity_bins)[velocity_idx]
150        angle = self.angle_bins[angle_idx]
151        angular_velocity = np.linspace(*self.angular_velocity_range, self.angular_velocity_bins)[angular_velocity_idx]
152
153        # Simulate physics here (simplified)
154        force = 10 if action == 1 else -10
155        new_velocity = velocity + (force + np.cos(angle) * -10.0) * 0.02
156        new_position = position + new_velocity * 0.02
157        new_angular_velocity = angular_velocity + (-3.0 * np.sin(angle)) * 0.02
158        new_angle = angle + new_angular_velocity * 0.02
159
160        new_position_idx = np.clip(np.digitize(new_position, np.linspace(*self.position_range, self.position_bins)) - 1, 0, self.position_bins-1)
161        new_velocity_idx = np.clip(np.digitize(new_velocity, np.linspace(*self.velocity_range, self.velocity_bins)) - 1, 0, self.velocity_bins-1)
162        new_angle_idx = np.clip(np.digitize(new_angle, self.angle_bins) - 1, 0, len(self.angle_bins)-1)
163        new_angular_velocity_idx = np.clip(np.digitize(new_angular_velocity, np.linspace(*self.angular_velocity_range, self.angular_velocity_bins)) - 1, 0, self.angular_velocity_bins-1)
164
165        new_state_idx = np.ravel_multi_index((new_position_idx, new_velocity_idx, new_angle_idx, new_angular_velocity_idx),
166                                             (self.position_bins, self.velocity_bins, len(self.angle_bins), self.angular_velocity_bins))
167
168        reward = 1 if np.abs(new_angle) < 12 * np.pi / 180 else -1
169        done = True if np.abs(new_angle) >= 12 * np.pi / 180 or np.abs(new_position) > 2.4 else False
170
171        return new_state_idx, reward, done
DiscretizedCartPole( position_bins, velocity_bins, angular_velocity_bins, angular_center_resolution, angular_outer_resolution)
15    def __init__(self,
16                 position_bins,
17                 velocity_bins,
18                 angular_velocity_bins,
19                 angular_center_resolution,
20                 angular_outer_resolution):
21
22        """
23         Initializes the DiscretizedCartPole model.
24
25         Parameters:
26         - position_bins (int): Number of discrete bins for the cart's position.
27         - velocity_bins (int): Number of discrete bins for the cart's velocity.
28         - angular_velocity_bins (int): Number of discrete bins for the pole's angular velocity.
29         - angular_center_resolution (float): The resolution of angle bins near the center (around zero).
30         - angular_outer_resolution (float): The resolution of angle bins away from the center.
31
32         Attributes:
33         - state_space (int): Total number of discrete states in the environment.
34         - P (dict): Transition probability matrix where P[state][action] is a list of tuples (probability, next_state,
35         reward, done).
36         - transform_obs (lambda): Function to transform continuous observations into a discrete state index.
37         """
38        self.position_bins = position_bins
39        self.velocity_bins = velocity_bins
40        self.angular_velocity_bins = angular_velocity_bins
41        self.action_space = 2  # Left or Right
42
43        # Define the range for each variable
44        self.position_range = (-2.4, 2.4)
45        self.velocity_range = (-3, 3)
46        self.angle_range = (-12 * np.pi / 180, 12 * np.pi / 180)
47        self.angular_velocity_range = (-1.5, 1.5)
48        self.angular_center_resolution = angular_center_resolution
49        self.angular_outer_resolution = angular_outer_resolution
50
51        # Use adaptive binning for the pole angle
52        self.angle_bins = self.adaptive_angle_bins(self.angle_range, self.angular_center_resolution, self.angular_outer_resolution)  # Adjust these values as needed
53
54        self.state_space = np.prod([self.position_bins, self.velocity_bins, len(self.angle_bins), self.angular_velocity_bins])
55        self.P = {state: {action: [] for action in range(self.action_space)} for state in range(self.state_space)}
56        self.setup_transition_probabilities()
57        self.n_states = len(self.angle_bins)*self.velocity_bins*self.position_bins*self.angular_velocity_bins
58        """
59        Explanation of transform_obs lambda: 
60        This lambda function will take cartpole observations, determine which bins they fall into, 
61        and then convert bin coordinates into a single index.  This makes it possible 
62        to use traditional reinforcement learning and planning algorithms, designed for discrete spaces, with continuous 
63        state space environments. 
64        
65        Parameters:
66        - obs (list): A list of continuous observations [position, velocity, angle, angular_velocity].
67
68        Returns:
69        - int: A single integer representing the discretized state index.
70        """
71        self.transform_obs = lambda obs: (
72            np.ravel_multi_index((
73                np.clip(np.digitize(obs[0], np.linspace(*self.position_range, self.position_bins)) - 1, 0,
74                        self.position_bins - 1),
75                np.clip(np.digitize(obs[1], np.linspace(*self.velocity_range, self.velocity_bins)) - 1, 0,
76                        self.velocity_bins - 1),
77                np.clip(np.digitize(obs[2], self.angle_bins) - 1, 0, len(self.angle_bins) - 1),
78                # Use adaptive angle bins
79                np.clip(np.digitize(obs[3], np.linspace(*self.angular_velocity_range, self.angular_velocity_bins)) - 1,
80                        0, self.angular_velocity_bins - 1)
81            ), (self.position_bins, self.velocity_bins, len(self.angle_bins), self.angular_velocity_bins))
82        )

Initializes the DiscretizedCartPole model.

Parameters:

  • position_bins (int): Number of discrete bins for the cart's position.
  • velocity_bins (int): Number of discrete bins for the cart's velocity.
  • angular_velocity_bins (int): Number of discrete bins for the pole's angular velocity.
  • angular_center_resolution (float): The resolution of angle bins near the center (around zero).
  • angular_outer_resolution (float): The resolution of angle bins away from the center.

Attributes:

  • state_space (int): Total number of discrete states in the environment.
  • P (dict): Transition probability matrix where P[state][action] is a list of tuples (probability, next_state, reward, done).
  • transform_obs (lambda): Function to transform continuous observations into a discrete state index.
position_bins
velocity_bins
angular_velocity_bins
action_space
position_range
velocity_range
angle_range
angular_velocity_range
angular_center_resolution
angular_outer_resolution
angle_bins
state_space
P
n_states

Explanation of transform_obs lambda: This lambda function will take cartpole observations, determine which bins they fall into, and then convert bin coordinates into a single index. This makes it possible to use traditional reinforcement learning and planning algorithms, designed for discrete spaces, with continuous state space environments.

Parameters:

  • obs (list): A list of continuous observations [position, velocity, angle, angular_velocity].

Returns:

  • int: A single integer representing the discretized state index.
transform_obs
def adaptive_angle_bins(self, angle_range, center_resolution, outer_resolution):
 84    def adaptive_angle_bins(self, angle_range, center_resolution, outer_resolution):
 85        """
 86        Generates adaptive bins for the pole's angle to allow for finer resolution near the center and coarser
 87        resolution farther away.
 88
 89        Parameters:
 90        - angle_range (tuple): The minimum and maximum angles in radians.
 91        - center_resolution (float): Bin width near zero angle for higher resolution.
 92        - outer_resolution (float): Bin width away from zero for lower resolution.
 93
 94        Returns:
 95        - np.array: An array of bin edges with adaptive spacing.
 96        """
 97        min_angle, max_angle = angle_range
 98        # Generate finer bins around zero
 99        center_bins = np.arange(-center_resolution, center_resolution + 1e-6, center_resolution / 10)
100        # Generate sparser bins outside the center region
101        left_bins = np.linspace(min_angle, -center_resolution,
102                                num=int((center_resolution - min_angle) / outer_resolution) + 1, endpoint=False)
103        right_bins = np.linspace(center_resolution, max_angle,
104                                 num=int((max_angle - center_resolution) / outer_resolution) + 1, endpoint=True)
105        return np.unique(np.concatenate([left_bins, center_bins, right_bins]))

Generates adaptive bins for the pole's angle to allow for finer resolution near the center and coarser resolution farther away.

Parameters:

  • angle_range (tuple): The minimum and maximum angles in radians.
  • center_resolution (float): Bin width near zero angle for higher resolution.
  • outer_resolution (float): Bin width away from zero for lower resolution.

Returns:

  • np.array: An array of bin edges with adaptive spacing.
def setup_transition_probabilities(self):
107    def setup_transition_probabilities(self):
108        """
109        Sets up the transition probabilities for the environment. This method iterates through all possible
110        states and actions, simulates the next state, and records the transition probability
111        (deterministic in this setup), reward, and termination status.
112        """
113        for state in range(self.state_space):
114            position, velocity, angle, angular_velocity = self.index_to_state(state)
115            for action in range(self.action_space):
116                next_state, reward, done = self.compute_next_state(position, velocity, angle, angular_velocity, action)
117                self.P[state][action].append((1, next_state, reward, done))

Sets up the transition probabilities for the environment. This method iterates through all possible states and actions, simulates the next state, and records the transition probability (deterministic in this setup), reward, and termination status.

def index_to_state(self, index):
119    def index_to_state(self, index):
120        """
121        Converts a single index into a multidimensional state representation.
122
123        Parameters:
124        - index (int): The flat index representing the state.
125
126        Returns:
127        - list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
128        """
129        totals = [self.position_bins, self.velocity_bins, len(self.angle_bins), self.angular_velocity_bins]
130        multipliers = np.cumprod([1] + totals[::-1])[:-1][::-1]
131        components = [int((index // multipliers[i]) % totals[i]) for i in range(4)]
132        return components

Converts a single index into a multidimensional state representation.

Parameters:

  • index (int): The flat index representing the state.

Returns:

  • list: A list of indices representing the state in terms of position, velocity, angle, and angular velocity bins.
def compute_next_state( self, position_idx, velocity_idx, angle_idx, angular_velocity_idx, action):
134    def compute_next_state(self, position_idx, velocity_idx, angle_idx, angular_velocity_idx, action):
135        """
136        Computes the next state based on the current state indices and the action taken. Applies simplified physics calculations to determine the next state.
137
138        Parameters:
139        - position_idx (int): Current index of the cart's position.
140        - velocity_idx (int): Current index of the cart's velocity.
141        - angle_idx (int): Current index of the pole's angle.
142        - angular_velocity_idx (int): Current index of the pole's angular velocity.
143        - action (int): Action taken (0 for left, 1 for right).
144
145        Returns:
146        - tuple: Contains the next state index, the reward, and the done flag indicating if the episode has ended.
147        """
148        position = np.linspace(*self.position_range, self.position_bins)[position_idx]
149        velocity = np.linspace(*self.velocity_range, self.velocity_bins)[velocity_idx]
150        angle = self.angle_bins[angle_idx]
151        angular_velocity = np.linspace(*self.angular_velocity_range, self.angular_velocity_bins)[angular_velocity_idx]
152
153        # Simulate physics here (simplified)
154        force = 10 if action == 1 else -10
155        new_velocity = velocity + (force + np.cos(angle) * -10.0) * 0.02
156        new_position = position + new_velocity * 0.02
157        new_angular_velocity = angular_velocity + (-3.0 * np.sin(angle)) * 0.02
158        new_angle = angle + new_angular_velocity * 0.02
159
160        new_position_idx = np.clip(np.digitize(new_position, np.linspace(*self.position_range, self.position_bins)) - 1, 0, self.position_bins-1)
161        new_velocity_idx = np.clip(np.digitize(new_velocity, np.linspace(*self.velocity_range, self.velocity_bins)) - 1, 0, self.velocity_bins-1)
162        new_angle_idx = np.clip(np.digitize(new_angle, self.angle_bins) - 1, 0, len(self.angle_bins)-1)
163        new_angular_velocity_idx = np.clip(np.digitize(new_angular_velocity, np.linspace(*self.angular_velocity_range, self.angular_velocity_bins)) - 1, 0, self.angular_velocity_bins-1)
164
165        new_state_idx = np.ravel_multi_index((new_position_idx, new_velocity_idx, new_angle_idx, new_angular_velocity_idx),
166                                             (self.position_bins, self.velocity_bins, len(self.angle_bins), self.angular_velocity_bins))
167
168        reward = 1 if np.abs(new_angle) < 12 * np.pi / 180 else -1
169        done = True if np.abs(new_angle) >= 12 * np.pi / 180 or np.abs(new_position) > 2.4 else False
170
171        return new_state_idx, reward, done

Computes the next state based on the current state indices and the action taken. Applies simplified physics calculations to determine the next state.

Parameters:

  • position_idx (int): Current index of the cart's position.
  • velocity_idx (int): Current index of the cart's velocity.
  • angle_idx (int): Current index of the pole's angle.
  • angular_velocity_idx (int): Current index of the pole's angular velocity.
  • action (int): Action taken (0 for left, 1 for right).

Returns:

  • tuple: Contains the next state index, the reward, and the done flag indicating if the episode has ended.