Pytorch version of Stable Baselines, implementations of reinforcement learning algorithms.
Implementation of Hindsight Experience Replay for goal-conditioned reinforcement learning, enabling learning from failed attempts by treating them as successful attempts toward different goals. This approach dramatically improves sample efficiency in sparse reward environments.
Specialized replay buffer that implements the Hindsight Experience Replay algorithm by automatically generating additional training samples from failed episodes.
class HerReplayBuffer(ReplayBuffer):
"""
Replay buffer with Hindsight Experience Replay.
Args:
buffer_size: Maximum buffer capacity
observation_space: Observation space (must include 'observation', 'achieved_goal', 'desired_goal')
action_space: Action space
env_info: Additional environment information
device: PyTorch device placement
n_envs: Number of parallel environments
optimize_memory_usage: Enable memory optimizations
handle_timeout_termination: Handle timeout terminations properly
n_sampled_goal: Number of virtual transitions per real transition
goal_selection_strategy: Strategy for selecting goals ("future", "final", "episode", "random")
wrapped_env: Environment wrapper for HER
online_sampling: Whether to sample goals online during training
max_episode_length: Maximum episode length for buffer management
"""
def __init__(
self,
buffer_size: int,
observation_space: gym.spaces.Space,
action_space: gym.spaces.Space,
env_info: Optional[Dict[str, Any]] = None,
device: Union[torch.device, str] = "auto",
n_envs: int = 1,
optimize_memory_usage: bool = False,
handle_timeout_termination: bool = True,
n_sampled_goal: int = 4,
goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future",
wrapped_env: Optional[VecEnv] = None,
online_sampling: bool = True,
max_episode_length: Optional[int] = None,
): ...
def add(
self,
obs: np.ndarray,
next_obs: np.ndarray,
actions: np.ndarray,
rewards: np.ndarray,
dones: np.ndarray,
infos: List[Dict[str, Any]],
) -> None:
"""
Add transition to replay buffer with HER.
Args:
obs: Current observations (dict with 'observation', 'achieved_goal', 'desired_goal')
next_obs: Next observations
actions: Actions taken
rewards: Rewards received
dones: Episode termination flags
infos: Additional information from environment
"""
def sample(self, batch_size: int, env: Optional[VecEnv] = None) -> ReplayBufferSamples:
"""
Sample batch of transitions with hindsight goals.
Args:
batch_size: Number of transitions to sample
env: Environment for computing rewards (if None, uses wrapped_env)
Returns:
Batch of experience samples with original and hindsight transitions
"""
def _sample_goals(
self,
episode_transitions: List[Dict[str, np.ndarray]],
transition_idx: int,
n_sampled_goal: int,
) -> np.ndarray:
"""
Sample goals for hindsight experience replay.
Args:
episode_transitions: List of transitions from episode
transition_idx: Index of current transition
n_sampled_goal: Number of goals to sample
Returns:
Array of sampled goals
"""
def _store_episode(
self,
episode_transitions: List[Dict[str, np.ndarray]],
is_success: bool,
) -> None:
"""
Store episode transitions and generate HER samples.
Args:
episode_transitions: List of transitions from completed episode
is_success: Whether episode was successful
"""
def truncate_last_trajectory(self) -> None:
"""Truncate last incomplete trajectory from buffer."""Different strategies for selecting which goals to use when creating hindsight experience, each with different trade-offs for learning efficiency.
class GoalSelectionStrategy:
"""
Enumeration of goal selection strategies for HER.
Strategies:
FUTURE: Sample goals from future states in the same episode
FINAL: Use the final achieved goal from the episode
EPISODE: Sample goals from any state in the episode
RANDOM: Sample completely random goals
"""
FUTURE = "future"
FINAL = "final"
EPISODE = "episode"
RANDOM = "random"
KEY_TO_GOAL_STRATEGY: Dict[str, GoalSelectionStrategy] = {
"future": GoalSelectionStrategy.FUTURE,
"final": GoalSelectionStrategy.FINAL,
"episode": GoalSelectionStrategy.EPISODE,
"random": GoalSelectionStrategy.RANDOM,
}HER requires specific environment structure and interfaces to function properly with goal-conditioned learning.
# Required observation space structure for HER
HER_OBSERVATION_SPACE = gym.spaces.Dict({
'observation': gym.spaces.Box, # Environment state
'achieved_goal': gym.spaces.Box, # Currently achieved goal
'desired_goal': gym.spaces.Box, # Desired goal for this episode
})
# Required info dict keys from environment
REQUIRED_INFO_KEYS = [
'is_success', # Boolean indicating if goal was achieved
]
# Optional info dict keys
OPTIONAL_INFO_KEYS = [
'TimeLimit.truncated', # Boolean indicating timeout termination
]import gymnasium as gym
from stable_baselines3 import SAC
from stable_baselines3.her import HerReplayBuffer
from stable_baselines3.common.vec_env import DummyVecEnv
# Create goal-conditioned environment (e.g., FetchReach-v1)
env = gym.make("FetchReach-v1")
# Verify environment has proper goal-conditioned structure
assert isinstance(env.observation_space, gym.spaces.Dict)
assert "observation" in env.observation_space.spaces
assert "achieved_goal" in env.observation_space.spaces
assert "desired_goal" in env.observation_space.spaces
# Wrap in vectorized environment
env = DummyVecEnv([lambda: env])
# Configure SAC with HER
model = SAC(
"MultiInputPolicy", # Required for dict observations
env,
replay_buffer_class=HerReplayBuffer,
replay_buffer_kwargs=dict(
n_sampled_goal=4,
goal_selection_strategy="future",
online_sampling=True,
max_episode_length=50,
),
verbose=1
)
# Train the agent
model.learn(total_timesteps=100000)from stable_baselines3.her import GoalSelectionStrategy
# Custom HER buffer configuration
her_kwargs = dict(
n_sampled_goal=8, # More hindsight goals per transition
goal_selection_strategy=GoalSelectionStrategy.FUTURE,
online_sampling=True,
max_episode_length=100,
handle_timeout_termination=True,
optimize_memory_usage=False,
)
# Use with TD3 (also works with DDPG, SAC)
from stable_baselines3 import TD3
model = TD3(
"MultiInputPolicy",
env,
replay_buffer_class=HerReplayBuffer,
replay_buffer_kwargs=her_kwargs,
buffer_size=1000000,
learning_starts=1000,
batch_size=256,
verbose=1
)
model.learn(total_timesteps=500000)import numpy as np
class SimpleGoalEnv(gym.Env):
"""Simple goal-conditioned environment for HER demonstration."""
def __init__(self):
super().__init__()
# Define spaces
self.action_space = gym.spaces.Box(-1, 1, (2,), dtype=np.float32)
# Goal-conditioned observation space
self.observation_space = gym.spaces.Dict({
'observation': gym.spaces.Box(-5, 5, (2,), dtype=np.float32),
'achieved_goal': gym.spaces.Box(-5, 5, (2,), dtype=np.float32),
'desired_goal': gym.spaces.Box(-5, 5, (2,), dtype=np.float32),
})
self.goal_threshold = 0.1
self.max_steps = 50
def reset(self, seed=None, options=None):
super().reset(seed=seed)
# Random initial position
self.position = self.np_random.uniform(-5, 5, (2,))
# Random goal
self.goal = self.np_random.uniform(-5, 5, (2,))
self.step_count = 0
obs = {
'observation': self.position.copy(),
'achieved_goal': self.position.copy(),
'desired_goal': self.goal.copy(),
}
return obs, {}
def step(self, action):
# Move based on action
self.position += action * 0.1
self.position = np.clip(self.position, -5, 5)
# Check if goal is achieved
distance = np.linalg.norm(self.position - self.goal)
is_success = distance < self.goal_threshold
# Sparse reward: 0 for success, -1 otherwise
reward = 0.0 if is_success else -1.0
self.step_count += 1
terminated = is_success
truncated = self.step_count >= self.max_steps
obs = {
'observation': self.position.copy(),
'achieved_goal': self.position.copy(),
'desired_goal': self.goal.copy(),
}
info = {
'is_success': is_success,
'distance': distance,
}
return obs, reward, terminated, truncated, info
def compute_reward(self, achieved_goal, desired_goal, info):
"""Compute reward for HER."""
distance = np.linalg.norm(achieved_goal - desired_goal, axis=-1)
return (distance < self.goal_threshold).astype(np.float32)
# Use custom environment with HER
custom_env = SimpleGoalEnv()
vec_env = DummyVecEnv([lambda: custom_env])
model = SAC(
"MultiInputPolicy",
vec_env,
replay_buffer_class=HerReplayBuffer,
replay_buffer_kwargs=dict(
n_sampled_goal=4,
goal_selection_strategy="future",
),
verbose=1
)
model.learn(total_timesteps=50000)# Compare different goal selection strategies
strategies = ["future", "final", "episode", "random"]
models = {}
for strategy in strategies:
print(f"Training with {strategy} strategy...")
env = DummyVecEnv([lambda: gym.make("FetchReach-v1")])
model = SAC(
"MultiInputPolicy",
env,
replay_buffer_class=HerReplayBuffer,
replay_buffer_kwargs=dict(
n_sampled_goal=4,
goal_selection_strategy=strategy,
),
verbose=0
)
model.learn(total_timesteps=25000)
models[strategy] = model
# Evaluate performance
from stable_baselines3.common.evaluation import evaluate_policy
for strategy, model in models.items():
mean_reward, std_reward = evaluate_policy(
model,
env,
n_eval_episodes=20,
deterministic=True
)
print(f"{strategy}: {mean_reward:.2f} ± {std_reward:.2f}")from stable_baselines3.common.callbacks import BaseCallback
import numpy as np
class HERMonitorCallback(BaseCallback):
"""Custom callback to monitor HER training progress."""
def __init__(self, eval_env, verbose=0):
super().__init__(verbose)
self.eval_env = eval_env
self.success_rates = []
def _on_step(self) -> bool:
# Log HER-specific metrics every 1000 steps
if self.n_calls % 1000 == 0:
# Evaluate success rate
n_eval_episodes = 10
successes = 0
for _ in range(n_eval_episodes):
obs = self.eval_env.reset()
done = False
while not done:
action, _ = self.model.predict(obs, deterministic=True)
obs, reward, done, info = self.eval_env.step(action)
if info.get('is_success', False):
successes += 1
break
success_rate = successes / n_eval_episodes
self.success_rates.append(success_rate)
# Log to tensorboard
self.logger.record("eval/success_rate", success_rate)
self.logger.record("eval/mean_success_rate", np.mean(self.success_rates[-10:]))
return True
# Use monitoring callback
eval_env = gym.make("FetchReach-v1")
monitor_callback = HERMonitorCallback(eval_env, verbose=1)
model.learn(total_timesteps=100000, callback=monitor_callback)For an environment to work with HER, it must:
Observation Space: Use gym.spaces.Dict with keys:
'observation': The actual environment state'achieved_goal': Currently achieved goal'desired_goal': Target goal for the episodeInfo Dictionary: Return 'is_success' boolean in the info dict
Reward Function: Ideally implement compute_reward(achieved_goal, desired_goal, info) method for efficient reward computation
n_sampled_goal controls the replay ratio - higher values improve learning but increase computationonline_sampling=True is more memory efficient but slightly slowerfrom typing import Union, Optional, Type, Callable, Dict, Any, List, Tuple
import numpy as np
import gymnasium as gym
from stable_baselines3.common.type_aliases import GymEnv, ReplayBufferSamples
from stable_baselines3.common.buffers import ReplayBuffer
from stable_baselines3.her.her_replay_buffer import HerReplayBuffer, GoalSelectionStrategy
from stable_baselines3.common.vec_env import VecEnv
from stable_baselines3.common.base_class import BaseAlgorithmInstall with Tessl CLI
npx tessl i tessl/pypi-stable-baselines3