pytorch - Why can't my DDPG solve MountainCarContinuous-v0 - Stack Overflow
I recently started to learn the DDOG algorithm of reinforcement learning, and I tried to solve the MountainCarContinuous-v0 problem.
I used the tuned parameter of RL ZOO whenever possible,but it dosen't work.Who can help me find out where the problem is and get it trained up on my code base?
reslut
enter image description herepolicy loss
enter image description herecritic loss
Code
import gymnasium as gym
import numpy as np
import torch.nn as nn
from torch.optim import Adam
import torch
from torch.functional import F
from stable_baselines3mon.noise import OrnsteinUhlenbeckActionNoise
from tensorboardX import SummaryWriter
from copy import deepcopy
logdir = "logs/scalars/"
file_writer = SummaryWriter(log_dir=logdir)
class Agent(nn.Module):
def __init__(self):
super().__init__()
self.l1=nn.Sequential(
nn.Linear(2, 400),
nn.ReLU(),
nn.Linear(400, 300),
nn.ReLU(),
nn.Linear(300, 1),
nn.Tanh()
)
def forward(self, obs):
return self.l1(obs)
class Qfunction(nn.Module):
def __init__(self):
super().__init__()
self.l1=nn.Sequential(
nn.Linear(3, 400),
nn.ReLU(),
nn.Linear(400, 300),
nn.ReLU(),
nn.Linear(300, 1),
)
def forward(self, obs,action):
input=torch.cat((obs,action),-1)
return self.l1(input)
class DDPG():
def __init__(self):
self.total_num_step=100000
self.device = 'cuda'
self.policy = Agent().to(self.device)
self.policy_target = deepcopy(self.policy).to(self.device)
self.tau=0.005
self.Q = Qfunction().to(self.device)
self.Q_target = deepcopy(self.Q).to(self.device)
self.Policyoptimizer = Adam(self.policy.parameters(), lr=1e-3)
self.Qoptimizer = Adam(self.Q.parameters(), lr=1e-3)
self.ounoise=OrnsteinUhlenbeckActionNoise(mean=np.array([0]),sigma=np.array(0.5))
self.buffer=ReplayBuffer(2,1,1000000)
self.gamma=0.99
self.env=gym.make("MountainCarContinuous-v0")
self.test_env=gym.make("MountainCarContinuous-v0")
def run(self):
state,info=self.env.reset()
self.ounoise.reset()
state=torch.from_numpy(state).to(self.device)
n_step=0
# begin
for i in range(self.total_num_step):
#collect data
with torch.no_grad():
if n_step<100:
action=self.env.action_space.sample()
else:
action=self.policy(state).cpu().numpy()
noise=self.ounoise()
action=action+noise
action=action.clip(-1,1)
next_state, reward, terminated, truncated, info = self.env.step(action)
episode_over = 1 if (terminated or truncated) else 0
self.buffer.store(state.cpu().numpy(), action, reward, next_state,episode_over)
if episode_over:
state, info = self.env.reset()
state=torch.from_numpy(state).to(self.device)
self.ounoise.reset()
else:
state = torch.from_numpy(next_state).to(self.device)
n_step+=1
#update
if n_step>100:
lossq ,lossp=self.train_once()
file_writer.add_scalar('lossq', lossq, n_step)
file_writer.add_scalar('lossp', lossp, n_step)
if int(n_step)%1000==0:
print(n_step)
print(self.test())
def train_once(self):
for i in range(1):
self.policy.train()
# sample train data
state, action, reward, next_state, done=self.buffer.sample_batch(self.device,batch_size=256)
with torch.no_grad():
target=reward+self.gamma*self.Q_target(next_state,self.policy_target(next_state))*(1-done)
lossq=F.mse_loss(self.Q(state,action),target)
self.Qoptimizer.zero_grad()
lossq.backward()
self.Qoptimizer.step()
self.Policyoptimizer.zero_grad()
loss_p=-self.Q(state,self.policy(state)).mean()
loss_p.backward()
self.Policyoptimizer.step()
# Finally, update target networks by polyak averaging.
with torch.no_grad():
for p, p_targ in zip(self.policy.parameters(), self.policy_target.parameters()):
# NB: We use an in-place operations "mul_", "add_" to update target
# params, as opposed to "mul" and "add", which would make new tensors.
p_targ.data.mul_(1 - self.tau)
p_targ.data.add_(self.tau* p.data)
for p, p_targ in zip(self.Q.parameters(), self.Q_target.parameters()):
# NB: We use an in-place operations "mul_", "add_" to update target
# params, as opposed to "mul" and "add", which would make new tensors.
p_targ.data.mul_(1 - self.tau)
p_targ.data.add_(self.tau* p.data)
return lossq.item(),loss_p.item()
def test(self):
self.policy.eval()
state,info=self.test_env.reset()
state=torch.from_numpy(state).to(self.device)
episode_over = False
rewards=0
with torch.no_grad():
while not episode_over:
action = self.policy(state) # agent policy that uses the observation and info
next_state, reward, terminated, truncated, info = self.test_env.step(action.cpu().numpy())
episode_over = terminated or truncated
next_state=torch.from_numpy(next_state).to(self.device)
rewards +=reward
state = next_state
return rewards
class ReplayBuffer:
"""
A simple FIFO experience replay buffer for DDPG agents.
"""
def __init__(self, obs_dim, act_dim, size):
self.obs_buf = np.zeros((size,obs_dim), dtype=np.float32)
self.obs2_buf = np.zeros((size,obs_dim), dtype=np.float32)
self.act_buf = np.zeros((size,act_dim), dtype=np.float32)
self.rew_buf = np.zeros((size,1), dtype=np.float32)
self.done_buf = np.zeros((size,1), dtype=np.float32)
self.ptr, self.size, self.max_size = 0, 0, size
def store(self, obs, act, rew, next_obs, done):
self.obs_buf[self.ptr] = obs
self.obs2_buf[self.ptr] = next_obs
self.act_buf[self.ptr] = act
self.rew_buf[self.ptr] = rew
self.done_buf[self.ptr] = done
self.ptr = (self.ptr+1) % self.max_size
self.size = min(self.size+1, self.max_size)
def sample_batch(self, device,batch_size=32):
idxs = np.random.randint(0, self.size, size=batch_size)
batch = dict(obs=self.obs_buf[idxs],
rew=self.rew_buf[idxs],
act=self.act_buf[idxs],
obs2=self.obs2_buf[idxs],
done=self.done_buf[idxs])
return (torch.as_tensor(v, dtype=torch.float32).to(device) for k,v in batch.items())
if __name__=="__main__":
trainer = DDPG()
trainer.run()
最新文章
- 百度推出智能云云手机
- 拥抱安卓的诺基亚能否逆势崛起?
- 网络购票冲击下的铁路代售点被迫转型
- 传统杀毒软件市场受冲击 McAfee计划裁员7100人
- 谷歌正在复制苹果模式?(图)
- oauth - SSO to 3rd party through AzureAD - Stack Overflow
- rust - Why do I have to use &char instead of char to index a key in a HashMap<char, i32>? - Stack Overflow
- java.lang.NoClassDefFoundError: Could not initialize class org.apache.spark.SparkThrowableHelper$ - Stack Overflow
- ios - Flutter: works on Xcode but not on VS code and Android Studio - Stack Overflow
- java - mac update sequoia 15.1 or 15.2 not work UniversalJavaApplicationStub - Stack Overflow
- laravel - PHP Filament header action button label not toggling correctly - Stack Overflow
- reactjs - Does React reconciliation have an internal timer? - Stack Overflow
- javascript - How to properly handle AES encryption in React Native and generate Random Key for AES encryption? - Stack Overflow
- javascript - How to Upload an Image to Supabase Storage and Store the Public URL in a Form Using Zod and React Hook Form in Next
- visual studio code - VSCode support fully integrated IPython terminal - Stack Overflow
- perl - How to get a diagnostic from Moo when I try to set an unknown attribute in the constructor? - Stack Overflow
- OneNote with embeded Excel having Data Connection - security warning and disabled - Stack Overflow