pytorch - Why can't my DDPG solve MountainCarContinuous-v0 - Stack Overflow

时间: 2025-01-06 admin 业界

I recently started to learn the DDOG algorithm of reinforcement learning, and I tried to solve the MountainCarContinuous-v0 problem.

I used the tuned parameter of RL ZOO whenever possible,but it dosen't work.Who can help me find out where the problem is and get it trained up on my code base?

reslut

enter image description herepolicy loss

enter image description herecritic loss

Code

import gymnasium as gym
import numpy as np
import torch.nn as nn
from torch.optim import Adam
import torch
from torch.functional import F
from stable_baselines3mon.noise import OrnsteinUhlenbeckActionNoise
from tensorboardX import SummaryWriter
from copy import deepcopy
logdir = "logs/scalars/"
file_writer = SummaryWriter(log_dir=logdir)


class Agent(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1=nn.Sequential(
            nn.Linear(2, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, 1),
            nn.Tanh()
        )
    def forward(self, obs):
        return self.l1(obs)
class Qfunction(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1=nn.Sequential(
            nn.Linear(3, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, 1),
        )

    def forward(self, obs,action):
        input=torch.cat((obs,action),-1)
        return self.l1(input)



class DDPG():
    def __init__(self):
        self.total_num_step=100000
        self.device = 'cuda'
        self.policy = Agent().to(self.device)
        self.policy_target = deepcopy(self.policy).to(self.device)

        self.tau=0.005
        self.Q = Qfunction().to(self.device)
        self.Q_target = deepcopy(self.Q).to(self.device)

        self.Policyoptimizer = Adam(self.policy.parameters(), lr=1e-3)
        self.Qoptimizer = Adam(self.Q.parameters(), lr=1e-3)
        self.ounoise=OrnsteinUhlenbeckActionNoise(mean=np.array([0]),sigma=np.array(0.5))
        self.buffer=ReplayBuffer(2,1,1000000)
        self.gamma=0.99
        self.env=gym.make("MountainCarContinuous-v0")
        self.test_env=gym.make("MountainCarContinuous-v0")
    def run(self):
        state,info=self.env.reset()
        self.ounoise.reset()
        state=torch.from_numpy(state).to(self.device)
        n_step=0

        # begin
        for i in range(self.total_num_step):
            #collect data
            with torch.no_grad():
                if n_step<100:
                    action=self.env.action_space.sample()
                else:
                    action=self.policy(state).cpu().numpy()
                    noise=self.ounoise()
                    action=action+noise
                action=action.clip(-1,1)
                next_state, reward, terminated, truncated, info = self.env.step(action)
                episode_over = 1 if (terminated or truncated) else 0
                self.buffer.store(state.cpu().numpy(), action, reward, next_state,episode_over)
                if episode_over:
                    state, info = self.env.reset()
                    state=torch.from_numpy(state).to(self.device)
                    self.ounoise.reset()
                else:
                    state = torch.from_numpy(next_state).to(self.device)

            n_step+=1
            #update
            if n_step>100:
                lossq ,lossp=self.train_once()
                file_writer.add_scalar('lossq', lossq, n_step)
                file_writer.add_scalar('lossp', lossp, n_step)
            if int(n_step)%1000==0:
                print(n_step)
                print(self.test())




    def train_once(self):
        for i in range(1):
            self.policy.train()
            # sample train data
            state, action, reward, next_state, done=self.buffer.sample_batch(self.device,batch_size=256)

            with torch.no_grad():
                target=reward+self.gamma*self.Q_target(next_state,self.policy_target(next_state))*(1-done)
            lossq=F.mse_loss(self.Q(state,action),target)
            self.Qoptimizer.zero_grad()
            lossq.backward()
            self.Qoptimizer.step()

            self.Policyoptimizer.zero_grad()
            loss_p=-self.Q(state,self.policy(state)).mean()
            loss_p.backward()
            self.Policyoptimizer.step()
            # Finally, update target networks by polyak averaging.
            with torch.no_grad():
                for p, p_targ in zip(self.policy.parameters(), self.policy_target.parameters()):
                    # NB: We use an in-place operations "mul_", "add_" to update target
                    # params, as opposed to "mul" and "add", which would make new tensors.
                    p_targ.data.mul_(1 - self.tau)
                    p_targ.data.add_(self.tau* p.data)
                for p, p_targ in zip(self.Q.parameters(), self.Q_target.parameters()):
                    # NB: We use an in-place operations "mul_", "add_" to update target
                    # params, as opposed to "mul" and "add", which would make new tensors.
                    p_targ.data.mul_(1 - self.tau)
                    p_targ.data.add_(self.tau* p.data)
            return lossq.item(),loss_p.item()

    def test(self):
        self.policy.eval()
        state,info=self.test_env.reset()
        state=torch.from_numpy(state).to(self.device)
        episode_over = False
        rewards=0
        with torch.no_grad():
            while not episode_over:
                action = self.policy(state)  # agent policy that uses the observation and info
                next_state, reward, terminated, truncated, info = self.test_env.step(action.cpu().numpy())
                episode_over = terminated or truncated
                next_state=torch.from_numpy(next_state).to(self.device)
                rewards +=reward
                state = next_state
        return rewards



class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for DDPG agents.
    """

    def __init__(self, obs_dim, act_dim, size):
        self.obs_buf = np.zeros((size,obs_dim), dtype=np.float32)
        self.obs2_buf = np.zeros((size,obs_dim), dtype=np.float32)
        self.act_buf = np.zeros((size,act_dim), dtype=np.float32)
        self.rew_buf = np.zeros((size,1), dtype=np.float32)
        self.done_buf = np.zeros((size,1), dtype=np.float32)
        self.ptr, self.size, self.max_size = 0, 0, size

    def store(self, obs, act, rew, next_obs, done):
        self.obs_buf[self.ptr] = obs
        self.obs2_buf[self.ptr] = next_obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sample_batch(self, device,batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        batch = dict(obs=self.obs_buf[idxs],
                     rew=self.rew_buf[idxs],
                     act=self.act_buf[idxs],
                     obs2=self.obs2_buf[idxs],
                     done=self.done_buf[idxs])
        return (torch.as_tensor(v, dtype=torch.float32).to(device) for k,v in batch.items())


if __name__=="__main__":
    trainer = DDPG()
    trainer.run()