DQN型号（游戏：Atari Pongnoframeskip）不学习

发布于 2025-01-22 20:27:55 字数 7479 浏览 7 评论 0原文

我正在尝试实施乒乓球的DQN模型。但是，即使在大约1000集之后，它仍然像随机活动一样执行。 CNN培训似乎并不能改善代理。

这是我的主要代码：

我创建了一个CNN，包括汇总后三个卷积层和三个向前连接层。输入通道是预处理框架的数量（从3 210 160到4 84 84，通道为4）：

class CNN(nn.Module):
  def __init__(self, s_channels, a_space):
    super(CNN, self).__init__()
    self.pool = nn.MaxPool2d(kernel_size=2, stride=1)
    self.conv1 = nn.Conv2d(s_channels,out_channels=32,kernel_size=8,stride=4)
    self.conv2 = nn.Conv2d(32,64,4,2)
    self.conv3 = nn.Conv2d(64,64,3,1)
    self.fc1 = nn.Linear(64*4*4,1024)
    self.fc2 = nn.Linear(1024,512)
    self.fc3 = nn.Linear(512,a_space)

  def forward(self,input):
    output = self.pool(F.relu(self.conv1(input)))
    output = self.pool(F.relu(self.conv2(output)))
    output = self.pool(F.relu(self.conv3(output)))
    output = output.view(-1,64*4*4)
    output = F.relu(self.fc1(output))
    output = F.relu(self.fc2(output))
    output = F.relu(self.fc3(output))
    return output

之后，我在那构建了一个带有代理类，动作选择和CNN培训功能。在CNN培训功能中，我使用批处理输入来删除损失值，而不是逐步进行批处理数据的循环迭代。在出现损失和向后之前，我将输入图像数据转换为批处理大小的向量。这是代理类别：

class Agent():
  def __init__(self, s_space, a_space, device) -> None:

    # set GPU device to cuda
    self.device = device

    # define parameters
    self.epsilon = 1.0
    self.min_epsilon = 0.01
    self.dr = 0.995
    self.lr = 0.001
    self.gamma = 0.9

    # define models
    self.evl_net = CNN(s_space, a_space).to(self.device)
    self.tgt_net = CNN(s_space, a_space).to(self.device)
    self.cert = nn.SmoothL1Loss()
    self.optimal = th.optim.Adam(self.evl_net.parameters(),lr=self.lr)

    # define memory store
    self.memory = deque(maxlen=2000)

  # pre-process the input image data
  def data_pre_process(self,batch_size):
    s_v = []
    a_v = []
    next_s_v = []
    r_v = []
    dones = []
    materials = random.sample(self.memory,batch_size)
    for t in materials:
      s_v.append(t[0])
      a_v.append(t[1])
      next_s_v.append(t[2])
      r_v.append(t[3])
      dones.append(t[4])

    s_v = th.Tensor(s_v).to(self.device)         
    a_v = th.LongTensor(a_v).unsqueeze(1).to(self.device)                                 
    r_v = th.FloatTensor(r_v).to(device)        
    # print(r_v.shape)
    return s_v, a_v, next_s_v, r_v, dones      

  # record the transformed images
  def record(self,tpl):
    self.memory.append(tpl)
  
  # select actions according to the states (input images with 4 channels)
  def select(self,state,a_space):
    actions = self.evl_net(state).data.tolist()
    if(random.random() <= self.epsilon):
      action = random.randint(0,a_space-1)
    else:
      action = actions.index(max(actions))
    return action

  # save CNN model
  def save(self):
    th.save(self.evl_net.state_dict(), "./Pong.pth")

  # at the beginning load the saved CNN model
  def load(self,s_channels, a_space):
    self.evl_net = CNN(s_channels, a_space).to(self.device)
    self.evl_net.load_state_dict(th.load("./Pong.pth"))

  # DQN replay progression
  def train(self,state,batch_size):
    """
    s_v_size: [batch_size,4,84,84] type: Tensor
    s_a_size: [batch_size,1] type: Tensor
    next_s_v_size: [batch_size,4,84,84] type: List
    r_v_size: [1,batch_size] type: Tensor
    dones_size: [batch_size] type: List

    """
    s_v,a_v,next_s_v,r_v,dones = self.data_pre_process(batch_size)
    self.tgt_net.load_state_dict(self.evl_net.state_dict())

    # create evl_Q_value tensor
    evl_Q_value = self.evl_net(s_v).gather(0,a_v) #  size: [batch_size,6].gather() -> [batch_size,1] Type: Tensor
    
    # correctly transform next_s_v into tensor：
    nonDone_index = th.LongTensor(tuple([i for i,x in enumerate(dones) if x!=True])).to(self.device)
    tgt_Q_value = th.zeros(batch_size).to(device) 

    true_next_s_v = list(filter((None).__ne__,next_s_v)) # pop the "None" elements  
    true_next_s_v = th.FloatTensor(true_next_s_v).to(self.device) # size: [notDone_batch_size,4,84,84]
    # print(true_next_s_v.shape)

    tgt = self.tgt_net(true_next_s_v).max(1)[0].detach() # size [1,notDone_batch_size] Type: Tensor
    # print(tgt.shape)
    
    # update tgt_Q_value
    tgt_Q_value[nonDone_index] = tgt
    tgt_Q_value = r_v + self.gamma * tgt_Q_value
    tgt_Q_value = tgt_Q_value.reshape(batch_size,1) #  size: [batch_size, 1] cannot be back propagated
    # print(tgt_Q_value)
    
    self.optimal.zero_grad()
    loss = self.cert(evl_Q_value, tgt_Q_value)
    loss.backward()
    
    # constrain the gradient from explosion
    for p in self.evl_net.parameters():
      p.grad.data.clamp_(-1, 1)
    
    self.optimal.step()

    # decrease fire
    if(self.epsilon > self.min_epsilon):
      self.epsilon *= self.dr

在主要培训进度中，我将批量尺寸从32增加到64，以加速操作。 CNN将每四集更新。统计信息将每十集打印。

# set GPU device to cuda
device = th.device("cuda:0" if th.cuda.is_available() else "cpu")

# set episode step and batch_size
episodes = 5000
batch_size = 32

env = gym.make("PongNoFrameskip-v4")
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=True, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
    
# create frame stack for the input image data (size: (4,84,84))
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
a_space = env.action_space.n

agent = Agent(channels, a_space, device)
agent.load(channels, a_space)

# testing start:

for e in range(episodes):

  # step 1: reset the agent at the beginning
  s = np.array(env.reset())
  img = plt.imshow(env.render('rgb_array'))
  done = False
  score = 0

  while not done:
  
    # step 2: iterate actions
    a = agent.select(th.Tensor(s).unsqueeze(0).to(device),a_space)
    next_s, reward, done, _ = env.step(a)
    if(done==True):
      reward = -1.0
      next_s = None
    else:
      next_s = np.array(next_s)  
    # print(next_s.shape)

    # step 3: record the data into buffer
    dataset = (s,a,next_s,reward,done)
    agent.record(dataset)

    # step 4: update state steps
    s = next_s
    score += reward

  # step 5: training and update CNN by each 4 episodes
  if(len(agent.memory) > batch_size and e % 4 == 0):
    agent.train(channels,batch_size)
    agent.save()
  
  # appendix 1: at the beginning increase batch_size from 32 to 64
  if(batch_size < 64):
    batch_size += 1

  # appendix 2: return score by each 10 episodes
  if(e % 10 == 0 and len(agent.memory)>batch_size):
    print("episodes:",e,"score:",score,"epsilon: {:.2}".format(agent.epsilon))

在运行期间，没有任何错误信息提醒。但是，代理的性能不如预期。经过1000集之后，它仍然像一开始一样返回减去分数。输出就是这样：

episodes: 800 score: -20.0 epsilon: 0.37
episodes: 810 score: -21.0 epsilon: 0.36
episodes: 820 score: -21.0 epsilon: 0.36
episodes: 830 score: -21.0 epsilon: 0.35
episodes: 840 score: -21.0 epsilon: 0.35
episodes: 850 score: -21.0 epsilon: 0.34
episodes: 860 score: -21.0 epsilon: 0.34
episodes: 870 score: -21.0 epsilon: 0.34
episodes: 880 score: -20.0 epsilon: 0.33
episodes: 890 score: -21.0 epsilon: 0.33
episodes: 900 score: -20.0 epsilon: 0.32
episodes: 910 score: -21.0 epsilon: 0.32
episodes: 920 score: -21.0 epsilon: 0.31
episodes: 930 score: -21.0 epsilon: 0.31
episodes: 940 score: -21.0 epsilon: 0.31
episodes: 950 score: -21.0 epsilon: 0.3
episodes: 960 score: -21.0 epsilon: 0.3
episodes: 970 score: -21.0 epsilon: 0.3
episodes: 980 score: -21.0 epsilon: 0.29

我根据算法理论重新检查了模型的结构，但没有发现任何不同。我希望得到一些建议，并帮助如何解决这个问题。

原文

I'm trying to implement a DQN model of Pong game. However, it still performs like random activities even after about 1000 episodes. The CNN training seems not improve the agents.

Here is my main code:

I create a CNN including three convolution layers after pooling and three forward connection layers. the input channels is the number of pre-processed frame (from 3210160 to 48484 and the channel is 4):

class CNN(nn.Module):
  def __init__(self, s_channels, a_space):
    super(CNN, self).__init__()
    self.pool = nn.MaxPool2d(kernel_size=2, stride=1)
    self.conv1 = nn.Conv2d(s_channels,out_channels=32,kernel_size=8,stride=4)
    self.conv2 = nn.Conv2d(32,64,4,2)
    self.conv3 = nn.Conv2d(64,64,3,1)
    self.fc1 = nn.Linear(64*4*4,1024)
    self.fc2 = nn.Linear(1024,512)
    self.fc3 = nn.Linear(512,a_space)

  def forward(self,input):
    output = self.pool(F.relu(self.conv1(input)))
    output = self.pool(F.relu(self.conv2(output)))
    output = self.pool(F.relu(self.conv3(output)))
    output = output.view(-1,64*4*4)
    output = F.relu(self.fc1(output))
    output = F.relu(self.fc2(output))
    output = F.relu(self.fc3(output))
    return output

After that, I construct an agent class with action selection and CNN training functions. In CNN training function, I use the batch input to come out the loss value instead of step-by-step for-loop iteration of batch data. before coming out the loss and backward, I transform the input image data into vectors of batch size. Here is the agent class:

class Agent():
  def __init__(self, s_space, a_space, device) -> None:

    # set GPU device to cuda
    self.device = device

    # define parameters
    self.epsilon = 1.0
    self.min_epsilon = 0.01
    self.dr = 0.995
    self.lr = 0.001
    self.gamma = 0.9

    # define models
    self.evl_net = CNN(s_space, a_space).to(self.device)
    self.tgt_net = CNN(s_space, a_space).to(self.device)
    self.cert = nn.SmoothL1Loss()
    self.optimal = th.optim.Adam(self.evl_net.parameters(),lr=self.lr)

    # define memory store
    self.memory = deque(maxlen=2000)

  # pre-process the input image data
  def data_pre_process(self,batch_size):
    s_v = []
    a_v = []
    next_s_v = []
    r_v = []
    dones = []
    materials = random.sample(self.memory,batch_size)
    for t in materials:
      s_v.append(t[0])
      a_v.append(t[1])
      next_s_v.append(t[2])
      r_v.append(t[3])
      dones.append(t[4])

    s_v = th.Tensor(s_v).to(self.device)         
    a_v = th.LongTensor(a_v).unsqueeze(1).to(self.device)                                 
    r_v = th.FloatTensor(r_v).to(device)        
    # print(r_v.shape)
    return s_v, a_v, next_s_v, r_v, dones      

  # record the transformed images
  def record(self,tpl):
    self.memory.append(tpl)
  
  # select actions according to the states (input images with 4 channels)
  def select(self,state,a_space):
    actions = self.evl_net(state).data.tolist()
    if(random.random() <= self.epsilon):
      action = random.randint(0,a_space-1)
    else:
      action = actions.index(max(actions))
    return action

  # save CNN model
  def save(self):
    th.save(self.evl_net.state_dict(), "./Pong.pth")

  # at the beginning load the saved CNN model
  def load(self,s_channels, a_space):
    self.evl_net = CNN(s_channels, a_space).to(self.device)
    self.evl_net.load_state_dict(th.load("./Pong.pth"))

  # DQN replay progression
  def train(self,state,batch_size):
    """
    s_v_size: [batch_size,4,84,84] type: Tensor
    s_a_size: [batch_size,1] type: Tensor
    next_s_v_size: [batch_size,4,84,84] type: List
    r_v_size: [1,batch_size] type: Tensor
    dones_size: [batch_size] type: List

    """
    s_v,a_v,next_s_v,r_v,dones = self.data_pre_process(batch_size)
    self.tgt_net.load_state_dict(self.evl_net.state_dict())

    # create evl_Q_value tensor
    evl_Q_value = self.evl_net(s_v).gather(0,a_v) #  size: [batch_size,6].gather() -> [batch_size,1] Type: Tensor
    
    # correctly transform next_s_v into tensor：
    nonDone_index = th.LongTensor(tuple([i for i,x in enumerate(dones) if x!=True])).to(self.device)
    tgt_Q_value = th.zeros(batch_size).to(device) 

    true_next_s_v = list(filter((None).__ne__,next_s_v)) # pop the "None" elements  
    true_next_s_v = th.FloatTensor(true_next_s_v).to(self.device) # size: [notDone_batch_size,4,84,84]
    # print(true_next_s_v.shape)

    tgt = self.tgt_net(true_next_s_v).max(1)[0].detach() # size [1,notDone_batch_size] Type: Tensor
    # print(tgt.shape)
    
    # update tgt_Q_value
    tgt_Q_value[nonDone_index] = tgt
    tgt_Q_value = r_v + self.gamma * tgt_Q_value
    tgt_Q_value = tgt_Q_value.reshape(batch_size,1) #  size: [batch_size, 1] cannot be back propagated
    # print(tgt_Q_value)
    
    self.optimal.zero_grad()
    loss = self.cert(evl_Q_value, tgt_Q_value)
    loss.backward()
    
    # constrain the gradient from explosion
    for p in self.evl_net.parameters():
      p.grad.data.clamp_(-1, 1)
    
    self.optimal.step()

    # decrease fire
    if(self.epsilon > self.min_epsilon):
      self.epsilon *= self.dr

In the main training progress, I set the batch size increasing from 32 to 64 for accelerating the operation. The CNN will be updated each four episodes. The statistic information will be printed each ten episodes.

# set GPU device to cuda
device = th.device("cuda:0" if th.cuda.is_available() else "cpu")

# set episode step and batch_size
episodes = 5000
batch_size = 32

env = gym.make("PongNoFrameskip-v4")
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=True, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
    
# create frame stack for the input image data (size: (4,84,84))
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
a_space = env.action_space.n

agent = Agent(channels, a_space, device)
agent.load(channels, a_space)

# testing start:

for e in range(episodes):

  # step 1: reset the agent at the beginning
  s = np.array(env.reset())
  img = plt.imshow(env.render('rgb_array'))
  done = False
  score = 0

  while not done:
  
    # step 2: iterate actions
    a = agent.select(th.Tensor(s).unsqueeze(0).to(device),a_space)
    next_s, reward, done, _ = env.step(a)
    if(done==True):
      reward = -1.0
      next_s = None
    else:
      next_s = np.array(next_s)  
    # print(next_s.shape)

    # step 3: record the data into buffer
    dataset = (s,a,next_s,reward,done)
    agent.record(dataset)

    # step 4: update state steps
    s = next_s
    score += reward

  # step 5: training and update CNN by each 4 episodes
  if(len(agent.memory) > batch_size and e % 4 == 0):
    agent.train(channels,batch_size)
    agent.save()
  
  # appendix 1: at the beginning increase batch_size from 32 to 64
  if(batch_size < 64):
    batch_size += 1

  # appendix 2: return score by each 10 episodes
  if(e % 10 == 0 and len(agent.memory)>batch_size):
    print("episodes:",e,"score:",score,"epsilon: {:.2}".format(agent.epsilon))

During running there is not any error information reminded. However, the agent does not perform as well as expected. After 1000 episodes, it still returns minus score as it did at the very start. The output is like this:

episodes: 800 score: -20.0 epsilon: 0.37
episodes: 810 score: -21.0 epsilon: 0.36
episodes: 820 score: -21.0 epsilon: 0.36
episodes: 830 score: -21.0 epsilon: 0.35
episodes: 840 score: -21.0 epsilon: 0.35
episodes: 850 score: -21.0 epsilon: 0.34
episodes: 860 score: -21.0 epsilon: 0.34
episodes: 870 score: -21.0 epsilon: 0.34
episodes: 880 score: -20.0 epsilon: 0.33
episodes: 890 score: -21.0 epsilon: 0.33
episodes: 900 score: -20.0 epsilon: 0.32
episodes: 910 score: -21.0 epsilon: 0.32
episodes: 920 score: -21.0 epsilon: 0.31
episodes: 930 score: -21.0 epsilon: 0.31
episodes: 940 score: -21.0 epsilon: 0.31
episodes: 950 score: -21.0 epsilon: 0.3
episodes: 960 score: -21.0 epsilon: 0.3
episodes: 970 score: -21.0 epsilon: 0.3
episodes: 980 score: -21.0 epsilon: 0.29

I rechecked the structure of the model according to the algorithm theory but found nothing different. I hope to get some advice and help on how to deal with this problem.

分享到QQ

分享到微博