DQN型号(游戏:Atari Pongnoframeskip)不学习

我正在尝试实施乒乓球的DQN模型。但是,即使在大约1000集之后,它仍然像随机活动一样执行。 CNN培训似乎并不能改善代理。


我创建了一个CNN,包括汇总后三个卷积层和三个向前连接层。输入通道是预处理框架的数量(从3 210 160到4 84 84,通道为4):

class CNN(nn.Module):
  def __init__(self, s_channels, a_space):
    super(CNN, self).__init__()
    self.pool = nn.MaxPool2d(kernel_size=2, stride=1)
    self.conv1 = nn.Conv2d(s_channels,out_channels=32,kernel_size=8,stride=4)
    self.conv2 = nn.Conv2d(32,64,4,2)
    self.conv3 = nn.Conv2d(64,64,3,1)
    self.fc1 = nn.Linear(64*4*4,1024)
    self.fc2 = nn.Linear(1024,512)
    self.fc3 = nn.Linear(512,a_space)

  def forward(self,input):
    output = self.pool(F.relu(self.conv1(input)))
    output = self.pool(F.relu(self.conv2(output)))
    output = self.pool(F.relu(self.conv3(output)))
    output = output.view(-1,64*4*4)
    output = F.relu(self.fc1(output))
    output = F.relu(self.fc2(output))
    output = F.relu(self.fc3(output))
    return output


class Agent():
  def __init__(self, s_space, a_space, device) -> None:

    # set GPU device to cuda
    self.device = device

    # define parameters
    self.epsilon = 1.0
    self.min_epsilon = 0.01
    self.dr = 0.995
    self.lr = 0.001
    self.gamma = 0.9

    # define models
    self.evl_net = CNN(s_space, a_space).to(self.device)
    self.tgt_net = CNN(s_space, a_space).to(self.device)
    self.cert = nn.SmoothL1Loss()
    self.optimal = th.optim.Adam(self.evl_net.parameters(),lr=self.lr)

    # define memory store
    self.memory = deque(maxlen=2000)

  # pre-process the input image data
  def data_pre_process(self,batch_size):
    s_v = []
    a_v = []
    next_s_v = []
    r_v = []
    dones = []
    materials = random.sample(self.memory,batch_size)
    for t in materials:

    s_v = th.Tensor(s_v).to(self.device)         
    a_v = th.LongTensor(a_v).unsqueeze(1).to(self.device)                                 
    r_v = th.FloatTensor(r_v).to(device)        
    # print(r_v.shape)
    return s_v, a_v, next_s_v, r_v, dones      

  # record the transformed images
  def record(self,tpl):
  # select actions according to the states (input images with 4 channels)
  def select(self,state,a_space):
    actions = self.evl_net(state).data.tolist()
    if(random.random() <= self.epsilon):
      action = random.randint(0,a_space-1)
      action = actions.index(max(actions))
    return action

  # save CNN model
  def save(self):
    th.save(self.evl_net.state_dict(), "./Pong.pth")

  # at the beginning load the saved CNN model
  def load(self,s_channels, a_space):
    self.evl_net = CNN(s_channels, a_space).to(self.device)

  # DQN replay progression
  def train(self,state,batch_size):
    s_v_size: [batch_size,4,84,84] type: Tensor
    s_a_size: [batch_size,1] type: Tensor
    next_s_v_size: [batch_size,4,84,84] type: List
    r_v_size: [1,batch_size] type: Tensor
    dones_size: [batch_size] type: List

    s_v,a_v,next_s_v,r_v,dones = self.data_pre_process(batch_size)

    # create evl_Q_value tensor
    evl_Q_value = self.evl_net(s_v).gather(0,a_v) #  size: [batch_size,6].gather() -> [batch_size,1] Type: Tensor
    # correctly transform next_s_v into tensor:
    nonDone_index = th.LongTensor(tuple([i for i,x in enumerate(dones) if x!=True])).to(self.device)
    tgt_Q_value = th.zeros(batch_size).to(device) 

    true_next_s_v = list(filter((None).__ne__,next_s_v)) # pop the "None" elements  
    true_next_s_v = th.FloatTensor(true_next_s_v).to(self.device) # size: [notDone_batch_size,4,84,84]
    # print(true_next_s_v.shape)

    tgt = self.tgt_net(true_next_s_v).max(1)[0].detach() # size [1,notDone_batch_size] Type: Tensor
    # print(tgt.shape)
    # update tgt_Q_value
    tgt_Q_value[nonDone_index] = tgt
    tgt_Q_value = r_v + self.gamma * tgt_Q_value
    tgt_Q_value = tgt_Q_value.reshape(batch_size,1) #  size: [batch_size, 1] cannot be back propagated
    # print(tgt_Q_value)
    loss = self.cert(evl_Q_value, tgt_Q_value)
    # constrain the gradient from explosion
    for p in self.evl_net.parameters():
      p.grad.data.clamp_(-1, 1)

    # decrease fire
    if(self.epsilon > self.min_epsilon):
      self.epsilon *= self.dr

在主要培训进度中,我将批量尺寸从32增加到64,以加速操作。 CNN将每四集更新。统计信息将每十集打印。

# set GPU device to cuda
device = th.device("cuda:0" if th.cuda.is_available() else "cpu")

# set episode step and batch_size
episodes = 5000
batch_size = 32

env = gym.make("PongNoFrameskip-v4")
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=True, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
# create frame stack for the input image data (size: (4,84,84))
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
a_space = env.action_space.n

agent = Agent(channels, a_space, device)
agent.load(channels, a_space)

# testing start:

for e in range(episodes):

  # step 1: reset the agent at the beginning
  s = np.array(env.reset())
  img = plt.imshow(env.render('rgb_array'))
  done = False
  score = 0

  while not done:
    # step 2: iterate actions
    a = agent.select(th.Tensor(s).unsqueeze(0).to(device),a_space)
    next_s, reward, done, _ = env.step(a)
      reward = -1.0
      next_s = None
      next_s = np.array(next_s)  
    # print(next_s.shape)

    # step 3: record the data into buffer
    dataset = (s,a,next_s,reward,done)

    # step 4: update state steps
    s = next_s
    score += reward

  # step 5: training and update CNN by each 4 episodes
  if(len(agent.memory) > batch_size and e % 4 == 0):
  # appendix 1: at the beginning increase batch_size from 32 to 64
  if(batch_size < 64):
    batch_size += 1

  # appendix 2: return score by each 10 episodes
  if(e % 10 == 0 and len(agent.memory)>batch_size):
    print("episodes:",e,"score:",score,"epsilon: {:.2}".format(agent.epsilon))


episodes: 800 score: -20.0 epsilon: 0.37
episodes: 810 score: -21.0 epsilon: 0.36
episodes: 820 score: -21.0 epsilon: 0.36
episodes: 830 score: -21.0 epsilon: 0.35
episodes: 840 score: -21.0 epsilon: 0.35
episodes: 850 score: -21.0 epsilon: 0.34
episodes: 860 score: -21.0 epsilon: 0.34
episodes: 870 score: -21.0 epsilon: 0.34
episodes: 880 score: -20.0 epsilon: 0.33
episodes: 890 score: -21.0 epsilon: 0.33
episodes: 900 score: -20.0 epsilon: 0.32
episodes: 910 score: -21.0 epsilon: 0.32
episodes: 920 score: -21.0 epsilon: 0.31
episodes: 930 score: -21.0 epsilon: 0.31
episodes: 940 score: -21.0 epsilon: 0.31
episodes: 950 score: -21.0 epsilon: 0.3
episodes: 960 score: -21.0 epsilon: 0.3
episodes: 970 score: -21.0 epsilon: 0.3
episodes: 980 score: -21.0 epsilon: 0.29


