- 概览
- 安装
- 教程
- 算法接口文档
- 简易高效的并行接口
- APIS
- FREQUENTLY ASKED QUESTIONS
- EVOKIT
- 其他
- parl.algorithms.paddle.policy_gradient
- parl.algorithms.paddle.dqn
- parl.algorithms.paddle.ddpg
- parl.algorithms.paddle.ddqn
- parl.algorithms.paddle.oac
- parl.algorithms.paddle.a2c
- parl.algorithms.paddle.qmix
- parl.algorithms.paddle.td3
- parl.algorithms.paddle.sac
- parl.algorithms.paddle.ppo
- parl.algorithms.paddle.maddpg
- parl.core.paddle.model
- parl.core.paddle.algorithm
- parl.remote.remote_decorator
- parl.core.paddle.agent
- parl.remote.client
文章来源于网络收集而来,版权归原创者所有,如有侵权请及时联系!
parl.algorithms.paddle.a2c
parl.algorithms.paddle.a2c 源代码
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import parl import paddle import paddle.nn.functional as F from paddle.distribution import Categorical from parl.utils.utils import check_model_method import numpy as np __all__ = ['A2C'] [文档]class A2C(parl.Algorithm): [文档] def __init__(self, model, vf_loss_coeff=None): """ A2C algorithm Args: model (parl.Model): forward network of policy and value vf_loss_coeff (float): coefficient of the value function loss """ # check model and vf_loss_coeff input check_model_method(model, 'value', self.__class__.__name__) check_model_method(model, 'policy', self.__class__.__name__) check_model_method(model, 'policy_and_value', self.__class__.__name__) assert isinstance(vf_loss_coeff, (int, float)) self.model = model self.vf_loss_coeff = vf_loss_coeff clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=40.0) self.optimizer = paddle.optimizer.Adam( learning_rate=0.001, parameters=self.model.parameters(), grad_clip=clip) [文档] def learn(self, obs, actions, advantages, target_values, learning_rate, entropy_coeff): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. actions: An int64 tensor of shape [B]. advantages: A float32 tensor of shape [B]. target_values: A float32 tensor of shape [B]. learning_rate: float scalar of leanring rate. entropy_coeff: float scalar of entropy coefficient. """ # shape: [B, act_dim] logits = self.model.policy(obs) act_dim = logits.shape[-1] actions_onehot = F.one_hot(actions, act_dim) # [B, act_dim] --> [B] actions_log_probs = paddle.sum( F.log_softmax(logits) * actions_onehot, axis=-1) # The policy gradient loss pi_loss = -1.0 * paddle.sum(actions_log_probs * advantages) # The value function loss values = self.model.value(obs) delta = values - target_values vf_loss = 0.5 * paddle.sum(paddle.square(delta)) # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0) # Using the Categorical just for calculating the entropy. # See https://github.com/PaddlePaddle/Paddle/blob/release/2.0/python/paddle/distribution.py for detail policy_distribution = Categorical(logits) policy_entropy = policy_distribution.entropy() entropy = paddle.sum(policy_entropy) total_loss = ( pi_loss + vf_loss * self.vf_loss_coeff + entropy * entropy_coeff) self.optimizer.set_lr(learning_rate) total_loss.backward() self.optimizer.step() self.optimizer.clear_grad() return total_loss, pi_loss, vf_loss, entropy [文档] def prob_and_value(self, obs): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. """ logits, values = self.model.policy_and_value(obs) probs = F.softmax(logits) return probs, values [文档] def predict(self, obs): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. """ logits = self.model.policy(obs) probs = F.softmax(logits) predict_actions = paddle.argmax(probs, 1) return predict_actions [文档] def value(self, obs): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. """ values = self.model.value(obs) return values
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论