TF代理在培训强化学习后为所有测试状态采取相同的措施
我正在尝试创建一个自定义的Pyenvironment,以使代理商学习最佳小时,以将通知发送给用户,以基于单击前7天发送的通知获得的奖励。
培训完成后,即使输入了不同的状态,代理也仅作为同一小时采取行动。
状态结构:我们正在考虑用户的最后7天小时响应对。
Eg : 7,0,14,1,5,0,3,0,23,0,9,1,12,0 [Reward 0 at 7th hour, Reward 1 at 14th hour and so on]
超参数:
duration = 1 # @param {type:"integer"}
discount = 0.99 # @param {type:"number"}
state_size = 7 # @param {type:"integer"}
num_episodes = 1 # @param {type:"integer"}
learning_rate = 1e-5 # @param {type:"number"}
num_eval_episodes = 100 # @param {type:"integer"}
replay_buffer_max_length = 100000 # @param {type:"integer"}
initial_collect_steps = 100 # @param {type:"integer"}
batch_size = 10 # @param {type:"integer"}
num_iterations = 1000 # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
log_interval = 500 # @param {type:"integer"}
eval_interval = 500 # @param {type:"integer"}
以下是notifenv类的代码:
class NotifEnv(py_environment.PyEnvironment):
def __init__(self, duration, discount, state_size):
self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=23, name='action')
self._state_spec = array_spec.BoundedArraySpec(
shape=(2*state_size, ), dtype=np.float32, minimum=0.0, maximum=1.0, name='state')
self._discount_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.float32, minimum=0.0, maximum=1.0, name='discount')
self.step_count = 0
self.duration = duration
self.state_size = state_size
self.discount = discount
self._episode_ended = False
self.state = np.array([0]*2*self.state_size, dtype=np.float32)
def observation_spec(self):
"""Return state_spec."""
return self._state_spec
def action_spec(self):
"""Return action_spec."""
return self._action_spec
def _reset(self):
"""Return initial_time_step."""
self.state = np.array([0]*2*self.state_size, dtype=np.float32)
self._episode_ended = False
self.step_count = 0
return ts.restart(np.array(self.state, dtype=np.float32))
def _step(self, action):
"""Apply action and return new time_step."""
if action<0 or action>23 :
raise ValueError('`action` should be between 0 and 23.')
self.step_count += 1
curr_reward = get_reward(action)
self.state[:-2] = self.state[2:]
self.state[-2:] = [action+1, curr_reward]
if self.step_count >= duration :
self.episode_ended = True
self.step_count = 0
return ts.termination(np.array(self.state, dtype=np.float32), reward=curr_reward)
else :
return ts.transition(np.array(self.state, dtype=np.float32), discount=self.discount, reward=curr_reward)
奖励功能(仅在一天的小时&lt; 12时给出奖励),其他对代理人进行惩罚:
def get_reward(action):
if action<12 :
return 1
else :
return -0.3
模型体系结构:
train_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
eval_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
def compute_avg_return(environment, policy, num_episodes=10):
total_return = 0.0
for _ in range(num_episodes):
time_step = environment.reset()
episode_return = 0.0
while not time_step.is_last():
action_step = policy.action(time_step)
time_step = environment.step(action_step.action)
episode_return += time_step.reward
total_return += episode_return
avg_return = total_return / num_episodes
return avg_return.numpy()[0]
训练:
fc_layer_params = (6,)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = 24
# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
return tf.keras.layers.Dense(
num_units,
activation=tf.keras.activations.relu,
kernel_initializer=tf.keras.initializers.VarianceScaling(
scale=2.0, mode='fan_in', distribution='truncated_normal'))
# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# it's output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
num_actions,
activation=None,
kernel_initializer=tf.keras.initializers.RandomUniform(
minval=-0.03, maxval=0.03),
bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
train_step_counter = tf.Variable(0)
print("train_step_counter = ",train_step_counter)
agent = dqn_agent.DqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network=q_net,
optimizer=optimizer,
td_errors_loss_fn=common.element_wise_squared_loss,
train_step_counter=train_step_counter)
agent.initialize()
初始化重播缓冲区:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=agent.collect_data_spec,
batch_size=train_env.batch_size,
max_length=20) # replay_buffer_max_length
def collect_step(environment, policy, buffer):
time_step = environment.current_time_step()
print("time_step = "+str(time_step))
action_step = policy.action(time_step)
print("action_step = "+str(action_step))
next_time_step = environment.step(action_step.action)
print("next_time_step = "+str(next_time_step))
traj = trajectory.from_transition(time_step, action_step, next_time_step)
print("traj = "+str(traj))
print("\n \n")
# Add trajectory to the replay buffer
buffer.add_batch(traj)
# buffer.add_batch(traj)
def collect_data(env, policy, buffer, steps):
for _ in range(steps):
print("NUM : "+str(_))
collect_step(env, policy, buffer)
collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)
训练运行:
iterator = iter(dataset)
agent.train = common.function(agent.train)
# Reset the train step
agent.train_step_counter.assign(0)
# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
print("log_interval = ",log_interval)
print("eval_interval = ",eval_interval)
print("returns = ",returns)
for _ in range(num_iterations):
# Collect a few steps using collect_policy and save to the replay buffer.
# collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
# print("x = "+str(x))
train_loss = agent.train(experience).loss
step = agent.train_step_counter.numpy()
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss))
print(np.max(q_net.get_weights()[0]))
if step % eval_interval == 0:
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
print('step = {0}: Average Return = {1}'.format(step, avg_return))
returns.append(avg_return)
I am trying to create a Custom PyEnvironment for making an agent learn the optimum hour to send the notification to the users, based on the rewards received by clicking on the notifications sent in previous 7 days.
After the training is complete, the agent is only taking the action as the same hour even if different states are inputted.
states structure : We are considering last 7 days hour-response pairs of the user.
Eg : 7,0,14,1,5,0,3,0,23,0,9,1,12,0 [Reward 0 at 7th hour, Reward 1 at 14th hour and so on]
Hyperparameters:
duration = 1 # @param {type:"integer"}
discount = 0.99 # @param {type:"number"}
state_size = 7 # @param {type:"integer"}
num_episodes = 1 # @param {type:"integer"}
learning_rate = 1e-5 # @param {type:"number"}
num_eval_episodes = 100 # @param {type:"integer"}
replay_buffer_max_length = 100000 # @param {type:"integer"}
initial_collect_steps = 100 # @param {type:"integer"}
batch_size = 10 # @param {type:"integer"}
num_iterations = 1000 # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
log_interval = 500 # @param {type:"integer"}
eval_interval = 500 # @param {type:"integer"}
Below is the code of NotifEnv class:
class NotifEnv(py_environment.PyEnvironment):
def __init__(self, duration, discount, state_size):
self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=23, name='action')
self._state_spec = array_spec.BoundedArraySpec(
shape=(2*state_size, ), dtype=np.float32, minimum=0.0, maximum=1.0, name='state')
self._discount_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.float32, minimum=0.0, maximum=1.0, name='discount')
self.step_count = 0
self.duration = duration
self.state_size = state_size
self.discount = discount
self._episode_ended = False
self.state = np.array([0]*2*self.state_size, dtype=np.float32)
def observation_spec(self):
"""Return state_spec."""
return self._state_spec
def action_spec(self):
"""Return action_spec."""
return self._action_spec
def _reset(self):
"""Return initial_time_step."""
self.state = np.array([0]*2*self.state_size, dtype=np.float32)
self._episode_ended = False
self.step_count = 0
return ts.restart(np.array(self.state, dtype=np.float32))
def _step(self, action):
"""Apply action and return new time_step."""
if action<0 or action>23 :
raise ValueError('`action` should be between 0 and 23.')
self.step_count += 1
curr_reward = get_reward(action)
self.state[:-2] = self.state[2:]
self.state[-2:] = [action+1, curr_reward]
if self.step_count >= duration :
self.episode_ended = True
self.step_count = 0
return ts.termination(np.array(self.state, dtype=np.float32), reward=curr_reward)
else :
return ts.transition(np.array(self.state, dtype=np.float32), discount=self.discount, reward=curr_reward)
Reward function (reward only given if hour of day < 12) else penalise the agent:
def get_reward(action):
if action<12 :
return 1
else :
return -0.3
Model Architecture :
train_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
eval_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
def compute_avg_return(environment, policy, num_episodes=10):
total_return = 0.0
for _ in range(num_episodes):
time_step = environment.reset()
episode_return = 0.0
while not time_step.is_last():
action_step = policy.action(time_step)
time_step = environment.step(action_step.action)
episode_return += time_step.reward
total_return += episode_return
avg_return = total_return / num_episodes
return avg_return.numpy()[0]
Training :
fc_layer_params = (6,)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = 24
# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
return tf.keras.layers.Dense(
num_units,
activation=tf.keras.activations.relu,
kernel_initializer=tf.keras.initializers.VarianceScaling(
scale=2.0, mode='fan_in', distribution='truncated_normal'))
# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# it's output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
num_actions,
activation=None,
kernel_initializer=tf.keras.initializers.RandomUniform(
minval=-0.03, maxval=0.03),
bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
train_step_counter = tf.Variable(0)
print("train_step_counter = ",train_step_counter)
agent = dqn_agent.DqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network=q_net,
optimizer=optimizer,
td_errors_loss_fn=common.element_wise_squared_loss,
train_step_counter=train_step_counter)
agent.initialize()
Initialising Replay Buffer :
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=agent.collect_data_spec,
batch_size=train_env.batch_size,
max_length=20) # replay_buffer_max_length
def collect_step(environment, policy, buffer):
time_step = environment.current_time_step()
print("time_step = "+str(time_step))
action_step = policy.action(time_step)
print("action_step = "+str(action_step))
next_time_step = environment.step(action_step.action)
print("next_time_step = "+str(next_time_step))
traj = trajectory.from_transition(time_step, action_step, next_time_step)
print("traj = "+str(traj))
print("\n \n")
# Add trajectory to the replay buffer
buffer.add_batch(traj)
# buffer.add_batch(traj)
def collect_data(env, policy, buffer, steps):
for _ in range(steps):
print("NUM : "+str(_))
collect_step(env, policy, buffer)
collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)
Training Run :
iterator = iter(dataset)
agent.train = common.function(agent.train)
# Reset the train step
agent.train_step_counter.assign(0)
# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
print("log_interval = ",log_interval)
print("eval_interval = ",eval_interval)
print("returns = ",returns)
for _ in range(num_iterations):
# Collect a few steps using collect_policy and save to the replay buffer.
# collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
# print("x = "+str(x))
train_loss = agent.train(experience).loss
step = agent.train_step_counter.numpy()
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss))
print(np.max(q_net.get_weights()[0]))
if step % eval_interval == 0:
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
print('step = {0}: Average Return = {1}'.format(step, avg_return))
returns.append(avg_return)
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
我认为1000个培训步骤可能是代理商被困在一项行动中的原因。 RL代理需要数十万个迭代,这可能会更高,具体取决于任务和某些神经网络超参数(例如学习率)。
我建议您通过大量来增加
num_iterations
,然后检查是否有所改善。I think that 1000 training steps could be on of the reason the agent is stuck with picking one action. RL agents need hundreds of thousands of iterations, and this could be even higher depending on the task and some neural networks hyperparameters, like the learning rate.
I suggest you to increase the
num_iterations
by a lot, and then check if something improves.