TF代理在培训强化学习后为所有测试状态采取相同的措施

发布于 2025-01-27 17:12:03 字数 7813 浏览 6 评论 0原文

我正在尝试创建一个自定义的Pyenvironment，以使代理商学习最佳小时，以将通知发送给用户，以基于单击前7天发送的通知获得的奖励。

培训完成后，即使输入了不同的状态，代理也仅作为同一小时采取行动。

状态结构：我们正在考虑用户的最后7天小时响应对。

Eg : 7,0,14,1,5,0,3,0,23,0,9,1,12,0   [Reward 0 at 7th hour, Reward 1 at 14th hour and so on]

超参数：

duration = 1 # @param {type:"integer"}

discount = 0.99 # @param {type:"number"}

state_size = 7 # @param {type:"integer"}

num_episodes = 1 # @param {type:"integer"}

learning_rate = 1e-5 # @param {type:"number"}

num_eval_episodes = 100 # @param {type:"integer"}

replay_buffer_max_length = 100000  # @param {type:"integer"}

initial_collect_steps = 100 # @param {type:"integer"} 

batch_size = 10 # @param {type:"integer"}

num_iterations = 1000 # @param {type:"integer"}

collect_steps_per_iteration = 1 # @param {type:"integer"}

log_interval = 500 # @param {type:"integer"}

eval_interval = 500 # @param {type:"integer"}

以下是notifenv类的代码：

class NotifEnv(py_environment.PyEnvironment):

  def __init__(self, duration, discount, state_size):
    
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=23, name='action')
    
    self._state_spec = array_spec.BoundedArraySpec(
            shape=(2*state_size, ), dtype=np.float32, minimum=0.0, maximum=1.0, name='state')
    
    self._discount_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.float32, minimum=0.0, maximum=1.0, name='discount')
  
    self.step_count = 0
    
    self.duration = duration
    
    self.state_size = state_size
    
    self.discount = discount
    
    self._episode_ended = False
    
    self.state = np.array([0]*2*self.state_size, dtype=np.float32)

    
  def observation_spec(self):
    """Return state_spec."""
    return self._state_spec 

  def action_spec(self):
    """Return action_spec."""
    return self._action_spec

  def _reset(self):
    """Return initial_time_step."""
    
    self.state = np.array([0]*2*self.state_size, dtype=np.float32)
    
    self._episode_ended = False
    
    self.step_count = 0
    
    return ts.restart(np.array(self.state, dtype=np.float32))

  def _step(self, action):
    """Apply action and return new time_step."""
    
    if action<0 or action>23 :
        raise ValueError('`action` should be between 0 and 23.')
    
    self.step_count += 1
    curr_reward = get_reward(action)

    self.state[:-2] = self.state[2:]
    self.state[-2:] = [action+1, curr_reward]
    
    if self.step_count >= duration :
        self.episode_ended = True
        self.step_count = 0
        return ts.termination(np.array(self.state, dtype=np.float32), reward=curr_reward)

    else :
        return ts.transition(np.array(self.state, dtype=np.float32), discount=self.discount, reward=curr_reward)

奖励功能（仅在一天的小时＆lt; 12时给出奖励），其他对代理人进行惩罚：

def get_reward(action):
    if action<12 :
        return 1
    else :
        return -0.3

模型体系结构：

train_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
eval_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
    
    def compute_avg_return(environment, policy, num_episodes=10):
    
        total_return = 0.0
        for _ in range(num_episodes):
            time_step = environment.reset()
            episode_return = 0.0
    
            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = environment.step(action_step.action)
                episode_return += time_step.reward
            total_return += episode_return
    
        avg_return = total_return / num_episodes
        return avg_return.numpy()[0]

训练：

fc_layer_params = (6,)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = 24

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
    return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# it's output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])


optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)
print("train_step_counter = ",train_step_counter)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

初始化重播缓冲区：

   replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            data_spec=agent.collect_data_spec,
            batch_size=train_env.batch_size,
            max_length=20)  # replay_buffer_max_length
    
   def collect_step(environment, policy, buffer):
        time_step = environment.current_time_step()
        print("time_step = "+str(time_step))
        action_step = policy.action(time_step)
        print("action_step = "+str(action_step))
        next_time_step = environment.step(action_step.action)
        print("next_time_step = "+str(next_time_step))
        traj = trajectory.from_transition(time_step, action_step, next_time_step)
        print("traj = "+str(traj))
        print("\n \n")
        # Add trajectory to the replay buffer
        buffer.add_batch(traj)
        # buffer.add_batch(traj)
    
    def collect_data(env, policy, buffer, steps):
        for _ in range(steps):
          print("NUM : "+str(_))
          collect_step(env, policy, buffer)
    
    collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)

训练运行：

iterator = iter(dataset)

agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
print("log_interval = ",log_interval)
print("eval_interval = ",eval_interval)
print("returns = ",returns)

for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    # collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)

    # print("x = "+str(x))
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))
        print(np.max(q_net.get_weights()[0]))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

原文

I am trying to create a Custom PyEnvironment for making an agent learn the optimum hour to send the notification to the users, based on the rewards received by clicking on the notifications sent in previous 7 days.

After the training is complete, the agent is only taking the action as the same hour even if different states are inputted.

states structure : We are considering last 7 days hour-response pairs of the user.

Eg : 7,0,14,1,5,0,3,0,23,0,9,1,12,0   [Reward 0 at 7th hour, Reward 1 at 14th hour and so on]

Hyperparameters:

duration = 1 # @param {type:"integer"}

discount = 0.99 # @param {type:"number"}

state_size = 7 # @param {type:"integer"}

num_episodes = 1 # @param {type:"integer"}

learning_rate = 1e-5 # @param {type:"number"}

num_eval_episodes = 100 # @param {type:"integer"}

replay_buffer_max_length = 100000  # @param {type:"integer"}

initial_collect_steps = 100 # @param {type:"integer"} 

batch_size = 10 # @param {type:"integer"}

num_iterations = 1000 # @param {type:"integer"}

collect_steps_per_iteration = 1 # @param {type:"integer"}

log_interval = 500 # @param {type:"integer"}

eval_interval = 500 # @param {type:"integer"}

Below is the code of NotifEnv class:

class NotifEnv(py_environment.PyEnvironment):

  def __init__(self, duration, discount, state_size):
    
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=23, name='action')
    
    self._state_spec = array_spec.BoundedArraySpec(
            shape=(2*state_size, ), dtype=np.float32, minimum=0.0, maximum=1.0, name='state')
    
    self._discount_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.float32, minimum=0.0, maximum=1.0, name='discount')
  
    self.step_count = 0
    
    self.duration = duration
    
    self.state_size = state_size
    
    self.discount = discount
    
    self._episode_ended = False
    
    self.state = np.array([0]*2*self.state_size, dtype=np.float32)

    
  def observation_spec(self):
    """Return state_spec."""
    return self._state_spec 

  def action_spec(self):
    """Return action_spec."""
    return self._action_spec

  def _reset(self):
    """Return initial_time_step."""
    
    self.state = np.array([0]*2*self.state_size, dtype=np.float32)
    
    self._episode_ended = False
    
    self.step_count = 0
    
    return ts.restart(np.array(self.state, dtype=np.float32))

  def _step(self, action):
    """Apply action and return new time_step."""
    
    if action<0 or action>23 :
        raise ValueError('`action` should be between 0 and 23.')
    
    self.step_count += 1
    curr_reward = get_reward(action)

    self.state[:-2] = self.state[2:]
    self.state[-2:] = [action+1, curr_reward]
    
    if self.step_count >= duration :
        self.episode_ended = True
        self.step_count = 0
        return ts.termination(np.array(self.state, dtype=np.float32), reward=curr_reward)

    else :
        return ts.transition(np.array(self.state, dtype=np.float32), discount=self.discount, reward=curr_reward)

Reward function (reward only given if hour of day < 12) else penalise the agent:

def get_reward(action):
    if action<12 :
        return 1
    else :
        return -0.3

Model Architecture :

train_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
eval_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
    
    def compute_avg_return(environment, policy, num_episodes=10):
    
        total_return = 0.0
        for _ in range(num_episodes):
            time_step = environment.reset()
            episode_return = 0.0
    
            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = environment.step(action_step.action)
                episode_return += time_step.reward
            total_return += episode_return
    
        avg_return = total_return / num_episodes
        return avg_return.numpy()[0]

Training :

fc_layer_params = (6,)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = 24

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
    return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# it's output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])


optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)
print("train_step_counter = ",train_step_counter)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

Initialising Replay Buffer :

   replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            data_spec=agent.collect_data_spec,
            batch_size=train_env.batch_size,
            max_length=20)  # replay_buffer_max_length
    
   def collect_step(environment, policy, buffer):
        time_step = environment.current_time_step()
        print("time_step = "+str(time_step))
        action_step = policy.action(time_step)
        print("action_step = "+str(action_step))
        next_time_step = environment.step(action_step.action)
        print("next_time_step = "+str(next_time_step))
        traj = trajectory.from_transition(time_step, action_step, next_time_step)
        print("traj = "+str(traj))
        print("\n \n")
        # Add trajectory to the replay buffer
        buffer.add_batch(traj)
        # buffer.add_batch(traj)
    
    def collect_data(env, policy, buffer, steps):
        for _ in range(steps):
          print("NUM : "+str(_))
          collect_step(env, policy, buffer)
    
    collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)

Training Run :

iterator = iter(dataset)

agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
print("log_interval = ",log_interval)
print("eval_interval = ",eval_interval)
print("returns = ",returns)

for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    # collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)

    # print("x = "+str(x))
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))
        print(np.max(q_net.get_weights()[0]))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

分享到QQ

分享到微博