0用于联合培训的准确性

发布于 2025-02-06 04:36:12 字数 4232 浏览 3 评论 0 原文

我正在尝试为到目前

以下 this 模型（在TFF之外）实现收敛性和准确的结果
试图将这种方法与联合框架进行集成，如下所示：

函数

def make_federated_data(client_data, client_ids):
  return [
      preprocess(client_data.create_tf_dataset_for_client(x))
      for x in client_ids]

def create_keras_model():
  return tf.keras.models.Sequential([
      tf.keras.layers.InputLayer(input_shape=(41,)),
      tf.keras.layers.Dense(82, activation='relu'),
      tf.keras.layers.Dense(41, activation='relu'),
      tf.keras.layers.Dense(1, activation= 'sigmoid')])


def model_fn():
  # We _must_ create a new model here, and _not_ capture it from an external
  # scope. TFF will call this within different graph contexts.
  keras_model = create_keras_model()
  return tff.learning.from_keras_model(
      keras_model,
      input_spec=preprocessed_example_dataset.element_spec,
      loss=tf.keras.losses.BinaryCrossentropy(),
      metrics=[tf.keras.metrics.Accuracy()])

def create_tf_dataset_for_client_fn(client_id):
      dataset = tf.data.Dataset.from_tensor_slices(train_set.to_dict("list"))
      return dataset
    
    
def create_tf_dataset_for_client_fn_2(client_id):
      dataset = tf.data.Dataset.from_tensor_slices(test.to_dict("list"))
      return dataset
    def preprocess(dataset):

def preprocess(dataset):
  def batch_format_fn(element):
    """converting each sample to an `OrderedDict` and then formatting it"""
    
    return collections.OrderedDict(
          x =  tf.reshape(tf.concat([element[i] for i in element.keys() if i!='class'],0), [-1, 41]) ,
        y = tf.reshape(element['class'], [-1, 1]))

  return dataset.repeat(5).shuffle(100, seed=1).batch(  # Repeat = number of rounds
      12).map(batch_format_fn).prefetch(10)

主

    # train_Set and test_Set are dataframes(scaled) with 41 features 
#and 1 label
    #adding client number to each sample
n_clients = 10
train_set['id'] = np.random.randint(0, n_clients ,train_set.shape[0])
test_set['id'] = np.random.randint(0, n_clients , test_set.shape[0])
# DataFrame  conversion
    train_data = tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
            client_ids = list(client_ids),
            serializable_dataset_fn=create_tf_dataset_for_client_fn
        )
    
    test_data = tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
            client_ids = list(client_ids),
            serializable_dataset_fn=create_tf_dataset_for_client_fn_2
        )

    # Sampling for tff
    example_dataset = train_data.create_tf_dataset_for_client(
                 train_data.client_ids[0])
    preprocessed_example_dataset = preprocess(example_dataset)
    iterative_process = tff.learning.build_federated_averaging_process(
        model_fn,
        client_optimizer_fn=lambda: tf.keras.optimizers.Adam(learning_rate=0.001),
        server_optimizer_fn=lambda: tf.keras.optimizers.Adam(learning_rate=0.001))

     federated_train_data = make_federated_data(train_data, train_data.client_ids)

我们具有41个功能和标签。我还在这里检查了标签，并且正确地在0到1之间。在这一点上，联合数据签名是：

<PrefetchDataset element_spec=OrderedDict([('x', TensorSpec(shape=(None, 41), dtype=tf.float32, name=None)), ('y', TensorSpec(shape=(None, 1), dtype=tf.float32, name=None))])>,
 <PrefetchDataset element_spec=OrderedDict([('x', TensorSpec(shape=(None, 41), dtype=tf.float32, name=None)), ('y', TensorSpec(shape=(None, 1), dtype=tf.float32, name=None))])>,
         ...

最后，

state = iterative_process.initialize()
NUM_ROUNDS = 10
for round_num in range(0, NUM_ROUNDS):
  state, metrics = iterative_process.next(state, federated_train_data)
  print('round {:2d}, metrics={}'.format(round_num, metrics))

上述循环的结果是每种迭代和损失的准确性0.0，并没有真正改变。
问题可能与模型和培训过程有关，因为我认为数据集的格式正确，但我无法弄清楚它是什么。
提到的结果可能表明该模型根本没有训练，而且它输出了与（0,1）完全不同的东西。
还指出，在每次迭代后，状态（模型的权重）确实会发生变化。
有什么想法吗？

原文

I am trying to implement a simple binary classifier for the KDD dataset using Tensorflow's federated learning framework.
Following this tutorial what i have done so far is :

implemented a classical centralized model(outside of tff) achieving convergence and accurate results
Tried to integrate this approach wthin the Federated framework as seen below :

Functions

def make_federated_data(client_data, client_ids):
  return [
      preprocess(client_data.create_tf_dataset_for_client(x))
      for x in client_ids]

def create_keras_model():
  return tf.keras.models.Sequential([
      tf.keras.layers.InputLayer(input_shape=(41,)),
      tf.keras.layers.Dense(82, activation='relu'),
      tf.keras.layers.Dense(41, activation='relu'),
      tf.keras.layers.Dense(1, activation= 'sigmoid')])


def model_fn():
  # We _must_ create a new model here, and _not_ capture it from an external
  # scope. TFF will call this within different graph contexts.
  keras_model = create_keras_model()
  return tff.learning.from_keras_model(
      keras_model,
      input_spec=preprocessed_example_dataset.element_spec,
      loss=tf.keras.losses.BinaryCrossentropy(),
      metrics=[tf.keras.metrics.Accuracy()])

def create_tf_dataset_for_client_fn(client_id):
      dataset = tf.data.Dataset.from_tensor_slices(train_set.to_dict("list"))
      return dataset
    
    
def create_tf_dataset_for_client_fn_2(client_id):
      dataset = tf.data.Dataset.from_tensor_slices(test.to_dict("list"))
      return dataset
    def preprocess(dataset):

def preprocess(dataset):
  def batch_format_fn(element):
    """converting each sample to an `OrderedDict` and then formatting it"""
    
    return collections.OrderedDict(
          x =  tf.reshape(tf.concat([element[i] for i in element.keys() if i!='class'],0), [-1, 41]) ,
        y = tf.reshape(element['class'], [-1, 1]))

  return dataset.repeat(5).shuffle(100, seed=1).batch(  # Repeat = number of rounds
      12).map(batch_format_fn).prefetch(10)

Main

    # train_Set and test_Set are dataframes(scaled) with 41 features 
#and 1 label
    #adding client number to each sample
n_clients = 10
train_set['id'] = np.random.randint(0, n_clients ,train_set.shape[0])
test_set['id'] = np.random.randint(0, n_clients , test_set.shape[0])
# DataFrame  conversion
    train_data = tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
            client_ids = list(client_ids),
            serializable_dataset_fn=create_tf_dataset_for_client_fn
        )
    
    test_data = tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
            client_ids = list(client_ids),
            serializable_dataset_fn=create_tf_dataset_for_client_fn_2
        )

    # Sampling for tff
    example_dataset = train_data.create_tf_dataset_for_client(
                 train_data.client_ids[0])
    preprocessed_example_dataset = preprocess(example_dataset)
    iterative_process = tff.learning.build_federated_averaging_process(
        model_fn,
        client_optimizer_fn=lambda: tf.keras.optimizers.Adam(learning_rate=0.001),
        server_optimizer_fn=lambda: tf.keras.optimizers.Adam(learning_rate=0.001))

     federated_train_data = make_federated_data(train_data, train_data.client_ids)

Where we have 41 features and a label.I have also checked the labels here and they correctly are between 0 and 1.
At this point federated data signature is :

<PrefetchDataset element_spec=OrderedDict([('x', TensorSpec(shape=(None, 41), dtype=tf.float32, name=None)), ('y', TensorSpec(shape=(None, 1), dtype=tf.float32, name=None))])>,
 <PrefetchDataset element_spec=OrderedDict([('x', TensorSpec(shape=(None, 41), dtype=tf.float32, name=None)), ('y', TensorSpec(shape=(None, 1), dtype=tf.float32, name=None))])>,
         ...

Finally,

state = iterative_process.initialize()
NUM_ROUNDS = 10
for round_num in range(0, NUM_ROUNDS):
  state, metrics = iterative_process.next(state, federated_train_data)
  print('round {:2d}, metrics={}'.format(round_num, metrics))

The result of the loop above is 0.0 accuracy for each iteration and loss that is not really changing.
The problem is probably related to the model and the training process since i think the dataset is in correct format but i can't figure out what it is.
The results mentioned probably indicate that the model is not training at all and also that it outputs something completely different from (0,1).
It is also noted that the state(model's weights) do change after each iteration.
Any ideas?