TensorFlow 对象检测 API - 如何在 COCO 数据集上进行训练并实现与报告相同的 mAP？

发布于 2025-01-12 04:57:13 字数 5349 浏览 0 评论 0原文

我正在尝试通过使用预训练的 EfficientNet 主干网在 COCO 上进行训练来重现对象检测 API 中官方报告的 EfficientDet D3 的 mAP。官方 COCO mAP 为 45.4%我能做到的就是 14% 左右。我不需要达到相同的值，但我希望至少接近它。

我正在加载在 ImageNet 上预训练的 EfficientNet B3 检查点，此处< /a>，并使用发现的配置文件此处。我更改的唯一参数是批量大小（以适应 RTX 3090）、学习率（0.08 产生损失 = NaN，因此我将其减少到 0.01）和步数（我将其增加到 600k）。这是我的 pipeline.config 文件：

  model {
  ssd {
    inplace_batchnorm_update: true
    freeze_batchnorm: false
    num_classes: 90
    add_background_class: false
    box_coder {
      faster_rcnn_box_coder {
        y_scale: 10.0
        x_scale: 10.0
        height_scale: 5.0
        width_scale: 5.0
      }
    }
    matcher {
      argmax_matcher {
        matched_threshold: 0.5
        unmatched_threshold: 0.5
        ignore_thresholds: false
        negatives_lower_than_unmatched: true
        force_match_for_each_row: true
        use_matmul_gather: true
      }
    }
    similarity_calculator {
      iou_similarity {
      }
    }
    encode_background_as_zeros: true
    anchor_generator {
      multiscale_anchor_generator {
        min_level: 3
        max_level: 7
        anchor_scale: 4.0
        aspect_ratios: [1.0, 2.0, 0.5]
        scales_per_octave: 3
      }
    }
    image_resizer {
      keep_aspect_ratio_resizer {
        min_dimension: 896
        max_dimension: 896
        pad_to_max_dimension: true
        }
    }
    box_predictor {
      weight_shared_convolutional_box_predictor {
        depth: 160
        class_prediction_bias_init: -4.6
        conv_hyperparams {
          force_use_bias: true
          activation: SWISH
          regularizer {
            l2_regularizer {
              weight: 0.00004
            }
          }
          initializer {
            random_normal_initializer {
              stddev: 0.01
              mean: 0.0
            }
          }
          batch_norm {
            scale: true
            decay: 0.99
            epsilon: 0.001
          }
        }
        num_layers_before_predictor: 4
        kernel_size: 3
        use_depthwise: true
      }
    }
    feature_extractor {
      type: 'ssd_efficientnet-b3_bifpn_keras'
      bifpn {
        min_level: 3
        max_level: 7
        num_iterations: 6
        num_filters: 160
      }
      conv_hyperparams {
        force_use_bias: true
        activation: SWISH
        regularizer {
          l2_regularizer {
            weight: 0.00004
          }
        }
        initializer {
          truncated_normal_initializer {
            stddev: 0.03
            mean: 0.0
          }
        }
        batch_norm {
          scale: true,
          decay: 0.99,
          epsilon: 0.001,
        }
      }
    }
    loss {
      classification_loss {
        weighted_sigmoid_focal {
          alpha: 0.25
          gamma: 1.5
        }
      }
      localization_loss {
        weighted_smooth_l1 {
        }
      }
      classification_weight: 1.0
      localization_weight: 1.0
    }
    normalize_loss_by_num_matches: true
    normalize_loc_loss_by_codesize: true
    post_processing {
      batch_non_max_suppression {
        score_threshold: 1e-8
        iou_threshold: 0.5
        max_detections_per_class: 100
        max_total_detections: 100
      }
      score_converter: SIGMOID
    }
  }
}

train_config: {
  fine_tune_checkpoint: "/API/Tensorflow/models/research/object_detection/test_data/efficientnet_b3/efficientnet_b3/ckpt-0"
  fine_tune_checkpoint_version: V2
  fine_tune_checkpoint_type: "classification"
  batch_size: 2
  sync_replicas: true
  startup_delay_steps: 0
  replicas_to_aggregate: 8
  use_bfloat16: false
  num_steps: 600000
  data_augmentation_options {
    random_horizontal_flip {
    }
  }
  data_augmentation_options {
    random_scale_crop_and_pad_to_square {
      output_size: 896
      scale_min: 0.1
      scale_max: 2.0
    }
  }
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        cosine_decay_learning_rate {
          learning_rate_base: 1e-2
          total_steps: 600000
          warmup_learning_rate: .001
          warmup_steps: 2500
        }
      }
      momentum_optimizer_value: 0.9
    }
    use_moving_average: false
  }
  max_number_of_boxes: 100
  unpad_groundtruth_tensors: false
}

train_input_reader: {
  label_map_path: "/DATASETS/COCO/classes.pbtxt"
  tf_record_input_reader {
    input_path: "/DATASETS/COCO/coco_train.record-00000-of-00100"
  }
}

eval_config: {
  metrics_set: "coco_detection_metrics"
  use_moving_averages: false
  batch_size: 1;
}

eval_input_reader: {
  label_map_path: "/DATASETS/COCO/classes.pbtxt"
  shuffle: false
  num_epochs: 1
  tf_record_input_reader {
    input_path: "/DATASETS/COCO/coco_val.record-00000-of-00050"
  }
}

这些是结果：

原文

I'm trying to reproduce the officially reported mAP of EfficientDet D3 in the Object Detection API by training on COCO using a pretrained EfficientNet backbone. The official COCO mAP is 45.4% and yet all I can manage to achieve is around 14%. I don't need to reach the same value, but I wish to at least come close to it.

I am loading the EfficientNet B3 checkpoint pretrained on ImageNet found here, and using the config file found here. The only parameters I changed are batch size (to fit into an RTX 3090), learning rate (0.08 was yielding loss=NaN so I reduced it to 0.01), and steps, which I increased to 600k. This is my pipeline.config file:

  model {
  ssd {
    inplace_batchnorm_update: true
    freeze_batchnorm: false
    num_classes: 90
    add_background_class: false
    box_coder {
      faster_rcnn_box_coder {
        y_scale: 10.0
        x_scale: 10.0
        height_scale: 5.0
        width_scale: 5.0
      }
    }
    matcher {
      argmax_matcher {
        matched_threshold: 0.5
        unmatched_threshold: 0.5
        ignore_thresholds: false
        negatives_lower_than_unmatched: true
        force_match_for_each_row: true
        use_matmul_gather: true
      }
    }
    similarity_calculator {
      iou_similarity {
      }
    }
    encode_background_as_zeros: true
    anchor_generator {
      multiscale_anchor_generator {
        min_level: 3
        max_level: 7
        anchor_scale: 4.0
        aspect_ratios: [1.0, 2.0, 0.5]
        scales_per_octave: 3
      }
    }
    image_resizer {
      keep_aspect_ratio_resizer {
        min_dimension: 896
        max_dimension: 896
        pad_to_max_dimension: true
        }
    }
    box_predictor {
      weight_shared_convolutional_box_predictor {
        depth: 160
        class_prediction_bias_init: -4.6
        conv_hyperparams {
          force_use_bias: true
          activation: SWISH
          regularizer {
            l2_regularizer {
              weight: 0.00004
            }
          }
          initializer {
            random_normal_initializer {
              stddev: 0.01
              mean: 0.0
            }
          }
          batch_norm {
            scale: true
            decay: 0.99
            epsilon: 0.001
          }
        }
        num_layers_before_predictor: 4
        kernel_size: 3
        use_depthwise: true
      }
    }
    feature_extractor {
      type: 'ssd_efficientnet-b3_bifpn_keras'
      bifpn {
        min_level: 3
        max_level: 7
        num_iterations: 6
        num_filters: 160
      }
      conv_hyperparams {
        force_use_bias: true
        activation: SWISH
        regularizer {
          l2_regularizer {
            weight: 0.00004
          }
        }
        initializer {
          truncated_normal_initializer {
            stddev: 0.03
            mean: 0.0
          }
        }
        batch_norm {
          scale: true,
          decay: 0.99,
          epsilon: 0.001,
        }
      }
    }
    loss {
      classification_loss {
        weighted_sigmoid_focal {
          alpha: 0.25
          gamma: 1.5
        }
      }
      localization_loss {
        weighted_smooth_l1 {
        }
      }
      classification_weight: 1.0
      localization_weight: 1.0
    }
    normalize_loss_by_num_matches: true
    normalize_loc_loss_by_codesize: true
    post_processing {
      batch_non_max_suppression {
        score_threshold: 1e-8
        iou_threshold: 0.5
        max_detections_per_class: 100
        max_total_detections: 100
      }
      score_converter: SIGMOID
    }
  }
}

train_config: {
  fine_tune_checkpoint: "/API/Tensorflow/models/research/object_detection/test_data/efficientnet_b3/efficientnet_b3/ckpt-0"
  fine_tune_checkpoint_version: V2
  fine_tune_checkpoint_type: "classification"
  batch_size: 2
  sync_replicas: true
  startup_delay_steps: 0
  replicas_to_aggregate: 8
  use_bfloat16: false
  num_steps: 600000
  data_augmentation_options {
    random_horizontal_flip {
    }
  }
  data_augmentation_options {
    random_scale_crop_and_pad_to_square {
      output_size: 896
      scale_min: 0.1
      scale_max: 2.0
    }
  }
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        cosine_decay_learning_rate {
          learning_rate_base: 1e-2
          total_steps: 600000
          warmup_learning_rate: .001
          warmup_steps: 2500
        }
      }
      momentum_optimizer_value: 0.9
    }
    use_moving_average: false
  }
  max_number_of_boxes: 100
  unpad_groundtruth_tensors: false
}

train_input_reader: {
  label_map_path: "/DATASETS/COCO/classes.pbtxt"
  tf_record_input_reader {
    input_path: "/DATASETS/COCO/coco_train.record-00000-of-00100"
  }
}

eval_config: {
  metrics_set: "coco_detection_metrics"
  use_moving_averages: false
  batch_size: 1;
}

eval_input_reader: {
  label_map_path: "/DATASETS/COCO/classes.pbtxt"
  shuffle: false
  num_epochs: 1
  tf_record_input_reader {
    input_path: "/DATASETS/COCO/coco_val.record-00000-of-00050"
  }
}

These are the results:

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

待＂谢繁草 2025-01-19 04:57:13

你的损失太大了。损失约为 1 表示您的模型尚未经过训练。它不学习权重。您可以检查以下几项内容：

数据集。训练期间是否使用了所有图像？还要看看注释。类和边界框是否正确？或者说有什么奇怪的吗？例如，COCO 的边界框应以绝对值给出。如果给出相对值，这可能表明您需要重新调整它们。
图像大小是否已调整？如果是这样，它的边界框也需要调整大小。
检查边界框。也许可以绘制一些带有边界框的图像。如果边界框的格式不正确或其值缩放不正确，您就会看到它。
为了缩小这个错误的来源，尝试加载在 COCO 上训练过的 EfficientNet 的权重，看看如果你尝试用非常低的 lr 进一步微调它们会发生什么。如果这不起作用，则强烈表明注释存在问题。

回复收藏 0 原文