ppshituV2训练时，出现classes(0)的相关报错 #3264

ca1wenha0 · 2024-09-30T10:20:26Z

我在使用ppshituv1时，可以完成特征提取的训练，但是在"PaddleClas/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_PPLCNetV2_base.yaml"的配置文件中，我只做了对应的数据集路径修改和类别数量修改：

#########################配置文件#############################
# global configs
Global:
  checkpoints: null
  # pretrained_model: null
  pretrained_model: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/PPShiTuV2/general_PPLCNetV2_base_pretrained_v1.0.pdparams
  output_dir: ./AMCS3_ppshituV2_output/
  device: gpu
  save_interval: 1
  eval_during_train: True
  eval_interval: 1
  epochs: 100
  print_batch_step: 20
  use_visualdl: False
  eval_mode: retrieval
  retrieval_feature_from: features # 'backbone' or 'features'
  re_ranking: False
  use_dali: False
  # used for static mode and model export
  image_shape: [3, 224, 224]
  save_inference_dir: ./inference


AMP:
  scale_loss: 65536
  use_dynamic_loss_scaling: True
  # O1: mixed fp16
  level: O1

# model architecture
Arch:
  name: RecModel
  infer_output_key: features
  infer_add_softmax: False

  Backbone:
    name: PPLCNetV2_base_ShiTu
    pretrained: True
    use_ssld: True
    class_expand: &feat_dim 512
  BackboneStopLayer:
    name: flatten
  Neck:
    name: BNNeck
    num_features: *feat_dim
    weight_attr:
      initializer:
        name: Constant
        value: 1.0
    bias_attr:
      initializer:
        name: Constant
        value: 0.0
      learning_rate: 1.0e-20 # NOTE: Temporarily set lr small enough to freeze the bias to zero
  Head:
    name: FC
    embedding_size: *feat_dim
    # class_num: 192612
    class_num: 5
    weight_attr:
      initializer:
        name: Normal
        std: 0.001
    bias_attr: False

# loss function config for traing/eval process
Loss:
  Train:
    - CELoss:
        weight: 1.0
        epsilon: 0.1
    - TripletAngularMarginLoss:
        weight: 1.0
        feature_from: features
        margin: 0.5
        reduction: mean
        add_absolute: True
        absolute_loss_weight: 0.1
        normalize_feature: True
        ap_value: 0.8
        an_value: 0.4
  Eval:
    - CELoss:
        weight: 1.0

Optimizer:
  name: Momentum
  momentum: 0.9
  lr:
    name: Cosine
    learning_rate: 0.06 # for 8gpu x 256bs
    warmup_epoch: 5
  regularizer:
    name: L2
    coeff: 0.00001

# data loader for train and eval
DataLoader:
  Train:
    dataset:
      name: ImageNetDataset
      image_root: deploy/datasets/AMCS3
      cls_label_path: ./deploy/datasets/AMCS3/train_label.txt
      relabel: False
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - ResizeImage:
            size: [224, 224]
            return_numpy: False
            interpolation: bilinear
            backend: cv2
        - RandFlipImage:
            flip_code: 1
        - Pad:
            padding: 10
            backend: cv2
        - RandCropImageV2:
            size: [224, 224]
        - RandomRotation:
            prob: 0.5
            degrees: 90
            interpolation: bilinear
        - ResizeImage:
            size: [224, 224]
            return_numpy: False
            interpolation: bilinear
            backend: cv2
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.485, 0.456, 0.406]
            std: [0.229, 0.224, 0.225]
            order: hwc
    sampler:
      name: PKSampler
      batch_size: 4
      sample_per_id: 4
      drop_last: False
      shuffle: True
      sample_method: "id_avg_prob"
      id_list: [50030, 80700, 92019, 96015] # be careful when set relabel=True
      ratio: [4, 4]
    loader:
      num_workers: 4
      use_shared_memory: True

  Eval:
    Query:
      dataset:
        name: VeriWild
        image_root: deploy/datasets/AMCS3
        cls_label_path: ./deploy/datasets/AMCS3/val_label.txt
        transform_ops:
          - DecodeImage:
              to_rgb: True
              channel_first: False
          - ResizeImage:
              size: [224, 224]
              return_numpy: False
              interpolation: bilinear
              backend: cv2
          - NormalizeImage:
              scale: 1.0/255.0
              mean: [0.485, 0.456, 0.406]
              std: [0.229, 0.224, 0.225]
              order: hwc
      sampler:
        name: DistributedBatchSampler
        batch_size: 64
        drop_last: False
        shuffle: False
      loader:
        num_workers: 4
        use_shared_memory: True

    Gallery:
      dataset:
        name: VeriWild
        image_root: deploy/datasets/AMCS3
        cls_label_path: ./deploy/datasets/AMCS3/val_label.txt
        transform_ops:
          - DecodeImage:
              to_rgb: True
              channel_first: False
          - ResizeImage:
              size: [224, 224]
              return_numpy: False
              interpolation: bilinear
              backend: cv2
          - NormalizeImage:
              scale: 1.0/255.0
              mean: [0.485, 0.456, 0.406]
              std: [0.229, 0.224, 0.225]
              order: hwc
      sampler:
        name: DistributedBatchSampler
        batch_size: 64
        drop_last: False
        shuffle: False
      loader:
        num_workers: 4
        use_shared_memory: True

Infer:
  infer_imgs: ./deploy/datasets/AMCS3/test
  batch_size: 32
  class_id_map_file: ./deploy/datasets/AMCS3/class_mapping2.txt
  transforms:
    - DecodeImage:
        to_rgb: True
        channel_first: False
    - ResizeImage:
        resize_short: 256
    - CropImage:
        size: 224
    - NormalizeImage:
        scale: 1.0/255.0
        mean: [0.485, 0.456, 0.406]
        std: [0.229, 0.224, 0.225]
        order: ''
    - ToCHWImage:
  PostProcess:
    name: Topk
    topk: 5



Metric:
  Eval:
    - Recallk:
        topk: [1, 5]
    - mAP: {}

    ###################################分割线#######################
虽然能够在终端打印出所有的train、val的图片信息：
train/5/1YH_20240101172813_1781.jpg 5 7307
train/5/2YH_20230501002030_1782.jpg 5 7308
train/5/1YH_20230201075635_1783.jpg 5 7309
...
但是在最后一行报错:"AssertionError: batch size(4) should not be bigger than than #classes(0)*sample_per_id(4)";
看起来是没有找到类别映射的路径，我的类别映射存放在"PaddleClas/deploy/datasets/AMCS3/label_list.txt"，
我参考了网上的其他配置文件，也是这样的存放方式，也没有在GeneralRecognitionV2_PPLCNetV2_base.yaml中
找到写入类别映射文件的参数，一直找不到解决办法，可以解答一下吗~
txt文件内容如下，共5类：
0 JueYuan
1 DiYa
2 GaoYa
3 DianLiu
4 DianYa

The text was updated successfully, but these errors were encountered:

liuhongen1234567 · 2024-10-12T12:27:35Z

您好，可以在 PaddleClas-develop/ppcls/data/dataloader/pk_sampler.py 文件里打印一下label看看吗？这个地方

这个类别计算应该和类别映射文件没有关系，主要统计的是标签文件中的类别数（比如：0，1，2，。。）

TingquanGao assigned liuhongen1234567 Sep 30, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ppshituV2训练时，出现classes(0)的相关报错 #3264

ppshituV2训练时，出现classes(0)的相关报错 #3264

ca1wenha0 commented Sep 30, 2024 •

edited

Loading

liuhongen1234567 commented Oct 12, 2024

ppshituV2训练时，出现classes(0)的相关报错 #3264

ppshituV2训练时，出现classes(0)的相关报错 #3264

Comments

ca1wenha0 commented Sep 30, 2024 • edited Loading

liuhongen1234567 commented Oct 12, 2024

ca1wenha0 commented Sep 30, 2024 •

edited

Loading