PaddlePaddle · Yang-Changhui · Sep 5, 2024
diff --git a/configs/co_detr/README.md b/configs/co_detr/README.md
@@ -0,0 +1,39 @@
+# DETR
+
+## Introduction
+
+
+DETR is an object detection model based on transformer. We reproduced the model of the paper.
+
+
+## Model Zoo
+
+| Backbone | Model | Images/GPU  | Inf time (fps) | Box AP | Config | Download |
+|:------:|:--------:|:--------:|:--------------:|:------:|:------:|:--------:|
+| R-50 | DETR  | 4 | --- | 42.3 | [config](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/configs/detr/detr_r50_1x_coco.yml) | [model](https://paddledet.bj.bcebos.com/models/detr_r50_1x_coco.pdparams) |
+
+**Notes:**
+
+- DETR is trained on COCO train2017 dataset and evaluated on val2017 results of `mAP(IoU=0.5:0.95)`.
+- DETR uses 8GPU to train 500 epochs.
+
+GPU multi-card training
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/detr/detr_r50_1x_coco.yml --fleet
+```
+
+## Citations
+```
+@inproceedings{detr,
+  author    = {Nicolas Carion and
+               Francisco Massa and
+               Gabriel Synnaeve and
+               Nicolas Usunier and
+               Alexander Kirillov and
+               Sergey Zagoruyko},
+  title     = {End-to-End Object Detection with Transformers},
+  booktitle = {ECCV},
+  year      = {2020}
+}
+```
diff --git a/configs/co_detr/_base_/co_detr_r50.yml b/configs/co_detr/_base_/co_detr_r50.yml
@@ -0,0 +1,187 @@
+architecture: CO_DETR
+# pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vb_normal_pretrained.pdparams
+pretrain_weights: /home/aistudio/co_deformable_detr_r50_1x_coco.pdparams
+
+# model settings
+num_dec_layer: &num_dec_layer 6
+lambda_2: &lambda_2 2.0
+
+CO_DETR:
+  backbone: ResNet
+  neck: ChannelMapper
+  query_head: CoDeformDETRHead
+  rpn_head: RPNHead
+  roi_head: Co_RoiHead
+  bbox_head:
+    name: CoATSSHead
+    num_classes: 80
+    in_channels: 256
+    stacked_convs: 1
+    feat_channels: 256
+    anchor_generator: 
+      name: CoAnchorGenerator
+      octave_base_scale: 8
+      scales_per_octave: 1
+      aspect_ratios: [1.0]
+      strides: [8, 16, 32, 64, 128]
+    assigner: 
+      name: ATSSAssigner
+      topk: 9
+    loss_cls: 
+      name: Weighted_FocalLoss
+      use_sigmoid: true
+      gamma: 2.0
+      alpha: 0.25
+    loss_bbox: 
+      name: GIoULoss
+
+
+ResNet:
+  # index 0 stands for res2
+  depth: 50
+  norm_type: bn
+  freeze_at: 0
+  return_idx: [1,2,3]
+  lr_mult_list: [0.0, 0.1, 0.1, 0.1]
+  num_stages: 4
+
+ChannelMapper:
+  in_channels: [512, 1024, 2048]
+  kernel_size: 1
+  out_channels: 256
+  norm_type: "gn"
+  norm_groups: 32
+  act: None
+  num_outs: 4
+
+
+CoDeformDETRHead:
+  num_query: 300
+  num_classes: 80
+  in_channels: 2048
+  sync_cls_avg_factor: True
+  with_box_refine: True
+  as_two_stage: True
+  mixed_selection: True
+  transformer:
+    name: CoDeformableDetrTransformer
+    num_co_heads: 2
+    as_two_stage: True
+    mixed_selection: True
+    encoder:
+      name: CoTransformerEncoder
+      num_layers: *num_dec_layer
+      out_channel: 256
+      encoder_layer:
+        name: TransformerEncoderLayer
+        d_model: 256
+        attn:
+          name: MSDeformableAttention
+          embed_dim: 256
+          num_heads: 8
+          num_levels: 4
+          num_points: 4
+        dim_feedforward: 2048
+        dropout: 0.0
+    decoder:
+      name: CoDeformableDetrTransformerDecoder
+      num_layers: *num_dec_layer
+      return_intermediate: True
+      look_forward_twice: True
+      decoder_layer:
+        name: PETR_TransformerDecoderLayer
+        d_model: 256
+        dim_feedforward: 2048
+        dropout: 0.0
+        self_attn:
+          name: MultiHeadAttention
+          embed_dim: 256
+          num_heads: 8
+          dropout: 0.0
+        cross_attn:
+          name: MSDeformableAttention
+          embed_dim: 256
+  positional_encoding:
+    name: PositionEmbedding
+    num_pos_feats: 128
+    normalize: true
+    offset: -0.5
+  loss_cls:
+    name: Weighted_FocalLoss
+    use_sigmoid: true
+    gamma: 2.0
+    alpha: 0.25
+    loss_weight: 2.0
+  loss_bbox:
+    name: L1Loss
+    loss_weight: 5.0
+  loss_iou:
+    name: GIoULoss
+    loss_weight: 2.0
+  assigner:
+    name: HungarianAssigner
+    cls_cost:
+      name: FocalLossCost
+      weight: 2.0
+    reg_cost:
+      name: BBoxL1Cost
+      weight: 5.0
+      box_format: xywh
+    iou_cost:
+      name: IoUCost
+      iou_mode: giou
+      weight: 2.0
+  test_cfg:
+    max_per_img: 100
+    score_thr: 0.0
+    nms: false
+  nms: 
+    name: MultiClassNMS
+    keep_top_k: 100
+    score_threshold: 0.05
+    nms_threshold: 0.6
+
+RPNHead:
+  loss_rpn_bbox: L1Loss
+  in_channel: 256
+  anchor_generator: 
+    name: RetinaAnchorGenerator
+    octave_base_scale: 4
+    scales_per_octave: 3
+    aspect_ratios: [0.5, 1.0, 2.0]
+    strides: [8.0, 16.0, 32.0, 64.0, 128.0]
+  rpn_target_assign:
+    batch_size_per_im: 256
+    fg_fraction: 0.5
+    negative_overlap: 0.3
+    positive_overlap: 0.7
+    use_random: True
+  train_proposal:
+    min_size: 0.0
+    nms_thresh: 0.7
+    pre_nms_top_n: 4000
+    post_nms_top_n: 1000
+    topk_after_collect: True
+  test_proposal:
+    min_size: 0.0
+    nms_thresh: 0.7
+    pre_nms_top_n: 1000
+    post_nms_top_n: 1000
+
+Co_RoiHead:
+  in_channel: 256
+  num_classes: 80
+  head: TwoFCHead
+  roi_extractor:
+    resolution: 7
+    sampling_ratio: 0
+    aligned: True
+  bbox_assigner: 
+    name: BBoxAssigner
+    batch_size_per_im: 512
+    bg_thresh: 0.5
+    fg_thresh: 0.5
+    fg_fraction: 0.25
+    use_random: True
+  bbox_loss: 
+    name: GIoULoss
diff --git a/configs/co_detr/_base_/co_detr_reader.yml b/configs/co_detr/_base_/co_detr_reader.yml
@@ -0,0 +1,47 @@
+worker_num: 0
+TrainReader:
+  sample_transforms:
+  - Decode: {}
+  - RandomFlip: {prob: 0.5}
+  - RandomSelect: { transforms1: [ RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ],
+                    transforms2: [
+                        RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] },
+                        RandomSizeCrop: { min_size: 384, max_size: 600 },
+                        RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ] }
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - NormalizeBox: {}
+  - BboxXYXY2XYWH: {}
+  - Permute: {}
+  batch_transforms:
+  - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true}
+  batch_size: 2
+  shuffle: false
+  drop_last: true
+  collate_batch: false
+  use_shared_memory: false
+
+
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    # - PETR_Resize: {img_scale: [[800, 1333]], keep_ratio: True}
+    - Resize: {target_size: [800, 1333], keep_ratio: True, interp: 1}
+    - NormalizeImage:
+        mean: [0.485,0.456,0.406]
+        std: [0.229, 0.224,0.225]
+        is_scale: true
+    - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
+
+
+TestReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {target_size: [800, 1333], keep_ratio: True}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
diff --git a/configs/co_detr/_base_/optimizer_1x.yml b/configs/co_detr/_base_/optimizer_1x.yml
@@ -0,0 +1,16 @@
+epoch: 500
+
+LearningRate:
+  base_lr: 0.0001
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones: [400]
+    use_warmup: false
+
+OptimizerBuilder:
+  clip_grad_by_norm: 0.1
+  regularizer: false
+  optimizer:
+    type: AdamW
+    weight_decay: 0.0001
diff --git a/configs/co_detr/co_detr_r50_1x_coco.yml b/configs/co_detr/co_detr_r50_1x_coco.yml
@@ -0,0 +1,9 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_1x.yml',
+  '_base_/co_detr_r50.yml',
+  '_base_/co_detr_reader.yml',
+]
+weights: /home/aistudio/co_deformable_detr_r50_1x_coco.pdparams
+find_unused_parameters: True
diff --git a/ppdet/modeling/architectures/__init__.py b/ppdet/modeling/architectures/__init__.py
@@ -45,6 +45,7 @@
 from . import detr_ssod
 from . import multi_stream_detector
 from . import clrnet
+from . import co_detr
 
 from .meta_arch import *
 from .faster_rcnn import *
@@ -68,6 +69,7 @@
 from .gfl import *
 from .picodet import *
 from .detr import *
+from .co_detr import *
 from .sparse_rcnn import *
 from .tood import *
 from .retinanet import *