NVIDIA · JanuszL · Sep 11, 2024 · Sep 9, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/dali/operators/reader/loader/video/video_loader_decoder_base.h b/dali/operators/reader/loader/video/video_loader_decoder_base.h
@@ -35,7 +35,8 @@ template <typename Backend>
 class VideoSample {
  public:
   Tensor<Backend> data_;
-  int label_;
+  int label_ = -1;
+  int first_frame_ = -1;
 };
 
 class VideoLoaderDecoderBase {
@@ -46,6 +47,7 @@ class VideoLoaderDecoderBase {
     stride_(spec.GetArgument<int>("stride")),
     step_(spec.GetArgument<int>("step")) {
     has_labels_ = spec.TryGetRepeatedArgument(labels_, "labels");
+    has_frame_idx_ = spec.GetArgument<bool>("enable_frame_num");
     DALI_ENFORCE(
         !has_labels_ || labels_.size() == filenames_.size(),
         make_string(
@@ -61,6 +63,7 @@ class VideoLoaderDecoderBase {
   std::vector<std::string> filenames_;
   std::vector<int> labels_;
   bool has_labels_ = false;
+  bool has_frame_idx_ = false;
 
   Index current_index_ = 0;
 

diff --git a/dali/operators/reader/loader/video/video_loader_decoder_cpu.cc b/dali/operators/reader/loader/video/video_loader_decoder_cpu.cc
@@ -45,6 +45,9 @@ void VideoLoaderDecoderCpu::ReadSample(VideoSample<CPUBackend> &sample) {
   if (has_labels_) {
     sample.label_ = labels_[sample_span.video_idx_];
   }
+  if (has_frame_idx_) {
+    sample.first_frame_ = sample_span.start_;
+  }
 }
 
 Index VideoLoaderDecoderCpu::SizeImpl() {

diff --git a/dali/operators/reader/video_reader_decoder_cpu_op.cc b/dali/operators/reader/video_reader_decoder_cpu_op.cc
@@ -20,7 +20,8 @@ namespace dali {
 
 VideoReaderDecoderCpu::VideoReaderDecoderCpu(const OpSpec &spec)
     : DataReader<CPUBackend, VideoSampleCpu, VideoSampleCpu, true>(spec),
-      has_labels_(spec.HasArgument("labels")) {
+      has_labels_(spec.HasArgument("labels")),
+      has_frame_idx_(spec.GetArgument<bool>("enable_frame_num")) {
       loader_ = InitLoader<VideoLoaderDecoderCpu>(spec);
       this->SetInitialSnapshot();
 }
@@ -32,16 +33,26 @@ void VideoReaderDecoderCpu::RunImpl(SampleWorkspace &ws) {
   video_output.Copy(sample.data_);
   video_output.SetSourceInfo(sample.data_.GetSourceInfo());
 
+  int out_index = 1;
   if (has_labels_) {
-    auto &label_output = ws.Output<CPUBackend>(1);
+    auto &label_output = ws.Output<CPUBackend>(out_index);
     label_output.Resize({}, DALIDataType::DALI_INT32);
     label_output.mutable_data<int>()[0] = sample.label_;
+    out_index++;
+  }
+  if (has_frame_idx_) {
+    auto &frame_idx_output = ws.Output<CPUBackend>(out_index);
+    frame_idx_output.Resize({}, DALIDataType::DALI_INT32);
+    frame_idx_output.mutable_data<int>()[0] = sample.first_frame_;
+    out_index++;
   }
 }
 
 namespace detail {
 inline int VideoReaderDecoderOutputFn(const OpSpec &spec) {
-  return spec.HasArgument("labels") ? 2 : 1;
+  bool has_labels = spec.HasArgument("labels")
+  bool has_frame_num_output  = spec.GetArgument<bool>("enable_frame_num");
+  return 1 + has_labels + has_frame_num_output;
 }
 }  // namespace detail
 
@@ -68,6 +79,10 @@ even in the variable frame rate scenario.)code")
   .AddArg("sequence_length",
       R"code(Frames to load per sequence.)code",
       DALI_INT32)
+  .AddOptionalArg("enable_frame_num",
-  .AddOptionalArg("enable_frame_num",
+  .AddOptionalArg("enable_frame_idx",
-  .AddOptionalArg("enable_frame_num",
+  .AddOptionalArg("enable_frame_idx",
+      R"code(If set, returns the index of the first frame in the decoded sequence
+as an additional output.)code",
+      false)
   .AddOptionalArg("step",
       R"code(Frame interval between each sequence.
 

diff --git a/dali/operators/reader/video_reader_decoder_cpu_op.h b/dali/operators/reader/video_reader_decoder_cpu_op.h
@@ -29,6 +29,7 @@ class VideoReaderDecoderCpu
 
  private:
   bool has_labels_ = false;
+  bool has_frame_idx_ = false;
 };
 
 }  // namespace dali

diff --git a/dali/operators/reader/video_reader_decoder_gpu_op.cc b/dali/operators/reader/video_reader_decoder_gpu_op.cc
@@ -20,7 +20,8 @@ namespace dali {
 
 VideoReaderDecoderGpu::VideoReaderDecoderGpu(const OpSpec &spec)
     : DataReader<GPUBackend, VideoSampleGpu, VideoSampleGpu, true>(spec),
-      has_labels_(spec.HasArgument("labels")) {
+      has_labels_(spec.HasArgument("labels")),
+      has_frame_idx_(spec.GetArgument<bool>("enable_frame_num")) {
       loader_ = InitLoader<VideoLoaderDecoderGpu>(spec);
       this->SetInitialSnapshot();
 }
@@ -50,14 +51,21 @@ bool VideoReaderDecoderGpu::SetupImpl(
 
   output_desc[0] = { video_shape, DALI_UINT8 };
 
-  if (!has_labels_) {
-    return true;
+  int out_index = 1;
+  if (has_labels_) {
+    output_desc[out_index] = {
+      uniform_list_shape<1>(batch_size, {1}),
+      DALI_INT32
+    };
+    out_index++;
+  }
+  if (has_frame_idx_) {
+    output_desc[out_index] = {
+      uniform_list_shape<1>(batch_size, {1}),
+      DALI_INT32
+    };
+    out_index++;
   }
-
-  output_desc[1] = {
-    uniform_list_shape<1>(batch_size, {1}),
-    DALI_INT32
-  };
 
   return true;
 }
@@ -80,23 +88,39 @@ void VideoReaderDecoderGpu::RunImpl(Workspace &ws) {
     video_output.SetSourceInfo(sample_id, sample.data_.GetSourceInfo());
   }
 
-  if (!has_labels_) {
-    return;
-  }
+  int out_index = 1;
+  if (has_labels_) {
+    auto &labels_output = ws.Output<GPUBackend>(out_index);
+    SmallVector<int, 32> labels_cpu;
 
-  auto &labels_output = ws.Output<GPUBackend>(1);
-  SmallVector<int, 32> labels_cpu;
+    for (int sample_id = 0; sample_id < batch_size; ++sample_id) {
+      auto &sample = GetSample(sample_id);
+      labels_cpu[sample_id] = sample.label_;
+    }
 
-  for (int sample_id = 0; sample_id < batch_size; ++sample_id) {
-    auto &sample = GetSample(sample_id);
-    labels_cpu[sample_id] = sample.label_;
+    MemCopy(
+      labels_output.AsTensor().raw_mutable_data(),
+      labels_cpu.data(),
+      batch_size * sizeof(DALI_INT32),
+      ws.stream());
+    out_index++;
   }
+  if (has_frame_idx_) {
+    auto &frame_idx_output = ws.Output<GPUBackend>(out_index);
+    SmallVector<int, 32> frame_idx_output_cpu;
+
+    for (int sample_id = 0; sample_id < batch_size; ++sample_id) {
+      auto &sample = GetSample(sample_id);
+      frame_idx_output_cpu[sample_id] = sample.span_ ? sample.span_->start_ : -1;
+    }
 
-  MemCopy(
-    labels_output.AsTensor().raw_mutable_data(),
-    labels_cpu.data(),
-    batch_size * sizeof(DALI_INT32),
-    ws.stream());
+    MemCopy(
+      frame_idx_output.AsTensor().raw_mutable_data(),
+      frame_idx_output_cpu.data(),
+      batch_size * sizeof(DALI_INT32),
+      ws.stream());
+    out_index++;
+  }
 }
 
 DALI_REGISTER_OPERATOR(experimental__readers__Video, VideoReaderDecoderGpu, GPU);

diff --git a/dali/operators/reader/video_reader_decoder_gpu_op.h b/dali/operators/reader/video_reader_decoder_gpu_op.h
@@ -35,6 +35,7 @@ class VideoReaderDecoderGpu : public DataReader<GPUBackend, VideoSampleGpu, Vide
 
  private:
   bool has_labels_ = false;
+  bool has_frame_idx_  = false;
 };
 
 }  // namespace dali

diff --git a/dali/operators/reader/video_reader_decoder_op_test.cc b/dali/operators/reader/video_reader_decoder_op_test.cc
@@ -40,6 +40,9 @@ class VideoReaderDecoderBaseTest : public VideoTestBase {
   virtual void AssertFrame(
     int frame_id, const uint8_t *frame, TestVideo &ground_truth) = 0;
 
+  template<typename Backend>
+  int GetFrameIdx(dali::TensorList<Backend> &device_frame_idx);
+
  private:
   template<typename Backend>
   void RunTestImpl(
@@ -129,15 +132,15 @@ class VideoReaderDecoderBaseTest : public VideoTestBase {
       .AddArg("device", backend)
       .AddArg("sequence_length", sequence_length)
       .AddArg("random_shuffle", true)
+      .AddArg("enable_frame_num", true)
       .AddArg("initial_fill", cfr_videos_[0].NumFrames())
       .AddArg(
         "filenames",
         std::vector<std::string>{cfr_videos_paths_[0]})
-      .AddOutput("frames", backend));
-
-    pipe.Build({{"frames", backend}});
+      .AddOutput("frames", backend)
+      .AddOutput("frame_idx", backend));
 
-    std::vector<int> expected_order = {29, 46, 33, 6, 37};
+    pipe.Build({{"frames", backend}, {"frame_idx", backend}});
 
     int num_sequences = 5;
 
@@ -148,9 +151,10 @@ class VideoReaderDecoderBaseTest : public VideoTestBase {
 
       auto &frame_video_output = ws.Output<Backend>(0);
       const auto sample = frame_video_output.template tensor<uint8_t>(0);
+      int frame_idx = GetFrameIdx(ws.Output<Backend>(1));
 
-      // We want to access correct order, so we comapre only the first frame of the sequence
-      AssertFrame(expected_order[sequence_id], sample, ground_truth_video);
+      // We want to access correct order, so we compare only the first frame of the sequence
+      AssertFrame(frame_idx, sample, ground_truth_video);
     }
   }
 };
@@ -168,6 +172,15 @@ void VideoReaderDecoderBaseTest::RunShuffleTest<dali::CPUBackend>() {
     RunShuffleTestImpl<dali::CPUBackend>("cpu", dali::CPU_ONLY_DEVICE_ID);
 }
 
+template<>
+int VideoReaderDecoderBaseTest::GetFrameIdx(
+  dali::TensorList<dali::CPUBackend> &device_frame_idx) {
+    const auto frame_idx = device_frame_idx.template tensor<int>(0);
+    int frame_idx_buffer = -1;
+    std::copy_n(frame_idx, 1, &frame_idx_buffer);
+    return frame_idx_buffer;
+}
+
 template<>
 void VideoReaderDecoderBaseTest::RunTest<dali::GPUBackend>(
   std::vector<std::string> &videos_paths,
@@ -181,6 +194,15 @@ void VideoReaderDecoderBaseTest::RunShuffleTest<dali::GPUBackend>() {
     RunShuffleTestImpl<dali::GPUBackend>("gpu", 0);
 }
 
+template<>
+int VideoReaderDecoderBaseTest::GetFrameIdx(
+  dali::TensorList<dali::GPUBackend> &device_frame_idx) {
+    const auto frame_idx = device_frame_idx.template tensor<int>(0);
+    int frame_idx_buffer = -1;
+    MemCopy(&frame_idx_buffer, frame_idx, sizeof(int));
+    return frame_idx_buffer;
+}
+
 class VideoReaderDecoderCpuTest : public VideoReaderDecoderBaseTest {
  public:
   void AssertLabel(const int *label, int ground_truth_label) override {