in RtpSenderVideo add support for writing DependencyDescriptor header extension

Bug: webrtc:10342
Change-Id: I12cca9c5e1606338bb914e58e13d268bbc6961f9
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/166532
Commit-Queue: Danil Chapovalov <danilchap@webrtc.org>
Reviewed-by: Philip Eliasson <philipel@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#30427}
diff --git a/modules/rtp_rtcp/BUILD.gn b/modules/rtp_rtcp/BUILD.gn
index daaac94..099c066 100644
--- a/modules/rtp_rtcp/BUILD.gn
+++ b/modules/rtp_rtcp/BUILD.gn
@@ -251,6 +251,7 @@
     "../../api/rtc_event_log",
     "../../api/transport:field_trial_based_config",
     "../../api/transport:webrtc_key_value_config",
+    "../../api/transport/rtp:dependency_descriptor",
     "../../api/transport/rtp:rtp_source",
     "../../api/units:data_rate",
     "../../api/units:time_delta",
@@ -332,6 +333,7 @@
   ]
   deps = [
     "../../:webrtc_common",
+    "../../api/transport/rtp:dependency_descriptor",
     "../../api/video:video_frame",
     "../../api/video:video_frame_type",
     "../../api/video:video_rtp_headers",
@@ -508,6 +510,7 @@
       "../../api:transport_api",
       "../../api/rtc_event_log",
       "../../api/transport:field_trial_based_config",
+      "../../api/transport/rtp:dependency_descriptor",
       "../../api/units:timestamp",
       "../../api/video:encoded_image",
       "../../api/video:video_bitrate_allocation",
@@ -518,6 +521,7 @@
       "../../api/video_codecs:video_codecs_api",
       "../../call:rtp_receiver",
       "../../common_video",
+      "../../common_video/generic_frame_descriptor",
       "../../common_video/test:utilities",
       "../../logging:mocks",
       "../../rtc_base:checks",
diff --git a/modules/rtp_rtcp/source/rtp_sender_video.cc b/modules/rtp_rtcp/source/rtp_sender_video.cc
index 9779df1..fc176c9 100644
--- a/modules/rtp_rtcp/source/rtp_sender_video.cc
+++ b/modules/rtp_rtcp/source/rtp_sender_video.cc
@@ -18,12 +18,15 @@
 #include <string>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "api/crypto/frame_encryptor_interface.h"
+#include "api/transport/rtp/dependency_descriptor.h"
 #include "modules/remote_bitrate_estimator/test/bwe_test_logging.h"
 #include "modules/rtp_rtcp/include/rtp_rtcp_defines.h"
 #include "modules/rtp_rtcp/source/absolute_capture_time_sender.h"
 #include "modules/rtp_rtcp/source/byte_io.h"
+#include "modules/rtp_rtcp/source/rtp_dependency_descriptor_extension.h"
 #include "modules/rtp_rtcp/source/rtp_format.h"
 #include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor_extension.h"
 #include "modules/rtp_rtcp/source/rtp_header_extensions.h"
@@ -72,6 +75,7 @@
     const RTPVideoHeader& video_header,
     const absl::optional<PlayoutDelay>& playout_delay,
     const absl::optional<AbsoluteCaptureTime>& absolute_capture_time,
+    FrameDependencyStructure* video_structure,
     bool set_video_rotation,
     bool set_color_space,
     bool set_frame_marking,
@@ -115,34 +119,71 @@
   }
 
   if (video_header.generic) {
-    RtpGenericFrameDescriptor generic_descriptor;
-    generic_descriptor.SetFirstPacketInSubFrame(first_packet);
-    generic_descriptor.SetLastPacketInSubFrame(last_packet);
-    generic_descriptor.SetDiscardable(video_header.generic->discardable);
-
-    if (first_packet) {
-      generic_descriptor.SetFrameId(
-          static_cast<uint16_t>(video_header.generic->frame_id));
+    bool extension_is_set = false;
+    if (video_structure != nullptr) {
+      DependencyDescriptor descriptor;
+      descriptor.first_packet_in_frame = first_packet;
+      descriptor.last_packet_in_frame = last_packet;
+      descriptor.frame_number = video_header.generic->frame_id & 0xFFFF;
+      descriptor.frame_dependencies.spatial_id =
+          video_header.generic->spatial_index;
+      descriptor.frame_dependencies.temporal_id =
+          video_header.generic->temporal_index;
       for (int64_t dep : video_header.generic->dependencies) {
-        generic_descriptor.AddFrameDependencyDiff(
+        descriptor.frame_dependencies.frame_diffs.push_back(
             video_header.generic->frame_id - dep);
       }
+      descriptor.frame_dependencies.decode_target_indications =
+          video_header.generic->decode_target_indications;
+      RTC_DCHECK_EQ(
+          descriptor.frame_dependencies.decode_target_indications.size(),
+          video_structure->num_decode_targets);
 
-      uint8_t spatial_bimask = 1 << video_header.generic->spatial_index;
-      generic_descriptor.SetSpatialLayersBitmask(spatial_bimask);
-
-      generic_descriptor.SetTemporalLayer(video_header.generic->temporal_index);
-
-      if (video_header.frame_type == VideoFrameType::kVideoFrameKey) {
-        generic_descriptor.SetResolution(video_header.width,
-                                         video_header.height);
+      // To avoid extra structure copy, temporary share ownership of the
+      // video_structure with the dependency descriptor.
+      if (video_header.frame_type == VideoFrameType::kVideoFrameKey &&
+          first_packet) {
+        descriptor.attached_structure = absl::WrapUnique(video_structure);
       }
+      extension_is_set = packet->SetExtension<RtpDependencyDescriptorExtension>(
+          *video_structure, descriptor);
+
+      // Remove the temporary shared ownership.
+      descriptor.attached_structure.release();
     }
 
-    if (!packet->SetExtension<RtpGenericFrameDescriptorExtension01>(
-            generic_descriptor)) {
-      packet->SetExtension<RtpGenericFrameDescriptorExtension00>(
-          generic_descriptor);
+    // Do not use v0/v1 generic frame descriptor when v2 is stored.
+    if (!extension_is_set) {
+      RtpGenericFrameDescriptor generic_descriptor;
+      generic_descriptor.SetFirstPacketInSubFrame(first_packet);
+      generic_descriptor.SetLastPacketInSubFrame(last_packet);
+      generic_descriptor.SetDiscardable(video_header.generic->discardable);
+
+      if (first_packet) {
+        generic_descriptor.SetFrameId(
+            static_cast<uint16_t>(video_header.generic->frame_id));
+        for (int64_t dep : video_header.generic->dependencies) {
+          generic_descriptor.AddFrameDependencyDiff(
+              video_header.generic->frame_id - dep);
+        }
+
+        uint8_t spatial_bimask = 1 << video_header.generic->spatial_index;
+        generic_descriptor.SetSpatialLayersBitmask(spatial_bimask);
+
+        generic_descriptor.SetTemporalLayer(
+            video_header.generic->temporal_index);
+
+        if (video_header.frame_type == VideoFrameType::kVideoFrameKey) {
+          generic_descriptor.SetResolution(video_header.width,
+                                           video_header.height);
+        }
+      }
+
+      if (!packet->SetExtension<RtpGenericFrameDescriptorExtension01>(
+              generic_descriptor)) {
+        packet->SetExtension<RtpGenericFrameDescriptorExtension00>(
+            generic_descriptor);
+      }
     }
   }
 }
@@ -417,6 +458,38 @@
   return absl::nullopt;
 }
 
+void RTPSenderVideo::SetVideoStructure(
+    const FrameDependencyStructure* video_structure) {
+  RTC_DCHECK_RUNS_SERIALIZED(&send_checker_);
+  if (video_structure == nullptr) {
+    video_structure_ = nullptr;
+    return;
+  }
+  // Simple sanity checks video structure is set up.
+  RTC_DCHECK_GT(video_structure->num_decode_targets, 0);
+  RTC_DCHECK_GT(video_structure->templates.size(), 0);
+
+  int structure_id = 0;
+  if (video_structure_) {
+    if (*video_structure_ == *video_structure) {
+      // Same structure (just a new key frame), no update required.
+      return;
+    }
+    // When setting different video structure make sure structure_id is updated
+    // so that templates from different structures do not collide.
+    static constexpr int kMaxTemplates = 64;
+    structure_id =
+        (video_structure_->structure_id + video_structure_->templates.size()) %
+        kMaxTemplates;
+  }
+
+  video_structure_ =
+      std::make_unique<FrameDependencyStructure>(*video_structure);
+  video_structure_->structure_id = structure_id;
+  // TODO(bugs.webrtc.org/10342): Support chains.
+  video_structure_->num_chains = 0;
+}
+
 bool RTPSenderVideo::SendVideo(
     int payload_type,
     absl::optional<VideoCodecType> codec_type,
@@ -523,16 +596,20 @@
   auto last_packet = std::make_unique<RtpPacketToSend>(*single_packet);
   // Simplest way to estimate how much extensions would occupy is to set them.
   AddRtpHeaderExtensions(video_header, playout_delay, absolute_capture_time,
-                         set_video_rotation, set_color_space, set_frame_marking,
+                         video_structure_.get(), set_video_rotation,
+                         set_color_space, set_frame_marking,
                          /*first=*/true, /*last=*/true, single_packet.get());
   AddRtpHeaderExtensions(video_header, playout_delay, absolute_capture_time,
-                         set_video_rotation, set_color_space, set_frame_marking,
+                         video_structure_.get(), set_video_rotation,
+                         set_color_space, set_frame_marking,
                          /*first=*/true, /*last=*/false, first_packet.get());
   AddRtpHeaderExtensions(video_header, playout_delay, absolute_capture_time,
-                         set_video_rotation, set_color_space, set_frame_marking,
+                         video_structure_.get(), set_video_rotation,
+                         set_color_space, set_frame_marking,
                          /*first=*/false, /*last=*/false, middle_packet.get());
   AddRtpHeaderExtensions(video_header, playout_delay, absolute_capture_time,
-                         set_video_rotation, set_color_space, set_frame_marking,
+                         video_structure_.get(), set_video_rotation,
+                         set_color_space, set_frame_marking,
                          /*first=*/false, /*last=*/true, last_packet.get());
 
   RTC_DCHECK_GT(packet_capacity, single_packet->headers_size());
diff --git a/modules/rtp_rtcp/source/rtp_sender_video.h b/modules/rtp_rtcp/source/rtp_sender_video.h
index 3f4c676..053877e 100644
--- a/modules/rtp_rtcp/source/rtp_sender_video.h
+++ b/modules/rtp_rtcp/source/rtp_sender_video.h
@@ -18,6 +18,7 @@
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "api/array_view.h"
+#include "api/transport/rtp/dependency_descriptor.h"
 #include "api/video/video_codec_type.h"
 #include "api/video/video_frame_type.h"
 #include "modules/include/module_common_types.h"
@@ -103,6 +104,13 @@
                  const RTPFragmentationHeader* fragmentation,
                  RTPVideoHeader video_header,
                  absl::optional<int64_t> expected_retransmission_time_ms);
+  // Configures video structures produced by encoder to send using the
+  // dependency descriptor rtp header extension. Next call to SendVideo should
+  // have video_header.frame_type == kVideoFrameKey.
+  // All calls to SendVideo after this call must use video_header compatible
+  // with the video_structure.
+  void SetVideoStructure(const FrameDependencyStructure* video_structure);
+
   // FlexFEC/ULPFEC.
   // Set FEC rates, max frames before FEC is sent, and type of FEC masks.
   // Returns false on failure.
@@ -184,6 +192,8 @@
   VideoRotation last_rotation_ RTC_GUARDED_BY(send_checker_);
   absl::optional<ColorSpace> last_color_space_ RTC_GUARDED_BY(send_checker_);
   bool transmit_color_space_next_frame_ RTC_GUARDED_BY(send_checker_);
+  std::unique_ptr<FrameDependencyStructure> video_structure_
+      RTC_GUARDED_BY(send_checker_);
 
   // Tracks the current request for playout delay limits from application
   // and decides whether the current RTP frame should include the playout
diff --git a/modules/rtp_rtcp/source/rtp_sender_video_unittest.cc b/modules/rtp_rtcp/source/rtp_sender_video_unittest.cc
index 7ccd0ac..867e05b 100644
--- a/modules/rtp_rtcp/source/rtp_sender_video_unittest.cc
+++ b/modules/rtp_rtcp/source/rtp_sender_video_unittest.cc
@@ -10,15 +10,20 @@
 
 #include "modules/rtp_rtcp/source/rtp_sender_video.h"
 
+#include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "api/transport/rtp/dependency_descriptor.h"
 #include "api/video/video_codec_constants.h"
 #include "api/video/video_timing.h"
+#include "common_video/generic_frame_descriptor/generic_frame_info.h"
 #include "modules/rtp_rtcp/include/rtp_cvo.h"
 #include "modules/rtp_rtcp/include/rtp_header_extension_map.h"
 #include "modules/rtp_rtcp/include/rtp_rtcp.h"
 #include "modules/rtp_rtcp/include/rtp_rtcp_defines.h"
+#include "modules/rtp_rtcp/source/rtp_dependency_descriptor_extension.h"
 #include "modules/rtp_rtcp/source/rtp_format_video_generic.h"
 #include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor.h"
 #include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor_extension.h"
@@ -35,12 +40,15 @@
 namespace {
 
 using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
 
 enum : int {  // The first valid value is 1.
   kAbsoluteSendTimeExtensionId = 1,
   kFrameMarkingExtensionId,
   kGenericDescriptorId00,
   kGenericDescriptorId01,
+  kGenericDescriptorId02,
   kTransmissionTimeOffsetExtensionId,
   kTransportSequenceNumberExtensionId,
   kVideoRotationExtensionId,
@@ -73,6 +81,8 @@
         kGenericDescriptorId00);
     receivers_extensions_.Register<RtpGenericFrameDescriptorExtension01>(
         kGenericDescriptorId01);
+    receivers_extensions_.Register<RtpDependencyDescriptorExtension>(
+        kGenericDescriptorId02);
     receivers_extensions_.Register<FrameMarkingExtension>(
         kFrameMarkingExtensionId);
     receivers_extensions_.Register<AbsoluteCaptureTimeExtension>(
@@ -522,6 +532,148 @@
   EXPECT_TRUE(rtp_sender_video_.AllowRetransmission(header, kSettings, kRttMs));
 }
 
+TEST_P(RtpSenderVideoTest, SendsDependencyDescriptorWhenVideoStructureIsSet) {
+  const int64_t kFrameId = 100000;
+  uint8_t kFrame[100];
+  rtp_module_->RegisterRtpHeaderExtension(
+      RtpDependencyDescriptorExtension::kUri, kGenericDescriptorId02);
+  FrameDependencyStructure video_structure;
+  video_structure.num_decode_targets = 2;
+  video_structure.templates = {
+      GenericFrameInfo::Builder().S(0).T(0).Dtis("SS").Build(),
+      GenericFrameInfo::Builder().S(1).T(0).Dtis("-S").Build(),
+      GenericFrameInfo::Builder().S(1).T(1).Dtis("-D").Build(),
+  };
+  rtp_sender_video_.SetVideoStructure(&video_structure);
+
+  // Send key frame.
+  RTPVideoHeader hdr;
+  RTPVideoHeader::GenericDescriptorInfo& generic = hdr.generic.emplace();
+  generic.frame_id = kFrameId;
+  generic.temporal_index = 0;
+  generic.spatial_index = 0;
+  generic.decode_target_indications = {DecodeTargetIndication::kSwitch,
+                                       DecodeTargetIndication::kSwitch};
+  hdr.frame_type = VideoFrameType::kVideoFrameKey;
+  rtp_sender_video_.SendVideo(kPayload, kType, kTimestamp, 0, kFrame, nullptr,
+                              hdr, kDefaultExpectedRetransmissionTimeMs);
+
+  ASSERT_EQ(transport_.packets_sent(), 1);
+  DependencyDescriptor descriptor_key;
+  ASSERT_TRUE(transport_.last_sent_packet()
+                  .GetExtension<RtpDependencyDescriptorExtension>(
+                      nullptr, &descriptor_key));
+  ASSERT_TRUE(descriptor_key.attached_structure);
+  EXPECT_EQ(descriptor_key.attached_structure->num_decode_targets, 2);
+  EXPECT_THAT(descriptor_key.attached_structure->templates, SizeIs(3));
+  EXPECT_EQ(descriptor_key.frame_number, kFrameId & 0xFFFF);
+  EXPECT_EQ(descriptor_key.frame_dependencies.spatial_id, 0);
+  EXPECT_EQ(descriptor_key.frame_dependencies.temporal_id, 0);
+  EXPECT_EQ(descriptor_key.frame_dependencies.decode_target_indications,
+            generic.decode_target_indications);
+  EXPECT_THAT(descriptor_key.frame_dependencies.frame_diffs, IsEmpty());
+
+  // Send delta frame.
+  generic.frame_id = kFrameId + 1;
+  generic.temporal_index = 1;
+  generic.spatial_index = 1;
+  generic.dependencies = {kFrameId, kFrameId - 500};
+  generic.decode_target_indications = {DecodeTargetIndication::kNotPresent,
+                                       DecodeTargetIndication::kRequired};
+  hdr.frame_type = VideoFrameType::kVideoFrameDelta;
+  rtp_sender_video_.SendVideo(kPayload, kType, kTimestamp, 0, kFrame, nullptr,
+                              hdr, kDefaultExpectedRetransmissionTimeMs);
+
+  EXPECT_EQ(transport_.packets_sent(), 2);
+  DependencyDescriptor descriptor_delta;
+  ASSERT_TRUE(
+      transport_.last_sent_packet()
+          .GetExtension<RtpDependencyDescriptorExtension>(
+              descriptor_key.attached_structure.get(), &descriptor_delta));
+  EXPECT_EQ(descriptor_delta.attached_structure, nullptr);
+  EXPECT_EQ(descriptor_delta.frame_number, (kFrameId + 1) & 0xFFFF);
+  EXPECT_EQ(descriptor_delta.frame_dependencies.spatial_id, 1);
+  EXPECT_EQ(descriptor_delta.frame_dependencies.temporal_id, 1);
+  EXPECT_EQ(descriptor_delta.frame_dependencies.decode_target_indications,
+            generic.decode_target_indications);
+  EXPECT_THAT(descriptor_delta.frame_dependencies.frame_diffs,
+              ElementsAre(1, 501));
+}
+
+TEST_P(RtpSenderVideoTest,
+       SetDiffentVideoStructureAvoidsCollisionWithThePreviousStructure) {
+  const int64_t kFrameId = 100000;
+  uint8_t kFrame[100];
+  rtp_module_->RegisterRtpHeaderExtension(
+      RtpDependencyDescriptorExtension::kUri, kGenericDescriptorId02);
+  FrameDependencyStructure video_structure1;
+  video_structure1.num_decode_targets = 2;
+  video_structure1.templates = {
+      GenericFrameInfo::Builder().S(0).T(0).Dtis("SS").Build(),
+      GenericFrameInfo::Builder().S(0).T(1).Dtis("D-").Build(),
+  };
+  FrameDependencyStructure video_structure2;
+  video_structure2.num_decode_targets = 2;
+  video_structure2.templates = {
+      GenericFrameInfo::Builder().S(0).T(0).Dtis("SS").Build(),
+      GenericFrameInfo::Builder().S(0).T(1).Dtis("R-").Build(),
+  };
+
+  // Send 1st key frame.
+  RTPVideoHeader hdr;
+  RTPVideoHeader::GenericDescriptorInfo& generic = hdr.generic.emplace();
+  generic.frame_id = kFrameId;
+  generic.decode_target_indications = {DecodeTargetIndication::kSwitch,
+                                       DecodeTargetIndication::kSwitch};
+  hdr.frame_type = VideoFrameType::kVideoFrameKey;
+  rtp_sender_video_.SetVideoStructure(&video_structure1);
+  rtp_sender_video_.SendVideo(kPayload, kType, kTimestamp, 0, kFrame, nullptr,
+                              hdr, kDefaultExpectedRetransmissionTimeMs);
+  // Parse 1st extension.
+  ASSERT_EQ(transport_.packets_sent(), 1);
+  DependencyDescriptor descriptor_key1;
+  ASSERT_TRUE(transport_.last_sent_packet()
+                  .GetExtension<RtpDependencyDescriptorExtension>(
+                      nullptr, &descriptor_key1));
+  ASSERT_TRUE(descriptor_key1.attached_structure);
+
+  // Send the delta frame.
+  generic.frame_id = kFrameId + 1;
+  generic.temporal_index = 1;
+  generic.decode_target_indications = {DecodeTargetIndication::kDiscardable,
+                                       DecodeTargetIndication::kNotPresent};
+  hdr.frame_type = VideoFrameType::kVideoFrameDelta;
+  rtp_sender_video_.SendVideo(kPayload, kType, kTimestamp, 0, kFrame, nullptr,
+                              hdr, kDefaultExpectedRetransmissionTimeMs);
+
+  ASSERT_EQ(transport_.packets_sent(), 2);
+  RtpPacket delta_packet = transport_.last_sent_packet();
+
+  // Send 2nd key frame.
+  generic.frame_id = kFrameId + 2;
+  generic.decode_target_indications = {DecodeTargetIndication::kSwitch,
+                                       DecodeTargetIndication::kSwitch};
+  hdr.frame_type = VideoFrameType::kVideoFrameKey;
+  rtp_sender_video_.SetVideoStructure(&video_structure2);
+  rtp_sender_video_.SendVideo(kPayload, kType, kTimestamp, 0, kFrame, nullptr,
+                              hdr, kDefaultExpectedRetransmissionTimeMs);
+  // Parse the 2nd key frame.
+  ASSERT_EQ(transport_.packets_sent(), 3);
+  DependencyDescriptor descriptor_key2;
+  ASSERT_TRUE(transport_.last_sent_packet()
+                  .GetExtension<RtpDependencyDescriptorExtension>(
+                      nullptr, &descriptor_key2));
+  ASSERT_TRUE(descriptor_key2.attached_structure);
+
+  // Try to parse the 1st delta frame. It should parseble using the structure
+  // from the 1st key frame, but not using the structure from the 2nd key frame.
+  DependencyDescriptor descriptor_delta;
+  EXPECT_TRUE(delta_packet.GetExtension<RtpDependencyDescriptorExtension>(
+      descriptor_key1.attached_structure.get(), &descriptor_delta));
+  EXPECT_FALSE(delta_packet.GetExtension<RtpDependencyDescriptorExtension>(
+      descriptor_key2.attached_structure.get(), &descriptor_delta));
+}
+
 void RtpSenderVideoTest::PopulateGenericFrameDescriptor(int version) {
   const absl::string_view ext_uri =
       (version == 0) ? RtpGenericFrameDescriptorExtension00::kUri
diff --git a/modules/rtp_rtcp/source/rtp_video_header.h b/modules/rtp_rtcp/source/rtp_video_header.h
index b66cba8..714d1eb 100644
--- a/modules/rtp_rtcp/source/rtp_video_header.h
+++ b/modules/rtp_rtcp/source/rtp_video_header.h
@@ -15,6 +15,7 @@
 #include "absl/container/inlined_vector.h"
 #include "absl/types/optional.h"
 #include "absl/types/variant.h"
+#include "api/transport/rtp/dependency_descriptor.h"
 #include "api/video/color_space.h"
 #include "api/video/video_codec_type.h"
 #include "api/video/video_content_type.h"
@@ -50,6 +51,7 @@
     int64_t frame_id = 0;
     int spatial_index = 0;
     int temporal_index = 0;
+    absl::InlinedVector<DecodeTargetIndication, 10> decode_target_indications;
     absl::InlinedVector<int64_t, 5> dependencies;
     bool discardable = false;
   };