[VP9] Shift spatial layers on RTP level to always start from 0.

This CL uses |width| and |height| in RTPVideoHeaderVP9 to pass information
about enabled layers from encoder to packetizer.

Bug: webrtc:11319
Change-Id: Idc1c337f8dfb3f7631506acb784d2a634b41b955
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/167724
Reviewed-by: Danil Chapovalov <danilchap@webrtc.org>
Reviewed-by: Niels Moller <nisse@webrtc.org>
Commit-Queue: Ilya Nikolaevskiy <ilnik@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#30428}
diff --git a/call/rtp_payload_params.cc b/call/rtp_payload_params.cc
index 70b156a..408a2a8 100644
--- a/call/rtp_payload_params.cc
+++ b/call/rtp_payload_params.cc
@@ -61,6 +61,7 @@
           info.codecSpecific.VP9.inter_layer_predicted;
       vp9_header.gof_idx = info.codecSpecific.VP9.gof_idx;
       vp9_header.num_spatial_layers = info.codecSpecific.VP9.num_spatial_layers;
+      vp9_header.first_active_layer = info.codecSpecific.VP9.first_active_layer;
       if (vp9_header.num_spatial_layers > 1) {
         vp9_header.spatial_idx = spatial_index.value_or(kNoSpatialIdx);
       } else {
diff --git a/modules/rtp_rtcp/source/rtp_format_vp9.cc b/modules/rtp_rtcp/source/rtp_format_vp9.cc
index 57ac447..15e059e 100644
--- a/modules/rtp_rtcp/source/rtp_format_vp9.cc
+++ b/modules/rtp_rtcp/source/rtp_format_vp9.cc
@@ -280,15 +280,42 @@
   }
   return true;
 }
+
+// TODO(https://bugs.webrtc.org/11319):
+// Workaround for switching off spatial layers on the fly.
+// Sent layers must start from SL0 on RTP layer, but can start from any
+// spatial layer because WebRTC-SVC api isn't implemented yet and
+// current API to invoke SVC is not flexible enough.
+RTPVideoHeaderVP9 RemoveInactiveSpatialLayers(
+    const RTPVideoHeaderVP9& original_header) {
+  RTPVideoHeaderVP9 hdr(original_header);
+  if (original_header.first_active_layer == 0)
+    return hdr;
+  for (size_t i = hdr.first_active_layer; i < hdr.num_spatial_layers; ++i) {
+    hdr.width[i - hdr.first_active_layer] = hdr.width[i];
+    hdr.height[i - hdr.first_active_layer] = hdr.height[i];
+  }
+  for (size_t i = hdr.num_spatial_layers - hdr.first_active_layer;
+       i < hdr.num_spatial_layers; ++i) {
+    hdr.width[i] = 0;
+    hdr.height[i] = 0;
+  }
+  hdr.num_spatial_layers -= hdr.first_active_layer;
+  hdr.spatial_idx -= hdr.first_active_layer;
+  hdr.first_active_layer = 0;
+  return hdr;
+}
 }  // namespace
 
 RtpPacketizerVp9::RtpPacketizerVp9(rtc::ArrayView<const uint8_t> payload,
                                    PayloadSizeLimits limits,
                                    const RTPVideoHeaderVP9& hdr)
-    : hdr_(hdr),
+    : hdr_(RemoveInactiveSpatialLayers(hdr)),
       header_size_(PayloadDescriptorLengthMinusSsData(hdr_)),
       first_packet_extra_header_size_(SsDataLength(hdr_)),
       remaining_payload_(payload) {
+  RTC_DCHECK_EQ(hdr_.first_active_layer, 0);
+
   limits.max_payload_len -= header_size_;
   limits.first_packet_reduction_len += first_packet_extra_header_size_;
   limits.single_packet_reduction_len += first_packet_extra_header_size_;
diff --git a/modules/rtp_rtcp/source/rtp_format_vp9_unittest.cc b/modules/rtp_rtcp/source/rtp_format_vp9_unittest.cc
index 7fd5135..0dc6566 100644
--- a/modules/rtp_rtcp/source/rtp_format_vp9_unittest.cc
+++ b/modules/rtp_rtcp/source/rtp_format_vp9_unittest.cc
@@ -169,6 +169,21 @@
       expected_.ss_data_available = false;
     }
   }
+
+  void CreateParseAndCheckPacketsLayers(size_t num_spatial_layers,
+                                        size_t expected_layer) {
+    ASSERT_TRUE(packetizer_ != nullptr);
+    for (size_t i = 0; i < num_packets_; ++i) {
+      EXPECT_TRUE(packetizer_->NextPacket(&packet_));
+      RTPVideoHeader video_header;
+      VideoRtpDepacketizerVp9::ParseRtpPayload(packet_.payload(),
+                                               &video_header);
+      const auto& vp9_header =
+          absl::get<RTPVideoHeaderVP9>(video_header.video_type_header);
+      EXPECT_EQ(vp9_header.spatial_idx, expected_layer);
+      EXPECT_EQ(vp9_header.num_spatial_layers, num_spatial_layers);
+    }
+  }
 };
 
 TEST_F(RtpPacketizerVp9Test, TestEqualSizedMode_OnePacket) {
@@ -546,5 +561,48 @@
   CreateParseAndCheckPackets(kExpectedHdrSizes, kExpectedSizes);
 }
 
+TEST_F(RtpPacketizerVp9Test,
+       ShiftsSpatialLayersTowardZeroWhenFirstLayersAreDisabled) {
+  const size_t kFrameSize = 25;
+  const size_t kPacketSize = 1024;
+
+  expected_.width[0] = 0;
+  expected_.height[0] = 0;
+  expected_.width[1] = 640;
+  expected_.height[1] = 360;
+  expected_.width[2] = 1280;
+  expected_.height[2] = 720;
+  expected_.num_spatial_layers = 3;
+  expected_.first_active_layer = 1;
+  expected_.ss_data_available = true;
+  expected_.spatial_layer_resolution_present = true;
+  expected_.gof.num_frames_in_gof = 3;
+  expected_.gof.temporal_idx[0] = 0;
+  expected_.gof.temporal_idx[1] = 1;
+  expected_.gof.temporal_idx[2] = 2;
+  expected_.gof.temporal_up_switch[0] = true;
+  expected_.gof.temporal_up_switch[1] = true;
+  expected_.gof.temporal_up_switch[2] = false;
+  expected_.gof.num_ref_pics[0] = 0;
+  expected_.gof.num_ref_pics[1] = 3;
+  expected_.gof.num_ref_pics[2] = 2;
+  expected_.gof.pid_diff[1][0] = 5;
+  expected_.gof.pid_diff[1][1] = 6;
+  expected_.gof.pid_diff[1][2] = 7;
+  expected_.gof.pid_diff[2][0] = 8;
+  expected_.gof.pid_diff[2][1] = 9;
+
+  expected_.spatial_idx = 1;
+  Init(kFrameSize, kPacketSize);
+  CreateParseAndCheckPacketsLayers(/*num_spatial_layers=*/2,
+                                   /*expected_layer=*/0);
+
+  // Now check for SL 2;
+  expected_.spatial_idx = 2;
+  Init(kFrameSize, kPacketSize);
+  CreateParseAndCheckPacketsLayers(/*num_spatial_layers=*/2,
+                                   /*expected_layer=*/1);
+}
+
 }  // namespace
 }  // namespace webrtc
diff --git a/modules/video_coding/codecs/vp9/include/vp9_globals.h b/modules/video_coding/codecs/vp9/include/vp9_globals.h
index 96b976e..c685312 100644
--- a/modules/video_coding/codecs/vp9/include/vp9_globals.h
+++ b/modules/video_coding/codecs/vp9/include/vp9_globals.h
@@ -173,6 +173,7 @@
     gof_idx = kNoGofIdx;
     num_ref_pics = 0;
     num_spatial_layers = 1;
+    first_active_layer = 0;
     end_of_picture = true;
   }
 
@@ -208,6 +209,7 @@
 
   // SS data.
   size_t num_spatial_layers;  // Always populated.
+  size_t first_active_layer;  // Not sent on wire, used to adjust ss data.
   bool spatial_layer_resolution_present;
   uint16_t width[kMaxVp9NumberOfSpatialLayers];
   uint16_t height[kMaxVp9NumberOfSpatialLayers];
diff --git a/modules/video_coding/codecs/vp9/vp9_impl.cc b/modules/video_coding/codecs/vp9/vp9_impl.cc
index 3d9211f..fe6c912 100644
--- a/modules/video_coding/codecs/vp9/vp9_impl.cc
+++ b/modules/video_coding/codecs/vp9/vp9_impl.cc
@@ -1113,6 +1113,7 @@
   // Always populate this, so that the packetizer can properly set the marker
   // bit.
   vp9_info->num_spatial_layers = num_active_spatial_layers_;
+  vp9_info->first_active_layer = first_active_layer_;
 
   vp9_info->num_ref_pics = 0;
   FillReferenceIndices(pkt, pics_since_key_, vp9_info->inter_layer_predicted,
diff --git a/modules/video_coding/include/video_codec_interface.h b/modules/video_coding/include/video_codec_interface.h
index 54839e1..c7b116f 100644
--- a/modules/video_coding/include/video_codec_interface.h
+++ b/modules/video_coding/include/video_codec_interface.h
@@ -69,6 +69,7 @@
 
   // SS data.
   size_t num_spatial_layers;  // Always populated.
+  size_t first_active_layer;
   bool spatial_layer_resolution_present;
   uint16_t width[kMaxVp9NumberOfSpatialLayers];
   uint16_t height[kMaxVp9NumberOfSpatialLayers];