Reland "codec2: add dequeue thread loop to recycle output buffers"

This reverts commit cb35176c4c402a7d77cd1c472002976be4021cb8.

The dequeue thread is implemented for C2VDAComponent to process a routine to
dequeue buffer from C2VDABQBlockPool. When a buffer (along with its slot index)
is dequeued and obtained from block pool, C2VDAComponent has the map of slot
index to buffer id, and the corresponding buffer is then reusable for VDA.

Note: this change will break C2VDAComponent_test, need to fix it

Note: this only support output surface mode, we need another solution for
byte-buffer mode

Bug: 79239042
Test: CtsMediaTestCases android.media.cts.MediaPlayerTest#testLocalVideo_MP4_H264_480x360_1000kbps_25fps_AAC_Stereo_128kbps_44110Hz
Change-Id: Icb42c425f90e0615c63728f75627a7a0cb661df3
diff --git a/C2VDAComponent.cpp b/C2VDAComponent.cpp
index 2b3abde..2e8f9d3 100644
--- a/C2VDAComponent.cpp
+++ b/C2VDAComponent.cpp
@@ -12,11 +12,14 @@
 #endif
 
 #define __C2_GENERATE_GLOBAL_VARS__
+#include <C2VDAAllocatorStore.h>
+#include <C2VdaBqBlockPool.h>
 #include <C2VDAComponent.h>
 #include <C2VDASupport.h>  // to getParamReflector from vda store
 
 #include <videodev2.h>
 
+#include <C2AllocatorGralloc.h>
 #include <C2ComponentFactory.h>
 #include <C2PlatformSupport.h>
 
@@ -52,6 +55,9 @@
 const C2String kVP8DecoderName = "c2.vda.vp8.decoder";
 const C2String kVP9DecoderName = "c2.vda.vp9.decoder";
 
+const uint32_t kDpbOutputBufferExtraCount = 3;  // Use the same number as ACodec.
+const int kDequeueRetryDelayUs = 10000;  // Wait time of dequeue buffer retry in microseconds.
+
 }  // namespace
 
 C2VDAComponent::IntfImpl::IntfImpl(C2String name, const std::shared_ptr<C2ReflectorHelper>& helper)
@@ -170,28 +176,6 @@
         CHECK_NE(mComponentState, ComponentState::UNINITIALIZED); \
     } while (0)
 
-class C2VDAGraphicBuffer : public C2Buffer {
-public:
-    C2VDAGraphicBuffer(const std::shared_ptr<C2GraphicBlock>& block, const media::Rect& visibleRect,
-                       const base::Closure& releaseCB);
-    ~C2VDAGraphicBuffer() override;
-
-private:
-    base::Closure mReleaseCB;
-};
-
-C2VDAGraphicBuffer::C2VDAGraphicBuffer(const std::shared_ptr<C2GraphicBlock>& block,
-                                       const media::Rect& visibleRect,
-                                       const base::Closure& releaseCB)
-      : C2Buffer({block->share(C2Rect(visibleRect.width(), visibleRect.height()), C2Fence())}),
-        mReleaseCB(releaseCB) {}
-
-C2VDAGraphicBuffer::~C2VDAGraphicBuffer() {
-    if (!mReleaseCB.is_null()) {
-        mReleaseCB.Run();
-    }
-}
-
 C2VDAComponent::VideoFormat::VideoFormat(HalPixelFormat pixelFormat, uint32_t minNumBuffers,
                                          media::Size codedSize, media::Rect visibleRect)
       : mPixelFormat(pixelFormat),
@@ -199,11 +183,22 @@
         mCodedSize(codedSize),
         mVisibleRect(visibleRect) {}
 
+static uint32_t getSlotFromGraphicBlockHandle(const C2Handle* const handle) {
+    uint32_t width, height, format, stride, igbp_slot, generation;
+    uint64_t usage, igbp_id;
+    _UnwrapNativeCodec2GrallocMetadata(
+            handle, &width, &height, &format, &usage, &stride, &generation, &igbp_id, &igbp_slot);
+    ALOGV("Unwrap Metadata: igbp[%" PRIu64 ", %u] (%u*%u, fmt %#x, usage %" PRIx64 ", stride %u)",
+          igbp_id, igbp_slot, width, height, format, usage, stride);
+    return igbp_slot;
+}
+
 C2VDAComponent::C2VDAComponent(C2String name, c2_node_id_t id,
                                const std::shared_ptr<C2ReflectorHelper>& helper)
       : mIntfImpl(std::make_shared<IntfImpl>(name, helper)),
         mIntf(std::make_shared<SimpleInterface<IntfImpl>>(name.c_str(), id, mIntfImpl)),
         mThread("C2VDAComponentThread"),
+        mDequeueThread("C2VDAComponentDequeueThread"),
         mVDAInitResult(VideoDecodeAcceleratorAdaptor::Result::ILLEGAL_STATE),
         mComponentState(ComponentState::UNINITIALIZED),
         mDrainWithEOS(false),
@@ -241,6 +236,7 @@
         mVDAAdaptor->destroy();
         mVDAAdaptor.reset(nullptr);
     }
+    stopDequeueThread();
 }
 
 void C2VDAComponent::onStart(media::VideoCodecProfile profile, base::WaitableEvent* done) {
@@ -351,16 +347,9 @@
     reportFinishedWorkIfAny();
 }
 
-// This is used as callback while output buffer is released by client.
-// TODO(johnylin): consider to use C2Buffer::registerOnDestroyNotify instead
-void C2VDAComponent::returnOutputBuffer(int32_t pictureBufferId) {
-    mTaskRunner->PostTask(FROM_HERE, base::Bind(&C2VDAComponent::onOutputBufferReturned,
-                                                base::Unretained(this), pictureBufferId));
-}
-
-void C2VDAComponent::onOutputBufferReturned(int32_t pictureBufferId) {
+void C2VDAComponent::onOutputBufferReturned(uint32_t slotId) {
     DCHECK(mTaskRunner->BelongsToCurrentThread());
-    ALOGV("onOutputBufferReturned: picture id=%d", pictureBufferId);
+    ALOGV("onOutputBufferReturned: slot id=%u", slotId);
     if (mComponentState == ComponentState::UNINITIALIZED) {
         // Output buffer is returned from client after component is stopped. Just let the buffer be
         // released.
@@ -369,7 +358,7 @@
 
     // TODO(johnylin): when buffer is returned, we should confirm that output format is not changed
     //                 yet. If changed, just let the buffer be released.
-    GraphicBlockInfo* info = getGraphicBlockById(pictureBufferId);
+    GraphicBlockInfo* info = getGraphicBlockBySlot(slotId);
     if (!info) {
         reportError(C2_CORRUPTED);
         return;
@@ -402,12 +391,14 @@
     CHECK_EQ(info->mState, GraphicBlockInfo::State::OWNED_BY_ACCELERATOR);
     // Output buffer will be passed to client soon along with mListener->onWorkDone_nb().
     info->mState = GraphicBlockInfo::State::OWNED_BY_CLIENT;
+    mBuffersInClient++;
 
     // Attach output buffer to the work corresponded to bitstreamId.
-    work->worklets.front()->output.buffers.emplace_back(std::make_shared<C2VDAGraphicBuffer>(
-            info->mGraphicBlock, mOutputFormat.mVisibleRect,
-            base::Bind(&C2VDAComponent::returnOutputBuffer, mWeakThisFactory.GetWeakPtr(),
-                       pictureBufferId)));
+    auto block = info->mGraphicBlock;
+    work->worklets.front()->output.buffers.emplace_back(C2Buffer::CreateGraphicBuffer(
+            block->share(C2Rect(mOutputFormat.mVisibleRect.width(),
+                                mOutputFormat.mVisibleRect.height()),
+                         C2Fence())));
 
     // TODO: this does not work for timestamps as they can wrap around
     int64_t currentTimestamp = base::checked_cast<int64_t>(work->input.ordinal.timestamp.peek());
@@ -552,6 +543,8 @@
 
     mGraphicBlocks.clear();
 
+    stopDequeueThread();
+
     mStopDoneEvent->Signal();
     mStopDoneEvent = nullptr;
     mComponentState = ComponentState::UNINITIALIZED;
@@ -606,6 +599,19 @@
     return &mGraphicBlocks[blockId];
 }
 
+C2VDAComponent::GraphicBlockInfo* C2VDAComponent::getGraphicBlockBySlot(uint32_t slotId) {
+    auto blockIter = std::find_if(mGraphicBlocks.begin(), mGraphicBlocks.end(),
+                                  [slotId](const GraphicBlockInfo& gb) {
+                                      return gb.mSlotId == slotId;
+                                  });
+
+    if (blockIter == mGraphicBlocks.end()) {
+        ALOGE("getGraphicBlockBySlot failed: slot=%u", slotId);
+        return nullptr;
+    }
+    return &(*blockIter);
+}
+
 void C2VDAComponent::onOutputFormatChanged(std::unique_ptr<VideoFormat> format) {
     DCHECK(mTaskRunner->BelongsToCurrentThread());
     ALOGV("onOutputFormatChanged");
@@ -684,7 +690,25 @@
         }
     }
 
+    stopDequeueThread();
     mGraphicBlocks.clear();
+
+    // Set requested buffer count to C2VdaBqBlockPool.
+    std::shared_ptr<C2VdaBqBlockPool> bqPool =
+            std::static_pointer_cast<C2VdaBqBlockPool>(mOutputBlockPool);
+    if (bqPool) {
+        err = bqPool->requestNewBufferSet(static_cast<uint32_t>(bufferCount));
+        if (err != C2_OK) {
+            ALOGE("failed to set buffer count magic to block pool: %d", err);
+            reportError(err);
+            return err;
+        }
+    } else {
+        ALOGE("Component only supports C2VdaBqBlockPool");
+        reportError(C2_CORRUPTED);
+        return C2_CORRUPTED;
+    }
+
     for (size_t i = 0; i < bufferCount; ++i) {
         std::shared_ptr<C2GraphicBlock> block;
         C2MemoryUsage usage = {C2MemoryUsage::CPU_READ, 0};
@@ -699,6 +723,11 @@
         appendOutputBuffer(std::move(block));
     }
     mOutputFormat.mMinNumBuffers = bufferCount;
+
+    if (!startDequeueThread(size, pixelFormat)) {
+        reportError(C2_CORRUPTED);
+        return C2_CORRUPTED;
+    }
     return C2_OK;
 }
 
@@ -766,6 +795,8 @@
     info.mHandle = std::move(passedHandle);
     info.mPlanes = std::move(passedPlanes);
 
+    info.mSlotId = getSlotFromGraphicBlockHandle(info.mGraphicBlock->handle());
+
     mGraphicBlocks.push_back(std::move(info));
 }
 
@@ -1070,6 +1101,56 @@
     mListener->onError_nb(shared_from_this(), static_cast<uint32_t>(error));
 }
 
+bool C2VDAComponent::startDequeueThread(const media::Size& size, uint32_t pixelFormat) {
+    CHECK(!mDequeueThread.IsRunning());
+    if (!mDequeueThread.Start()) {
+        ALOGE("failed to start dequeue thread!!");
+        return false;
+    }
+    mDequeueLoopStop.store(false);
+    mBuffersInClient.store(0u);
+    mDequeueThread.task_runner()->PostTask(
+            FROM_HERE, base::Bind(&C2VDAComponent::dequeueThreadLoop, base::Unretained(this),
+                                  size, pixelFormat));
+    return true;
+}
+
+void C2VDAComponent::stopDequeueThread() {
+    if (mDequeueThread.IsRunning()) {
+        mDequeueLoopStop.store(true);
+        mDequeueThread.Stop();
+    }
+}
+
+void C2VDAComponent::dequeueThreadLoop(const media::Size& size, uint32_t pixelFormat) {
+    ALOGV("dequeueThreadLoop starts");
+    DCHECK(mDequeueThread.task_runner()->BelongsToCurrentThread());
+
+    while (!mDequeueLoopStop.load()) {
+        if (mBuffersInClient.load() == 0) {
+            ::usleep(kDequeueRetryDelayUs);  // wait for retry
+            continue;
+        }
+        std::shared_ptr<C2GraphicBlock> block;
+        C2MemoryUsage usage = {C2MemoryUsage::CPU_READ, 0};
+        auto err = mOutputBlockPool->fetchGraphicBlock(size.width(), size.height(), pixelFormat,
+                                                       usage, &block);
+        if (err == C2_TIMED_OUT) {
+            continue;  // wait for retry
+        }
+        if (err == C2_OK) {
+            auto slot = getSlotFromGraphicBlockHandle(block->handle());
+            mTaskRunner->PostTask(FROM_HERE, base::Bind(&C2VDAComponent::onOutputBufferReturned,
+                                                        base::Unretained(this), slot));
+            mBuffersInClient--;
+        } else {
+            ALOGE("dequeueThreadLoop got error: %d", err);
+            break;
+        }
+    }
+    ALOGV("dequeueThreadLoop terminates");
+}
+
 class C2VDAComponentFactory : public C2ComponentFactory {
 public:
     C2VDAComponentFactory(C2String decoderName)