arm_compute v19.11
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index e78395f..8d46014 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -36,6 +36,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/float_ops.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
@@ -44,12 +45,15 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::cl_gemm;
+using namespace arm_compute::utils::cast;
 
-CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(std::move(memory_manager)),
+      _weights_manager(weights_manager),
       _mm_kernel(),
       _reshape_lhs_kernel(),
       _reshape_rhs_kernel(),
+      _reshape_rhs_kernel_managed(),
       _mm_reshaped_kernel(),
       _mm_reshaped_only_rhs_kernel(),
       _tmp_a(),
@@ -65,37 +69,53 @@
 {
     GEMMType gemm_type = GEMMType::RESHAPED_V1;
 
-    if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
+    if(gpu_target_is_in(gpu_target, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+                        GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72,
+                        GPUTarget::G76, GPUTarget::G77))
     {
-        if((m > 1) && (n < 16))
+        if(data_type == DataType::F32)
         {
-            gemm_type = GEMMType::RESHAPED_V1;
-        }
-        else if((m == 1) && (data_type == DataType::F32))
-        {
-            gemm_type = GEMMType::RESHAPED_ONLY_RHS;
-        }
-        else
-        {
-            // COMPMID-852
-            if((k > 256) && (m > 4) && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+            if((m > 1) && (n < 16))
             {
-                constexpr float alpha = 3.2f;
-                constexpr float fact0 = 1.51f;
-                constexpr float fact1 = 1.66f;
-                constexpr float ops   = 12.0f;
-                const float     scale = k > 1024 ? 1.07f : 1.0f;
-                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+                gemm_type = GEMMType::RESHAPED_V1;
+            }
+            else if(m == 1)
+            {
+                gemm_type = GEMMType::RESHAPED_ONLY_RHS;
             }
             else
             {
-                gemm_type = GEMMType::NATIVE;
+                // COMPMID-852
+                if((k > 256) && (m > 4) && reshape_b_only_on_first_run)
+                {
+                    constexpr float alpha = 3.2f;
+                    constexpr float fact0 = 1.51f;
+                    constexpr float fact1 = 1.66f;
+                    constexpr float ops   = 12.0f;
+                    const float     scale = k > 1024 ? 1.07f : 1.0f;
+                    gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+                }
+                else
+                {
+                    gemm_type = GEMMType::NATIVE;
+                }
+            }
+
+            const auto workload = static_cast<float>((m * n) / 20.0f);
+
+            gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
+        }
+        else
+        {
+            if((m == 1) || (!reshape_b_only_on_first_run))
+            {
+                gemm_type = GEMMType::RESHAPED_ONLY_RHS;
+            }
+            else
+            {
+                gemm_type = GEMMType::RESHAPED_V2;
             }
         }
-
-        const auto workload = static_cast<float>((m * n) / 20.0f);
-
-        gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
     }
     else
     {
@@ -162,8 +182,12 @@
 
     GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
 
+    const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
+
+    // Manage intermediate buffers
     _memory_group.manage(&_tmp_a);
-    if(!_reshape_b_only_on_first_run)
+
+    if(!_reshape_b_only_on_first_run && use_mm_b)
     {
         _memory_group.manage(&_tmp_b);
     }
@@ -172,16 +196,26 @@
     _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
 
     // Configure transpose kernel
-    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+    ICLTensor *reshaped_rhs = &_tmp_b;
+    if(_weights_manager && _weights_manager->are_weights_managed(b))
+    {
+        _reshape_rhs_kernel_managed.configure(b, rhs_info);
+        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+    }
+    else
+    {
+        _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+    }
 
     // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+    _mm_kernel.configure(&_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
 
     CLScheduler::get().tune_kernel_static(_mm_kernel);
 
     // Allocate intermediate tensors
     _tmp_a.allocator()->allocate();
-    if(!_reshape_b_only_on_first_run)
+
+    if(!_reshape_b_only_on_first_run && use_mm_b)
     {
         _tmp_b.allocator()->allocate();
     }
@@ -212,12 +246,16 @@
     _reshape_lhs_kernel.set_target(gpu_target);
     _mm_kernel.set_target(gpu_target);
 
+    const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
+
     // Manage intermediate buffers
     _memory_group.manage(&_tmp_a);
-    if(!_reshape_b_only_on_first_run)
+
+    if(!_reshape_b_only_on_first_run && use_mm_b)
     {
         _memory_group.manage(&_tmp_b);
     }
+
     // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
     GEMMLHSMatrixInfo lhs_info{};
@@ -231,14 +269,25 @@
     std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
 
     _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+    ICLTensor *reshaped_rhs = &_tmp_b;
+    if(_weights_manager && _weights_manager->are_weights_managed(b))
+    {
+        _reshape_rhs_kernel_managed.configure(b, rhs_info);
+        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+    }
+    else
+    {
+        _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+    }
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_kernel.configure(&_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
     // Allocate intermediate tensors
     _tmp_a.allocator()->allocate();
-    if(!_reshape_b_only_on_first_run)
+
+    if(!_reshape_b_only_on_first_run && use_mm_b)
     {
         _tmp_b.allocator()->allocate();
     }
@@ -268,8 +317,10 @@
     // Set the target for the kernels
     _mm_kernel.set_target(gpu_target);
 
+    const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
+
     // Manage intermediate buffers
-    if(!_reshape_b_only_on_first_run)
+    if(!_reshape_b_only_on_first_run && use_mm_b)
     {
         _memory_group.manage(&_tmp_b);
     }
@@ -284,12 +335,21 @@
     // Configure lhs_info and rhs_info
     std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
 
-    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+    ICLTensor *reshaped_rhs = &_tmp_b;
+    if(_weights_manager && _weights_manager->are_weights_managed(b))
+    {
+        _reshape_rhs_kernel_managed.configure(b, rhs_info);
+        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+    }
+    else
+    {
+        _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+    }
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_kernel.configure(a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
-    if(!_reshape_b_only_on_first_run)
+    if(!_reshape_b_only_on_first_run && use_mm_b)
     {
         _tmp_b.allocator()->allocate();
     }
@@ -591,7 +651,14 @@
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
+                {
+                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                }
+                else
+                {
+                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                }
             }
 
             CLScheduler::get().enqueue(_mm_kernel, true);
@@ -605,7 +672,14 @@
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
+                {
+                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                }
+                else
+                {
+                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                }
             }
 
             CLScheduler::get().enqueue(_mm_reshaped_kernel, true);
@@ -616,7 +690,14 @@
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
+                {
+                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                }
+                else
+                {
+                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                }
             }
 
             CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true);
@@ -635,10 +716,17 @@
     {
         if(_gemm_type != GEMMType::NATIVE && _reshape_b_only_on_first_run)
         {
-            // Run transpose kernel and mark original weights tensor as unused
-            _tmp_b.allocator()->allocate();
-            CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
-            _original_b->mark_as_unused();
+            if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
+            {
+                _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+            }
+            else
+            {
+                // Run transpose kernel and mark original weights tensor as unused
+                _tmp_b.allocator()->allocate();
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                _original_b->mark_as_unused();
+            }
         }
         CLScheduler::get().queue().finish();
         _is_prepared = true;