arm_compute v17.06

commit: dbdab85d6e0f96d3361a9e30310367d89953466c [log] [tgz]
author: Anthony Barbier <anthony.barbier@arm.com> Fri Jun 23 15:42:00 2017 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> Fri Jun 23 16:07:31 2017 +0100
tree: 0cc80d19fd8192de6eca2d28f7e4062aa9deecbf
parent: 664d833b9d7b569db60b0f6d93e80f91f2c07c39 [diff]
diff --git a/tests/benchmark/CL/ActivationLayer.cpp b/tests/benchmark/CL/ActivationLayer.cpp
new file mode 100644
index 0000000..5180d3d
--- /dev/null
+++ b/tests/benchmark/CL/ActivationLayer.cpp

@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/ActivationLayer.h"
+
+namespace
+{
+using ActivationLayerAlexNet   = ActivationLayer<AlexNetActivationLayerDataset, CLTensor, CLAccessor, CLActivationLayer>;
+using ActivationLayerLeNet5    = ActivationLayer<LeNet5ActivationLayerDataset, CLTensor, CLAccessor, CLActivationLayer>;
+using ActivationLayerGoogLeNet = ActivationLayer<GoogLeNetActivationLayerDataset, CLTensor, CLAccessor, CLActivationLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(ActivationLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ActivationLayerDataset, 0, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 32, 1, 4, 8>);

diff --git a/tests/benchmark/CL/BitwiseAnd.cpp b/tests/benchmark/CL/BitwiseAnd.cpp
new file mode 100644
index 0000000..a3deb3e
--- /dev/null
+++ b/tests/benchmark/CL/BitwiseAnd.cpp

@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+namespace
+{
+template <typename DataSet>
+class BitwiseAnd : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        ::benchmark::Fixture::SetUp(state);
+
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const std::string image_name = *(DataSet().begin() + state.range(0));
+
+        // Create tensors
+        src1 = create_tensor(image_name, DataType::U8);
+        src2 = create_tensor(image_name, DataType::U8);
+        dst  = create_tensor(image_name, DataType::U8);
+
+        // Create and configure function
+        band.configure(&src1, &src2, &dst);
+
+        // Allocate tensors
+        src1.allocator()->allocate();
+        src2.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill source tensors
+        library->fill(CLAccessor(src1), image_name, Channel::R);
+        library->fill(CLAccessor(src2), image_name, Channel::G);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        profiler.submit(state);
+
+        ::benchmark::Fixture::TearDown(state);
+    }
+
+    CLBitwiseAnd band{};
+    Profiler     profiler{};
+
+private:
+    CLTensor src1{};
+    CLTensor src2{};
+    CLTensor dst{};
+};
+
+using BitwiseAndSmall = BitwiseAnd<SmallImages>;
+using BitwiseAndLarge = BitwiseAnd<LargeImages>;
+} // namespace
+
+BENCHMARK_DEFINE_F(BitwiseAndSmall, cl_bitwise_and)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        band.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndSmall, cl_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<SmallImages>);
+
+BENCHMARK_DEFINE_F(BitwiseAndLarge, cl_bitwise_and)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        band.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndLarge, cl_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<LargeImages>);

diff --git a/tests/benchmark/CL/ConvolutionLayer.cpp b/tests/benchmark/CL/ConvolutionLayer.cpp
new file mode 100644
index 0000000..e1f4fab
--- /dev/null
+++ b/tests/benchmark/CL/ConvolutionLayer.cpp

@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/ConvolutionLayer.h"
+
+namespace
+{
+using ConvolutionLayerAlexNet    = ConvolutionLayer<AlexNetConvolutionLayerDataset, CLTensor, CLAccessor, CLConvolutionLayer>;
+using ConvolutionLayerLeNet5     = ConvolutionLayer<LeNet5ConvolutionLayerDataset, CLTensor, CLAccessor, CLConvolutionLayer>;
+using ConvolutionLayerGoogLeNet1 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset1, CLTensor, CLAccessor, CLConvolutionLayer>;
+using ConvolutionLayerGoogLeNet2 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset2, CLTensor, CLAccessor, CLConvolutionLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(ConvolutionLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 16, 1, 4, 8>);

diff --git a/tests/benchmark/CL/FullyConnectedLayer.cpp b/tests/benchmark/CL/FullyConnectedLayer.cpp
new file mode 100644
index 0000000..6e8c89f
--- /dev/null
+++ b/tests/benchmark/CL/FullyConnectedLayer.cpp

@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/FullyConnectedLayer.h"
+
+namespace
+{
+using FullyConnectedLayerAlexNet   = FullyConnectedLayer<AlexNetFullyConnectedLayerDataset, CLTensor, CLAccessor, CLFullyConnectedLayer>;
+using FullyConnectedLayerLeNet5    = FullyConnectedLayer<LeNet5FullyConnectedLayerDataset, CLTensor, CLAccessor, CLFullyConnectedLayer>;
+using FullyConnectedLayerGoogLeNet = FullyConnectedLayer<GoogLeNetFullyConnectedLayerDataset, CLTensor, CLAccessor, CLFullyConnectedLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetFullyConnectedLayerDataset, 0, 1, 4, 8>);

diff --git a/tests/benchmark/CL/GEMM.cpp b/tests/benchmark/CL/GEMM.cpp
new file mode 100644
index 0000000..b90556d
--- /dev/null
+++ b/tests/benchmark/CL/GEMM.cpp

@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/CL/GEMM.h"
+
+namespace
+{
+using GEMMFP16GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, CLTensor, CLAccessor, CLGEMM, DataType::F16>;
+using GEMMFP16GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, CLTensor, CLAccessor, CLGEMM, DataType::F16>;
+using GEMMFP32GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, CLTensor, CLAccessor, CLGEMM, DataType::F32>;
+using GEMMFP32GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, CLTensor, CLAccessor, CLGEMM, DataType::F32>;
+} // namespace
+
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet1, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet2, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet1, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet2, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);

diff --git a/tests/benchmark/CL/GEMM.h b/tests/benchmark/CL/GEMM.h
new file mode 100644
index 0000000..25f920f
--- /dev/null
+++ b/tests/benchmark/CL/GEMM.h

@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_CL_GEMM_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_CL_GEMM_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/GEMMDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType data_type>
+class GEMM : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(data_type != DataType::F16 && data_type != DataType::F32, "Unsupported data type for GEMM operation");
+
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const GEMMDataObject gemm_obj = *(DataSet().begin() + state.range(0));
+
+        TensorShape shape_a = gemm_obj.shape_a;
+        TensorShape shape_b = gemm_obj.shape_b;
+        TensorShape shape_c = gemm_obj.shape_c;
+        TensorShape shape_d = gemm_obj.shape_d;
+
+        // Create tensors
+        a = create_tensor(shape_a, data_type);
+        b = create_tensor(shape_b, data_type);
+        c = create_tensor(shape_c, data_type);
+        d = create_tensor(shape_d, data_type);
+
+        // Create and configure function
+        gemm_layer = std::unique_ptr<Function>(new Function());
+        gemm_layer->configure(&a, &b, &c, &d, gemm_obj.alpha, gemm_obj.beta);
+
+        // Allocate tensors
+        a.allocator()->allocate();
+        b.allocator()->allocate();
+        c.allocator()->allocate();
+        d.allocator()->allocate();
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        gemm_layer.reset();
+
+        a.allocator()->free();
+        b.allocator()->free();
+        c.allocator()->free();
+        d.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    std::unique_ptr<Function> gemm_layer{ nullptr };
+    Profiler                  profiler{};
+
+private:
+    TensorType a{};
+    TensorType b{};
+    TensorType c{};
+    TensorType d{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_CL_GEMM_H__

diff --git a/tests/benchmark/CL/NormalizationLayer.cpp b/tests/benchmark/CL/NormalizationLayer.cpp
new file mode 100644
index 0000000..81d3c65
--- /dev/null
+++ b/tests/benchmark/CL/NormalizationLayer.cpp

@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/NormalizationLayer.h"
+
+namespace
+{
+using NormalizationLayerAlexNet   = NormalizationLayer<AlexNetNormalizationLayerDataset, CLTensor, CLAccessor, CLNormalizationLayer>;
+using NormalizationLayerGoogLeNet = NormalizationLayer<GoogLeNetNormalizationLayerDataset, CLTensor, CLAccessor, CLNormalizationLayer>;
+
+} // namespace
+
+BENCHMARK_DEFINE_F(NormalizationLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        norm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(NormalizationLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        norm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 1, 1, 4, 8>);

diff --git a/tests/benchmark/CL/PoolingLayer.cpp b/tests/benchmark/CL/PoolingLayer.cpp
new file mode 100644
index 0000000..d8a8e45
--- /dev/null
+++ b/tests/benchmark/CL/PoolingLayer.cpp

@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/PoolingLayer.h"
+
+namespace
+{
+using PoolingLayerAlexNet   = PoolingLayer<AlexNetPoolingLayerDataset, CLTensor, CLAccessor, CLPoolingLayer>;
+using PoolingLayerLeNet5    = PoolingLayer<LeNet5PoolingLayerDataset, CLTensor, CLAccessor, CLPoolingLayer>;
+using PoolingLayerGoogLeNet = PoolingLayer<GoogLeNetPoolingLayerDataset, CLTensor, CLAccessor, CLPoolingLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(PoolingLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 9, 1, 4, 8>);

diff --git a/tests/benchmark/Datasets.h b/tests/benchmark/Datasets.h
new file mode 100644
index 0000000..e7bfb6f
--- /dev/null
+++ b/tests/benchmark/Datasets.h

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_DATASETS_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_DATASETS_H__
+
+#include "dataset/ActivationLayerDataset.h"
+#include "dataset/BorderModeDataset.h"
+#include "dataset/ConvolutionLayerDataset.h"
+#include "dataset/DataTypeDatasets.h"
+#include "dataset/FullyConnectedLayerDataset.h"
+#include "dataset/GEMMDataset.h"
+#include "dataset/ImageDatasets.h"
+#include "dataset/InterpolationPolicyDataset.h"
+#include "dataset/NormalizationLayerDataset.h"
+#include "dataset/PoolingLayerDataset.h"
+#include "dataset/ShapeDatasets.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <array>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, int N>
+void DataSetArg(::benchmark::internal::Benchmark *b)
+{
+    b->Arg(N);
+    b->ArgName(std::string(*(DataSet().begin() + N)));
+}
+
+template <typename DataSet, int N, unsigned int... Args>
+void DataSetArgBatched(::benchmark::internal::Benchmark *b)
+{
+    constexpr std::array<unsigned int, sizeof...(Args)> batches{ { Args... } };
+    for(const auto &el : batches)
+    {
+        b->Args({ N, static_cast<int>(el) });
+    }
+    b->ArgNames({ std::string(*(DataSet().begin() + N)), "batch_size" });
+}
+
+template <typename DataSet>
+void DataSetArgs(::benchmark::internal::Benchmark *b)
+{
+    for(size_t i = 0; i < DataSet().size(); ++i)
+    {
+        b->Arg(i);
+        b->ArgName(*(DataSet().begin() + i));
+    }
+}
+}
+}
+}
+#endif

diff --git a/tests/benchmark/Instrument.h b/tests/benchmark/Instrument.h
new file mode 100644
index 0000000..39b0088
--- /dev/null
+++ b/tests/benchmark/Instrument.h

@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_INSTRUMENT_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_INSTRUMENT_H__
+
+#include "Utils.h"
+
+#include <memory>
+#include <string>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+/** Interface for classes that can be used to measure performance. */
+class Instrument
+{
+public:
+    /** Interface defining a measurement, e.g. time, cycles, ... */
+    class IMeasurement
+    {
+    public:
+        IMeasurement()                     = default;
+        IMeasurement(const IMeasurement &) = default;
+        IMeasurement(IMeasurement &&)      = default;
+        IMeasurement &operator=(const IMeasurement &) = default;
+        IMeasurement &operator=(IMeasurement &&) = default;
+        virtual ~IMeasurement()                  = default;
+
+        virtual operator double() const = 0;
+    };
+
+    /** Implementation of a Measurement class for arihtmetic types. */
+    template <typename T>
+    class Measurement : public IMeasurement
+    {
+    public:
+        /** Store the given value as measurement.
+         *
+         * @param[in] value Measured value.
+         */
+        Measurement(T value);
+
+        operator double() const override;
+
+    private:
+        T _value;
+    };
+
+    Instrument()                   = default;
+    Instrument(const Instrument &) = default;
+    Instrument(Instrument &&)      = default;
+    Instrument &operator=(const Instrument &) = default;
+    Instrument &operator=(Instrument &&) = default;
+    virtual ~Instrument()                = default;
+
+    /** Identifier for the instrument */
+    virtual std::string id() const = 0;
+
+    /** Start measuring. */
+    virtual void start() = 0;
+
+    /** Stop measuring. */
+    virtual void stop() = 0;
+
+    /** Return the latest measurement. */
+    virtual std::unique_ptr<IMeasurement> get_measurement() const = 0;
+};
+
+template <typename T>
+Instrument::Measurement<T>::Measurement(T value)
+    : _value{ value }
+{
+}
+
+template <typename T>
+Instrument::Measurement<T>::operator double() const
+{
+    return _value;
+}
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif

diff --git a/tests/benchmark/NEON/ActivationLayer.cpp b/tests/benchmark/NEON/ActivationLayer.cpp
new file mode 100644
index 0000000..8faed9f
--- /dev/null
+++ b/tests/benchmark/NEON/ActivationLayer.cpp

@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/ActivationLayer.h"
+
+namespace
+{
+using ActivationLayerAlexNetF32 = ActivationLayer<AlexNetActivationLayerDataset, Tensor, NEAccessor, NEActivationLayer>;
+using ActivationLayerAlexNetQS8 = ActivationLayer<AlexNetActivationLayerDataset, Tensor, NEAccessor, NEActivationLayer, DataType::QS8>;
+using ActivationLayerLeNet5     = ActivationLayer<LeNet5ActivationLayerDataset, Tensor, NEAccessor, NEActivationLayer, DataType::F32>;
+using ActivationLayerGoogLeNet  = ActivationLayer<GoogLeNetActivationLayerDataset, Tensor, NEAccessor, NEActivationLayer, DataType::F32>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(ActivationLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 4, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(ActivationLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerLeNet5, neon_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ActivationLayerDataset, 0, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerGoogLeNet, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 32, 1, 4, 8>);

diff --git a/tests/benchmark/NEON/BitwiseAnd.cpp b/tests/benchmark/NEON/BitwiseAnd.cpp
new file mode 100644
index 0000000..dba3d1e
--- /dev/null
+++ b/tests/benchmark/NEON/BitwiseAnd.cpp

@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+namespace
+{
+template <typename DataSet>
+class BitwiseAnd : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const std::string image_name = *(DataSet().begin() + state.range(0));
+
+        // Create tensors
+        src1 = create_tensor(image_name, DataType::U8);
+        src2 = create_tensor(image_name, DataType::U8);
+        dst  = create_tensor(image_name, DataType::U8);
+
+        // Create and configure function
+        band.configure(&src1, &src2, &dst);
+
+        // Allocate tensors
+        src1.allocator()->allocate();
+        src2.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill source tensors
+        library->fill(NEAccessor(src1), image_name, Channel::R);
+        library->fill(NEAccessor(src2), image_name, Channel::G);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        profiler.submit(state);
+    }
+
+    NEBitwiseAnd band{};
+    Profiler     profiler{};
+
+private:
+    Tensor src1{};
+    Tensor src2{};
+    Tensor dst{};
+};
+
+using BitwiseAndSmall = BitwiseAnd<SmallImages>;
+using BitwiseAndLarge = BitwiseAnd<LargeImages>;
+} // namespace
+
+BENCHMARK_DEFINE_F(BitwiseAndSmall, neon_bitwise_and)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        band.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndSmall, neon_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<SmallImages>);
+
+BENCHMARK_DEFINE_F(BitwiseAndLarge, neon_bitwise_and)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        band.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndLarge, neon_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<LargeImages>);

diff --git a/tests/benchmark/NEON/ConvolutionLayer.cpp b/tests/benchmark/NEON/ConvolutionLayer.cpp
new file mode 100644
index 0000000..0cfff84
--- /dev/null
+++ b/tests/benchmark/NEON/ConvolutionLayer.cpp

@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/ConvolutionLayer.h"
+
+namespace
+{
+using ConvolutionLayerAlexNetF32 = ConvolutionLayer<AlexNetConvolutionLayerDataset, Tensor, NEAccessor, NEConvolutionLayer>;
+using ConvolutionLayerAlexNetQS8 = ConvolutionLayer<AlexNetConvolutionLayerDataset, Tensor, NEAccessor, NEConvolutionLayer, DataType::QS8>;
+using ConvolutionLayerLeNet5     = ConvolutionLayer<LeNet5ConvolutionLayerDataset, Tensor, NEAccessor, NEConvolutionLayer>;
+using ConvolutionLayerGoogLeNet1 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset1, Tensor, NEAccessor, NEConvolutionLayer>;
+using ConvolutionLayerGoogLeNet2 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset2, Tensor, NEAccessor, NEConvolutionLayer>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerLeNet5, neon_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 16, 1, 4, 8>);

diff --git a/tests/benchmark/NEON/ConvolutionLayerDirect.cpp b/tests/benchmark/NEON/ConvolutionLayerDirect.cpp
new file mode 100644
index 0000000..bc56e84
--- /dev/null
+++ b/tests/benchmark/NEON/ConvolutionLayerDirect.cpp

@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+#include "dataset/ConvolutionLayerDataset.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/ConvolutionLayer.h"
+
+namespace
+{
+using ConvolutionLayerDirectAlexNet = ConvolutionLayer<AlexNetConvolutionLayerDataset, Tensor, NEAccessor, NEDirectConvolutionLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(ConvolutionLayerDirectAlexNet, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        profiler.stop();
+    }
+}
+
+// Registr only the 3x3 convolution layers
+BENCHMARK_REGISTER_F(ConvolutionLayerDirectAlexNet, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerDirectAlexNet, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerDirectAlexNet, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);

diff --git a/tests/benchmark/NEON/FullyConnectedLayer.cpp b/tests/benchmark/NEON/FullyConnectedLayer.cpp
new file mode 100644
index 0000000..8597920
--- /dev/null
+++ b/tests/benchmark/NEON/FullyConnectedLayer.cpp

@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/FullyConnectedLayer.h"
+
+namespace
+{
+using FullyConnectedLayerAlexNetF32 = FullyConnectedLayer<AlexNetFullyConnectedLayerDataset, Tensor, NEAccessor, NEFullyConnectedLayer>;
+using FullyConnectedLayerAlexNetQS8 = FullyConnectedLayer<AlexNetFullyConnectedLayerDataset, Tensor, NEAccessor, NEFullyConnectedLayer, DataType::QS8>;
+using FullyConnectedLayerLeNet5     = FullyConnectedLayer<LeNet5FullyConnectedLayerDataset, Tensor, NEAccessor, NEFullyConnectedLayer>;
+using FullyConnectedLayerGoogLeNet  = FullyConnectedLayer<GoogLeNetFullyConnectedLayerDataset, Tensor, NEAccessor, NEFullyConnectedLayer>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(FullyConnectedLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 2, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(FullyConnectedLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerLeNet5, neon_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        profiler.stop();
+    }
+}
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerGoogLeNet, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        profiler.stop();
+    }
+}
+BENCHMARK_REGISTER_F(FullyConnectedLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetFullyConnectedLayerDataset, 0, 1, 4, 8>);

diff --git a/tests/benchmark/NEON/GEMM.cpp b/tests/benchmark/NEON/GEMM.cpp
new file mode 100644
index 0000000..9190309
--- /dev/null
+++ b/tests/benchmark/NEON/GEMM.cpp

@@ -0,0 +1,709 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/NEON/GEMM.h"
+
+namespace
+{
+#ifdef ENABLE_FP16
+using GEMMFP16GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, Tensor, NEAccessor, NEGEMM, DataType::F16>;
+using GEMMFP16GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, Tensor, NEAccessor, NEGEMM, DataType::F16>;
+#endif /* ENABLE_FP16 */
+using GEMMFP32GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, Tensor, NEAccessor, NEGEMM, DataType::F32>;
+using GEMMFP32GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, Tensor, NEAccessor, NEGEMM, DataType::F32>;
+using GEMMQS8GoogLeNet1  = GEMM<GoogLeNetGEMMDataset1, Tensor, NEAccessor, NEGEMM, DataType::QS8>;
+using GEMMQS8GoogLeNet2  = GEMM<GoogLeNetGEMMDataset2, Tensor, NEAccessor, NEGEMM, DataType::QS8>;
+} // namespace
+#ifdef ENABLE_FP16
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet1, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet2, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
+#endif /* ENABLE_FP16 */
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet1, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet2, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
+
+BENCHMARK_DEFINE_F(GEMMQS8GoogLeNet1, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(GEMMQS8GoogLeNet2, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);

diff --git a/tests/benchmark/NEON/GEMM.h b/tests/benchmark/NEON/GEMM.h
new file mode 100644
index 0000000..0e24fe9
--- /dev/null
+++ b/tests/benchmark/NEON/GEMM.h

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_NEON_GEMM_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_NEON_GEMM_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/GEMMDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType data_type>
+class GEMM : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+#ifdef ENABLE_FP16
+        ARM_COMPUTE_ERROR_ON_MSG(data_type != DataType::F16 && data_type != DataType::F32 && data_type != DataType::QS8, "Unsupported data type for GEMM operation");
+#else  /* ENABLE_FP16 */
+        ARM_COMPUTE_ERROR_ON_MSG(data_type != DataType::F32 && data_type != DataType::QS8, "Unsupported data type for GEMM operation");
+#endif /* ENABLE_FP16 */
+
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const GEMMDataObject gemm_obj = *(DataSet().begin() + state.range(0));
+
+        TensorShape shape_a = gemm_obj.shape_a;
+        TensorShape shape_b = gemm_obj.shape_b;
+        TensorShape shape_c = gemm_obj.shape_c;
+        TensorShape shape_d = gemm_obj.shape_d;
+
+        // Create tensors
+        a = create_tensor(shape_a, data_type, 1, 4);
+        b = create_tensor(shape_b, data_type, 1, 4);
+        c = create_tensor(shape_c, data_type, 1, 4);
+        d = create_tensor(shape_d, data_type, 1, 4);
+
+        // Create and configure function
+        gemm_layer = std::unique_ptr<Function>(new Function());
+        gemm_layer->configure(&a, &b, &c, &d, gemm_obj.alpha, gemm_obj.beta);
+
+        // Allocate tensors
+        a.allocator()->allocate();
+        b.allocator()->allocate();
+        c.allocator()->allocate();
+        d.allocator()->allocate();
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        gemm_layer.reset();
+
+        a.allocator()->free();
+        b.allocator()->free();
+        c.allocator()->free();
+        d.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    std::unique_ptr<Function> gemm_layer{ nullptr };
+    Profiler                  profiler{};
+
+private:
+    TensorType a{};
+    TensorType b{};
+    TensorType c{};
+    TensorType d{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_NEON_GEMM_H__

diff --git a/tests/benchmark/NEON/NormalizationLayer.cpp b/tests/benchmark/NEON/NormalizationLayer.cpp
new file mode 100644
index 0000000..46dc56b
--- /dev/null
+++ b/tests/benchmark/NEON/NormalizationLayer.cpp

@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/NormalizationLayer.h"
+
+namespace
+{
+using NormalizationLayerAlexNetF32 = NormalizationLayer<AlexNetNormalizationLayerDataset, Tensor, NEAccessor, NENormalizationLayer>;
+using NormalizationLayerAlexNetQS8 = NormalizationLayer<AlexNetNormalizationLayerDataset, Tensor, NEAccessor, NENormalizationLayer, DataType::QS8>;
+using NormalizationLayerGoogLeNet  = NormalizationLayer<GoogLeNetNormalizationLayerDataset, Tensor, NEAccessor, NENormalizationLayer>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(NormalizationLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        norm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 1, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(NormalizationLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        norm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(NormalizationLayerGoogLeNet, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        norm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 1, 1, 4, 8>);

diff --git a/tests/benchmark/NEON/PoolingLayer.cpp b/tests/benchmark/NEON/PoolingLayer.cpp
new file mode 100644
index 0000000..5a7f883
--- /dev/null
+++ b/tests/benchmark/NEON/PoolingLayer.cpp

@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/PoolingLayer.h"
+
+namespace
+{
+using PoolingLayerAlexNetF32 = PoolingLayer<AlexNetPoolingLayerDataset, Tensor, NEAccessor, NEPoolingLayer>;
+using PoolingLayerAlexNetQS8 = PoolingLayer<AlexNetPoolingLayerDataset, Tensor, NEAccessor, NEPoolingLayer, DataType::QS8>;
+using PoolingLayerLeNet5     = PoolingLayer<LeNet5PoolingLayerDataset, Tensor, NEAccessor, NEPoolingLayer>;
+using PoolingLayerGoogLeNet  = PoolingLayer<GoogLeNetPoolingLayerDataset, Tensor, NEAccessor, NEPoolingLayer>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(PoolingLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 2, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(PoolingLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerLeNet5, neon_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerGoogLeNet, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 9, 1, 4, 8>);

diff --git a/tests/benchmark/PMUCounter.cpp b/tests/benchmark/PMUCounter.cpp
new file mode 100644
index 0000000..e87dae8
--- /dev/null
+++ b/tests/benchmark/PMUCounter.cpp

@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PMUCounter.h"
+
+#include "Utils.h"
+
+#define _GNU_SOURCE 1
+#include <asm/unistd.h>
+#include <csignal>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
+#include <stdexcept>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+CycleCounter::CycleCounter()
+{
+    const pid_t pid = getpid();
+
+    struct perf_event_attr perf_config
+    {
+    };
+    memset(&perf_config, 0, sizeof(struct perf_event_attr));
+
+    perf_config.config = PERF_COUNT_HW_CPU_CYCLES;
+    perf_config.size   = sizeof(struct perf_event_attr);
+    perf_config.type   = PERF_TYPE_HARDWARE;
+    // The inherit bit specifies that this counter should count events of child
+    // tasks as well as the task specified
+    perf_config.inherit = 1;
+    // Enables saving of event counts on context switch for inherited tasks
+    perf_config.inherit_stat = 1;
+
+    _fd = syscall(__NR_perf_event_open, &perf_config, pid, -1, -1, 0);
+
+    if(_fd < 0)
+    {
+        throw std::runtime_error("perf_event_open for cycles failed");
+    }
+}
+
+std::string CycleCounter::id() const
+{
+    return "Cycle Counter";
+}
+
+void CycleCounter::start()
+{
+    ioctl(_fd, PERF_EVENT_IOC_RESET, 0);
+    ioctl(_fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+void CycleCounter::stop()
+{
+    ioctl(_fd, PERF_EVENT_IOC_DISABLE, 0);
+    read(_fd, &_cycles, sizeof(_cycles));
+}
+
+std::unique_ptr<Instrument::IMeasurement> CycleCounter::get_measurement() const
+{
+    return ::arm_compute::test::cpp14::make_unique<Instrument::Measurement<long long>>(_cycles);
+}
+
+InstructionCounter::InstructionCounter()
+{
+    const pid_t pid = getpid();
+
+    struct perf_event_attr perf_config
+    {
+    };
+    memset(&perf_config, 0, sizeof(struct perf_event_attr));
+
+    perf_config.config = PERF_COUNT_HW_INSTRUCTIONS;
+    perf_config.size   = sizeof(struct perf_event_attr);
+    perf_config.type   = PERF_TYPE_HARDWARE;
+    // The inherit bit specifies that this counter should count events of child
+    // tasks as well as the task specified
+    perf_config.inherit = 1;
+    // Enables saving of event counts on context switch for inherited tasks
+    perf_config.inherit_stat = 1;
+
+    _fd = syscall(__NR_perf_event_open, &perf_config, pid, -1, -1, 0);
+
+    if(_fd < 0)
+    {
+        throw std::runtime_error("perf_event_open for instructions failed");
+    }
+}
+
+std::string InstructionCounter::id() const
+{
+    return "Instruction Counter";
+}
+
+void InstructionCounter::start()
+{
+    ioctl(_fd, PERF_EVENT_IOC_RESET, 0);
+    ioctl(_fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+void InstructionCounter::stop()
+{
+    ioctl(_fd, PERF_EVENT_IOC_DISABLE, 0);
+    read(_fd, &_instructions, sizeof(_instructions));
+}
+
+std::unique_ptr<Instrument::IMeasurement> InstructionCounter::get_measurement() const
+{
+    return std::unique_ptr<Instrument::IMeasurement>(new Instrument::Measurement<long long>(_instructions));
+}
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/benchmark/PMUCounter.h b/tests/benchmark/PMUCounter.h
new file mode 100644
index 0000000..de45f31
--- /dev/null
+++ b/tests/benchmark/PMUCounter.h

@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_PMU_COUNTER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_PMU_COUNTER_H__
+
+#include "Instrument.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+/** Implementation of an instrument to count CPU cycles. */
+class CycleCounter : public Instrument
+{
+public:
+    /** Initialise the cycle counter. */
+    CycleCounter();
+
+    std::string                               id() const override;
+    void                                      start() override;
+    void                                      stop() override;
+    std::unique_ptr<Instrument::IMeasurement> get_measurement() const override;
+
+private:
+    long      _fd{ -1 };
+    long long _cycles{ 0 };
+};
+
+/** Implementation of an instrument to count executed CPU instructions. */
+class InstructionCounter : public Instrument
+{
+public:
+    /** Initialise the instruction counter. */
+    InstructionCounter();
+
+    std::string                               id() const override;
+    void                                      start() override;
+    void                                      stop() override;
+    std::unique_ptr<Instrument::IMeasurement> get_measurement() const override;
+
+private:
+    long      _fd{ -1 };
+    long long _instructions{ 0 };
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif

diff --git a/tests/benchmark/PerformanceProgramOptions.cpp b/tests/benchmark/PerformanceProgramOptions.cpp
new file mode 100644
index 0000000..b4becc3
--- /dev/null
+++ b/tests/benchmark/PerformanceProgramOptions.cpp

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PerformanceProgramOptions.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma GCC diagnostic ignored "-Wctor-dtor-privacy"
+#include "boost/program_options.hpp"
+#pragma GCC diagnostic pop
+
+namespace arm_compute
+{
+namespace test
+{
+namespace performance
+{
+PerformanceProgramOptions::PerformanceProgramOptions()
+{
+    boost::program_options::options_description options("Performance options");
+    options.add_options()("runs", boost::program_options::value<unsigned int>()->default_value(1), "Repetitions per test");
+    options.add_options()("threads", boost::program_options::value<unsigned int>()->default_value(1), "Number of parallel CPU threads");
+    add_options(options);
+}
+} // namespace performance
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/benchmark/PerformanceProgramOptions.h b/tests/benchmark/PerformanceProgramOptions.h
new file mode 100644
index 0000000..671e263
--- /dev/null
+++ b/tests/benchmark/PerformanceProgramOptions.h

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_PERFORMANCE_PROGRAM_OPTIONS_H__
+#define __ARM_COMPUTE_TEST_PERFORMANCE_PROGRAM_OPTIONS_H__
+
+#include "ProgramOptions.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace performance
+{
+/** Subclass of @ref ProgramOptions that adds performance specific options. */
+class PerformanceProgramOptions : public ProgramOptions
+{
+public:
+    /** Defines additonal options. */
+    PerformanceProgramOptions();
+};
+} // namespace performance
+} // namespace test
+} // namespace arm_compute
+#endif

diff --git a/tests/benchmark/PerformanceUserConfiguration.cpp b/tests/benchmark/PerformanceUserConfiguration.cpp
new file mode 100644
index 0000000..ca412d6
--- /dev/null
+++ b/tests/benchmark/PerformanceUserConfiguration.cpp

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PerformanceUserConfiguration.h"
+
+#include "ProgramOptions.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace performance
+{
+PerformanceUserConfiguration::PerformanceUserConfiguration(const ProgramOptions &options)
+    : UserConfiguration(options)
+{
+    unsigned int tmp_runs = 0;
+    if(options.get("runs", tmp_runs))
+    {
+        runs = tmp_runs;
+    }
+}
+} // namespace performance
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/benchmark/PerformanceUserConfiguration.h b/tests/benchmark/PerformanceUserConfiguration.h
new file mode 100644
index 0000000..a140d40
--- /dev/null
+++ b/tests/benchmark/PerformanceUserConfiguration.h

@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_PERFORMANCE_PERFORMANCE_USER_CONFIGURATION_H__
+#define __ARM_COMPUTE_TEST_PERFORMANCE_PERFORMANCE_USER_CONFIGURATION_H__
+
+#include "UserConfiguration.h"
+
+namespace arm_compute
+{
+namespace test
+{
+class ProgramOptions;
+
+namespace performance
+{
+/** Specialisation of @ref UserConfiguration to provide performance specific
+ * configuration options.
+ */
+struct PerformanceUserConfiguration : public UserConfiguration
+{
+    PerformanceUserConfiguration() = default;
+
+    /** Initialise the configuration according to the program options.
+     *
+     * @param[in] options Parsed command line options.
+     */
+    PerformanceUserConfiguration(const ProgramOptions &options);
+
+    Option<unsigned int> runs{};
+};
+} // namespace performance
+
+extern performance::PerformanceUserConfiguration user_config;
+} // namespace test
+} // namespace arm_compute
+#endif

diff --git a/tests/benchmark/Profiler.cpp b/tests/benchmark/Profiler.cpp
new file mode 100644
index 0000000..f3ce941
--- /dev/null
+++ b/tests/benchmark/Profiler.cpp

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Profiler.h"
+
+#include <iostream>
+#include <utility>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+void Profiler::add(const std::shared_ptr<Instrument> &instrument)
+{
+    _instruments.push_back(instrument);
+}
+
+void Profiler::start()
+{
+    for(auto &instrument : _instruments)
+    {
+        instrument->start();
+    }
+}
+
+void Profiler::stop()
+{
+    for(auto &instrument : _instruments)
+    {
+        instrument->stop();
+    }
+
+    for(const auto &instrument : _instruments)
+    {
+        _measurements[instrument->id()].push_back(*instrument->get_measurement());
+    }
+}
+
+void Profiler::submit(::benchmark::State &state)
+{
+    for(auto &instrument : _measurements)
+    {
+        double sum_values = std::accumulate(instrument.second.begin(), instrument.second.end(), 0.);
+        size_t num_values = instrument.second.size();
+
+        if(num_values > 2)
+        {
+            auto minmax_values                        = std::minmax_element(instrument.second.begin(), instrument.second.end());
+            state.counters[instrument.first + "_min"] = *minmax_values.first;
+            state.counters[instrument.first + "_max"] = *minmax_values.second;
+            sum_values -= *minmax_values.first + *minmax_values.second;
+            num_values -= 2;
+        }
+        state.counters[instrument.first] = sum_values / num_values;
+        instrument.second.clear();
+    }
+}
+
+const Profiler::MeasurementsMap &Profiler::measurements() const
+{
+    return _measurements;
+}
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/benchmark/Profiler.h b/tests/benchmark/Profiler.h
new file mode 100644
index 0000000..03922f4
--- /dev/null
+++ b/tests/benchmark/Profiler.h

@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_PROFILER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_PROFILER_H__
+
+#include "Instrument.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+class Profiler
+{
+public:
+    /** Mapping from instrument ids to their measurements. */
+    using MeasurementsMap = std::map<std::string, std::vector<double>>;
+
+    /** Add @p instrument to the performance montior.
+     *
+     * All added instruments will be used when @ref start or @ref stop are
+     * called to make measurements.
+     *
+     * @param[in] instrument Instrument to be used to measure performance.
+     */
+    void add(const std::shared_ptr<Instrument> &instrument);
+
+    /** Start all added instruments to measure performance. */
+    void start();
+
+    /** Stop all added instruments. */
+    void stop();
+
+    /** Commit all measured values to the current active test. */
+    void submit(::benchmark::State &state);
+
+    /** Return measurements for all instruments. */
+    const MeasurementsMap &measurements() const;
+
+private:
+    std::vector<std::shared_ptr<Instrument>> _instruments{};
+    MeasurementsMap                          _measurements{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif

diff --git a/tests/benchmark/WallClockTimer.cpp b/tests/benchmark/WallClockTimer.cpp
new file mode 100644
index 0000000..9ab53d0
--- /dev/null
+++ b/tests/benchmark/WallClockTimer.cpp

@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "WallClockTimer.h"
+
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+std::string WallClockTimer::id() const
+{
+    return "Wall clock";
+}
+
+void WallClockTimer::start()
+{
+    _start = std::chrono::high_resolution_clock::now();
+}
+
+void WallClockTimer::stop()
+{
+    _stop = std::chrono::high_resolution_clock::now();
+}
+
+std::unique_ptr<Instrument::IMeasurement> WallClockTimer::get_measurement() const
+{
+    const std::chrono::duration<float, std::milli> delta = _stop - _start;
+    return ::arm_compute::test::cpp14::make_unique<Instrument::Measurement<float>>(delta.count());
+}
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/benchmark/WallClockTimer.h b/tests/benchmark/WallClockTimer.h
new file mode 100644
index 0000000..cf6828e
--- /dev/null
+++ b/tests/benchmark/WallClockTimer.h

@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_WALL_CLOCK_TIMER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_WALL_CLOCK_TIMER_H__
+
+#include "Instrument.h"
+
+#include <chrono>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+/** Implementation of an instrument to measure elapsed wall-clock time in milliseconds. */
+class WallClockTimer : public Instrument
+{
+public:
+    std::string                               id() const override;
+    void                                      start() override;
+    void                                      stop() override;
+    std::unique_ptr<Instrument::IMeasurement> get_measurement() const override;
+
+private:
+    std::chrono::high_resolution_clock::time_point _start{};
+    std::chrono::high_resolution_clock::time_point _stop{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif

diff --git a/tests/benchmark/common/ActivationLayer.h b/tests/benchmark/common/ActivationLayer.h
new file mode 100644
index 0000000..7edfb6e
--- /dev/null
+++ b/tests/benchmark/common/ActivationLayer.h

@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_ACTIVATION_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_ACTIVATION_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/ActivationLayerDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class ActivationLayer : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const ActivationLayerDataObject act_obj = *(DataSet().begin() + state.range(0));
+
+        // Set batched in source and destination shapes
+        const unsigned int batches              = state.range(1);
+        const unsigned int fixed_point_position = 4;
+        TensorShape        shape                = act_obj.shape;
+        shape.set(shape.num_dimensions(), batches);
+
+        // Create tensors
+        src = create_tensor(shape, dt, 1, fixed_point_position);
+        dst = create_tensor(shape, dt, 1, fixed_point_position);
+
+        // Create and configure function
+        act_layer.configure(&src, &dst, act_obj.info);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill tensors
+        library->fill_tensor_uniform(Accessor(src), 0);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        src.allocator()->free();
+        dst.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    Function act_layer{};
+    Profiler profiler{};
+
+private:
+    TensorType src{};
+    TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_ACTIVATION_LAYER_H__

diff --git a/tests/benchmark/common/ConvolutionLayer.h b/tests/benchmark/common/ConvolutionLayer.h
new file mode 100644
index 0000000..594c62c
--- /dev/null
+++ b/tests/benchmark/common/ConvolutionLayer.h

@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_CONVOLUTION_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_CONVOLUTION_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/ConvolutionLayerDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class ConvolutionLayer : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const ConvolutionLayerDataObject conv_obj = *(DataSet().begin() + state.range(0));
+
+        // Set batched in source and destination shapes
+        const unsigned int batches              = state.range(1);
+        const unsigned int fixed_point_position = 4;
+        TensorShape        src_shape            = conv_obj.src_shape;
+        TensorShape        dst_shape            = conv_obj.dst_shape;
+        src_shape.set(3 /* batch */, batches);
+        dst_shape.set(3 /* batch */, batches);
+
+        // Create tensors
+        src     = create_tensor(src_shape, dt, 1, fixed_point_position);
+        weights = create_tensor(conv_obj.weights_shape, dt, 1, fixed_point_position);
+        bias    = create_tensor(conv_obj.bias_shape, dt, 1, fixed_point_position);
+        dst     = create_tensor(dst_shape, dt, 1, fixed_point_position);
+
+        // Create and configure function
+        conv_layer = std::unique_ptr<Function>(new Function());
+        conv_layer->configure(&src, &weights, &bias, &dst, conv_obj.info);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        weights.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill tensors
+        library->fill_tensor_uniform(Accessor(src), 0);
+        library->fill_tensor_uniform(Accessor(weights), 1);
+        library->fill_tensor_uniform(Accessor(bias), 2);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        conv_layer.reset();
+
+        src.allocator()->free();
+        weights.allocator()->free();
+        bias.allocator()->free();
+        dst.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    std::unique_ptr<Function> conv_layer{ nullptr };
+    Profiler                  profiler{};
+
+private:
+    TensorType src{};
+    TensorType weights{};
+    TensorType bias{};
+    TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_CONVOLUTION_LAYER_H__

diff --git a/tests/benchmark/common/FullyConnectedLayer.h b/tests/benchmark/common/FullyConnectedLayer.h
new file mode 100644
index 0000000..88adf83
--- /dev/null
+++ b/tests/benchmark/common/FullyConnectedLayer.h

@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_FULLYCONNECTED_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_FULLYCONNECTED_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/ConvolutionLayerDataset.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class FullyConnectedLayer : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const FullyConnectedLayerDataObject fc_obj = *(DataSet().begin() + state.range(0));
+
+        // Set batched in source and destination shapes
+        const unsigned int batches              = state.range(1);
+        const unsigned int fixed_point_position = 4;
+        TensorShape        src_shape            = fc_obj.src_shape;
+        TensorShape        dst_shape            = fc_obj.dst_shape;
+        src_shape.set(src_shape.num_dimensions(), batches);
+        dst_shape.set(dst_shape.num_dimensions(), batches);
+
+        // Create tensors
+        src     = create_tensor(src_shape, dt, 1, fixed_point_position);
+        weights = create_tensor(fc_obj.weights_shape, dt, 1, fixed_point_position);
+        bias    = create_tensor(fc_obj.bias_shape, dt, 1, fixed_point_position);
+        dst     = create_tensor(dst_shape, dt, 1, fixed_point_position);
+
+        // Create and configure function
+        fc_layer = std::unique_ptr<Function>(new Function());
+        fc_layer->configure(&src, &weights, &bias, &dst);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        weights.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill tensors
+        library->fill_tensor_uniform(Accessor(src), 0);
+        library->fill_tensor_uniform(Accessor(weights), 1);
+        library->fill_tensor_uniform(Accessor(bias), 2);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        fc_layer.reset();
+
+        src.allocator()->free();
+        weights.allocator()->free();
+        bias.allocator()->free();
+        dst.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    std::unique_ptr<Function> fc_layer{ nullptr };
+    Profiler                  profiler{};
+
+private:
+    TensorType src{};
+    TensorType weights{};
+    TensorType bias{};
+    TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_FULLYCONNECTED_LAYER_H__

diff --git a/tests/benchmark/common/NormalizationLayer.h b/tests/benchmark/common/NormalizationLayer.h
new file mode 100644
index 0000000..4593fb7
--- /dev/null
+++ b/tests/benchmark/common/NormalizationLayer.h

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_NORMALIZATION_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_NORMALIZATION_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/NormalizationLayerDataset.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class NormalizationLayer : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const NormalizationLayerDataObject norm_obj = *(DataSet().begin() + state.range(0));
+
+        // Set batched in source and destination shapes
+        const unsigned int batches              = state.range(1);
+        const unsigned int fixed_point_position = 4;
+        TensorShape        shape                = norm_obj.shape;
+        shape.set(shape.num_dimensions(), batches);
+
+        // Create tensors
+        src = create_tensor(shape, dt, 1, fixed_point_position);
+        dst = create_tensor(shape, dt, 1, fixed_point_position);
+
+        // Create and configure function
+        norm_layer = std::unique_ptr<Function>(new Function());
+        norm_layer->configure(&src, &dst, norm_obj.info);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill tensors
+        library->fill_tensor_uniform(Accessor(src), 0);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        norm_layer.reset();
+
+        src.allocator()->free();
+        dst.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    std::unique_ptr<Function> norm_layer{ nullptr };
+    Profiler                  profiler{};
+
+private:
+    TensorType src{};
+    TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_NORMALIZATION_LAYER_H__

diff --git a/tests/benchmark/common/PoolingLayer.h b/tests/benchmark/common/PoolingLayer.h
new file mode 100644
index 0000000..5bb332f
--- /dev/null
+++ b/tests/benchmark/common/PoolingLayer.h

@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_POOLING_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_POOLING_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/PoolingLayerDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class PoolingLayer : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const PoolingLayerDataObject pool_obj = *(DataSet().begin() + state.range(0));
+
+        // Set batched in source and destination shapes
+        const unsigned int batches              = state.range(1);
+        const unsigned int fixed_point_position = 4;
+        TensorShape        src_shape            = pool_obj.src_shape;
+        TensorShape        dst_shape            = pool_obj.dst_shape;
+        src_shape.set(src_shape.num_dimensions(), batches);
+        dst_shape.set(dst_shape.num_dimensions(), batches);
+
+        // Create tensors
+        src = create_tensor(src_shape, dt, 1, fixed_point_position);
+        dst = create_tensor(dst_shape, dt, 1, fixed_point_position);
+
+        // Create and configure function
+        pool_layer.configure(&src, &dst, pool_obj.info);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill tensors
+        library->fill_tensor_uniform(Accessor(src), 0);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        // Free allocators
+        src.allocator()->free();
+        dst.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    Function pool_layer{};
+    Profiler profiler{};
+
+private:
+    TensorType src{};
+    TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_POOLING_LAYER_H__

diff --git a/tests/benchmark/main.cpp b/tests/benchmark/main.cpp
new file mode 100644
index 0000000..acde259
--- /dev/null
+++ b/tests/benchmark/main.cpp

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "PMUCounter.h"
+#include "PerformanceProgramOptions.h"
+#include "PerformanceUserConfiguration.h"
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "WallClockTimer.h"
+
+#include "benchmark/benchmark_api.h"
+
+#ifdef OPENCL
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#endif
+#include "arm_compute/runtime/Scheduler.h"
+
+#include <iostream>
+#include <memory>
+
+using namespace arm_compute::test;
+using namespace arm_compute::test::performance;
+
+namespace arm_compute
+{
+namespace test
+{
+PerformanceUserConfiguration   user_config;
+std::unique_ptr<TensorLibrary> library;
+} // namespace test
+} // namespace arm_compute
+
+int main(int argc, char **argv)
+{
+    PerformanceProgramOptions options;
+    try
+    {
+        options.parse_commandline(argc, argv);
+
+        if(options.wants_help())
+        {
+            std::cout << "Usage: " << argv[0] << " [options] PATH\n";
+            std::cout << options.get_help() << "\n";
+        }
+
+        user_config = PerformanceUserConfiguration(options);
+    }
+    catch(const boost::program_options::required_option &err)
+    {
+        std::cerr << "Error: " << err.what() << "\n";
+        std::cout << "\nUsage: " << argv[0] << " [options] PATH\n";
+        std::cout << options.get_help() << "\n";
+        return 1;
+    }
+
+    ::benchmark::Initialize(&argc, argv);
+
+    if(user_config.seed.is_set())
+    {
+        library = cpp14::make_unique<TensorLibrary>(user_config.path.get(), user_config.seed);
+    }
+    else
+    {
+        library = cpp14::make_unique<TensorLibrary>(user_config.path.get());
+    }
+
+#ifdef OPENCL
+    arm_compute::CLScheduler::get().default_init();
+#endif
+
+    std::cout << "Using " << user_config.threads << " CPU " << (user_config.threads == 1 ? "thread" : "threads") << "\n";
+    arm_compute::Scheduler::get().set_num_threads(user_config.threads);
+
+    ::benchmark::RunSpecifiedBenchmarks();
+}

diff --git a/tests/benchmark/system_tests/CL/AlexNet.cpp b/tests/benchmark/system_tests/CL/AlexNet.cpp
new file mode 100644
index 0000000..fe0b991
--- /dev/null
+++ b/tests/benchmark/system_tests/CL/AlexNet.cpp

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/system_tests/common/AlexNet.h"
+
+namespace
+{
+using AlexNetSystemTest = AlexNetFixture<ICLTensor,
+      CLTensor,
+      CLSubTensor,
+      CLAccessor,
+      CLActivationLayer,
+      CLConvolutionLayer,
+      CLFullyConnectedLayer,
+      CLNormalizationLayer,
+      CLPoolingLayer,
+      CLSoftmaxLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(AlexNetSystemTest, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run AlexNet
+        profiler.start();
+        network.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(AlexNetSystemTest, cl_alexnet)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(4)
+->Arg(8);
\ No newline at end of file

diff --git a/tests/benchmark/system_tests/CL/LeNet5.cpp b/tests/benchmark/system_tests/CL/LeNet5.cpp
new file mode 100644
index 0000000..d65a7dd
--- /dev/null
+++ b/tests/benchmark/system_tests/CL/LeNet5.cpp

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/system_tests/common/LeNet5.h"
+
+namespace
+{
+using LeNet5SystemTest = LeNet5Fixture<CLTensor,
+      CLAccessor,
+      CLActivationLayer,
+      CLConvolutionLayer,
+      CLFullyConnectedLayer,
+      CLPoolingLayer,
+      CLSoftmaxLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(LeNet5SystemTest, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run LeNet5
+        profiler.start();
+        network.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(LeNet5SystemTest, cl_lenet5)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(16)
+->Arg(32);

diff --git a/tests/benchmark/system_tests/NEON/AlexNet.cpp b/tests/benchmark/system_tests/NEON/AlexNet.cpp
new file mode 100644
index 0000000..2d222e7
--- /dev/null
+++ b/tests/benchmark/system_tests/NEON/AlexNet.cpp

@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/system_tests/common/AlexNet.h"
+
+namespace
+{
+using AlexNetSystemTestF32 = AlexNetFixture<ITensor,
+      Tensor,
+      SubTensor,
+      NEAccessor,
+      NEActivationLayer,
+      NEConvolutionLayer,
+      NEFullyConnectedLayer,
+      NENormalizationLayer,
+      NEPoolingLayer,
+      NESoftmaxLayer,
+      DataType::F32>;
+
+using AlexNetSystemTestQS8 = AlexNetFixture<ITensor,
+      Tensor,
+      SubTensor,
+      NEAccessor,
+      NEActivationLayer,
+      NEConvolutionLayer,
+      NEFullyConnectedLayer,
+      NENormalizationLayer,
+      NEPoolingLayer,
+      NESoftmaxLayer,
+      DataType::QS8>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(AlexNetSystemTestF32, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run AlexNet
+        profiler.start();
+        network.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(AlexNetSystemTestF32, neon_alexnet)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(4)
+->Arg(8);
+
+// QS8
+BENCHMARK_DEFINE_F(AlexNetSystemTestQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run AlexNet
+        profiler.start();
+        network.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(AlexNetSystemTestQS8, neon_alexnet)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(4)
+->Arg(8);
\ No newline at end of file

diff --git a/tests/benchmark/system_tests/NEON/LeNet5.cpp b/tests/benchmark/system_tests/NEON/LeNet5.cpp
new file mode 100644
index 0000000..5170f05
--- /dev/null
+++ b/tests/benchmark/system_tests/NEON/LeNet5.cpp

@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/system_tests/common/LeNet5.h"
+
+namespace
+{
+using LeNet5SystemTest = LeNet5Fixture<Tensor,
+      NEAccessor,
+      NEActivationLayer,
+      NEConvolutionLayer,
+      NEFullyConnectedLayer,
+      NEPoolingLayer,
+      NESoftmaxLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(LeNet5SystemTest, neon_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run LeNet5
+        profiler.start();
+        network.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(LeNet5SystemTest, neon_lenet5)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(16)
+->Arg(32);

diff --git a/tests/benchmark/system_tests/common/AlexNet.h b/tests/benchmark/system_tests/common/AlexNet.h
new file mode 100644
index 0000000..9c93dc7
--- /dev/null
+++ b/tests/benchmark/system_tests/common/AlexNet.h

@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_ALEXNET_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_ALEXNET_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "model_objects/AlexNet.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename ITensorType,
+          typename TensorType,
+          typename SubTensorType,
+          typename Accessor,
+          typename ActivationLayerFunction,
+          typename ConvolutionLayerFunction,
+          typename FullyConnectedLayerFunction,
+          typename NormalizationLayerFunction,
+          typename PoolingLayerFunction,
+          typename SoftmaxLayerFunction,
+          DataType dt = DataType::F32>
+class AlexNetFixture : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const unsigned int batches            = static_cast<unsigned int>(state.range(0));
+        const bool         weights_transposed = true;
+
+        network.init_weights(batches, weights_transposed);
+        network.build();
+        network.allocate();
+        network.fill_random();
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        profiler.submit(state);
+        network.clear();
+    }
+
+    Profiler profiler{};
+    model_objects::AlexNet<ITensorType,
+                  TensorType,
+                  SubTensorType,
+                  Accessor,
+                  ActivationLayerFunction,
+                  ConvolutionLayerFunction,
+                  FullyConnectedLayerFunction,
+                  NormalizationLayerFunction,
+                  PoolingLayerFunction,
+                  SoftmaxLayerFunction,
+                  dt>
+                  network{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_ALEXNET_H__

diff --git a/tests/benchmark/system_tests/common/LeNet5.h b/tests/benchmark/system_tests/common/LeNet5.h
new file mode 100644
index 0000000..db34f68
--- /dev/null
+++ b/tests/benchmark/system_tests/common/LeNet5.h

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_LENET5_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_LENET5_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "model_objects/LeNet5.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename TensorType,
+          typename Accessor,
+          typename ActivationLayerFunction,
+          typename ConvolutionLayerFunction,
+          typename FullyConnectedLayerFunction,
+          typename PoolingLayerFunction,
+          typename SoftmaxLayerFunction>
+class LeNet5Fixture : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        network.build(static_cast<unsigned int>(state.range(0)));
+        network.fill_random();
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        profiler.submit(state);
+        network.clear();
+    }
+
+    Profiler profiler{};
+    model_objects::LeNet5<TensorType,
+                  Accessor,
+                  ActivationLayerFunction,
+                  ConvolutionLayerFunction,
+                  FullyConnectedLayerFunction,
+                  PoolingLayerFunction,
+                  SoftmaxLayerFunction>
+                  network{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_LENET5_H__
commit	dbdab85d6e0f96d3361a9e30310367d89953466c	[log] [tgz]
author	Anthony Barbier <anthony.barbier@arm.com>	Fri Jun 23 15:42:00 2017 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	Fri Jun 23 16:07:31 2017 +0100
tree	0cc80d19fd8192de6eca2d28f7e4062aa9deecbf
parent	664d833b9d7b569db60b0f6d93e80f91f2c07c39 [diff]