arm_compute v18.05
diff --git a/examples/graph_inception_v4.cpp b/examples/graph_inception_v4.cpp
index f004b41..ed95baa 100644
--- a/examples/graph_inception_v4.cpp
+++ b/examples/graph_inception_v4.cpp
@@ -21,9 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/Nodes.h"
-#include "arm_compute/graph/SubGraph.h"
+#include "arm_compute/graph.h"
 #include "support/ToolchainSupport.h"
 #include "utils/GraphUtils.h"
 #include "utils/Utils.h"
@@ -32,19 +30,22 @@
 #include <tuple>
 
 using namespace arm_compute::utils;
-using namespace arm_compute::graph;
+using namespace arm_compute::graph::frontend;
 using namespace arm_compute::graph_utils;
 
 /** Example demonstrating how to implement InceptionV4's network using the Compute Library's graph API
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 class InceptionV4Example final : public Example
 {
 public:
     void do_setup(int argc, char **argv) override
     {
+        // Disabled the test for now because the process gets killed on Linux Firefly 32 bit even when using ConvolutionMethodHint::DIRECT.
+        // Needs to review/rework to run the code below.
+#if __aarch64__
         std::string data_path; /* Path to the trainable data */
         std::string image;     /* Image data */
         std::string label;     /* Label data */
@@ -52,45 +53,56 @@
         // Create a preprocessor object
         std::unique_ptr<IPreprocessor> preprocessor = arm_compute::support::cpp14::make_unique<TFPreproccessor>();
 
-        // Set target. 0 (NEON), 1 (OpenCL), 2 (OpenCL with Tuner). By default it is NEON
-        const int  int_target_hint = argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0;
-        TargetHint target_hint     = set_target_hint(int_target_hint);
+        // Set target. 0 (NEON), 1 (OpenCL). By default it is NEON
+        const int    target         = argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0;
+        Target       target_hint    = set_target_hint(target);
+        FastMathHint fast_math_hint = FastMathHint::DISABLED;
 
         // Parse arguments
         if(argc < 2)
         {
             // Print help
-            std::cout << "Usage: " << argv[0] << " [target] [path_to_data] [image] [labels]\n\n";
+            std::cout << "Usage: " << argv[0] << " [target] [path_to_data] [image] [labels] [fast_math_hint]\n\n";
             std::cout << "No data folder provided: using random values\n\n";
         }
         else if(argc == 2)
         {
-            std::cout << "Usage: " << argv[0] << " " << argv[1] << " [path_to_data] [image] [labels]\n\n";
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " [path_to_data] [image] [labels] [fast_math_hint]\n\n";
             std::cout << "No data folder provided: using random values\n\n";
         }
         else if(argc == 3)
         {
             data_path = argv[2];
-            std::cout << "Usage: " << argv[0] << " " << argv[1] << " " << argv[2] << " [image] [labels]\n\n";
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " " << argv[2] << " [image] [labels] [fast_math_hint]\n\n";
             std::cout << "No image provided: using random values\n\n";
         }
         else if(argc == 4)
         {
             data_path = argv[2];
             image     = argv[3];
-            std::cout << "Usage: " << argv[0] << " " << argv[1] << " " << argv[2] << " " << argv[3] << " [labels]\n\n";
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " " << argv[2] << " " << argv[3] << " [labels] [fast_math_hint]\n\n";
             std::cout << "No text file with labels provided: skipping output accessor\n\n";
         }
-        else
+        else if(argc == 5)
         {
             data_path = argv[2];
             image     = argv[3];
             label     = argv[4];
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " " << argv[2] << " " << argv[3] << " " << argv[4] << " [fast_math_hint]\n\n";
+            std::cout << "No fast math info provided: disabling fast math\n\n";
+        }
+        else
+        {
+            data_path      = argv[2];
+            image          = argv[3];
+            label          = argv[4];
+            fast_math_hint = (std::strtol(argv[5], nullptr, 1) == 0) ? FastMathHint::DISABLED : FastMathHint::ENABLED;
         }
 
-        graph << target_hint << Tensor(TensorInfo(TensorShape(299U, 299U, 3U, 1U), 1, DataType::F32),
-                                       get_input_accessor(image, std::move(preprocessor), false))
-
+        graph << target_hint
+              << fast_math_hint
+              << InputLayer(TensorDescriptor(TensorShape(299U, 299U, 3U, 1U), DataType::F32),
+                            get_input_accessor(image, std::move(preprocessor), false))
               // Conv2d_1a_3x3
               << ConvolutionLayer(3U, 3U, 32U,
                                   get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_weights.npy"),
@@ -120,64 +132,71 @@
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_beta.npy"),
                                          0.001f)
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-              << get_mixed_3a(data_path)
-              << get_mixed_4a(data_path)
-              << get_mixed_5a(data_path)
-              // 4 inception A blocks
-              << get_inceptionA_block(data_path, "Mixed_5b")
-              << get_inceptionA_block(data_path, "Mixed_5c")
-              << get_inceptionA_block(data_path, "Mixed_5d")
-              << get_inceptionA_block(data_path, "Mixed_5e")
-              // reduction A block
-              << get_reductionA_block(data_path)
-              // 7 inception B blocks
-              << get_inceptionB_block(data_path, "Mixed_6b")
-              << get_inceptionB_block(data_path, "Mixed_6c")
-              << get_inceptionB_block(data_path, "Mixed_6d")
-              << get_inceptionB_block(data_path, "Mixed_6e")
-              << get_inceptionB_block(data_path, "Mixed_6f")
-              << get_inceptionB_block(data_path, "Mixed_6g")
-              << get_inceptionB_block(data_path, "Mixed_6h")
-              // reduction B block
-              << get_reductionB_block(data_path)
-              // 3 inception C blocks
-              << get_inceptionC_block(data_path, "Mixed_7b")
-              << get_inceptionC_block(data_path, "Mixed_7c")
-              << get_inceptionC_block(data_path, "Mixed_7d")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::AVG))
+        graph << get_mixed_3a(data_path);
+        graph << get_mixed_4a(data_path);
+        graph << get_mixed_5a(data_path);
+        // 4 inception A blocks
+        graph << get_inceptionA_block(data_path, "Mixed_5b");
+        graph << get_inceptionA_block(data_path, "Mixed_5c");
+        graph << get_inceptionA_block(data_path, "Mixed_5d");
+        graph << get_inceptionA_block(data_path, "Mixed_5e");
+        // reduction A block
+        graph << get_reductionA_block(data_path);
+        // 7 inception B blocks
+        graph << get_inceptionB_block(data_path, "Mixed_6b");
+        graph << get_inceptionB_block(data_path, "Mixed_6c");
+        graph << get_inceptionB_block(data_path, "Mixed_6d");
+        graph << get_inceptionB_block(data_path, "Mixed_6e");
+        graph << get_inceptionB_block(data_path, "Mixed_6f");
+        graph << get_inceptionB_block(data_path, "Mixed_6g");
+        graph << get_inceptionB_block(data_path, "Mixed_6h");
+        // reduction B block
+        graph << get_reductionB_block(data_path);
+        // 3 inception C blocks
+        graph << get_inceptionC_block(data_path, "Mixed_7b");
+        graph << get_inceptionC_block(data_path, "Mixed_7c");
+        graph << get_inceptionC_block(data_path, "Mixed_7d");
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG))
               << FlattenLayer()
               << FullyConnectedLayer(
                   1001U,
                   get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_weights.npy"),
                   get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_biases.npy"))
               << SoftmaxLayer()
-              << Tensor(get_output_accessor(label, 5));
+              << OutputLayer(get_output_accessor(label, 5));
 
-        // In order to enable the OpenCL tuner, graph_init() has to be called only when all nodes have been instantiated
-        graph.graph_init(int_target_hint == 2);
+        // Finalize graph
+        GraphConfig config;
+        config.use_tuner = (target == 2);
+        graph.finalize(target_hint, config);
+#else  /* __aarch64__ */
+        using namespace arm_compute;
+        ARM_COMPUTE_UNUSED(argc);
+        ARM_COMPUTE_UNUSED(argv);
+#endif /* __aarch64__ */
     }
 
     void do_run() override
     {
+#if __aarch64__
         graph.run();
+#endif /* __aarch64__ */
     }
 
 private:
-    Graph graph{};
+    Stream graph{ 0, "InceptionV4" };
 
 private:
     BranchLayer get_mixed_3a(const std::string &data_path)
     {
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_3a_";
 
-        SubGraph i_a;
-        i_a << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
-            // TODO (geopin01) : Remove once we understand why a single node graph does not run in CL
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f, 0.f));
+        SubStream i_a(graph);
+        i_a << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true));
 
-        SubGraph i_b;
+        SubStream i_b(graph);
         i_b << ConvolutionLayer(3U, 3U, 96U,
                                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
@@ -195,7 +214,7 @@
     {
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_4a_";
 
-        SubGraph i_a;
+        SubStream i_a(graph);
         i_a << ConvolutionLayer(1U, 1U, 64U,
                                 get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
@@ -215,7 +234,7 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_b;
+        SubStream i_b(graph);
         i_b << ConvolutionLayer(1U, 1U, 64U,
                                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
@@ -260,7 +279,7 @@
     {
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_5a_";
 
-        SubGraph i_a;
+        SubStream i_a(graph);
         i_a << ConvolutionLayer(3U, 3U, 192U,
                                 get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
@@ -271,10 +290,8 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_b;
-        i_b << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
-            // TODO (geopin01) : Remove once we understand why a single node graph does not run in CL
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f, 0.f));
+        SubStream i_b(graph);
+        i_b << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true));
 
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b));
     }
@@ -283,7 +300,7 @@
     {
         std::string total_path = "/cnn_data/inceptionv4_model/" + param_path + "_";
 
-        SubGraph i_a;
+        SubStream i_a(graph);
         i_a << ConvolutionLayer(1U, 1U, 96U,
                                 get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
@@ -294,7 +311,7 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_b;
+        SubStream i_b(graph);
         i_b << ConvolutionLayer(1U, 1U, 64U,
                                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
@@ -314,7 +331,7 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_c;
+        SubStream i_c(graph);
         i_c << ConvolutionLayer(1U, 1U, 64U,
                                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
@@ -343,7 +360,7 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_d;
+        SubStream i_d(graph);
         i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
             << ConvolutionLayer(1U, 1U, 96U,
                                 get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy"),
@@ -362,7 +379,7 @@
     {
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_6a_";
 
-        SubGraph i_a;
+        SubStream i_a(graph);
         i_a << ConvolutionLayer(3U, 3U, 384U,
                                 get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
@@ -373,7 +390,7 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_b;
+        SubStream i_b(graph);
         i_b << ConvolutionLayer(1U, 1U, 192U,
                                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
@@ -402,10 +419,9 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_c;
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
-            // TODO (geopin01) : Remove once we understand why a single node graph does not run in CL
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f, 0.f));
+        SubStream i_c(graph);
+        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true));
+
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c));
     }
 
@@ -413,7 +429,7 @@
     {
         std::string total_path = "/cnn_data/inceptionv4_model/" + param_path + "_";
 
-        SubGraph i_a;
+        SubStream i_a(graph);
         i_a << ConvolutionLayer(1U, 1U, 384U,
                                 get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
@@ -424,7 +440,7 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_b;
+        SubStream i_b(graph);
         i_b << ConvolutionLayer(1U, 1U, 192U,
                                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
@@ -453,7 +469,7 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_c;
+        SubStream i_c(graph);
         i_c << ConvolutionLayer(1U, 1U, 192U,
                                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
@@ -500,7 +516,7 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_d;
+        SubStream i_d(graph);
         i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
             << ConvolutionLayer(1U, 1U, 128U,
                                 get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy"),
@@ -519,7 +535,7 @@
     {
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_7a_";
 
-        SubGraph i_a;
+        SubStream i_a(graph);
         i_a << ConvolutionLayer(1U, 1U, 192U,
                                 get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
@@ -539,7 +555,7 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_b;
+        SubStream i_b(graph);
         i_b << ConvolutionLayer(1U, 1U, 256U,
                                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
@@ -577,10 +593,9 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_c;
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
-            // TODO (geopin01) : Remove once we understand why a single node graph does not run in CL
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f, 0.f));
+        SubStream i_c(graph);
+        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true));
+
         return BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_a), std::move(i_b), std::move(i_c));
     }
 
@@ -588,7 +603,7 @@
     {
         std::string total_path = "/cnn_data/inceptionv4_model/" + param_path + "_";
 
-        SubGraph i_a;
+        SubStream i_a(graph);
         i_a << ConvolutionLayer(1U, 1U, 256U,
                                 get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy"),
                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
@@ -599,7 +614,21 @@
                                        0.001f)
             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_b1;
+        SubStream i_b(graph);
+        i_b << ConvolutionLayer(
+                1U, 1U, 384U,
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
+                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                PadStrideInfo(1, 1, 0, 0))
+            << BatchNormalizationLayer(
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                get_random_accessor(1.f, 1.f),
+                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                0.001f)
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubStream i_b1(static_cast<IStream &>(i_b));
         i_b1 << ConvolutionLayer(
                  3U, 1U, 256U,
                  get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy"),
@@ -613,7 +642,7 @@
                  0.001f)
              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_b2;
+        SubStream i_b2(static_cast<IStream &>(i_b));
         i_b2 << ConvolutionLayer(
                  1U, 3U, 256U,
                  get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_weights.npy"),
@@ -627,50 +656,10 @@
                  0.001f)
              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_b;
-        i_b << ConvolutionLayer(
-                1U, 1U, 384U,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy"),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                get_random_accessor(1.f, 1.f),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
-            << BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_b1), std::move(i_b2));
+        // Merge b1 and b2
+        i_b << BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_b1), std::move(i_b2));
 
-        SubGraph i_c1;
-        i_c1 << ConvolutionLayer(
-                 3U, 1U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_weights.npy"),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 1, 0))
-             << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_moving_variance.npy"),
-                 get_random_accessor(1.f, 1.f),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_beta.npy"),
-                 0.001f)
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-
-        SubGraph i_c2;
-        i_c2 << ConvolutionLayer(
-                 1U, 3U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_weights.npy"),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 0, 1))
-             << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_moving_variance.npy"),
-                 get_random_accessor(1.f, 1.f),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_beta.npy"),
-                 0.001f)
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
-
-        SubGraph i_c;
+        SubStream i_c(graph);
         i_c << ConvolutionLayer(
                 1U, 1U, 384U,
                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy"),
@@ -706,10 +695,40 @@
                 get_random_accessor(1.f, 1.f),
                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_beta.npy"),
                 0.001f)
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
-            << BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_c1), std::move(i_c2));
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
-        SubGraph i_d;
+        SubStream i_c1(static_cast<IStream &>(i_c));
+        i_c1 << ConvolutionLayer(
+                 3U, 1U, 256U,
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_weights.npy"),
+                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                 PadStrideInfo(1, 1, 1, 0))
+             << BatchNormalizationLayer(
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_moving_mean.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_moving_variance.npy"),
+                 get_random_accessor(1.f, 1.f),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_beta.npy"),
+                 0.001f)
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        SubStream i_c2(static_cast<IStream &>(i_c));
+        i_c2 << ConvolutionLayer(
+                 1U, 3U, 256U,
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_weights.npy"),
+                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                 PadStrideInfo(1, 1, 0, 1))
+             << BatchNormalizationLayer(
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_moving_mean.npy"),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_moving_variance.npy"),
+                 get_random_accessor(1.f, 1.f),
+                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_beta.npy"),
+                 0.001f)
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+        // Merge i_c1 and i_c2
+        i_c << BranchLayer(BranchMergeMethod::DEPTH_CONCATENATE, std::move(i_c1), std::move(i_c2));
+
+        SubStream i_d(graph);
         i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
             << ConvolutionLayer(1U, 1U, 256U,
                                 get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy"),
@@ -728,7 +747,7 @@
 /** Main program for Inception V4
  *
  * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels )
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL, 2 = OpenCL with Tuner), [optional] Path to the weights folder, [optional] image, [optional] labels, [optional] Fast math for convolution layer (0 = DISABLED, 1 = ENABLED) )
  */
 int main(int argc, char **argv)
 {