use NEON 8-bit stages on ARMv7 too We don't really use anything very ARMv8 specific in the 8-bit NEON stages, so we can just naturally extend what we're doing to ARMv7 too. Note that unlike the float stages, we're not requiring VFPv4 either, just NEON. VFPv4 is for FMA and F16<->F32 conversion, both of which are unnecessary for the integer pipeline. GMs and perf improvement are similar to the previous ARMv8 change. Change-Id: Id618801ea1920564c1deee144a640a4133c4505f Reviewed-on: https://skia-review.googlesource.com/39840 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com>

commit: b561b764d894260b77d3c44f8fa182802897f2e1 [log] [tgz]
author: Mike Klein <mtklein@chromium.org> Mon Aug 28 17:53:34 2017 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> Tue Aug 29 18:27:51 2017 +0000
tree: 3d3e22e7f0c76bbb7775dba0e566aee28d3322a2
parent: fe75930ce0b8d9451d29162942badfd568a1ec47 [diff]
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 315110f..8f3e6a7 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp

@@ -175,7 +175,7 @@
         SK_RASTER_PIPELINE_STAGES(M)
     #undef M
 
-#if defined(__clang__) && defined(__aarch64__)
+#if defined(JUMPER_HAS_NEON_8BIT)
     // We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang.
     StartPipelineFn sk_start_pipeline_8bit;
     StageFn sk_just_return_8bit;
@@ -208,13 +208,13 @@
         }
         LOWP_STAGES(M)
     #undef M
-#elif defined(__clang__) && defined(__aarch64__)
+#elif defined(JUMPER_HAS_NEON_8BIT)
     template <SkRasterPipeline::StockStage st>
-    static constexpr StageFn* aarch64_8bit() { return nullptr; }
+    static constexpr StageFn* neon_8bit() { return nullptr; }
 
-    #define M(st)                                                               \
-        template <> constexpr StageFn* aarch64_8bit<SkRasterPipeline::st>() {   \
-            return sk_##st##_8bit;                                              \
+    #define M(st)                                                            \
+        template <> constexpr StageFn* neon_8bit<SkRasterPipeline::st>() {   \
+            return sk_##st##_8bit;                                           \
         }
         LOWP_STAGES(M)
     #undef M
@@ -346,9 +346,9 @@
             #undef M
             };
         }
-    #elif defined(__clang__) && defined(__aarch64__)
+    #elif defined(JUMPER_HAS_NEON_8BIT)
         return {
-        #define M(st) aarch64_8bit<SkRasterPipeline::st>(),
+        #define M(st) neon_8bit<SkRasterPipeline::st>(),
             { SK_RASTER_PIPELINE_STAGES(M) },
             sk_start_pipeline_8bit,
             sk_just_return_8bit,

diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h
index 20b8d32..4bb851f 100644
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h

@@ -50,6 +50,15 @@
     #include <stdint.h>
 #endif
 
+// When compiled with Clang on ARM, we'll have 8-bit NEON stages.
+#if defined(__clang__)
+    #if defined(__aarch64__)
+        #define JUMPER_HAS_NEON_8BIT
+    #elif defined(__arm__) && defined(__ARM_NEON__)
+        #define JUMPER_HAS_NEON_8BIT
+    #endif
+#endif
+
 static const int SkJumper_kMaxStride = 8;
 
 struct SkJumper_constants {

diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index 0c019f8..b6d94e3 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp

@@ -14,9 +14,9 @@
 
 // As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code.
 // Any other platform (so far) is offline-only.
-#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__))
+#if defined(JUMPER_IS_OFFLINE) || defined(JUMPER_HAS_NEON_8BIT)
 
-#if defined(__aarch64__)
+#if defined(JUMPER_HAS_NEON_8BIT)
     #include <arm_neon.h>
 #else
     #include <immintrin.h>
@@ -24,8 +24,6 @@
 
 #if !defined(JUMPER_IS_OFFLINE)
     #define WRAP(name) sk_##name##_8bit
-#elif defined(__aarch64__)
-    #define WRAP(name) sk_##name##_aarch64_8bit
 #elif defined(__AVX2__)
     #define WRAP(name) sk_##name##_hsw_8bit
 #elif defined(__SSE4_1__)
@@ -166,7 +164,7 @@
       b_lo, b_hi;
     split(a.u8x4, &a_lo, &a_hi);
     split(b.u8x4, &b_lo, &b_hi);
-#if defined(__aarch64__)
+#if defined(JUMPER_HAS_NEON_8BIT)
     return join(vqaddq_u8(a_lo, b_lo),
                 vqaddq_u8(a_hi, b_hi));
 #elif defined(__AVX2__)

diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
index 688ad60..728b0a5 100755
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py

@@ -109,10 +109,6 @@
 subprocess.check_call(clang + cflags + vfp4 +
                       ['-c', stages] +
                       ['-o', 'vfp4.o'])
-# TODO: should work fine... I just want to turn this one on separately from x86
-#subprocess.check_call(clang + cflags + vfp4 +
-#                      ['-c', stages_8bit] +
-#                      ['-o', '8bit_vfp4.o'])
 
 def parse_object_file(dot_o, directive, target=None):
   globl, hidden, label, comment, align = \
@@ -223,12 +219,11 @@
 print '.text'
 print '#if defined(__arm__)'
 print 'BALIGN4'
-parse_object_file(     'vfp4.o', '.long', target='elf32-littlearm')
-#parse_object_file('8bit_vfp4.o', '.long', target='elf32-littlearm')
+parse_object_file('vfp4.o', '.long', target='elf32-littlearm')
 
 print '#elif defined(__x86_64__)'
 print 'BALIGN32'
-parse_object_file('merged.o',   '.byte')
+parse_object_file('merged.o', '.byte')
 
 print '#elif defined(__i386__)'
 print 'BALIGN32'
commit	b561b764d894260b77d3c44f8fa182802897f2e1	[log] [tgz]
author	Mike Klein <mtklein@chromium.org>	Mon Aug 28 17:53:34 2017 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	Tue Aug 29 18:27:51 2017 +0000
tree	3d3e22e7f0c76bbb7775dba0e566aee28d3322a2
parent	fe75930ce0b8d9451d29162942badfd568a1ec47 [diff]