use NEON 8-bit stages on ARMv7 too
We don't really use anything very ARMv8 specific in the 8-bit NEON
stages, so we can just naturally extend what we're doing to ARMv7 too.
Note that unlike the float stages, we're not requiring VFPv4 either,
just NEON. VFPv4 is for FMA and F16<->F32 conversion, both of which are
unnecessary for the integer pipeline.
GMs and perf improvement are similar to the previous ARMv8 change.
Change-Id: Id618801ea1920564c1deee144a640a4133c4505f
Reviewed-on: https://skia-review.googlesource.com/39840
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 315110f..8f3e6a7 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -175,7 +175,7 @@
SK_RASTER_PIPELINE_STAGES(M)
#undef M
-#if defined(__clang__) && defined(__aarch64__)
+#if defined(JUMPER_HAS_NEON_8BIT)
// We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang.
StartPipelineFn sk_start_pipeline_8bit;
StageFn sk_just_return_8bit;
@@ -208,13 +208,13 @@
}
LOWP_STAGES(M)
#undef M
-#elif defined(__clang__) && defined(__aarch64__)
+#elif defined(JUMPER_HAS_NEON_8BIT)
template <SkRasterPipeline::StockStage st>
- static constexpr StageFn* aarch64_8bit() { return nullptr; }
+ static constexpr StageFn* neon_8bit() { return nullptr; }
- #define M(st) \
- template <> constexpr StageFn* aarch64_8bit<SkRasterPipeline::st>() { \
- return sk_##st##_8bit; \
+ #define M(st) \
+ template <> constexpr StageFn* neon_8bit<SkRasterPipeline::st>() { \
+ return sk_##st##_8bit; \
}
LOWP_STAGES(M)
#undef M
@@ -346,9 +346,9 @@
#undef M
};
}
- #elif defined(__clang__) && defined(__aarch64__)
+ #elif defined(JUMPER_HAS_NEON_8BIT)
return {
- #define M(st) aarch64_8bit<SkRasterPipeline::st>(),
+ #define M(st) neon_8bit<SkRasterPipeline::st>(),
{ SK_RASTER_PIPELINE_STAGES(M) },
sk_start_pipeline_8bit,
sk_just_return_8bit,
diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h
index 20b8d32..4bb851f 100644
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@@ -50,6 +50,15 @@
#include <stdint.h>
#endif
+// When compiled with Clang on ARM, we'll have 8-bit NEON stages.
+#if defined(__clang__)
+ #if defined(__aarch64__)
+ #define JUMPER_HAS_NEON_8BIT
+ #elif defined(__arm__) && defined(__ARM_NEON__)
+ #define JUMPER_HAS_NEON_8BIT
+ #endif
+#endif
+
static const int SkJumper_kMaxStride = 8;
struct SkJumper_constants {
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index 0c019f8..b6d94e3 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -14,9 +14,9 @@
// As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code.
// Any other platform (so far) is offline-only.
-#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__))
+#if defined(JUMPER_IS_OFFLINE) || defined(JUMPER_HAS_NEON_8BIT)
-#if defined(__aarch64__)
+#if defined(JUMPER_HAS_NEON_8BIT)
#include <arm_neon.h>
#else
#include <immintrin.h>
@@ -24,8 +24,6 @@
#if !defined(JUMPER_IS_OFFLINE)
#define WRAP(name) sk_##name##_8bit
-#elif defined(__aarch64__)
- #define WRAP(name) sk_##name##_aarch64_8bit
#elif defined(__AVX2__)
#define WRAP(name) sk_##name##_hsw_8bit
#elif defined(__SSE4_1__)
@@ -166,7 +164,7 @@
b_lo, b_hi;
split(a.u8x4, &a_lo, &a_hi);
split(b.u8x4, &b_lo, &b_hi);
-#if defined(__aarch64__)
+#if defined(JUMPER_HAS_NEON_8BIT)
return join(vqaddq_u8(a_lo, b_lo),
vqaddq_u8(a_hi, b_hi));
#elif defined(__AVX2__)
diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
index 688ad60..728b0a5 100755
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@@ -109,10 +109,6 @@
subprocess.check_call(clang + cflags + vfp4 +
['-c', stages] +
['-o', 'vfp4.o'])
-# TODO: should work fine... I just want to turn this one on separately from x86
-#subprocess.check_call(clang + cflags + vfp4 +
-# ['-c', stages_8bit] +
-# ['-o', '8bit_vfp4.o'])
def parse_object_file(dot_o, directive, target=None):
globl, hidden, label, comment, align = \
@@ -223,12 +219,11 @@
print '.text'
print '#if defined(__arm__)'
print 'BALIGN4'
-parse_object_file( 'vfp4.o', '.long', target='elf32-littlearm')
-#parse_object_file('8bit_vfp4.o', '.long', target='elf32-littlearm')
+parse_object_file('vfp4.o', '.long', target='elf32-littlearm')
print '#elif defined(__x86_64__)'
print 'BALIGN32'
-parse_object_file('merged.o', '.byte')
+parse_object_file('merged.o', '.byte')
print '#elif defined(__i386__)'
print 'BALIGN32'