Always inline Filter_32_*_neon functions
The functions are rather performance critical and already marked
'inline'. However, Chrome for Android will not have these functions
inlined due to it being compiled with -Os and a small -finline-limit.
This avoids one call in the filtering functions.
Does not increase the library size.
BUG=chromium:363073
R=mtklein@google.com
Author: kkinnunen@nvidia.com
Review URL: https://codereview.chromium.org/280403005
git-svn-id: http://skia.googlecode.com/svn/trunk@14709 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/opts/SkBitmapProcState_filter_neon.h b/src/opts/SkBitmapProcState_filter_neon.h
index e56b683..0887145 100644
--- a/src/opts/SkBitmapProcState_filter_neon.h
+++ b/src/opts/SkBitmapProcState_filter_neon.h
@@ -17,12 +17,15 @@
* exact results for the color components, but if the 4 incoming colors are
* all opaque, then the output color must also be opaque. Subsequent parts of
* the drawing pipeline may rely on this (e.g. which blitrow proc to use).
+ *
*/
-
-static inline void Filter_32_opaque_neon(unsigned x, unsigned y,
- SkPMColor a00, SkPMColor a01,
- SkPMColor a10, SkPMColor a11,
- SkPMColor *dst) {
+// Chrome on Android uses -Os so we need to force these inline. Otherwise
+// calling the function in the inner loops will cause significant overhead on
+// some platforms.
+static SK_ALWAYS_INLINE void Filter_32_opaque_neon(unsigned x, unsigned y,
+ SkPMColor a00, SkPMColor a01,
+ SkPMColor a10, SkPMColor a11,
+ SkPMColor *dst) {
uint8x8_t vy, vconst16_8, v16_y, vres;
uint16x4_t vx, vconst16_16, v16_x, tmp;
uint32x2_t va0, va1;
@@ -53,10 +56,11 @@
vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result
}
-static inline void Filter_32_alpha_neon(unsigned x, unsigned y,
- SkPMColor a00, SkPMColor a01,
- SkPMColor a10, SkPMColor a11,
- SkPMColor *dst, uint16_t scale) {
+static SK_ALWAYS_INLINE void Filter_32_alpha_neon(unsigned x, unsigned y,
+ SkPMColor a00, SkPMColor a01,
+ SkPMColor a10, SkPMColor a11,
+ SkPMColor *dst,
+ uint16_t scale) {
uint8x8_t vy, vconst16_8, v16_y, vres;
uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
uint32x2_t va0, va1;