more neon work:
- remove neon-specific functions that were not faster than the C version
- move the neon filter function to its own header, so it could be use in more places
git-svn-id: http://skia.googlecode.com/svn/trunk@309 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index 3e5b927..3c5833d 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp
@@ -1,4 +1,5 @@
#include "SkBitmapProcState.h"
+#include "SkBitmapProcState_filter.h"
#include "SkColorPriv.h"
#include "SkFilterProc.h"
#include "SkPaint.h"
@@ -12,33 +13,6 @@
#define UNPACK_SECONDARY_SHORT(packed) ((uint32_t)(packed) >> 16)
#endif
-static inline SkPMColor Filter_32(unsigned x, unsigned y,
- SkPMColor a00, SkPMColor a01,
- SkPMColor a10, SkPMColor a11) {
- SkASSERT((unsigned)x <= 0xF);
- SkASSERT((unsigned)y <= 0xF);
-
- int xy = x * y;
- uint32_t mask = gMask_00FF00FF; //0xFF00FF;
-
- int scale = 256 - 16*y - 16*x + xy;
- uint32_t lo = (a00 & mask) * scale;
- uint32_t hi = ((a00 >> 8) & mask) * scale;
-
- scale = 16*x - xy;
- lo += (a01 & mask) * scale;
- hi += ((a01 >> 8) & mask) * scale;
-
- scale = 16*y - xy;
- lo += (a10 & mask) * scale;
- hi += ((a10 >> 8) & mask) * scale;
-
- lo += (a11 & mask) * xy;
- hi += ((a11 >> 8) & mask) * xy;
-
- return ((lo >> 8) & mask) | (hi & ~mask);
-}
-
// returns expanded * 5bits
static inline uint32_t Filter_565_Expanded(unsigned x, unsigned y,
uint32_t a00, uint32_t a01,
@@ -111,7 +85,7 @@
// SRC == 8888
-#define FILTER_PROC(x, y, a, b, c, d) Filter_32(x, y, a, b, c, d)
+#define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_opaque(x, y, a, b, c, d, dst)
#define MAKENAME(suffix) S32_opaque_D32 ## suffix
#define DSTSIZE 32
@@ -120,24 +94,29 @@
SkASSERT(state.fAlphaScale == 256)
#define RETURNDST(src) src
#define SRC_TO_FILTER(src) src
-#define FILTER_TO_DST(c) c
#include "SkBitmapProcState_sample.h"
+#undef FILTER_PROC
+#define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_alpha(x, y, a, b, c, d, dst, alphaScale)
+
#define MAKENAME(suffix) S32_alpha_D32 ## suffix
#define DSTSIZE 32
#define SRCTYPE SkPMColor
#define CHECKSTATE(state) SkASSERT(state.fBitmap->config() == SkBitmap::kARGB_8888_Config); \
SkASSERT(state.fAlphaScale < 256)
-#define PREAMBLE(state) unsigned scale = state.fAlphaScale
-#define RETURNDST(src) SkAlphaMulQ(src, scale)
+#define PREAMBLE(state) unsigned alphaScale = state.fAlphaScale
+#define RETURNDST(src) SkAlphaMulQ(src, alphaScale)
#define SRC_TO_FILTER(src) src
-#define FILTER_TO_DST(c) SkAlphaMulQ(c, scale)
#include "SkBitmapProcState_sample.h"
// SRC == 565
#undef FILTER_PROC
-#define FILTER_PROC(x, y, a, b, c, d) Filter_565_Expanded(x, y, a, b, c, d)
+#define FILTER_PROC(x, y, a, b, c, d, dst) \
+ do { \
+ uint32_t tmp = Filter_565_Expanded(x, y, a, b, c, d); \
+ *(dst) = SkExpanded_565_To_PMColor(tmp); \
+ } while (0)
#define MAKENAME(suffix) S16_opaque_D32 ## suffix
#define DSTSIZE 32
@@ -146,24 +125,29 @@
SkASSERT(state.fAlphaScale == 256)
#define RETURNDST(src) SkPixel16ToPixel32(src)
#define SRC_TO_FILTER(src) src
-#define FILTER_TO_DST(c) SkExpanded_565_To_PMColor(c)
#include "SkBitmapProcState_sample.h"
+#undef FILTER_PROC
+#define FILTER_PROC(x, y, a, b, c, d, dst) \
+ do { \
+ uint32_t tmp = Filter_565_Expanded(x, y, a, b, c, d); \
+ *(dst) = SkAlphaMulQ(SkExpanded_565_To_PMColor(tmp), alphaScale); \
+ } while (0)
+
#define MAKENAME(suffix) S16_alpha_D32 ## suffix
#define DSTSIZE 32
#define SRCTYPE uint16_t
#define CHECKSTATE(state) SkASSERT(state.fBitmap->config() == SkBitmap::kRGB_565_Config); \
SkASSERT(state.fAlphaScale < 256)
-#define PREAMBLE(state) unsigned scale = state.fAlphaScale
-#define RETURNDST(src) SkAlphaMulQ(SkPixel16ToPixel32(src), scale)
+#define PREAMBLE(state) unsigned alphaScale = state.fAlphaScale
+#define RETURNDST(src) SkAlphaMulQ(SkPixel16ToPixel32(src), alphaScale)
#define SRC_TO_FILTER(src) src
-#define FILTER_TO_DST(c) SkAlphaMulQ(SkExpanded_565_To_PMColor(c), scale)
#include "SkBitmapProcState_sample.h"
// SRC == Index8
#undef FILTER_PROC
-#define FILTER_PROC(x, y, a, b, c, d) Filter_32(x, y, a, b, c, d)
+#define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_opaque(x, y, a, b, c, d, dst)
#define MAKENAME(suffix) SI8_opaque_D32 ## suffix
#define DSTSIZE 32
@@ -173,53 +157,63 @@
#define PREAMBLE(state) const SkPMColor* SK_RESTRICT table = state.fBitmap->getColorTable()->lockColors()
#define RETURNDST(src) table[src]
#define SRC_TO_FILTER(src) table[src]
-#define FILTER_TO_DST(c) c
#define POSTAMBLE(state) state.fBitmap->getColorTable()->unlockColors(false)
#include "SkBitmapProcState_sample.h"
+#undef FILTER_PROC
+#define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_alpha(x, y, a, b, c, d, dst, alphaScale)
+
#define MAKENAME(suffix) SI8_alpha_D32 ## suffix
#define DSTSIZE 32
#define SRCTYPE uint8_t
#define CHECKSTATE(state) SkASSERT(state.fBitmap->config() == SkBitmap::kIndex8_Config); \
SkASSERT(state.fAlphaScale < 256)
-#define PREAMBLE(state) unsigned scale = state.fAlphaScale; \
+#define PREAMBLE(state) unsigned alphaScale = state.fAlphaScale; \
const SkPMColor* SK_RESTRICT table = state.fBitmap->getColorTable()->lockColors()
-#define RETURNDST(src) SkAlphaMulQ(table[src], scale)
+#define RETURNDST(src) SkAlphaMulQ(table[src], alphaScale)
#define SRC_TO_FILTER(src) table[src]
-#define FILTER_TO_DST(c) SkAlphaMulQ(c, scale)
#define POSTAMBLE(state) state.fBitmap->getColorTable()->unlockColors(false)
#include "SkBitmapProcState_sample.h"
// SRC == 4444
#undef FILTER_PROC
-#define FILTER_PROC(x, y, a, b, c, d) Filter_4444_D32(x, y, a, b, c, d)
+#define FILTER_PROC(x, y, a, b, c, d, dst) *(dst) = Filter_4444_D32(x, y, a, b, c, d)
#define MAKENAME(suffix) S4444_opaque_D32 ## suffix
#define DSTSIZE 32
#define SRCTYPE SkPMColor16
#define CHECKSTATE(state) SkASSERT(state.fBitmap->config() == SkBitmap::kARGB_4444_Config); \
-SkASSERT(state.fAlphaScale == 256)
+ SkASSERT(state.fAlphaScale == 256)
#define RETURNDST(src) SkPixel4444ToPixel32(src)
#define SRC_TO_FILTER(src) src
-#define FILTER_TO_DST(c) c
#include "SkBitmapProcState_sample.h"
+#undef FILTER_PROC
+#define FILTER_PROC(x, y, a, b, c, d, dst) \
+ do { \
+ uint32_t tmp = Filter_4444_D32(x, y, a, b, c, d); \
+ *(dst) = SkAlphaMulQ(tmp, alphaScale); \
+ } while (0)
+
#define MAKENAME(suffix) S4444_alpha_D32 ## suffix
#define DSTSIZE 32
#define SRCTYPE SkPMColor16
#define CHECKSTATE(state) SkASSERT(state.fBitmap->config() == SkBitmap::kARGB_4444_Config); \
-SkASSERT(state.fAlphaScale < 256)
-#define PREAMBLE(state) unsigned scale = state.fAlphaScale
-#define RETURNDST(src) SkAlphaMulQ(SkPixel4444ToPixel32(src), scale)
+ SkASSERT(state.fAlphaScale < 256)
+#define PREAMBLE(state) unsigned alphaScale = state.fAlphaScale
+#define RETURNDST(src) SkAlphaMulQ(SkPixel4444ToPixel32(src), alphaScale)
#define SRC_TO_FILTER(src) src
-#define FILTER_TO_DST(c) SkAlphaMulQ(c, scale)
#include "SkBitmapProcState_sample.h"
// SRC == A8
#undef FILTER_PROC
-#define FILTER_PROC(x, y, a, b, c, d) Filter_8(x, y, a, b, c, d)
+#define FILTER_PROC(x, y, a, b, c, d, dst) \
+ do { \
+ unsigned tmp = Filter_8(x, y, a, b, c, d); \
+ *(dst) = SkAlphaMulQ(pmColor, SkAlpha255To256(tmp)); \
+ } while (0)
#define MAKENAME(suffix) SA8_alpha_D32 ## suffix
#define DSTSIZE 32
@@ -229,7 +223,6 @@
#define PREAMBLE(state) const SkPMColor pmColor = state.fPaintPMColor;
#define RETURNDST(src) SkAlphaMulQ(pmColor, SkAlpha255To256(src))
#define SRC_TO_FILTER(src) src
-#define FILTER_TO_DST(c) SkAlphaMulQ(pmColor, SkAlpha255To256(c))
#include "SkBitmapProcState_sample.h"
/*****************************************************************************
@@ -241,7 +234,12 @@
// SRC == 8888
#undef FILTER_PROC
-#define FILTER_PROC(x, y, a, b, c, d) Filter_32(x, y, a, b, c, d)
+#define FILTER_PROC(x, y, a, b, c, d, dst) \
+ do { \
+ SkPMColor dstColor; \
+ Filter_32_opaque(x, y, a, b, c, d, &dstColor); \
+ (*dst) = SkPixel32ToPixel16(dstColor); \
+ } while (0)
#define MAKENAME(suffix) S32_D16 ## suffix
#define DSTSIZE 16
@@ -250,13 +248,16 @@
SkASSERT(state.fBitmap->isOpaque())
#define RETURNDST(src) SkPixel32ToPixel16(src)
#define SRC_TO_FILTER(src) src
-#define FILTER_TO_DST(c) SkPixel32ToPixel16(c)
#include "SkBitmapProcState_sample.h"
// SRC == 565
#undef FILTER_PROC
-#define FILTER_PROC(x, y, a, b, c, d) Filter_565_Expanded(x, y, a, b, c, d)
+#define FILTER_PROC(x, y, a, b, c, d, dst) \
+ do { \
+ uint32_t tmp = Filter_565_Expanded(x, y, a, b, c, d); \
+ *(dst) = SkCompact_rgb_16((tmp) >> 5); \
+ } while (0)
#define MAKENAME(suffix) S16_D16 ## suffix
#define DSTSIZE 16
@@ -264,13 +265,16 @@
#define CHECKSTATE(state) SkASSERT(state.fBitmap->config() == SkBitmap::kRGB_565_Config)
#define RETURNDST(src) src
#define SRC_TO_FILTER(src) src
-#define FILTER_TO_DST(c) SkCompact_rgb_16((c) >> 5)
#include "SkBitmapProcState_sample.h"
// SRC == Index8
#undef FILTER_PROC
-#define FILTER_PROC(x, y, a, b, c, d) Filter_565_Expanded(x, y, a, b, c, d)
+#define FILTER_PROC(x, y, a, b, c, d, dst) \
+ do { \
+ uint32_t tmp = Filter_565_Expanded(x, y, a, b, c, d); \
+ *(dst) = SkCompact_rgb_16((tmp) >> 5); \
+ } while (0)
#define MAKENAME(suffix) SI8_D16 ## suffix
#define DSTSIZE 16
@@ -280,14 +284,18 @@
#define PREAMBLE(state) const uint16_t* SK_RESTRICT table = state.fBitmap->getColorTable()->lock16BitCache()
#define RETURNDST(src) table[src]
#define SRC_TO_FILTER(src) table[src]
-#define FILTER_TO_DST(c) SkCompact_rgb_16(c >> 5)
#define POSTAMBLE(state) state.fBitmap->getColorTable()->unlock16BitCache()
#include "SkBitmapProcState_sample.h"
///////////////////////////////////////////////////////////////////////////////
#undef FILTER_PROC
-#define FILTER_PROC(x, y, a, b, c, d) Filter_565_Expanded(x, y, a, b, c, d)
+#define FILTER_PROC(x, y, a, b, c, d, dst) \
+ do { \
+ uint32_t tmp = Filter_565_Expanded(x, y, a, b, c, d); \
+ *(dst) = SkCompact_rgb_16((tmp) >> 5); \
+ } while (0)
+
#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
@@ -299,7 +307,6 @@
#define DSTTYPE uint16_t
#define CHECKSTATE(state) SkASSERT(state.fBitmap->config() == SkBitmap::kRGB_565_Config)
#define SRC_TO_FILTER(src) src
-#define FILTER_TO_DST(c) SkCompact_rgb_16((c) >> 5)
#include "SkBitmapProcState_shaderproc.h"
@@ -313,7 +320,6 @@
#define DSTTYPE uint16_t
#define CHECKSTATE(state) SkASSERT(state.fBitmap->config() == SkBitmap::kRGB_565_Config)
#define SRC_TO_FILTER(src) src
-#define FILTER_TO_DST(c) SkCompact_rgb_16((c) >> 5)
#include "SkBitmapProcState_shaderproc.h"
///////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/SkBitmapProcState_filter.h b/src/core/SkBitmapProcState_filter.h
new file mode 100644
index 0000000..706ff7d
--- /dev/null
+++ b/src/core/SkBitmapProcState_filter.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __arm__
+ #include <machine/cpu-features.h>
+#endif
+#include "SkColorPriv.h"
+
+#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
+static inline void Filter_32_opaque_neon(unsigned x, unsigned y,
+ SkPMColor a00, SkPMColor a01,
+ SkPMColor a10, SkPMColor a11,
+ SkPMColor *dst) {
+ asm volatile(
+ "vdup.8 d0, %[y] \n\t" // duplicate y into d0
+ "vmov.u8 d16, #16 \n\t" // set up constant in d16
+ "vsub.u8 d1, d16, d0 \n\t" // d1 = 16-y
+
+ "vdup.32 d4, %[a00] \n\t" // duplicate a00 into d4
+ "vdup.32 d5, %[a10] \n\t" // duplicate a10 into d5
+ "vmov.32 d4[1], %[a01] \n\t" // set top of d4 to a01
+ "vmov.32 d5[1], %[a11] \n\t" // set top of d5 to a11
+
+ "vmull.u8 q3, d4, d1 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d0 \n\t" // q0 = [a11|a10] * y
+
+ "vdup.16 d5, %[x] \n\t" // duplicate x into d5
+ "vmov.u16 d16, #16 \n\t" // set up constant in d16
+ "vsub.u16 d3, d16, d5 \n\t" // d3 = 16-x
+
+ "vmul.i16 d4, d7, d5 \n\t" // d4 = a01 * x
+ "vmla.i16 d4, d1, d5 \n\t" // d4 += a11 * x
+ "vmla.i16 d4, d6, d3 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d4, d0, d3 \n\t" // d4 += a10 * (16-x)
+ "vshrn.i16 d0, q2, #8 \n\t" // shift down result by 8
+ "vst1.32 {d0[0]}, [%[dst]] \n\t" // store result
+ :
+ : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst)
+ : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16"
+ );
+}
+
+static inline void Filter_32_alpha_neon(unsigned x, unsigned y,
+ SkPMColor a00, SkPMColor a01,
+ SkPMColor a10, SkPMColor a11,
+ SkPMColor *dst, uint16_t scale) {
+ asm volatile(
+ "vdup.8 d0, %[y] \n\t" // duplicate y into d0
+ "vmov.u8 d16, #16 \n\t" // set up constant in d16
+ "vsub.u8 d1, d16, d0 \n\t" // d1 = 16-y
+
+ "vdup.32 d4, %[a00] \n\t" // duplicate a00 into d4
+ "vdup.32 d5, %[a10] \n\t" // duplicate a10 into d5
+ "vmov.32 d4[1], %[a01] \n\t" // set top of d4 to a01
+ "vmov.32 d5[1], %[a11] \n\t" // set top of d5 to a11
+
+ "vmull.u8 q3, d4, d1 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d0 \n\t" // q0 = [a11|a10] * y
+
+ "vdup.16 d5, %[x] \n\t" // duplicate x into d5
+ "vmov.u16 d16, #16 \n\t" // set up constant in d16
+ "vsub.u16 d3, d16, d5 \n\t" // d3 = 16-x
+
+ "vmul.i16 d4, d7, d5 \n\t" // d4 = a01 * x
+ "vmla.i16 d4, d1, d5 \n\t" // d4 += a11 * x
+ "vmla.i16 d4, d6, d3 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d4, d0, d3 \n\t" // d4 += a10 * (16-x)
+ "vdup.16 d3, %[scale] \n\t" // duplicate scale into d3
+ "vshr.u16 d4, d4, #8 \n\t" // shift down result by 8
+ "vmul.i16 d4, d4, d3 \n\t" // multiply result by scale
+ "vshrn.i16 d0, q2, #8 \n\t" // shift down result by 8
+ "vst1.32 {d0[0]}, [%[dst]] \n\t" // store result
+ :
+ : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst), [scale] "r" (scale)
+ : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16"
+ );
+}
+#define Filter_32_opaque Filter_32_opaque_neon
+#define Filter_32_alpha Filter_32_alpha_neon
+#else
+static inline void Filter_32_opaque_portable(unsigned x, unsigned y,
+ SkPMColor a00, SkPMColor a01,
+ SkPMColor a10, SkPMColor a11,
+ SkPMColor* dstColor) {
+ SkASSERT((unsigned)x <= 0xF);
+ SkASSERT((unsigned)y <= 0xF);
+
+ int xy = x * y;
+ uint32_t mask = gMask_00FF00FF; //0xFF00FF;
+
+ int scale = 256 - 16*y - 16*x + xy;
+ uint32_t lo = (a00 & mask) * scale;
+ uint32_t hi = ((a00 >> 8) & mask) * scale;
+
+ scale = 16*x - xy;
+ lo += (a01 & mask) * scale;
+ hi += ((a01 >> 8) & mask) * scale;
+
+ scale = 16*y - xy;
+ lo += (a10 & mask) * scale;
+ hi += ((a10 >> 8) & mask) * scale;
+
+ lo += (a11 & mask) * xy;
+ hi += ((a11 >> 8) & mask) * xy;
+
+ *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
+}
+
+static inline void Filter_32_alpha_portable(unsigned x, unsigned y,
+ SkPMColor a00, SkPMColor a01,
+ SkPMColor a10, SkPMColor a11,
+ SkPMColor* dstColor,
+ unsigned alphaScale) {
+ SkASSERT((unsigned)x <= 0xF);
+ SkASSERT((unsigned)y <= 0xF);
+ SkASSERT(alphaScale <= 256);
+
+ int xy = x * y;
+ uint32_t mask = gMask_00FF00FF; //0xFF00FF;
+
+ int scale = 256 - 16*y - 16*x + xy;
+ uint32_t lo = (a00 & mask) * scale;
+ uint32_t hi = ((a00 >> 8) & mask) * scale;
+
+ scale = 16*x - xy;
+ lo += (a01 & mask) * scale;
+ hi += ((a01 >> 8) & mask) * scale;
+
+ scale = 16*y - xy;
+ lo += (a10 & mask) * scale;
+ hi += ((a10 >> 8) & mask) * scale;
+
+ lo += (a11 & mask) * xy;
+ hi += ((a11 >> 8) & mask) * xy;
+
+ lo = ((lo >> 8) & mask) * alphaScale;
+ hi = ((hi >> 8) & mask) * alphaScale;
+
+ *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
+}
+#define Filter_32_opaque Filter_32_opaque_portable
+#define Filter_32_alpha Filter_32_alpha_portable
+#endif
+
diff --git a/src/core/SkBitmapProcState_sample.h b/src/core/SkBitmapProcState_sample.h
index 954afca..4e1f139 100644
--- a/src/core/SkBitmapProcState_sample.h
+++ b/src/core/SkBitmapProcState_sample.h
@@ -76,6 +76,7 @@
SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height());
srcAddr = (const SRCTYPE*)((const char*)srcAddr +
xy[0] * s.fBitmap->rowBytes());
+ xy += 1;
SRCTYPE src;
@@ -85,20 +86,20 @@
BITMAPPROC_MEMSET(colors, dstValue, count);
} else {
int i;
- const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy + 1);
for (i = (count >> 2); i > 0; --i) {
- SkASSERT(*xx < (unsigned)s.fBitmap->width());
- src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
+ uint32_t xx0 = *xy++;
+ uint32_t xx1 = *xy++;
+ SRCTYPE x0 = srcAddr[UNPACK_PRIMARY_SHORT(xx0)];
+ SRCTYPE x1 = srcAddr[UNPACK_SECONDARY_SHORT(xx0)];
+ SRCTYPE x2 = srcAddr[UNPACK_PRIMARY_SHORT(xx1)];
+ SRCTYPE x3 = srcAddr[UNPACK_SECONDARY_SHORT(xx1)];
- SkASSERT(*xx < (unsigned)s.fBitmap->width());
- src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
-
- SkASSERT(*xx < (unsigned)s.fBitmap->width());
- src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
-
- SkASSERT(*xx < (unsigned)s.fBitmap->width());
- src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
+ *colors++ = RETURNDST(x0);
+ *colors++ = RETURNDST(x1);
+ *colors++ = RETURNDST(x2);
+ *colors++ = RETURNDST(x3);
}
+ const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy);
for (i = (count & 3); i > 0; --i) {
SkASSERT(*xx < (unsigned)s.fBitmap->width());
src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
@@ -144,12 +145,13 @@
unsigned subX = x0 & 0xF;
x0 >>= 4;
- uint32_t c = FILTER_PROC(subX, subY,
- SRC_TO_FILTER(row0[x0]),
- SRC_TO_FILTER(row0[x1]),
- SRC_TO_FILTER(row1[x0]),
- SRC_TO_FILTER(row1[x1]));
- *colors++ = FILTER_TO_DST(c);
+ FILTER_PROC(subX, subY,
+ SRC_TO_FILTER(row0[x0]),
+ SRC_TO_FILTER(row0[x1]),
+ SRC_TO_FILTER(row1[x0]),
+ SRC_TO_FILTER(row1[x1]),
+ colors);
+ colors += 1;
} while (--count != 0);
@@ -186,12 +188,13 @@
const SRCTYPE* SK_RESTRICT row0 = (const SRCTYPE*)(srcAddr + y0 * rb);
const SRCTYPE* SK_RESTRICT row1 = (const SRCTYPE*)(srcAddr + y1 * rb);
- uint32_t c = FILTER_PROC(subX, subY,
- SRC_TO_FILTER(row0[x0]),
- SRC_TO_FILTER(row0[x1]),
- SRC_TO_FILTER(row1[x0]),
- SRC_TO_FILTER(row1[x1]));
- *colors++ = FILTER_TO_DST(c);
+ FILTER_PROC(subX, subY,
+ SRC_TO_FILTER(row0[x0]),
+ SRC_TO_FILTER(row0[x1]),
+ SRC_TO_FILTER(row1[x0]),
+ SRC_TO_FILTER(row1[x1]),
+ colors);
+ colors += 1;
} while (--count != 0);
#ifdef POSTAMBLE
diff --git a/src/core/SkBitmapProcState_shaderproc.h b/src/core/SkBitmapProcState_shaderproc.h
index 070a62b..b4a53e4 100644
--- a/src/core/SkBitmapProcState_shaderproc.h
+++ b/src/core/SkBitmapProcState_shaderproc.h
@@ -41,12 +41,13 @@
unsigned x0 = TILEX_PROCF(fx, maxX);
unsigned x1 = TILEX_PROCF((fx + oneX), maxX);
- uint32_t c = FILTER_PROC(subX, subY,
- SRC_TO_FILTER(row0[x0]),
- SRC_TO_FILTER(row0[x1]),
- SRC_TO_FILTER(row1[x0]),
- SRC_TO_FILTER(row1[x1]));
- *colors++ = FILTER_TO_DST(c);
+ FILTER_PROC(subX, subY,
+ SRC_TO_FILTER(row0[x0]),
+ SRC_TO_FILTER(row0[x1]),
+ SRC_TO_FILTER(row1[x0]),
+ SRC_TO_FILTER(row1[x1]),
+ colors);
+ colors += 1;
fx += dx;
} while (--count != 0);
diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp
index 1bd4d91..707304c 100644
--- a/src/opts/SkBitmapProcState_opts_arm.cpp
+++ b/src/opts/SkBitmapProcState_opts_arm.cpp
@@ -19,264 +19,6 @@
#include "SkColorPriv.h"
#include "SkUtils.h"
-#if __ARM_ARCH__ >= 5 && !defined(SK_CPU_BENDIAN)
-void S16_D16_nofilter_DX_arm(const SkBitmapProcState& s,
- const uint32_t* SK_RESTRICT xy,
- int count, uint16_t* SK_RESTRICT colors) {
- SkASSERT(count > 0 && colors != NULL);
- SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask));
- SkASSERT(s.fDoFilter == false);
-
- const uint16_t* SK_RESTRICT srcAddr = (const uint16_t*)s.fBitmap->getPixels();
-
- // buffer is y32, x16, x16, x16, x16, x16
- // bump srcAddr to the proper row, since we're told Y never changes
- SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height());
- srcAddr = (const uint16_t*)((const char*)srcAddr +
- xy[0] * s.fBitmap->rowBytes());
-
- uint16_t src;
-
- if (1 == s.fBitmap->width()) {
- src = srcAddr[0];
- uint16_t dstValue = src;
- sk_memset16(colors, dstValue, count);
- } else {
- int i;
- const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy + 1);
-
- if((count >> 2) > 0) {
- asm volatile (
- "mov r8, %[count], lsr #2 \n\t" // shift down count so we iterate in fours
- "1: \n\t"
- "subs r8, r8, #1 \n\t" // decrement loop counter
- "ldrh r4, [%[xx]], #2 \n\t" // load xx value, update ptr
- "ldrh r5, [%[xx]], #2 \n\t" // load xx value, update ptr
- "ldrh r6, [%[xx]], #2 \n\t" // load xx value, update ptr
- "add r4, r4, r4 \n\t" // double offset for half word addressing
- "ldrh r7, [%[xx]], #2 \n\t" // load xx value, update ptr
- "add r5, r5, r5 \n\t" // double offset for half word addressing
- "ldrh r4, [%[srcAddr], r4] \n\t" // load value from srcAddr[*xx]
- "add r6, r6, r6 \n\t" // double offset for half word addressing
- "ldrh r5, [%[srcAddr], r5] \n\t" // load value from srcAddr[*xx]
- "add r7, r7, r7 \n\t" // double offset for half word addressing
- "ldrh r6, [%[srcAddr], r6] \n\t" // load value from srcAddr[*xx]
- "ldrh r7, [%[srcAddr], r7] \n\t" // load value from srcAddr[*xx]
- "strh r4, [%[colors]], #2 \n\t" // store value to colors, update ptr
- "strh r5, [%[colors]], #2 \n\t" // store value to colors, update ptr
- "strh r6, [%[colors]], #2 \n\t" // store value to colors, update ptr
- "strh r7, [%[colors]], #2 \n\t" // store value to colors, update ptr
- "bgt 1b \n\t" // branch if loop counter > 0
- : [count] "+r" (count), [xx] "+r" (xx), [srcAddr] "+r" (srcAddr), [colors] "+r" (colors)
- :
- : "cc", "memory", "r4", "r5", "r6", "r7", "r8"
- );
- }
- for (i = (count & 3); i > 0; --i) {
- SkASSERT(*xx < (unsigned)s.fBitmap->width());
- src = srcAddr[*xx++]; *colors++ = src;
- }
- }
-}
-#endif //__ARM_ARCH__ >= 5 && !defined(SK_CPU_BENDIAN)
-
-#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
-void S16_D16_filter_DX_arm(const SkBitmapProcState& s,
- const uint32_t* SK_RESTRICT xy,
- int count, uint16_t* SK_RESTRICT colors)
-{
- SkASSERT(count > 0 && colors != NULL);
- SkASSERT(s.fDoFilter);
-
- const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
- unsigned rb = s.fBitmap->rowBytes();
- unsigned subY;
- const uint16_t* SK_RESTRICT row0;
- const uint16_t* SK_RESTRICT row1;
- unsigned int rowgap;
- const uint32_t c7ffe = 0x7ffe;
-
- // setup row ptrs and update proc_table
- {
- uint32_t XY = *xy++;
- unsigned y0 = XY >> 14;
- row0 = (const uint16_t*)(srcAddr + (y0 >> 4) * rb);
- row1 = (const uint16_t*)(srcAddr + (XY & 0x3FFF) * rb);
- rowgap = (unsigned int)row1 - (unsigned int)row0;
- subY = y0 & 0xF;
- }
-
- unsigned int count4 = ((count >> 2) << 4) | subY;
- count &= 3;
-
- asm volatile (
- "and r4, %[count4], #0xF \n\t" // mask off subY
- "vmov.u16 d2[0], r4 \n\t" // move subY to Neon
- "rsb r4, r4, #16 \n\t" // r4 = 16-subY
- "vmov.u16 d2[1], r4 \n\t" // move 16-subY to Neon
- "movs %[count4], %[count4], lsr #4 \n\t" // shift count down, lose subY
- "vmov.u16 d3, #16 \n\t" // create constant
- "vmov.u16 q2, #31 \n\t" // set up blue mask
- "beq 2f \n\t" // if count4 == 0, exit
-
- "1: \n\t"
- "ldmia %[xy]!, {r4, r5, r6, r7} \n\t" // load four xy values
- // xy = [ x0:14 | subX:4 | x1:14 ]
- // extract subX for iter 0-3
- "vmov d0, r4, r5 \n\t" // move xy to Neon, iter 0-1
- "vmov d1, r6, r7 \n\t" // move xy to Neon, iter 2-3
-
- // Load 16 pixels for four filter iterations from memory.
- // Because the source pixels are potentially scattered, each lane
- // of each vector is loaded separately. Also, the X sub pixel
- // offset is extracted.
-
- // iter 0
- "mov r8, r4, lsr #18 \n\t" // extract x0
- "and r4, %[c7ffe], r4, lsl #1 \n\t" // extract x1 and make byte offset
- "add r8, %[row0], r8, lsl #1 \n\t" // calculate address of row0[x0]
- "add r4, %[row0], r4 \n\t" // calculate address of row0[x1]
- "vld1.u16 {d16[0]}, [r8], %[rowgap] \n\t" // load row0[x0] and move ptr to row1
- "vld1.u16 {d17[0]}, [r4], %[rowgap] \n\t" // load row0[x1] and move ptr to row1
- "vld1.u16 {d18[0]}, [r8] \n\t" // load row1[x0]
- "vld1.u16 {d19[0]}, [r4] \n\t" // load row1[x1]
-
- // iter 1
- "mov r8, r5, lsr #18 \n\t" // extract x0
- "and r5, %[c7ffe], r5, lsl #1 \n\t" // extract x1 and make byte offset
- "add r8, %[row0], r8, lsl #1 \n\t" // calculate address of row0[x0]
- "add r5, %[row0], r5 \n\t" // calculate address of row0[x1]
- "vld1.u16 {d16[1]}, [r8], %[rowgap] \n\t" // load row0[x0] and move ptr to row1
- "vld1.u16 {d17[1]}, [r5], %[rowgap] \n\t" // load row0[x1] and move ptr to row1
- "vld1.u16 {d18[1]}, [r8] \n\t" // load row1[x0]
- "vld1.u16 {d19[1]}, [r5] \n\t" // load row1[x1]
-
- "vshrn.u32 d0, q0, #2 \n\t" // shift right subX by 2 and narrow
- // iter 2
- "mov r8, r6, lsr #18 \n\t" // extract x0
- "and r6, %[c7ffe], r6, lsl #1 \n\t" // extract x1 and make byte offset
- "add r8, %[row0], r8, lsl #1 \n\t" // calculate address of row0[x0]
- "add r6, %[row0], r6 \n\t" // calculate address of row0[x1]
- "vld1.u16 {d16[2]}, [r8], %[rowgap] \n\t" // load row0[x0] and move ptr to row1
- "vld1.u16 {d17[2]}, [r6], %[rowgap] \n\t" // load row0[x1] and move ptr to row1
- "vld1.u16 {d18[2]}, [r8] \n\t" // load row1[x0]
- "vld1.u16 {d19[2]}, [r6] \n\t" // load row1[x1]
-
- "vshr.u16 d0, d0, #12 \n\t" // shift right subX to bottom 4 bits
- // iter 3
- "mov r8, r7, lsr #18 \n\t" // extract x0
- "and r7, %[c7ffe], r7, lsl #1 \n\t" // extract x1 and make byte offset
- "add r8, %[row0], r8, lsl #1 \n\t" // calculate address of row0[x0]
- "add r7, %[row0], r7 \n\t" // calculate address of row0[x1]
- "vld1.u16 {d16[3]}, [r8], %[rowgap] \n\t" // load row0[x0] and move ptr to row1
- "vld1.u16 {d17[3]}, [r7], %[rowgap] \n\t" // load row0[x1] and move ptr to row1
- "vld1.u16 {d18[3]}, [r8] \n\t" // load row1[x0]
- "vld1.u16 {d19[3]}, [r7] \n\t" // load row1[x1]
-
- // Registers d16-d19 now contain pixels a00-a11 for 4 iterations:
- // d16 = [ a00_3 | a00_2 | a00_1 | a00_0 ]
- // d17 = [ a01_3 | a01_2 | a01_1 | a01_0 ]
- // d18 = [ a10_3 | a10_2 | a10_1 | a10_0 ]
- // d19 = [ a11_3 | a11_2 | a11_1 | a11_0 ]
- //
- // Extract RGB channels from each 565 pixel.
-
- "vshl.i16 q11, q8, #5 \n\t" // shift greens to top of each lane
- "vand q12, q8, q2 \n\t" // mask blues
- "vshr.u16 q10, q8, #11 \n\t" // shift reds to bottom of each lane
- "vshr.u16 q11, q11, #10 \n\t" // shift greens to bottom of each lane
- "vshl.i16 q14, q9, #5 \n\t" // shift greens to top of each lane
- "vand q15, q9, q2 \n\t" // mask blues
- "vshr.u16 q13, q9, #11 \n\t" // shift reds to bottom of each lane
- "vshr.u16 q14, q14, #10 \n\t" // shift greens to bottom of each lane
-
- // There are now six Q regs, containing
- // q10 = [ a01r3 | a01r2 | a01r1 | a01r0 | a00r3 | a00r2 | a00r1 | a00r0 ]
- // q11 = [ a01g3 | a01g2 | a01g1 | a01g0 | a00g3 | a00g2 | a00g1 | a00g0 ]
- // q12 = [ a01b3 | a01b2 | a01b1 | a01b0 | a00b3 | a00b2 | a00b1 | a00b0 ]
- // q13 = [ a11r3 | a11r2 | a11r1 | a11r0 | a01r3 | a01r2 | a01r1 | a01r0 ]
- // q14 = [ a11g3 | a11g2 | a11g1 | a11g0 | a01g3 | a01g2 | a01g1 | a01g0 ]
- // q15 = [ a11b3 | a11b2 | a11b1 | a11b0 | a01b3 | a01b2 | a01b1 | a01b0 ]
- // where aXXyZ: XX = pixel position, y = colour channel, Z = iteration
- // d0 = subX, d1 = 16-subX
- // d2[0] = subY, d2[1] = 16-subY
- // d3 = 16, q2(d4d5) = 31
-
- // The filter:
- //
- // | |
- // ---- a00 ---- a01 ----> * (16-y)
- // | |
- // -----a10 ---- a11 ----> * y
- // | |
- // V V
- // * (16-x) * x
- //
- // result = (a00.(16-y).(16-x) + a01.(16-y).x + a10.(16-x).y + a11.x.y) >> 8
- //
-
- "vsub.u16 d1, d3, d0 \n\t" // calculate 16-subX
- // multiply top pixel pair by (16-y)
- "vmul.i16 q10, q10, d2[1] \n\t" // top reds multiplied by (16-y)
- "vmul.i16 q11, q11, d2[1] \n\t" // top greens multiplied by (16-y)
- "vmul.i16 q12, q12, d2[1] \n\t" // top blues multiplied by (16-y)
- // multiply bottom pixel pair by y
- "vmul.i16 q13, q13, d2[0] \n\t" // bottom reds multiplied by y
- "vmul.i16 q14, q14, d2[0] \n\t" // bottom greens multiplied by y
- "vmul.i16 q15, q15, d2[0] \n\t" // bottom blues multiplied by y
- // mul/acc left pixels by (16-x)
- "vmul.i16 d16, d20, d1 \n\t" // resultr = a00r * (16-x)
- "vmul.i16 d17, d22, d1 \n\t" // resultg = a00g * (16-x)
- "vmul.i16 d18, d24, d1 \n\t" // resultb = a00b * (16-x)
- "vmla.i16 d16, d26, d1 \n\t" // resultr += a00r * (16-x)
- "vmla.i16 d17, d28, d1 \n\t" // resultg += a00g * (16-x)
- "vmla.i16 d18, d30, d1 \n\t" // resultb += a00b * (16-x)
- // mul/acc right pixels by x
- "vmla.i16 d16, d21, d0 \n\t" // resultr += a01r * x
- "vmla.i16 d17, d23, d0 \n\t" // resultg += a01g * x
- "vmla.i16 d18, d25, d0 \n\t" // resultb += a01b * x
- "vmla.i16 d16, d27, d0 \n\t" // resultr += a11r * x
- "vmla.i16 d17, d29, d0 \n\t" // resultg += a11g * x
- "vmla.i16 d18, d31, d0 \n\t" // resultb += a11b * x
- "subs %[count4], %[count4], #1 \n\t" // decrement counter
- // shift results down 8 bits
- "vshr.u16 q8, q8, #8 \n\t" // resultr >>= 8, resultg >>=8
- "vshr.u16 d18, d18, #8 \n\t" // resultb >>= 8
- // put rgb into 565
- "vsli.i16 d18, d17, #5 \n\t" // shift greens into blues
- "vsli.i16 d18, d16, #11 \n\t" // shift reds into greens and blues
- "vst1.i16 {d18}, [%[colors]]! \n\t" // store result
- "bgt 1b \n\t" // if counter > 0, loop
- "2: \n\t" // exit
- : [xy] "+r" (xy), [count4] "+r" (count4), [colors] "+r" (colors)
- : [row0] "r" (row0), [rowgap] "r" (rowgap), [c7ffe] "r" (c7ffe)
- : "cc", "memory", "r4", "r5", "r6", "r7", "r8", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
- );
-
- while (count != 0) {
- uint32_t XX = *xy++; // x0:14 | subX:4 | x1:14
- unsigned x0 = XX >> 14;
- unsigned x1 = XX & 0x3FFF;
- unsigned subX = x0 & 0xF;
- x0 >>= 4;
-
- uint32_t a00 = SkExpand_rgb_16(row0[x0]);
- uint32_t a01 = SkExpand_rgb_16(row0[x1]);
- uint32_t a10 = SkExpand_rgb_16(row1[x0]);
- uint32_t a11 = SkExpand_rgb_16(row1[x1]);
-
- int xy = subX * subY >> 3;
- uint32_t c = a00 * (32 - 2*subY - 2*subX + xy) +
- a01 * (2*subX - xy) +
- a10 * (2*subY - xy) +
- a11 * xy;
-
- *colors++ = SkCompact_rgb_16(c>>5);
- count--;
- }
-}
-#endif //defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
-
#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
void SI8_D16_nofilter_DX_arm(const SkBitmapProcState& s,
const uint32_t* SK_RESTRICT xy,
@@ -438,229 +180,6 @@
}
#endif //__ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
-#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
-static inline void Filter_32_direct(unsigned x, unsigned y,
- SkPMColor a00, SkPMColor a01,
- SkPMColor a10, SkPMColor a11,
- SkPMColor *dst) {
- asm volatile(
- "vdup.8 d0, %[y] \n\t" // duplicate y into d0
- "vmov.u8 d16, #16 \n\t" // set up constant in d16
- "vsub.u8 d1, d16, d0 \n\t" // d1 = 16-y
-
- "vdup.32 d4, %[a00] \n\t" // duplicate a00 into d4
- "vdup.32 d5, %[a10] \n\t" // duplicate a10 into d5
- "vmov.32 d4[1], %[a01] \n\t" // set top of d4 to a01
- "vmov.32 d5[1], %[a11] \n\t" // set top of d5 to a11
-
- "vmull.u8 q3, d4, d1 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d0 \n\t" // q0 = [a11|a10] * y
-
- "vdup.16 d5, %[x] \n\t" // duplicate x into d5
- "vmov.u16 d16, #16 \n\t" // set up constant in d16
- "vsub.u16 d3, d16, d5 \n\t" // d3 = 16-x
-
- "vmul.i16 d4, d7, d5 \n\t" // d4 = a01 * x
- "vmla.i16 d4, d1, d5 \n\t" // d4 += a11 * x
- "vmla.i16 d4, d6, d3 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d4, d0, d3 \n\t" // d4 += a10 * (16-x)
- "vshrn.i16 d0, q2, #8 \n\t" // shift down result by 8
- "vst1.32 {d0[0]}, [%[dst]] \n\t" // store result
- :
- : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst)
- : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16"
- );
-}
-
-static inline void Filter_32_direct_alpha(unsigned x, unsigned y,
- SkPMColor a00, SkPMColor a01,
- SkPMColor a10, SkPMColor a11,
- SkPMColor *dst, uint16_t scale) {
- asm volatile(
- "vdup.8 d0, %[y] \n\t" // duplicate y into d0
- "vmov.u8 d16, #16 \n\t" // set up constant in d16
- "vsub.u8 d1, d16, d0 \n\t" // d1 = 16-y
-
- "vdup.32 d4, %[a00] \n\t" // duplicate a00 into d4
- "vdup.32 d5, %[a10] \n\t" // duplicate a10 into d5
- "vmov.32 d4[1], %[a01] \n\t" // set top of d4 to a01
- "vmov.32 d5[1], %[a11] \n\t" // set top of d5 to a11
-
- "vmull.u8 q3, d4, d1 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d0 \n\t" // q0 = [a11|a10] * y
-
- "vdup.16 d5, %[x] \n\t" // duplicate x into d5
- "vmov.u16 d16, #16 \n\t" // set up constant in d16
- "vsub.u16 d3, d16, d5 \n\t" // d3 = 16-x
-
- "vmul.i16 d4, d7, d5 \n\t" // d4 = a01 * x
- "vmla.i16 d4, d1, d5 \n\t" // d4 += a11 * x
- "vmla.i16 d4, d6, d3 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d4, d0, d3 \n\t" // d4 += a10 * (16-x)
- "vdup.16 d3, %[scale] \n\t" // duplicate scale into d3
- "vshr.u16 d4, d4, #8 \n\t" // shift down result by 8
- "vmul.i16 d4, d4, d3 \n\t" // multiply result by scale
- "vshrn.i16 d0, q2, #8 \n\t" // shift down result by 8
- "vst1.32 {d0[0]}, [%[dst]] \n\t" // store result
- :
- : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst), [scale] "r" (scale)
- : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16"
- );
-}
-
-void SI8_opaque_D32_filter_DX_arm(const SkBitmapProcState& s,
- const uint32_t* SK_RESTRICT xy,
- int count, SkPMColor* SK_RESTRICT colors) {
- SkASSERT(count > 0 && colors != NULL);
- SkASSERT(s.fDoFilter);
-
- const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
- const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
- unsigned rb = s.fBitmap->rowBytes();
- unsigned subY;
- const uint8_t* SK_RESTRICT row0;
- const uint8_t* SK_RESTRICT row1;
-
- // setup row ptrs and update proc_table
- {
- uint32_t XY = *xy++;
- unsigned y0 = XY >> 14;
- row0 = (const uint8_t*)(srcAddr + (y0 >> 4) * rb);
- row1 = (const uint8_t*)(srcAddr + (XY & 0x3FFF) * rb);
- subY = y0 & 0xF;
- }
-
- do {
- uint32_t XX = *xy++; // x0:14 | 4 | x1:14
- unsigned x0 = XX >> 14;
- unsigned x1 = XX & 0x3FFF;
- unsigned subX = x0 & 0xF;
- x0 >>= 4;
-
- Filter_32_direct(subX, subY, table[row0[x0]],
- table[row0[x1]],
- table[row1[x0]],
- table[row1[x1]], colors);
- colors++;
- } while (--count != 0);
-
- s.fBitmap->getColorTable()->unlockColors(false);
-}
-
-void SI8_opaque_D32_filter_DXDY_arm(const SkBitmapProcState& s,
- const uint32_t* SK_RESTRICT xy,
- int count, SkPMColor* SK_RESTRICT colors) {
- SkASSERT(count > 0 && colors != NULL);
- SkASSERT(s.fDoFilter);
-
- const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
- const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
- int rb = s.fBitmap->rowBytes();
-
- do {
- uint32_t data = *xy++;
- unsigned y0 = data >> 14;
- unsigned y1 = data & 0x3FFF;
- unsigned subY = y0 & 0xF;
- y0 >>= 4;
-
- data = *xy++;
- unsigned x0 = data >> 14;
- unsigned x1 = data & 0x3FFF;
- unsigned subX = x0 & 0xF;
- x0 >>= 4;
-
- const uint8_t* SK_RESTRICT row0 = (const uint8_t*)(srcAddr + y0 * rb);
- const uint8_t* SK_RESTRICT row1 = (const uint8_t*)(srcAddr + y1 * rb);
-
- Filter_32_direct(subX, subY, table[row0[x0]],
- table[row0[x1]],
- table[row1[x0]],
- table[row1[x1]], colors);
- colors++;
- } while (--count != 0);
-
- s.fBitmap->getColorTable()->unlockColors(false);
-}
-
-void SI8_alpha_D32_filter_DX_arm(const SkBitmapProcState& s,
- const uint32_t* SK_RESTRICT xy,
- int count, SkPMColor* SK_RESTRICT colors) {
- SkASSERT(count > 0 && colors != NULL);
- SkASSERT(s.fDoFilter);
-
- unsigned scale = s.fAlphaScale;
- const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
- const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
- unsigned rb = s.fBitmap->rowBytes();
- unsigned subY;
- const uint8_t* SK_RESTRICT row0;
- const uint8_t* SK_RESTRICT row1;
-
- // setup row ptrs and update proc_table
- {
- uint32_t XY = *xy++;
- unsigned y0 = XY >> 14;
- row0 = (const uint8_t*)(srcAddr + (y0 >> 4) * rb);
- row1 = (const uint8_t*)(srcAddr + (XY & 0x3FFF) * rb);
- subY = y0 & 0xF;
- }
-
- do {
- uint32_t XX = *xy++; // x0:14 | 4 | x1:14
- unsigned x0 = XX >> 14;
- unsigned x1 = XX & 0x3FFF;
- unsigned subX = x0 & 0xF;
- x0 >>= 4;
-
- Filter_32_direct_alpha(subX, subY, table[row0[x0]],
- table[row0[x1]],
- table[row1[x0]],
- table[row1[x1]], colors, scale);
- colors++;
- } while (--count != 0);
-
- s.fBitmap->getColorTable()->unlockColors(false);
-}
-
-void SI8_alpha_D32_filter_DXDY_arm(const SkBitmapProcState& s,
- const uint32_t* SK_RESTRICT xy,
- int count, SkPMColor* SK_RESTRICT colors) {
- SkASSERT(count > 0 && colors != NULL);
- SkASSERT(s.fDoFilter);
-
- unsigned scale = s.fAlphaScale;
- const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
- const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
- int rb = s.fBitmap->rowBytes();
-
- do {
- uint32_t data = *xy++;
- unsigned y0 = data >> 14;
- unsigned y1 = data & 0x3FFF;
- unsigned subY = y0 & 0xF;
- y0 >>= 4;
-
- data = *xy++;
- unsigned x0 = data >> 14;
- unsigned x1 = data & 0x3FFF;
- unsigned subX = x0 & 0xF;
- x0 >>= 4;
-
- const uint8_t* SK_RESTRICT row0 = (const uint8_t*)(srcAddr + y0 * rb);
- const uint8_t* SK_RESTRICT row1 = (const uint8_t*)(srcAddr + y1 * rb);
-
- Filter_32_direct_alpha(subX, subY, table[row0[x0]],
- table[row0[x1]],
- table[row1[x0]],
- table[row1[x1]], colors, scale);
- colors++;
- } while (--count != 0);
-
- s.fBitmap->getColorTable()->unlockColors(false);
-}
-#endif //defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
-
///////////////////////////////////////////////////////////////////////////////
/* If we replace a sampleproc, then we null-out the associated shaderproc,
@@ -676,20 +195,6 @@
}
switch (fBitmap->config()) {
- case SkBitmap::kRGB_565_Config:
-#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
- if (justDx && doFilter) {
- fSampleProc16 = S16_D16_filter_DX_arm;
- fShaderProc16 = NULL;
- }
-#endif
-#if __ARM_ARCH__ >= 5 && !defined(SK_CPU_BENDIAN)
- if (justDx && !doFilter) {
- fSampleProc16 = S16_D16_nofilter_DX_arm;
- fShaderProc16 = NULL;
- }
-#endif
- break; // k565
case SkBitmap::kIndex8_Config:
#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
if (justDx && !doFilter) {
@@ -698,30 +203,13 @@
fShaderProc16 = NULL;
#endif
if (isOpaque) {
+ // this one is only very slighty faster than the C version
fSampleProc32 = SI8_opaque_D32_nofilter_DX_arm;
fShaderProc32 = NULL;
}
}
#endif
-#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
- if (doFilter) {
- if (isOpaque) {
- if (justDx) {
- fSampleProc32 = SI8_opaque_D32_filter_DX_arm;
- } else {
- fSampleProc32 = SI8_opaque_D32_filter_DXDY_arm;
- }
- } else { // !isOpaque
- if (justDx) {
- fSampleProc32 = SI8_alpha_D32_filter_DX_arm;
- } else {
- fSampleProc32 = SI8_alpha_D32_filter_DXDY_arm;
- }
- }
- fShaderProc32 = NULL;
- }
-#endif
- break; // kIndex8
+ break;
default:
break;
}