Add VP9 inter-frame prediction intrinsic
Change-Id: If8985a6200fb6d34083eff711ccdf2f1b3c374e6
diff --git a/cpp/ScriptIntrinsics.cpp b/cpp/ScriptIntrinsics.cpp
index c5013b6..f9a1d97 100644
--- a/cpp/ScriptIntrinsics.cpp
+++ b/cpp/ScriptIntrinsics.cpp
@@ -66,6 +66,52 @@
Script::setVar(0, lut);
}
+sp<ScriptIntrinsicVP9InterPred> ScriptIntrinsicVP9InterPred::create(sp<RS> rs, sp<const Element> e) {
+ if (e->isCompatible(Element::U8(rs)) == false) {
+ rs->throwError(RS_ERROR_INVALID_ELEMENT, "Element not supported for intrinsic");
+ return NULL;
+ }
+ return new ScriptIntrinsicVP9InterPred(rs, e);
+}
+
+ScriptIntrinsicVP9InterPred::ScriptIntrinsicVP9InterPred(sp<RS> rs, sp<const Element> e)
+ : ScriptIntrinsic(rs, RS_SCRIPT_INTRINSIC_ID_INTER_PRED, e) {
+}
+
+void ScriptIntrinsicVP9InterPred::forEach(sp<Allocation> asize) {
+ if (asize->getType()->getElement()->isCompatible(mElement) == false) {
+ mRS->throwError(RS_ERROR_INVALID_ELEMENT, "InterPred forEach element mismatch");
+ return;
+ }
+ Script::forEach(0, asize, NULL, NULL, 0);
+}
+
+void ScriptIntrinsicVP9InterPred::setRef(sp<Allocation> ref) {
+ sp<const Type> t = ref->getType();
+ if (!t->getElement()->isCompatible(mElement)) {
+ mRS->throwError(RS_ERROR_INVALID_ELEMENT, "setRef element does not match");
+ return;
+ }
+ Script::setVar(0, ref);
+}
+
+void ScriptIntrinsicVP9InterPred::setParam(sp<Allocation> param) {
+ sp<const Type> t = param->getType();
+ if (!t->getElement()->isCompatible(mElement)) {
+ mRS->throwError(RS_ERROR_INVALID_ELEMENT, "setFriParam element does not match");
+ return;
+ }
+ Script::setVar(1, param);
+}
+
+void ScriptIntrinsicVP9InterPred::setParamCount(int fri, int sec, int offset) {
+ FieldPacker fp(12);
+ fp.add(fri);
+ fp.add(sec);
+ fp.add(offset);
+ Script::setVar(2, fp.getData(), fp.getLength());
+}
+
sp<ScriptIntrinsicBlend> ScriptIntrinsicBlend::create(sp<RS> rs, sp<const Element> e) {
if (e->isCompatible(Element::U8_4(rs)) == false) {
rs->throwError(RS_ERROR_INVALID_ELEMENT, "Element not supported for intrinsic");
diff --git a/cpp/rsCppStructs.h b/cpp/rsCppStructs.h
index 6c14e8c..805f072 100644
--- a/cpp/rsCppStructs.h
+++ b/cpp/rsCppStructs.h
@@ -1435,6 +1435,20 @@
*/
void setLUT(sp<Allocation> lut);
};
+/**
+ * Intrinsic for VP9InterPrediction
+ */
+class ScriptIntrinsicVP9InterPred : public ScriptIntrinsic {
+ private:
+ ScriptIntrinsicVP9InterPred(sp<RS> rs, sp<const Element> e);
+ public:
+ static sp<ScriptIntrinsicVP9InterPred> create(sp<RS> rs, sp<const Element> e);
+
+ void forEach(sp<Allocation> asize);
+ void setRef(sp<Allocation> ref);
+ void setParamCount(int fri, int sec, int offset);
+ void setParam(sp<Allocation> param);
+};
/**
* Intrinsic kernel for blending two Allocations.
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index df9ac09..0199eee 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -29,8 +29,10 @@
rsCpuIntrinsicConvolve3x3.cpp \
rsCpuIntrinsicConvolve5x5.cpp \
rsCpuIntrinsicHistogram.cpp \
+ rsCpuIntrinsicInterPred.cpp \
rsCpuIntrinsicLUT.cpp \
- rsCpuIntrinsicYuvToRGB.cpp
+ rsCpuIntrinsicYuvToRGB.cpp \
+ convolve/convolve.c
LOCAL_CFLAGS_arm64 += -DARCH_ARM_HAVE_NEON
LOCAL_SRC_FILES_arm64 += \
@@ -50,6 +52,11 @@
rsCpuIntrinsics_neon_Blend.S \
rsCpuIntrinsics_neon_Blur.S \
rsCpuIntrinsics_neon_YuvToRGB.S
+ convolve/convolve_copy_neon.s \
+ convolve/convolve_avg_neon.s \
+ convolve/convolve8_neon.s \
+ convolve/convolve8_avg_neon.s \
+ convolve/convolve_neon.c
LOCAL_ASFLAGS_arm := -mfpu=neon
endif
diff --git a/cpu_ref/convolve/convolve.c b/cpu_ref/convolve/convolve.c
new file mode 100644
index 0000000..c85db92
--- /dev/null
+++ b/cpu_ref/convolve/convolve.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vp9_common.h"
+#include "vp9_filter.h"
+#include <string.h>
+#include <stdio.h>
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const subpel_kernel *x_filters,
+ int x0_q4, int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const subpel_kernel *x_filters,
+ int x0_q4, int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_x[k] * x_filter[k];
+ dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+ clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const subpel_kernel *y_filters,
+ int y0_q4, int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const subpel_kernel *y_filters,
+ int y0_q4, int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+ clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void convolve(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const subpel_kernel *const x_filters,
+ int x0_q4, int x_step_q4,
+ const subpel_kernel *const y_filters,
+ int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Fixed size intermediate buffer places limits on parameters.
+ // Maximum intermediate_height is 324, for y_step_q4 == 80,
+ // h == 64, taps == 8.
+ // y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
+ uint8_t temp[64 * 324];
+ int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;
+
+ if (intermediate_height < h)
+ intermediate_height = h;
+
+ convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+ x_filters, x0_q4, x_step_q4, w, intermediate_height);
+ convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+ y_filters, y0_q4, y_step_q4, w, h);
+}
+
+static const subpel_kernel *get_filter_base(const int16_t *filter) {
+ // NOTE: This assumes that the filter table is 256-byte aligned.
+ // TODO(agrange) Modify to make independent of table alignment.
+ return (const subpel_kernel *)(filter);
+}
+
+static int get_filter_offset(const int16_t *f, const subpel_kernel *base) {
+ return 0;
+}
+
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const subpel_kernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+ x0_q4, x_step_q4, w, h);
+}
+
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const subpel_kernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+ x0_q4, x_step_q4, w, h);
+}
+
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const subpel_kernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+ convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+ y0_q4, y_step_q4, w, h);
+}
+
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const subpel_kernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+ convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+ y0_q4, y_step_q4, w, h);
+}
+
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const subpel_kernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ const subpel_kernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ convolve(src, src_stride, dst, dst_stride,
+ filters_x, x0_q4, x_step_q4,
+ filters_y, y0_q4, y_step_q4, w, h);
+}
+
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
+ int x, y;
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x)
+ dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ /* Fixed size intermediate buffer places limits on parameters. */
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 64 * 64);
+
+ vp9_convolve8_c(src, src_stride, temp, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+ vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+}
+
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
+ int r;
+
+ for (r = h; r > 0; --r) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
diff --git a/cpu_ref/convolve/convolve8_avg_neon.s b/cpu_ref/convolve/convolve8_avg_neon.s
new file mode 100644
index 0000000..7821446
--- /dev/null
+++ b/cpu_ref/convolve/convolve8_avg_neon.s
@@ -0,0 +1,323 @@
+@ This file was created from a .asm file
+@ using the ads2gas.pl script.
+ .equ DO1STROUNDING, 0
+@
+@ Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+@ Copyright (c) 2014 The Android Open Source Project
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+
+
+
+ @ These functions are only valid when:
+ @ x_step_q4 == 16
+ @ w%4 == 0
+ @ h%4 == 0
+ @ taps == 8
+ @ VP9_FILTER_WEIGHT == 128
+ @ VP9_FILTER_SHIFT == 7
+
+ .global vp9_convolve8_avg_horiz_neon
+ .type vp9_convolve8_avg_horiz_neon, function
+ .global vp9_convolve8_avg_vert_neon
+ .type vp9_convolve8_avg_vert_neon, function
+ .global vp9_convolve8_avg_horiz_c
+ .global vp9_convolve8_avg_vert_c
+ .arm
+ .eabi_attribute 24, 1 @Tag_ABI_align_needed
+ .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+
+.text
+.p2align 2
+
+ @ Multiply and accumulate by q0
+.macro MULTIPLY_BY_Q0 dst, src0, src1, src2, src3, src4, src5, src6, src7
+ vmull.s16 \dst, \src0, d0[0]
+ vmlal.s16 \dst, \src1, d0[1]
+ vmlal.s16 \dst, \src2, d0[2]
+ vmlal.s16 \dst, \src3, d0[3]
+ vmlal.s16 \dst, \src4, d1[0]
+ vmlal.s16 \dst, \src5, d1[1]
+ vmlal.s16 \dst, \src6, d1[2]
+ vmlal.s16 \dst, \src7, d1[3]
+ .endm
+
+@ r0 const uint8_t *src
+@ r1 int src_stride
+@ r2 uint8_t *dst
+@ r3 int dst_stride
+@ sp[]const int16_t *filter_x
+@ sp[]int x_step_q4
+@ sp[]const int16_t *filter_y ; unused
+@ sp[]int y_step_q4 ; unused
+@ sp[]int w
+@ sp[]int h
+
+_vp9_convolve8_avg_horiz_neon:
+ vp9_convolve8_avg_horiz_neon: @ PROC
+ ldr r12, [sp, #4] @ x_step_q4
+ cmp r12, #16
+ bne vp9_convolve8_avg_horiz_c
+
+ push {r4-r10, lr}
+
+ sub r0, r0, #3 @ adjust for taps
+
+ ldr r5, [sp, #32] @ filter_x
+ ldr r6, [sp, #48] @ w
+ ldr r7, [sp, #52] @ h
+
+ vld1.s16 {q0}, [r5] @ filter_x
+
+ sub r8, r1, r1, lsl #2 @ -src_stride * 3
+ add r8, r8, #4 @ -src_stride * 3 + 4
+
+ sub r4, r3, r3, lsl #2 @ -dst_stride * 3
+ add r4, r4, #4 @ -dst_stride * 3 + 4
+
+ rsb r9, r6, r1, lsl #2 @ reset src for outer loop
+ sub r9, r9, #7
+ rsb r12, r6, r3, lsl #2 @ reset dst for outer loop
+
+ mov r10, r6 @ w loop counter
+
+loop_horiz_v:
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d27}, [r0], r8
+
+ vtrn.16 q12, q13
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+
+ pld [r0, r1, lsl #2]
+
+ vmovl.u8 q8, d24
+ vmovl.u8 q9, d25
+ vmovl.u8 q10, d26
+ vmovl.u8 q11, d27
+
+ @ save a few instructions in the inner loop
+ vswp d17, d18
+ vmov d23, d21
+
+ add r0, r0, #3
+
+loop_horiz:
+ add r5, r0, #64
+
+ vld1.32 {d28[]}, [r0], r1
+ vld1.32 {d29[]}, [r0], r1
+ vld1.32 {d31[]}, [r0], r1
+ vld1.32 {d30[]}, [r0], r8
+
+ pld [r5]
+
+ vtrn.16 d28, d31
+ vtrn.16 d29, d30
+ vtrn.8 d28, d29
+ vtrn.8 d31, d30
+
+ pld [r5, r1]
+
+ @ extract to s16
+ vtrn.32 q14, q15
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+
+ pld [r5, r1, lsl #1]
+
+ @ slightly out of order load to match the existing data
+ vld1.u32 {d6[0]}, [r2], r3
+ vld1.u32 {d7[0]}, [r2], r3
+ vld1.u32 {d6[1]}, [r2], r3
+ vld1.u32 {d7[1]}, [r2], r3
+
+ sub r2, r2, r3, lsl #2 @ reset for store
+
+ @ src[] * filter_x
+ MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
+ MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
+ MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
+ MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+ pld [r5, -r8]
+
+ @ += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ @ saturate
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
+
+ @ transpose
+ vtrn.16 d2, d3
+ vtrn.32 d2, d3
+ vtrn.8 d2, d3
+
+ @ average the new value and the dst value
+ vrhadd.u8 q1, q1, q3
+
+ vst1.u32 {d2[0]}, [r2,:32], r3
+ vst1.u32 {d3[0]}, [r2,:32], r3
+ vst1.u32 {d2[1]}, [r2,:32], r3
+ vst1.u32 {d3[1]}, [r2,:32], r4
+
+ vmov q8, q9
+ vmov d20, d23
+ vmov q11, q12
+ vmov q9, q13
+
+ subs r6, r6, #4 @ w -= 4
+ bgt loop_horiz
+
+ @ outer loop
+ mov r6, r10 @ restore w counter
+ add r0, r0, r9 @ src += src_stride * 4 - w
+ add r2, r2, r12 @ dst += dst_stride * 4 - w
+ subs r7, r7, #4 @ h -= 4
+ bgt loop_horiz_v
+
+ pop {r4-r10, pc}
+
+ .size vp9_convolve8_avg_horiz_neon, .-vp9_convolve8_avg_horiz_neon @ ENDP
+
+_vp9_convolve8_avg_vert_neon:
+ vp9_convolve8_avg_vert_neon: @ PROC
+ ldr r12, [sp, #12]
+ cmp r12, #16
+ bne vp9_convolve8_avg_vert_c
+
+ push {r4-r8, lr}
+
+ @ adjust for taps
+ sub r0, r0, r1
+ sub r0, r0, r1, lsl #1
+
+ ldr r4, [sp, #32] @ filter_y
+ ldr r6, [sp, #40] @ w
+ ldr lr, [sp, #44] @ h
+
+ vld1.s16 {q0}, [r4] @ filter_y
+
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+
+loop_vert_h:
+ mov r4, r0
+ add r7, r0, r1, asr #1
+ mov r5, r2
+ add r8, r2, r3, asr #1
+ mov r12, lr @ h loop counter
+
+ vld1.u32 {d16[0]}, [r4], r1
+ vld1.u32 {d16[1]}, [r7], r1
+ vld1.u32 {d18[0]}, [r4], r1
+ vld1.u32 {d18[1]}, [r7], r1
+ vld1.u32 {d20[0]}, [r4], r1
+ vld1.u32 {d20[1]}, [r7], r1
+ vld1.u32 {d22[0]}, [r4], r1
+
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+
+loop_vert:
+ @ always process a 4x4 block at a time
+ vld1.u32 {d24[0]}, [r7], r1
+ vld1.u32 {d26[0]}, [r4], r1
+ vld1.u32 {d26[1]}, [r7], r1
+ vld1.u32 {d24[1]}, [r4], r1
+
+ @ extract to s16
+ vmovl.u8 q12, d24
+ vmovl.u8 q13, d26
+
+ vld1.u32 {d6[0]}, [r5,:32], r3
+ vld1.u32 {d6[1]}, [r8,:32], r3
+ vld1.u32 {d7[0]}, [r5,:32], r3
+ vld1.u32 {d7[1]}, [r8,:32], r3
+
+ pld [r7]
+ pld [r4]
+
+ @ src[] * filter_y
+ MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
+
+ pld [r7, r1]
+ pld [r4, r1]
+
+ MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26
+
+ pld [r5]
+ pld [r8]
+
+ MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+ pld [r5, r3]
+ pld [r8, r3]
+
+ MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+ @ += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ @ saturate
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
+
+ @ average the new value and the dst value
+ vrhadd.u8 q1, q1, q3
+
+ sub r5, r5, r3, lsl #1 @ reset for store
+ sub r8, r8, r3, lsl #1
+
+ vst1.u32 {d2[0]}, [r5,:32], r3
+ vst1.u32 {d2[1]}, [r8,:32], r3
+ vst1.u32 {d3[0]}, [r5,:32], r3
+ vst1.u32 {d3[1]}, [r8,:32], r3
+
+ vmov q8, q10
+ vmov d18, d22
+ vmov d19, d24
+ vmov q10, q13
+ vmov d22, d25
+
+ subs r12, r12, #4 @ h -= 4
+ bgt loop_vert
+
+ @ outer loop
+ add r0, r0, #4
+ add r2, r2, #4
+ subs r6, r6, #4 @ w -= 4
+ bgt loop_vert_h
+
+ pop {r4-r8, pc}
+
+ .size vp9_convolve8_avg_vert_neon, .-vp9_convolve8_avg_vert_neon @ ENDP
+ .section .note.GNU-stack,"",%progbits
diff --git a/cpu_ref/convolve/convolve8_neon.s b/cpu_ref/convolve/convolve8_neon.s
new file mode 100644
index 0000000..0bc15d9
--- /dev/null
+++ b/cpu_ref/convolve/convolve8_neon.s
@@ -0,0 +1,300 @@
+@ This file was created from a .asm file
+@ using the ads2gas.pl script.
+ .equ DO1STROUNDING, 0
+@
+@ Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+@ Copyright (c) 2014 The Android Open Source Project
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+
+
+ @ These functions are only valid when:
+ @ x_step_q4 == 16
+ @ w%4 == 0
+ @ h%4 == 0
+ @ taps == 8
+ @ VP9_FILTER_WEIGHT == 128
+ @ VP9_FILTER_SHIFT == 7
+
+ .global vp9_convolve8_horiz_neon
+ .type vp9_convolve8_horiz_neon, function
+ .global vp9_convolve8_vert_neon
+ .type vp9_convolve8_vert_neon, function
+ .global vp9_convolve8_horiz_c
+ .global vp9_convolve8_vert_c
+ .arm
+ .eabi_attribute 24, 1 @Tag_ABI_align_needed
+ .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+
+.text
+.p2align 2
+
+ @ Multiply and accumulate by q0
+.macro MULTIPLY_BY_Q0 dst, src0, src1, src2, src3, src4, src5, src6, src7
+ vmull.s16 \dst, \src0, d0[0]
+ vmlal.s16 \dst, \src1, d0[1]
+ vmlal.s16 \dst, \src2, d0[2]
+ vmlal.s16 \dst, \src3, d0[3]
+ vmlal.s16 \dst, \src4, d1[0]
+ vmlal.s16 \dst, \src5, d1[1]
+ vmlal.s16 \dst, \src6, d1[2]
+ vmlal.s16 \dst, \src7, d1[3]
+ .endm
+
+@ r0 const uint8_t *src
+@ r1 int src_stride
+@ r2 uint8_t *dst
+@ r3 int dst_stride
+@ sp[]const int16_t *filter_x
+@ sp[]int x_step_q4
+@ sp[]const int16_t *filter_y ; unused
+@ sp[]int y_step_q4 ; unused
+@ sp[]int w
+@ sp[]int h
+
+_vp9_convolve8_horiz_neon:
+ vp9_convolve8_horiz_neon: @ PROC
+ ldr r12, [sp, #4] @ x_step_q4
+ cmp r12, #16
+ bne vp9_convolve8_horiz_c
+
+ push {r4-r10, lr}
+
+ sub r0, r0, #3 @ adjust for taps
+
+ ldr r5, [sp, #32] @ filter_x
+ ldr r6, [sp, #48] @ w
+ ldr r7, [sp, #52] @ h
+
+ vld1.s16 {q0}, [r5] @ filter_x
+
+ sub r8, r1, r1, lsl #2 @ -src_stride * 3
+ add r8, r8, #4 @ -src_stride * 3 + 4
+
+ sub r4, r3, r3, lsl #2 @ -dst_stride * 3
+ add r4, r4, #4 @ -dst_stride * 3 + 4
+
+ rsb r9, r6, r1, lsl #2 @ reset src for outer loop
+ sub r9, r9, #7
+ rsb r12, r6, r3, lsl #2 @ reset dst for outer loop
+
+ mov r10, r6 @ w loop counter
+
+loop_horiz_v:
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d27}, [r0], r8
+
+ vtrn.16 q12, q13
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+
+ pld [r0, r1, lsl #2]
+
+ vmovl.u8 q8, d24
+ vmovl.u8 q9, d25
+ vmovl.u8 q10, d26
+ vmovl.u8 q11, d27
+
+ @ save a few instructions in the inner loop
+ vswp d17, d18
+ vmov d23, d21
+
+ add r0, r0, #3
+
+loop_horiz:
+ add r5, r0, #64
+
+ vld1.32 {d28[]}, [r0], r1
+ vld1.32 {d29[]}, [r0], r1
+ vld1.32 {d31[]}, [r0], r1
+ vld1.32 {d30[]}, [r0], r8
+
+ pld [r5]
+
+ vtrn.16 d28, d31
+ vtrn.16 d29, d30
+ vtrn.8 d28, d29
+ vtrn.8 d31, d30
+
+ pld [r5, r1]
+
+ @ extract to s16
+ vtrn.32 q14, q15
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+
+ pld [r5, r1, lsl #1]
+
+ @ src[] * filter_x
+ MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
+ MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
+ MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
+ MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+ pld [r5, -r8]
+
+ @ += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ @ saturate
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
+
+ @ transpose
+ vtrn.16 d2, d3
+ vtrn.32 d2, d3
+ vtrn.8 d2, d3
+
+ vst1.u32 {d2[0]}, [r2,:32], r3
+ vst1.u32 {d3[0]}, [r2,:32], r3
+ vst1.u32 {d2[1]}, [r2,:32], r3
+ vst1.u32 {d3[1]}, [r2,:32], r4
+
+ vmov q8, q9
+ vmov d20, d23
+ vmov q11, q12
+ vmov q9, q13
+
+ subs r6, r6, #4 @ w -= 4
+ bgt loop_horiz
+
+ @ outer loop
+ mov r6, r10 @ restore w counter
+ add r0, r0, r9 @ src += src_stride * 4 - w
+ add r2, r2, r12 @ dst += dst_stride * 4 - w
+ subs r7, r7, #4 @ h -= 4
+ bgt loop_horiz_v
+
+ pop {r4-r10, pc}
+
+ .size vp9_convolve8_horiz_neon, .-vp9_convolve8_horiz_neon @ ENDP
+
+_vp9_convolve8_vert_neon:
+ vp9_convolve8_vert_neon: @ PROC
+ ldr r12, [sp, #12]
+ cmp r12, #16
+ bne vp9_convolve8_vert_c
+
+ push {r4-r8, lr}
+
+ @ adjust for taps
+ sub r0, r0, r1
+ sub r0, r0, r1, lsl #1
+
+ ldr r4, [sp, #32] @ filter_y
+ ldr r6, [sp, #40] @ w
+ ldr lr, [sp, #44] @ h
+
+ vld1.s16 {q0}, [r4] @ filter_y
+
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+
+loop_vert_h:
+ mov r4, r0
+ add r7, r0, r1, asr #1
+ mov r5, r2
+ add r8, r2, r3, asr #1
+ mov r12, lr @ h loop counter
+
+ vld1.u32 {d16[0]}, [r4], r1
+ vld1.u32 {d16[1]}, [r7], r1
+ vld1.u32 {d18[0]}, [r4], r1
+ vld1.u32 {d18[1]}, [r7], r1
+ vld1.u32 {d20[0]}, [r4], r1
+ vld1.u32 {d20[1]}, [r7], r1
+ vld1.u32 {d22[0]}, [r4], r1
+
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+
+loop_vert:
+ @ always process a 4x4 block at a time
+ vld1.u32 {d24[0]}, [r7], r1
+ vld1.u32 {d26[0]}, [r4], r1
+ vld1.u32 {d26[1]}, [r7], r1
+ vld1.u32 {d24[1]}, [r4], r1
+
+ @ extract to s16
+ vmovl.u8 q12, d24
+ vmovl.u8 q13, d26
+
+ pld [r5]
+ pld [r8]
+
+ @ src[] * filter_y
+ MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
+
+ pld [r5, r3]
+ pld [r8, r3]
+
+ MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26
+
+ pld [r7]
+ pld [r4]
+
+ MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+ pld [r7, r1]
+ pld [r4, r1]
+
+ MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+ @ += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ @ saturate
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
+
+ vst1.u32 {d2[0]}, [r5,:32], r3
+ vst1.u32 {d2[1]}, [r8,:32], r3
+ vst1.u32 {d3[0]}, [r5,:32], r3
+ vst1.u32 {d3[1]}, [r8,:32], r3
+
+ vmov q8, q10
+ vmov d18, d22
+ vmov d19, d24
+ vmov q10, q13
+ vmov d22, d25
+
+ subs r12, r12, #4 @ h -= 4
+ bgt loop_vert
+
+ @ outer loop
+ add r0, r0, #4
+ add r2, r2, #4
+ subs r6, r6, #4 @ w -= 4
+ bgt loop_vert_h
+
+ pop {r4-r8, pc}
+
+ .size vp9_convolve8_vert_neon, .-vp9_convolve8_vert_neon @ ENDP
+ .section .note.GNU-stack,"",%progbits
diff --git a/cpu_ref/convolve/convolve_avg_neon.s b/cpu_ref/convolve/convolve_avg_neon.s
new file mode 100644
index 0000000..41e79f1
--- /dev/null
+++ b/cpu_ref/convolve/convolve_avg_neon.s
@@ -0,0 +1,135 @@
+@ This file was created from a .asm file
+@ using the ads2gas.pl script.
+ .equ DO1STROUNDING, 0
+@
+@ Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+@ Copyright (c) 2014 The Android Open Source Project
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+
+ .global vp9_convolve_avg_neon
+ .type vp9_convolve_avg_neon, function
+ .arm
+ .eabi_attribute 24, 1 @Tag_ABI_align_needed
+ .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+
+.text
+.p2align 2
+
+_vp9_convolve_avg_neon:
+ vp9_convolve_avg_neon: @ PROC
+ push {r4-r6, lr}
+ ldrd r4, r5, [sp, #32]
+ mov r6, r2
+
+ cmp r4, #32
+ bgt avg64
+ beq avg32
+ cmp r4, #8
+ bgt avg16
+ beq avg8
+ b avg4
+
+avg64:
+ sub lr, r1, #32
+ sub r4, r3, #32
+avg64_h:
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0]!
+ vld1.8 {q2-q3}, [r0], lr
+ pld [r2, r3]
+ vld1.8 {q8-q9}, [r6,:128]!
+ vld1.8 {q10-q11}, [r6,:128], r4
+ vrhadd.u8 q0, q0, q8
+ vrhadd.u8 q1, q1, q9
+ vrhadd.u8 q2, q2, q10
+ vrhadd.u8 q3, q3, q11
+ vst1.8 {q0-q1}, [r2,:128]!
+ vst1.8 {q2-q3}, [r2,:128], r4
+ subs r5, r5, #1
+ bgt avg64_h
+ pop {r4-r6, pc}
+
+avg32:
+ vld1.8 {q0-q1}, [r0], r1
+ vld1.8 {q2-q3}, [r0], r1
+ vld1.8 {q8-q9}, [r6,:128], r3
+ vld1.8 {q10-q11}, [r6,:128], r3
+ pld [r0]
+ vrhadd.u8 q0, q0, q8
+ pld [r0, r1]
+ vrhadd.u8 q1, q1, q9
+ pld [r6]
+ vrhadd.u8 q2, q2, q10
+ pld [r6, r3]
+ vrhadd.u8 q3, q3, q11
+ vst1.8 {q0-q1}, [r2,:128], r3
+ vst1.8 {q2-q3}, [r2,:128], r3
+ subs r5, r5, #2
+ bgt avg32
+ pop {r4-r6, pc}
+
+avg16:
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q2}, [r6,:128], r3
+ vld1.8 {q3}, [r6,:128], r3
+ pld [r0]
+ pld [r0, r1]
+ vrhadd.u8 q0, q0, q2
+ pld [r6]
+ pld [r6, r3]
+ vrhadd.u8 q1, q1, q3
+ vst1.8 {q0}, [r2,:128], r3
+ vst1.8 {q1}, [r2,:128], r3
+ subs r5, r5, #2
+ bgt avg16
+ pop {r4-r6, pc}
+
+avg8:
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d2}, [r6,:64], r3
+ vld1.8 {d3}, [r6,:64], r3
+ pld [r0]
+ pld [r0, r1]
+ vrhadd.u8 q0, q0, q1
+ pld [r6]
+ pld [r6, r3]
+ vst1.8 {d0}, [r2,:64], r3
+ vst1.8 {d1}, [r2,:64], r3
+ subs r5, r5, #2
+ bgt avg8
+ pop {r4-r6, pc}
+
+avg4:
+ vld1.32 {d0[0]}, [r0], r1
+ vld1.32 {d0[1]}, [r0], r1
+ vld1.32 {d2[0]}, [r6,:32], r3
+ vld1.32 {d2[1]}, [r6,:32], r3
+ vrhadd.u8 d0, d0, d2
+ vst1.32 {d0[0]}, [r2,:32], r3
+ vst1.32 {d0[1]}, [r2,:32], r3
+ subs r5, r5, #2
+ bgt avg4
+ pop {r4-r6, pc}
+ .size vp9_convolve_avg_neon, .-vp9_convolve_avg_neon @ ENDP
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/cpu_ref/convolve/convolve_copy_neon.s b/cpu_ref/convolve/convolve_copy_neon.s
new file mode 100644
index 0000000..60ada14
--- /dev/null
+++ b/cpu_ref/convolve/convolve_copy_neon.s
Binary files differ
diff --git a/cpu_ref/convolve/convolve_neon.c b/cpu_ref/convolve/convolve_neon.c
new file mode 100644
index 0000000..3d4bf30
--- /dev/null
+++ b/cpu_ref/convolve/convolve_neon.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vp9_common.h"
+#include "vp9_filter.h"
+#include <string.h>
+#include <stdio.h>
+
+extern void vp9_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+extern void vp9_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+extern void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+extern void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+extern void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+ * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+ */
+ DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+
+ // Account for the vertical phase needing 3 lines prior and 4 lines post
+ int intermediate_height = h + 7;
+
+ if (x_step_q4 != 16 || y_step_q4 != 16)
+ return vp9_convolve8_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+
+ /* Filter starting 3 lines back. The neon implementation will ignore the
+ * given height and filter a multiple of 4 lines. Since this goes in to
+ * the temp buffer which has lots of extra room and is subsequently discarded
+ * this is safe if somewhat less than ideal.
+ */
+ vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+ temp, 64, filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, intermediate_height);
+
+ /* Step into the temp buffer 3 lines to get the actual frame data */
+ vp9_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+ int intermediate_height = h + 7;
+
+ if (x_step_q4 != 16 || y_step_q4 != 16)
+ return vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+
+ /* This implementation has the same issues as above. In addition, we only want
+ * to average the values after both passes.
+ */
+ vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, intermediate_height);
+ vp9_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+}
diff --git a/cpu_ref/convolve/vp9_common.h b/cpu_ref/convolve/vp9_common.h
new file mode 100644
index 0000000..73a1021
--- /dev/null
+++ b/cpu_ref/convolve/vp9_common.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef VP9_COMMON_VP9_COMMON_H_
+#define VP9_COMMON_VP9_COMMON_H_
+#include <stdint.h>
+
+#define DECLARE_ALIGNED_ARRAY(a,typ,val,n)\
+ typ val##_[(n)+(a)/sizeof(typ)+1];\
+ typ *val = (typ*)((((intptr_t)val##_)+(a)-1)&((intptr_t)-(a)))
+/* Interface header for common constant data structures and lookup tables */
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#define ROUND_POWER_OF_TWO(value, n) \
+ (((value) + (1 << ((n) - 1))) >> (n))
+
+#define ALIGN_POWER_OF_TWO(value, n) \
+ (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+
+// Only need this for fixed-size arrays, for structs just assign.
+#define vp9_copy(dest, src) { \
+ memcpy(dest, src, sizeof(src)); \
+ }
+
+// Use this for variably-sized arrays.
+#define vp9_copy_array(dest, src, n) { \
+ memcpy(dest, src, n * sizeof(*src)); \
+ }
+
+#define vp9_zero(dest) memset(&dest, 0, sizeof(dest))
+#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
+
+static inline uint8_t clip_pixel(int val) {
+ return (val > 255) ? 255u : (val < 0) ? 0u : val;
+}
+
+static inline int clamp(int value, int low, int high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+static inline double fclamp(double value, double low, double high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+#define VP9_SYNC_CODE_0 0x49
+#define VP9_SYNC_CODE_1 0x83
+#define VP9_SYNC_CODE_2 0x42
+
+#define VP9_FRAME_MARKER 0x2
+
+
+#endif // VP9_COMMON_VP9_COMMON_H_
diff --git a/cpu_ref/convolve/vp9_filter.h b/cpu_ref/convolve/vp9_filter.h
new file mode 100644
index 0000000..754578d
--- /dev/null
+++ b/cpu_ref/convolve/vp9_filter.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef VP9_COMMON_VP9_FILTER_H_
+#define VP9_COMMON_VP9_FILTER_H_
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+typedef enum {
+ EIGHTTAP = 0,
+ EIGHTTAP_SMOOTH = 1,
+ EIGHTTAP_SHARP = 2,
+ BILINEAR = 3,
+ SWITCHABLE = 4 /* should be the last one */
+} INTERPOLATION_TYPE;
+
+typedef int16_t subpel_kernel[SUBPEL_TAPS];
+
+struct subpix_fn_table {
+ const subpel_kernel *filter_x;
+ const subpel_kernel *filter_y;
+};
+
+const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type);
+extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
+
+// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
+// filter kernel as a 2 tap filter.
+#define BILINEAR_FILTERS_2TAP(x) \
+ (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1)
+
+#endif // VP9_COMMON_VP9_FILTER_H_
diff --git a/cpu_ref/rsCpuConvolve.h b/cpu_ref/rsCpuConvolve.h
new file mode 100644
index 0000000..d7d2d16
--- /dev/null
+++ b/cpu_ref/rsCpuConvolve.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_CPU_CONVOLVE_NEON_H
+#define RSD_CPU_CONVOLVE_NEON_H
+
+#include <stdint.h>
+
+extern "C" {
+#if defined(ARCH_ARM_HAVE_VFP)
+void vp9_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+#else
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+#endif
+}
+#endif
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 7475ccb..406b5c2 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -506,6 +506,8 @@
extern RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_InterPred(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e);
extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
const Script *s, const Element *e);
extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
@@ -531,6 +533,9 @@
case RS_SCRIPT_INTRINSIC_ID_3DLUT:
i = rsdIntrinsic_3DLUT(this, s, e);
break;
+ case RS_SCRIPT_INTRINSIC_ID_INTER_PRED:
+ i = rsdIntrinsic_InterPred(this, s, e);
+ break;
case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
i = rsdIntrinsic_Convolve3x3(this, s, e);
break;
diff --git a/cpu_ref/rsCpuIntrinsicInterPred.cpp b/cpu_ref/rsCpuIntrinsicInterPred.cpp
new file mode 100644
index 0000000..20e0f2e
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicInterPred.cpp
@@ -0,0 +1,173 @@
+#include "rsCpuIntrinsicInterPred.h"
+
+void RsdCpuScriptIntrinsicInterPred::setGlobalObj(uint32_t slot,
+ ObjectBase *data) {
+ Allocation *alloc = static_cast<Allocation *>(data);
+ if (slot == 0) mRef = (uint8_t *)alloc->mHal.state.userProvidedPtr;
+ if (slot == 1) mParam = (uint8_t *)alloc->mHal.state.userProvidedPtr;
+}
+
+void RsdCpuScriptIntrinsicInterPred::setGlobalVar(uint32_t slot,
+ const void *data,
+ size_t dataLength) {
+ mFriParamCount = ((int32_t *)data)[0];
+ mSecParamCount = ((int32_t *)data)[1];
+ mParamOffset = ((int32_t *)data)[2];
+}
+
+void RsdCpuScriptIntrinsicInterPred::kernel(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep) {
+ RsdCpuScriptIntrinsicInterPred *cp = (RsdCpuScriptIntrinsicInterPred *)p->usr;
+ cp->mCount++;
+ const int vp9_convolve_mode[2][2] = {{24, 16}, {8, 0}};
+ uint8_t *ref_base = cp->mRef;
+ INTER_PRED_PARAM *fri_param = (INTER_PRED_PARAM *)cp->mParam;
+ INTER_PRED_PARAM *sec_param = (INTER_PRED_PARAM *)(cp->mParam + cp->mParamOffset);
+ int32_t fri_count = cp->mFriParamCount;
+ int32_t sec_count = cp->mSecParamCount;
+ int mode_num;
+ uint8_t *src;
+ uint8_t *dst;
+ const int16_t *filter_x;
+ const int16_t *filter_y;
+ for (int i = 0; i < fri_count; i++) {
+
+ mode_num = vp9_convolve_mode[(fri_param[i].x_step_q4 == 16)]
+ [(fri_param[i].y_step_q4 == 16)];
+ src = ref_base + fri_param[i].src_mv;
+ dst = ref_base + fri_param[i].dst_mv;
+
+ filter_x = inter_pred_filters + fri_param[i].filter_x_mv;
+ filter_y = inter_pred_filters + fri_param[i].filter_y_mv;
+
+ cp->mSwitchConvolve[fri_param[i].pred_mode + mode_num](
+ src, fri_param[i].src_stride,
+ dst, fri_param[i].dst_stride,
+ filter_x, fri_param[i].x_step_q4,
+ filter_y, fri_param[i].y_step_q4,
+ fri_param[i].w, fri_param[i].h
+ );
+ }
+
+ for (int i = 0; i < sec_count; i++) {
+ mode_num = vp9_convolve_mode[(sec_param[i].x_step_q4 == 16)]
+ [(sec_param[i].y_step_q4 == 16)];
+ src = ref_base + sec_param[i].src_mv;
+ dst = ref_base + sec_param[i].dst_mv;
+
+ filter_x = inter_pred_filters + sec_param[i].filter_x_mv;
+ filter_y = inter_pred_filters + sec_param[i].filter_y_mv;
+
+ cp->mSwitchConvolve[sec_param[i].pred_mode + mode_num + 1](
+ src, sec_param[i].src_stride,
+ dst, sec_param[i].dst_stride,
+ filter_x, sec_param[i].x_step_q4,
+ filter_y, sec_param[i].y_step_q4,
+ sec_param[i].w, sec_param[i].h
+ );
+ }
+
+}
+
+RsdCpuScriptIntrinsicInterPred::RsdCpuScriptIntrinsicInterPred(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e)
+ : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_INTER_PRED) {
+ mRootPtr = &kernel;
+ mCount = 0;
+ mParamOffset = 0;
+ mFriParamCount = 0;
+ mSecParamCount = 0;
+ mRef = NULL;
+ mParam = NULL;
+
+#if defined(ARCH_ARM_HAVE_VFP)
+ mSwitchConvolve[0] = vp9_convolve_copy_neon;
+ mSwitchConvolve[1] = vp9_convolve_avg_neon;
+ mSwitchConvolve[2] = vp9_convolve8_vert_neon;
+ mSwitchConvolve[3] = vp9_convolve8_avg_vert_neon;
+ mSwitchConvolve[4] = vp9_convolve8_horiz_neon;
+ mSwitchConvolve[5] = vp9_convolve8_avg_horiz_neon;
+ mSwitchConvolve[6] = vp9_convolve8_neon;
+ mSwitchConvolve[7] = vp9_convolve8_avg_neon;
+
+ mSwitchConvolve[8] = vp9_convolve8_vert_neon;
+ mSwitchConvolve[9] = vp9_convolve8_avg_vert_neon;
+ mSwitchConvolve[10] = vp9_convolve8_vert_neon;
+ mSwitchConvolve[11] = vp9_convolve8_avg_vert_neon;
+ mSwitchConvolve[12] = vp9_convolve8_neon;
+ mSwitchConvolve[13] = vp9_convolve8_avg_neon;
+ mSwitchConvolve[14] = vp9_convolve8_neon;
+ mSwitchConvolve[15] = vp9_convolve8_avg_neon;
+
+ mSwitchConvolve[16] = vp9_convolve8_horiz_neon;
+ mSwitchConvolve[17] = vp9_convolve8_avg_horiz_neon;
+ mSwitchConvolve[18] = vp9_convolve8_neon;
+ mSwitchConvolve[19] = vp9_convolve8_avg_neon;
+ mSwitchConvolve[20] = vp9_convolve8_horiz_neon;
+ mSwitchConvolve[21] = vp9_convolve8_avg_horiz_neon;
+ mSwitchConvolve[22] = vp9_convolve8_neon;
+ mSwitchConvolve[23] = vp9_convolve8_avg_neon;
+
+ mSwitchConvolve[24] = vp9_convolve8_neon;
+ mSwitchConvolve[25] = vp9_convolve8_avg_neon;
+ mSwitchConvolve[26] = vp9_convolve8_neon;
+ mSwitchConvolve[27] = vp9_convolve8_avg_neon;
+ mSwitchConvolve[28] = vp9_convolve8_neon;
+ mSwitchConvolve[29] = vp9_convolve8_avg_neon;
+ mSwitchConvolve[30] = vp9_convolve8_neon;
+ mSwitchConvolve[31] = vp9_convolve8_avg_neon;
+#else
+ mSwitchConvolve[0] = vp9_convolve_copy_c;
+ mSwitchConvolve[1] = vp9_convolve_avg_c;
+ mSwitchConvolve[2] = vp9_convolve8_vert_c;
+ mSwitchConvolve[3] = vp9_convolve8_avg_vert_c;
+ mSwitchConvolve[4] = vp9_convolve8_horiz_c;
+ mSwitchConvolve[5] = vp9_convolve8_avg_horiz_c;
+ mSwitchConvolve[6] = vp9_convolve8_c;
+ mSwitchConvolve[7] = vp9_convolve8_avg_c;
+
+ mSwitchConvolve[8] = vp9_convolve8_vert_c;
+ mSwitchConvolve[9] = vp9_convolve8_avg_vert_c;
+ mSwitchConvolve[10] = vp9_convolve8_vert_c;
+ mSwitchConvolve[11] = vp9_convolve8_avg_vert_c;
+ mSwitchConvolve[12] = vp9_convolve8_c;
+ mSwitchConvolve[13] = vp9_convolve8_avg_c;
+ mSwitchConvolve[14] = vp9_convolve8_c;
+ mSwitchConvolve[15] = vp9_convolve8_avg_c;
+
+ mSwitchConvolve[16] = vp9_convolve8_horiz_c;
+ mSwitchConvolve[17] = vp9_convolve8_avg_horiz_c;
+ mSwitchConvolve[18] = vp9_convolve8_c;
+ mSwitchConvolve[19] = vp9_convolve8_avg_c;
+ mSwitchConvolve[20] = vp9_convolve8_horiz_c;
+ mSwitchConvolve[21] = vp9_convolve8_avg_horiz_c;
+ mSwitchConvolve[22] = vp9_convolve8_c;
+ mSwitchConvolve[23] = vp9_convolve8_avg_c;
+
+ mSwitchConvolve[24] = vp9_convolve8_c;
+ mSwitchConvolve[25] = vp9_convolve8_avg_c;
+ mSwitchConvolve[26] = vp9_convolve8_c;
+ mSwitchConvolve[27] = vp9_convolve8_avg_c;
+ mSwitchConvolve[28] = vp9_convolve8_c;
+ mSwitchConvolve[29] = vp9_convolve8_avg_c;
+ mSwitchConvolve[30] = vp9_convolve8_c;
+ mSwitchConvolve[31] = vp9_convolve8_avg_c;
+#endif
+}
+
+RsdCpuScriptIntrinsicInterPred::~RsdCpuScriptIntrinsicInterPred() {
+}
+
+void RsdCpuScriptIntrinsicInterPred::populateScript(Script *s) {
+ s->mHal.info.exportedVariableCount = 3;
+}
+
+void RsdCpuScriptIntrinsicInterPred::invokeFreeChildren() {
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_InterPred(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e) {
+ return new RsdCpuScriptIntrinsicInterPred(ctx, s, e);
+}
diff --git a/cpu_ref/rsCpuIntrinsicInterPred.h b/cpu_ref/rsCpuIntrinsicInterPred.h
new file mode 100644
index 0000000..552f4eb
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicInterPred.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_CPU_SCRIPT_INTRINSIC_INTER_PRED_H
+#define RSD_CPU_SCRIPT_INTRINSIC_INTER_PRED_H
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+#include "rsCpuConvolve.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+typedef struct inter_pred_param {
+ int pred_mode;
+
+ int src_mv;
+ int src_stride;
+ int dst_mv;
+ int dst_stride;
+
+ int filter_x_mv;
+ int x_step_q4;
+ int filter_y_mv;
+ int y_step_q4;
+
+ int w;
+ int h;
+}INTER_PRED_PARAM;
+
+static const int16_t inter_pred_filters[512] = {
+ 0, 0, 0, 128, 0, 0, 0, 0, 0, 1, -5, 126, 8, -3, 1, 0,
+ -1, 3, -10, 122, 18, -6, 2, 0, -1, 4, -13, 118, 27, -9, 3, -1,
+ -1, 4, -16, 112, 37, -11, 4, -1, -1, 5, -18, 105, 48, -14, 4, -1,
+ -1, 5, -19, 97, 58, -16, 5, -1, -1, 6, -19, 88, 68, -18, 5, -1,
+ -1, 6, -19, 78, 78, -19, 6, -1, -1, 5, -18, 68, 88, -19, 6, -1,
+ -1, 5, -16, 58, 97, -19, 5, -1, -1, 4, -14, 48, 105, -18, 5, -1,
+ -1, 4, -11, 37, 112, -16, 4, -1, -1, 3, -9, 27, 118, -13, 4, -1,
+ 0, 2, -6, 18, 122, -10, 3, -1, 0, 1, -3, 8, 126, -5, 1, 0,
+ 0, 0, 0, 128, 0, 0, 0, 0, -3, -1, 32, 64, 38, 1, -3, 0,
+ -2, -2, 29, 63, 41, 2, -3, 0, -2, -2, 26, 63, 43, 4, -4, 0,
+ -2, -3, 24, 62, 46, 5, -4, 0, -2, -3, 21, 60, 49, 7, -4, 0,
+ -1, -4, 18, 59, 51, 9, -4, 0, -1, -4, 16, 57, 53, 12, -4, -1,
+ -1, -4, 14, 55, 55, 14, -4, -1, -1, -4, 12, 53, 57, 16, -4, -1,
+ 0, -4, 9, 51, 59, 18, -4, -1, 0, -4, 7, 49, 60, 21, -3, -2,
+ 0, -4, 5, 46, 62, 24, -3, -2, 0, -4, 4, 43, 63, 26, -2, -2,
+ 0, -3, 2, 41, 63, 29, -2, -2, 0, -3, 1, 38, 64, 32, -1, -3,
+ 0, 0, 0, 128, 0, 0, 0, 0, -1, 3, -7, 127, 8, -3, 1, 0,
+ -2, 5, -13, 125, 17, -6, 3, -1, -3, 7, -17, 121, 27, -10, 5, -2,
+ -4, 9, -20, 115, 37, -13, 6, -2, -4, 10, -23, 108, 48, -16, 8, -3,
+ -4, 10, -24, 100, 59, -19, 9, -3, -4, 11, -24, 90, 70, -21, 10, -4,
+ -4, 11, -23, 80, 80, -23, 11, -4, -4, 10, -21, 70, 90, -24, 11, -4,
+ -3, 9, -19, 59, 100, -24, 10, -4, -3, 8, -16, 48, 108, -23, 10, -4,
+ -2, 6, -13, 37, 115, -20, 9, -4, -2, 5, -10, 27, 121, -17, 7, -3,
+ -1, 3, -6, 17, 125, -13, 5, -2, 0, 1, -3, 8, 127, -7, 3, -1,
+ 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 120, 8, 0, 0, 0,
+ 0, 0, 0, 112, 16, 0, 0, 0, 0, 0, 0, 104, 24, 0, 0, 0,
+ 0, 0, 0, 96, 32, 0, 0, 0, 0, 0, 0, 88, 40, 0, 0, 0,
+ 0, 0, 0, 80, 48, 0, 0, 0, 0, 0, 0, 72, 56, 0, 0, 0,
+ 0, 0, 0, 64, 64, 0, 0, 0, 0, 0, 0, 56, 72, 0, 0, 0,
+ 0, 0, 0, 48, 80, 0, 0, 0, 0, 0, 0, 40, 88, 0, 0, 0,
+ 0, 0, 0, 32, 96, 0, 0, 0, 0, 0, 0, 24, 104, 0, 0, 0,
+ 0, 0, 0, 16, 112, 0, 0, 0, 0, 0, 0, 8, 120, 0, 0, 0
+};
+
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+
+class RsdCpuScriptIntrinsicInterPred: public RsdCpuScriptIntrinsic {
+public:
+ virtual void populateScript(Script *);
+ virtual void invokeFreeChildren();
+
+ virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+ virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+ virtual ~RsdCpuScriptIntrinsicInterPred();
+ RsdCpuScriptIntrinsicInterPred(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
+
+protected:
+ uint8_t *mRef;
+ uint8_t *mParam;
+ int mFriParamCount;
+ int mSecParamCount;
+ int mParamOffset;
+ int mCount;
+ convolve_fn_t mSwitchConvolve[32];
+ static void kernel(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+#endif
+
diff --git a/rsDefines.h b/rsDefines.h
index 741f67b..fbc63cd 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -364,7 +364,8 @@
RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB = 6,
RS_SCRIPT_INTRINSIC_ID_BLEND = 7,
RS_SCRIPT_INTRINSIC_ID_3DLUT = 8,
- RS_SCRIPT_INTRINSIC_ID_HISTOGRAM = 9
+ RS_SCRIPT_INTRINSIC_ID_HISTOGRAM = 9,
+ RS_SCRIPT_INTRINSIC_ID_INTER_PRED= 10
};
typedef struct {