Add VP9 inter-frame prediction intrinsic

Change-Id: If8985a6200fb6d34083eff711ccdf2f1b3c374e6
diff --git a/cpp/ScriptIntrinsics.cpp b/cpp/ScriptIntrinsics.cpp
index c5013b6..f9a1d97 100644
--- a/cpp/ScriptIntrinsics.cpp
+++ b/cpp/ScriptIntrinsics.cpp
@@ -66,6 +66,52 @@
     Script::setVar(0, lut);
 }
 
+sp<ScriptIntrinsicVP9InterPred> ScriptIntrinsicVP9InterPred::create(sp<RS> rs, sp<const Element> e) {
+    if (e->isCompatible(Element::U8(rs)) == false) {
+        rs->throwError(RS_ERROR_INVALID_ELEMENT, "Element not supported for intrinsic");
+        return NULL;
+    }
+    return new ScriptIntrinsicVP9InterPred(rs, e);
+}
+
+ScriptIntrinsicVP9InterPred::ScriptIntrinsicVP9InterPred(sp<RS> rs, sp<const Element> e)
+    : ScriptIntrinsic(rs, RS_SCRIPT_INTRINSIC_ID_INTER_PRED, e) {
+}
+
+void ScriptIntrinsicVP9InterPred::forEach(sp<Allocation> asize) {
+    if (asize->getType()->getElement()->isCompatible(mElement) == false) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "InterPred forEach element mismatch");
+        return;
+    }
+    Script::forEach(0, asize, NULL, NULL, 0);
+}
+
+void ScriptIntrinsicVP9InterPred::setRef(sp<Allocation> ref) {
+    sp<const Type> t = ref->getType();
+    if (!t->getElement()->isCompatible(mElement)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "setRef element does not match");
+        return;
+    }
+    Script::setVar(0, ref);
+}
+
+void ScriptIntrinsicVP9InterPred::setParam(sp<Allocation> param) {
+    sp<const Type> t = param->getType();
+    if (!t->getElement()->isCompatible(mElement)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "setFriParam element does not match");
+        return;
+    }
+    Script::setVar(1, param);
+}
+
+void ScriptIntrinsicVP9InterPred::setParamCount(int fri, int sec, int offset) {
+    FieldPacker fp(12);
+    fp.add(fri);
+    fp.add(sec);
+    fp.add(offset);
+    Script::setVar(2, fp.getData(), fp.getLength());
+}
+
 sp<ScriptIntrinsicBlend> ScriptIntrinsicBlend::create(sp<RS> rs, sp<const Element> e) {
     if (e->isCompatible(Element::U8_4(rs)) == false) {
         rs->throwError(RS_ERROR_INVALID_ELEMENT, "Element not supported for intrinsic");
diff --git a/cpp/rsCppStructs.h b/cpp/rsCppStructs.h
index 6c14e8c..805f072 100644
--- a/cpp/rsCppStructs.h
+++ b/cpp/rsCppStructs.h
@@ -1435,6 +1435,20 @@
      */
     void setLUT(sp<Allocation> lut);
 };
+/**
+ * Intrinsic for VP9InterPrediction
+ */
+class ScriptIntrinsicVP9InterPred : public ScriptIntrinsic {
+ private:
+    ScriptIntrinsicVP9InterPred(sp<RS> rs, sp<const Element> e);
+ public:
+    static sp<ScriptIntrinsicVP9InterPred> create(sp<RS> rs, sp<const Element> e);
+
+    void forEach(sp<Allocation> asize);
+    void setRef(sp<Allocation> ref);
+    void setParamCount(int fri, int sec, int offset);
+    void setParam(sp<Allocation> param);
+};
 
 /**
  * Intrinsic kernel for blending two Allocations.
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index df9ac09..0199eee 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -29,8 +29,10 @@
 	rsCpuIntrinsicConvolve3x3.cpp \
 	rsCpuIntrinsicConvolve5x5.cpp \
 	rsCpuIntrinsicHistogram.cpp \
+	rsCpuIntrinsicInterPred.cpp \
 	rsCpuIntrinsicLUT.cpp \
-	rsCpuIntrinsicYuvToRGB.cpp
+	rsCpuIntrinsicYuvToRGB.cpp \
+	convolve/convolve.c
 
 LOCAL_CFLAGS_arm64 += -DARCH_ARM_HAVE_NEON
 LOCAL_SRC_FILES_arm64 += \
@@ -50,6 +52,11 @@
     rsCpuIntrinsics_neon_Blend.S \
     rsCpuIntrinsics_neon_Blur.S \
     rsCpuIntrinsics_neon_YuvToRGB.S
+    convolve/convolve_copy_neon.s \
+    convolve/convolve_avg_neon.s \
+    convolve/convolve8_neon.s \
+    convolve/convolve8_avg_neon.s \
+    convolve/convolve_neon.c
     LOCAL_ASFLAGS_arm := -mfpu=neon
 endif
 
diff --git a/cpu_ref/convolve/convolve.c b/cpu_ref/convolve/convolve.c
new file mode 100644
index 0000000..c85db92
--- /dev/null
+++ b/cpu_ref/convolve/convolve.c
@@ -0,0 +1,257 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vp9_common.h"
+#include "vp9_filter.h"
+#include <string.h>
+#include <stdio.h>
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const subpel_kernel *x_filters,
+                           int x0_q4, int x_step_q4, int w, int h) {
+    int x, y;
+    src -= SUBPEL_TAPS / 2 - 1;
+    for (y = 0; y < h; ++y) {
+        int x_q4 = x0_q4;
+        for (x = 0; x < w; ++x) {
+            const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+            const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+            int k, sum = 0;
+            for (k = 0; k < SUBPEL_TAPS; ++k)
+                sum += src_x[k] * x_filter[k];
+            dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+            x_q4 += x_step_q4;
+        }
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const subpel_kernel *x_filters,
+                               int x0_q4, int x_step_q4, int w, int h) {
+    int x, y;
+    src -= SUBPEL_TAPS / 2 - 1;
+    for (y = 0; y < h; ++y) {
+        int x_q4 = x0_q4;
+        for (x = 0; x < w; ++x) {
+            const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+            const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+            int k, sum = 0;
+            for (k = 0; k < SUBPEL_TAPS; ++k)
+                sum += src_x[k] * x_filter[k];
+            dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+                                        clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+            x_q4 += x_step_q4;
+        }
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const subpel_kernel *y_filters,
+                          int y0_q4, int y_step_q4, int w, int h) {
+    int x, y;
+    src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+    for (x = 0; x < w; ++x) {
+        int y_q4 = y0_q4;
+        for (y = 0; y < h; ++y) {
+            const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+            const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+            int k, sum = 0;
+            for (k = 0; k < SUBPEL_TAPS; ++k)
+                sum += src_y[k * src_stride] * y_filter[k];
+            dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+            y_q4 += y_step_q4;
+        }
+        ++src;
+        ++dst;
+    }
+}
+
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const subpel_kernel *y_filters,
+                              int y0_q4, int y_step_q4, int w, int h) {
+    int x, y;
+    src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+    for (x = 0; x < w; ++x) {
+        int y_q4 = y0_q4;
+        for (y = 0; y < h; ++y) {
+            const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+            const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+            int k, sum = 0;
+            for (k = 0; k < SUBPEL_TAPS; ++k)
+                sum += src_y[k * src_stride] * y_filter[k];
+            dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+                                  clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+            y_q4 += y_step_q4;
+        }
+        ++src;
+        ++dst;
+    }
+}
+
+static void convolve(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const subpel_kernel *const x_filters,
+                     int x0_q4, int x_step_q4,
+                     const subpel_kernel *const y_filters,
+                     int y0_q4, int y_step_q4,
+                     int w, int h) {
+    // Fixed size intermediate buffer places limits on parameters.
+    // Maximum intermediate_height is 324, for y_step_q4 == 80,
+    // h == 64, taps == 8.
+    // y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
+    uint8_t temp[64 * 324];
+    int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;
+
+    if (intermediate_height < h)
+        intermediate_height = h;
+
+    convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+                   x_filters, x0_q4, x_step_q4, w, intermediate_height);
+    convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                  y_filters, y0_q4, y_step_q4, w, h);
+}
+
+static const subpel_kernel *get_filter_base(const int16_t *filter) {
+    // NOTE: This assumes that the filter table is 256-byte aligned.
+    // TODO(agrange) Modify to make independent of table alignment.
+    return (const subpel_kernel *)(filter);
+}
+
+static int get_filter_offset(const int16_t *f, const subpel_kernel *base) {
+    return 0;
+}
+
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h) {
+    const subpel_kernel *const filters_x = get_filter_base(filter_x);
+    const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+    convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+                   x0_q4, x_step_q4, w, h);
+}
+
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+    const subpel_kernel *const filters_x = get_filter_base(filter_x);
+    const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+    convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+                       x0_q4, x_step_q4, w, h);
+}
+
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const int16_t *filter_x, int x_step_q4,
+                          const int16_t *filter_y, int y_step_q4,
+                          int w, int h) {
+    const subpel_kernel *const filters_y = get_filter_base(filter_y);
+    const int y0_q4 = get_filter_offset(filter_y, filters_y);
+    convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+                  y0_q4, y_step_q4, w, h);
+}
+
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+    const subpel_kernel *const filters_y = get_filter_base(filter_y);
+    const int y0_q4 = get_filter_offset(filter_y, filters_y);
+    convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+                      y0_q4, y_step_q4, w, h);
+}
+
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const int16_t *filter_x, int x_step_q4,
+                     const int16_t *filter_y, int y_step_q4,
+                     int w, int h) {
+    const subpel_kernel *const filters_x = get_filter_base(filter_x);
+    const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+    const subpel_kernel *const filters_y = get_filter_base(filter_y);
+    const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+    convolve(src, src_stride, dst, dst_stride,
+             filters_x, x0_q4, x_step_q4,
+             filters_y, y0_q4, y_step_q4, w, h);
+}
+
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int filter_x_stride,
+                        const int16_t *filter_y, int filter_y_stride,
+                        int w, int h) {
+    int x, y;
+    for (y = 0; y < h; ++y) {
+        for (x = 0; x < w; ++x)
+            dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+    /* Fixed size intermediate buffer places limits on parameters. */
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 64 * 64);
+
+    vp9_convolve8_c(src, src_stride, temp, 64,
+                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+    vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+}
+
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int filter_x_stride,
+                         const int16_t *filter_y, int filter_y_stride,
+                         int w, int h) {
+    int r;
+
+    for (r = h; r > 0; --r) {
+        memcpy(dst, src, w);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
diff --git a/cpu_ref/convolve/convolve8_avg_neon.s b/cpu_ref/convolve/convolve8_avg_neon.s
new file mode 100644
index 0000000..7821446
--- /dev/null
+++ b/cpu_ref/convolve/convolve8_avg_neon.s
@@ -0,0 +1,323 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+  .equ DO1STROUNDING, 0
+@
+@  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+@
+@  Use of this source code is governed by a BSD-style license
+@  that can be found in the LICENSE file in the root of the source
+@  tree. An additional intellectual property rights grant can be found
+@  in the file PATENTS.  All contributing project authors may
+@  be found in the AUTHORS file in the root of the source tree.
+@
+@  Copyright (c) 2014 The Android Open Source Project
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+
+
+
+    @ These functions are only valid when:
+    @ x_step_q4 == 16
+    @ w%4 == 0
+    @ h%4 == 0
+    @ taps == 8
+    @ VP9_FILTER_WEIGHT == 128
+    @ VP9_FILTER_SHIFT == 7
+
+    .global vp9_convolve8_avg_horiz_neon
+  .type vp9_convolve8_avg_horiz_neon, function
+    .global vp9_convolve8_avg_vert_neon
+  .type vp9_convolve8_avg_vert_neon, function
+    .global vp9_convolve8_avg_horiz_c
+    .global vp9_convolve8_avg_vert_c
+   .arm
+   .eabi_attribute 24, 1 @Tag_ABI_align_needed
+   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+
+.text
+.p2align 2
+
+    @ Multiply and accumulate by q0
+.macro    MULTIPLY_BY_Q0 dst, src0, src1, src2, src3, src4, src5, src6, src7
+    vmull.s16 \dst, \src0, d0[0]
+    vmlal.s16 \dst, \src1, d0[1]
+    vmlal.s16 \dst, \src2, d0[2]
+    vmlal.s16 \dst, \src3, d0[3]
+    vmlal.s16 \dst, \src4, d1[0]
+    vmlal.s16 \dst, \src5, d1[1]
+    vmlal.s16 \dst, \src6, d1[2]
+    vmlal.s16 \dst, \src7, d1[3]
+    .endm
+
+@ r0    const uint8_t *src
+@ r1    int src_stride
+@ r2    uint8_t *dst
+@ r3    int dst_stride
+@ sp[]const int16_t *filter_x
+@ sp[]int x_step_q4
+@ sp[]const int16_t *filter_y ; unused
+@ sp[]int y_step_q4           ; unused
+@ sp[]int w
+@ sp[]int h
+
+_vp9_convolve8_avg_horiz_neon:
+  vp9_convolve8_avg_horiz_neon: @ PROC
+    ldr             r12, [sp, #4]           @ x_step_q4
+    cmp             r12, #16
+    bne             vp9_convolve8_avg_horiz_c
+
+    push            {r4-r10, lr}
+
+    sub             r0, r0, #3              @ adjust for taps
+
+    ldr             r5, [sp, #32]           @ filter_x
+    ldr             r6, [sp, #48]           @ w
+    ldr             r7, [sp, #52]           @ h
+
+    vld1.s16        {q0}, [r5]              @ filter_x
+
+    sub             r8, r1, r1, lsl #2      @ -src_stride * 3
+    add             r8, r8, #4              @ -src_stride * 3 + 4
+
+    sub             r4, r3, r3, lsl #2      @ -dst_stride * 3
+    add             r4, r4, #4              @ -dst_stride * 3 + 4
+
+    rsb             r9, r6, r1, lsl #2      @ reset src for outer loop
+    sub             r9, r9, #7
+    rsb             r12, r6, r3, lsl #2     @ reset dst for outer loop
+
+    mov             r10, r6                 @ w loop counter
+
+loop_horiz_v:
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8
+
+    vtrn.16         q12, q13
+    vtrn.8          d24, d25
+    vtrn.8          d26, d27
+
+    pld             [r0, r1, lsl #2]
+
+    vmovl.u8        q8, d24
+    vmovl.u8        q9, d25
+    vmovl.u8        q10, d26
+    vmovl.u8        q11, d27
+
+    @ save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+loop_horiz:
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    @ extract to s16
+    vtrn.32         q14, q15
+    vmovl.u8        q12, d28
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]
+
+    @ slightly out of order load to match the existing data
+    vld1.u32        {d6[0]}, [r2], r3
+    vld1.u32        {d7[0]}, [r2], r3
+    vld1.u32        {d6[1]}, [r2], r3
+    vld1.u32        {d7[1]}, [r2], r3
+
+    sub             r2, r2, r3, lsl #2      @ reset for store
+
+    @ src[] * filter_x
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]
+
+    @ += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    @ saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    @ transpose
+    vtrn.16         d2, d3
+    vtrn.32         d2, d3
+    vtrn.8          d2, d3
+
+    @ average the new value and the dst value
+    vrhadd.u8       q1, q1, q3
+
+    vst1.u32        {d2[0]}, [r2,:32], r3
+    vst1.u32        {d3[0]}, [r2,:32], r3
+    vst1.u32        {d2[1]}, [r2,:32], r3
+    vst1.u32        {d3[1]}, [r2,:32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13
+
+    subs            r6, r6, #4              @ w -= 4
+    bgt             loop_horiz
+
+    @ outer loop
+    mov             r6, r10                 @ restore w counter
+    add             r0, r0, r9              @ src += src_stride * 4 - w
+    add             r2, r2, r12             @ dst += dst_stride * 4 - w
+    subs            r7, r7, #4              @ h -= 4
+    bgt loop_horiz_v
+
+    pop             {r4-r10, pc}
+
+  .size vp9_convolve8_avg_horiz_neon, .-vp9_convolve8_avg_horiz_neon    @ ENDP
+
+_vp9_convolve8_avg_vert_neon:
+  vp9_convolve8_avg_vert_neon: @ PROC
+    ldr             r12, [sp, #12]
+    cmp             r12, #16
+    bne             vp9_convolve8_avg_vert_c
+
+    push            {r4-r8, lr}
+
+    @ adjust for taps
+    sub             r0, r0, r1
+    sub             r0, r0, r1, lsl #1
+
+    ldr             r4, [sp, #32]           @ filter_y
+    ldr             r6, [sp, #40]           @ w
+    ldr             lr, [sp, #44]           @ h
+
+    vld1.s16        {q0}, [r4]              @ filter_y
+
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1
+
+loop_vert_h:
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 @ h loop counter
+
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1
+
+    vmovl.u8        q8, d16
+    vmovl.u8        q9, d18
+    vmovl.u8        q10, d20
+    vmovl.u8        q11, d22
+
+loop_vert:
+    @ always process a 4x4 block at a time
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
+
+    @ extract to s16
+    vmovl.u8        q12, d24
+    vmovl.u8        q13, d26
+
+    vld1.u32        {d6[0]}, [r5,:32], r3
+    vld1.u32        {d6[1]}, [r8,:32], r3
+    vld1.u32        {d7[0]}, [r5,:32], r3
+    vld1.u32        {d7[1]}, [r8,:32], r3
+
+    pld             [r7]
+    pld             [r4]
+
+    @ src[] * filter_y
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r5]
+    pld             [r8]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+    @ += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    @ saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    @ average the new value and the dst value
+    vrhadd.u8       q1, q1, q3
+
+    sub             r5, r5, r3, lsl #1      @ reset for store
+    sub             r8, r8, r3, lsl #1
+
+    vst1.u32        {d2[0]}, [r5,:32], r3
+    vst1.u32        {d2[1]}, [r8,:32], r3
+    vst1.u32        {d3[0]}, [r5,:32], r3
+    vst1.u32        {d3[1]}, [r8,:32], r3
+
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            @ h -= 4
+    bgt             loop_vert
+
+    @ outer loop
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              @ w -= 4
+    bgt             loop_vert_h
+
+    pop             {r4-r8, pc}
+
+  .size vp9_convolve8_avg_vert_neon, .-vp9_convolve8_avg_vert_neon    @ ENDP
+  .section  .note.GNU-stack,"",%progbits
diff --git a/cpu_ref/convolve/convolve8_neon.s b/cpu_ref/convolve/convolve8_neon.s
new file mode 100644
index 0000000..0bc15d9
--- /dev/null
+++ b/cpu_ref/convolve/convolve8_neon.s
@@ -0,0 +1,300 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+    .equ DO1STROUNDING, 0
+@
+@  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+@
+@  Use of this source code is governed by a BSD-style license
+@  that can be found in the LICENSE file in the root of the source
+@  tree. An additional intellectual property rights grant can be found
+@  in the file PATENTS.  All contributing project authors may
+@  be found in the AUTHORS file in the root of the source tree.
+@
+@  Copyright (c) 2014 The Android Open Source Project
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+
+
+    @ These functions are only valid when:
+    @ x_step_q4 == 16
+    @ w%4 == 0
+    @ h%4 == 0
+    @ taps == 8
+    @ VP9_FILTER_WEIGHT == 128
+    @ VP9_FILTER_SHIFT == 7
+
+    .global vp9_convolve8_horiz_neon
+    .type vp9_convolve8_horiz_neon, function
+    .global vp9_convolve8_vert_neon
+    .type vp9_convolve8_vert_neon, function
+    .global vp9_convolve8_horiz_c
+    .global vp9_convolve8_vert_c
+   .arm
+   .eabi_attribute 24, 1 @Tag_ABI_align_needed
+   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+
+.text
+.p2align 2
+
+    @ Multiply and accumulate by q0
+.macro    MULTIPLY_BY_Q0 dst, src0, src1, src2, src3, src4, src5, src6, src7
+    vmull.s16 \dst, \src0, d0[0]
+    vmlal.s16 \dst, \src1, d0[1]
+    vmlal.s16 \dst, \src2, d0[2]
+    vmlal.s16 \dst, \src3, d0[3]
+    vmlal.s16 \dst, \src4, d1[0]
+    vmlal.s16 \dst, \src5, d1[1]
+    vmlal.s16 \dst, \src6, d1[2]
+    vmlal.s16 \dst, \src7, d1[3]
+    .endm
+
+@ r0    const uint8_t *src
+@ r1    int src_stride
+@ r2    uint8_t *dst
+@ r3    int dst_stride
+@ sp[]const int16_t *filter_x
+@ sp[]int x_step_q4
+@ sp[]const int16_t *filter_y ; unused
+@ sp[]int y_step_q4           ; unused
+@ sp[]int w
+@ sp[]int h
+
+_vp9_convolve8_horiz_neon:
+    vp9_convolve8_horiz_neon: @ PROC
+    ldr             r12, [sp, #4]           @ x_step_q4
+    cmp             r12, #16
+    bne             vp9_convolve8_horiz_c
+
+    push            {r4-r10, lr}
+
+    sub             r0, r0, #3              @ adjust for taps
+
+    ldr             r5, [sp, #32]           @ filter_x
+    ldr             r6, [sp, #48]           @ w
+    ldr             r7, [sp, #52]           @ h
+
+    vld1.s16        {q0}, [r5]              @ filter_x
+
+    sub             r8, r1, r1, lsl #2      @ -src_stride * 3
+    add             r8, r8, #4              @ -src_stride * 3 + 4
+
+    sub             r4, r3, r3, lsl #2      @ -dst_stride * 3
+    add             r4, r4, #4              @ -dst_stride * 3 + 4
+
+    rsb             r9, r6, r1, lsl #2      @ reset src for outer loop
+    sub             r9, r9, #7
+    rsb             r12, r6, r3, lsl #2     @ reset dst for outer loop
+
+    mov             r10, r6                 @ w loop counter
+
+loop_horiz_v:
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8
+
+    vtrn.16         q12, q13
+    vtrn.8          d24, d25
+    vtrn.8          d26, d27
+
+    pld             [r0, r1, lsl #2]
+
+    vmovl.u8        q8, d24
+    vmovl.u8        q9, d25
+    vmovl.u8        q10, d26
+    vmovl.u8        q11, d27
+
+    @ save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+loop_horiz:
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    @ extract to s16
+    vtrn.32         q14, q15
+    vmovl.u8        q12, d28
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]
+
+    @ src[] * filter_x
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]
+
+    @ += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    @ saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    @ transpose
+    vtrn.16         d2, d3
+    vtrn.32         d2, d3
+    vtrn.8          d2, d3
+
+    vst1.u32        {d2[0]}, [r2,:32], r3
+    vst1.u32        {d3[0]}, [r2,:32], r3
+    vst1.u32        {d2[1]}, [r2,:32], r3
+    vst1.u32        {d3[1]}, [r2,:32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13
+
+    subs            r6, r6, #4              @ w -= 4
+    bgt             loop_horiz
+
+    @ outer loop
+    mov             r6, r10                 @ restore w counter
+    add             r0, r0, r9              @ src += src_stride * 4 - w
+    add             r2, r2, r12             @ dst += dst_stride * 4 - w
+    subs            r7, r7, #4              @ h -= 4
+    bgt loop_horiz_v
+
+    pop             {r4-r10, pc}
+
+    .size vp9_convolve8_horiz_neon, .-vp9_convolve8_horiz_neon    @ ENDP
+
+_vp9_convolve8_vert_neon:
+    vp9_convolve8_vert_neon: @ PROC
+    ldr             r12, [sp, #12]
+    cmp             r12, #16
+    bne             vp9_convolve8_vert_c
+
+    push            {r4-r8, lr}
+
+    @ adjust for taps
+    sub             r0, r0, r1
+    sub             r0, r0, r1, lsl #1
+
+    ldr             r4, [sp, #32]           @ filter_y
+    ldr             r6, [sp, #40]           @ w
+    ldr             lr, [sp, #44]           @ h
+
+    vld1.s16        {q0}, [r4]              @ filter_y
+
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1
+
+loop_vert_h:
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 @ h loop counter
+
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1
+
+    vmovl.u8        q8, d16
+    vmovl.u8        q9, d18
+    vmovl.u8        q10, d20
+    vmovl.u8        q11, d22
+
+loop_vert:
+    @ always process a 4x4 block at a time
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
+
+    @ extract to s16
+    vmovl.u8        q12, d24
+    vmovl.u8        q13, d26
+
+    pld             [r5]
+    pld             [r8]
+
+    @ src[] * filter_y
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r7]
+    pld             [r4]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+    @ += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    @ saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    vst1.u32        {d2[0]}, [r5,:32], r3
+    vst1.u32        {d2[1]}, [r8,:32], r3
+    vst1.u32        {d3[0]}, [r5,:32], r3
+    vst1.u32        {d3[1]}, [r8,:32], r3
+
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            @ h -= 4
+    bgt             loop_vert
+
+    @ outer loop
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              @ w -= 4
+    bgt             loop_vert_h
+
+    pop             {r4-r8, pc}
+
+    .size vp9_convolve8_vert_neon, .-vp9_convolve8_vert_neon    @ ENDP
+    .section    .note.GNU-stack,"",%progbits
diff --git a/cpu_ref/convolve/convolve_avg_neon.s b/cpu_ref/convolve/convolve_avg_neon.s
new file mode 100644
index 0000000..41e79f1
--- /dev/null
+++ b/cpu_ref/convolve/convolve_avg_neon.s
@@ -0,0 +1,135 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+    .equ DO1STROUNDING, 0
+@
+@  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+@
+@  Use of this source code is governed by a BSD-style license
+@  that can be found in the LICENSE file in the root of the source
+@  tree. An additional intellectual property rights grant can be found
+@  in the file PATENTS.  All contributing project authors may
+@  be found in the AUTHORS file in the root of the source tree.
+@
+@  Copyright (c) 2014 The Android Open Source Project
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+
+    .global vp9_convolve_avg_neon
+    .type vp9_convolve_avg_neon, function
+   .arm
+   .eabi_attribute 24, 1 @Tag_ABI_align_needed
+   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+
+.text
+.p2align 2
+
+_vp9_convolve_avg_neon:
+    vp9_convolve_avg_neon: @ PROC
+    push                {r4-r6, lr}
+    ldrd                r4, r5, [sp, #32]
+    mov                 r6, r2
+
+    cmp                 r4, #32
+    bgt                 avg64
+    beq                 avg32
+    cmp                 r4, #8
+    bgt                 avg16
+    beq                 avg8
+    b                   avg4
+
+avg64:
+    sub                 lr, r1, #32
+    sub                 r4, r3, #32
+avg64_h:
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    pld                 [r2, r3]
+    vld1.8              {q8-q9},   [r6,:128]!
+    vld1.8              {q10-q11}, [r6,:128], r4
+    vrhadd.u8           q0, q0, q8
+    vrhadd.u8           q1, q1, q9
+    vrhadd.u8           q2, q2, q10
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2,:128]!
+    vst1.8              {q2-q3}, [r2,:128], r4
+    subs                r5, r5, #1
+    bgt                 avg64_h
+    pop                 {r4-r6, pc}
+
+avg32:
+    vld1.8              {q0-q1}, [r0], r1
+    vld1.8              {q2-q3}, [r0], r1
+    vld1.8              {q8-q9},   [r6,:128], r3
+    vld1.8              {q10-q11}, [r6,:128], r3
+    pld                 [r0]
+    vrhadd.u8           q0, q0, q8
+    pld                 [r0, r1]
+    vrhadd.u8           q1, q1, q9
+    pld                 [r6]
+    vrhadd.u8           q2, q2, q10
+    pld                 [r6, r3]
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2,:128], r3
+    vst1.8              {q2-q3}, [r2,:128], r3
+    subs                r5, r5, #2
+    bgt                 avg32
+    pop                 {r4-r6, pc}
+
+avg16:
+    vld1.8              {q0}, [r0], r1
+    vld1.8              {q1}, [r0], r1
+    vld1.8              {q2}, [r6,:128], r3
+    vld1.8              {q3}, [r6,:128], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q2
+    pld                 [r6]
+    pld                 [r6, r3]
+    vrhadd.u8           q1, q1, q3
+    vst1.8              {q0}, [r2,:128], r3
+    vst1.8              {q1}, [r2,:128], r3
+    subs                r5, r5, #2
+    bgt                 avg16
+    pop                 {r4-r6, pc}
+
+avg8:
+    vld1.8              {d0}, [r0], r1
+    vld1.8              {d1}, [r0], r1
+    vld1.8              {d2}, [r6,:64], r3
+    vld1.8              {d3}, [r6,:64], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q1
+    pld                 [r6]
+    pld                 [r6, r3]
+    vst1.8              {d0}, [r2,:64], r3
+    vst1.8              {d1}, [r2,:64], r3
+    subs                r5, r5, #2
+    bgt                 avg8
+    pop                 {r4-r6, pc}
+
+avg4:
+    vld1.32             {d0[0]}, [r0], r1
+    vld1.32             {d0[1]}, [r0], r1
+    vld1.32             {d2[0]}, [r6,:32], r3
+    vld1.32             {d2[1]}, [r6,:32], r3
+    vrhadd.u8           d0, d0, d2
+    vst1.32             {d0[0]}, [r2,:32], r3
+    vst1.32             {d0[1]}, [r2,:32], r3
+    subs                r5, r5, #2
+    bgt                 avg4
+    pop                 {r4-r6, pc}
+    .size vp9_convolve_avg_neon, .-vp9_convolve_avg_neon    @ ENDP
+
+    .section	.note.GNU-stack,"",%progbits
diff --git a/cpu_ref/convolve/convolve_copy_neon.s b/cpu_ref/convolve/convolve_copy_neon.s
new file mode 100644
index 0000000..60ada14
--- /dev/null
+++ b/cpu_ref/convolve/convolve_copy_neon.s
Binary files differ
diff --git a/cpu_ref/convolve/convolve_neon.c b/cpu_ref/convolve/convolve_neon.c
new file mode 100644
index 0000000..3d4bf30
--- /dev/null
+++ b/cpu_ref/convolve/convolve_neon.c
@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vp9_common.h"
+#include "vp9_filter.h"
+#include <string.h>
+#include <stdio.h>
+
+extern void vp9_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h);
+
+extern void vp9_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const int16_t *filter_x, int x_step_q4,
+                                    const int16_t *filter_y, int y_step_q4,
+                                    int w, int h);
+
+extern void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                        uint8_t *dst, ptrdiff_t dst_stride,
+                                        const int16_t *filter_x, int x_step_q4,
+                                        const int16_t *filter_y, int y_step_q4,
+                                        int w, int h);
+extern void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const int16_t *filter_x, int x_step_q4,
+                                const int16_t *filter_y, int y_step_q4,
+                                int w, int h);
+
+extern void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h);
+
+void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int x_step_q4,
+                        const int16_t *filter_y, int y_step_q4,
+                        int w, int h) {
+    /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+     * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+     */
+    DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+
+    // Account for the vertical phase needing 3 lines prior and 4 lines post
+    int intermediate_height = h + 7;
+
+    if (x_step_q4 != 16 || y_step_q4 != 16)
+        return vp9_convolve8_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+
+    /* Filter starting 3 lines back. The neon implementation will ignore the
+     * given height and filter a multiple of 4 lines. Since this goes in to
+     * the temp buffer which has lots of extra room and is subsequently discarded
+     * this is safe if somewhat less than ideal.
+     */
+    vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+                             temp, 64, filter_x, x_step_q4,
+                             filter_y, y_step_q4,
+                             w, intermediate_height);
+
+    /* Step into the temp buffer 3 lines to get the actual frame data */
+    vp9_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+                            x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h) {
+    DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+    int intermediate_height = h + 7;
+
+    if (x_step_q4 != 16 || y_step_q4 != 16)
+        return vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                                   filter_x, x_step_q4, filter_y, y_step_q4,
+                                   w, h);
+
+    /* This implementation has the same issues as above. In addition, we only want
+     * to average the values after both passes.
+     */
+    vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64,
+                             filter_x, x_step_q4, filter_y, y_step_q4,
+                             w, intermediate_height);
+    vp9_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride,
+                                filter_x, x_step_q4, filter_y, y_step_q4,
+                                w, h);
+}
diff --git a/cpu_ref/convolve/vp9_common.h b/cpu_ref/convolve/vp9_common.h
new file mode 100644
index 0000000..73a1021
--- /dev/null
+++ b/cpu_ref/convolve/vp9_common.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef VP9_COMMON_VP9_COMMON_H_
+#define VP9_COMMON_VP9_COMMON_H_
+#include <stdint.h>
+
+#define DECLARE_ALIGNED_ARRAY(a,typ,val,n)\
+  typ val##_[(n)+(a)/sizeof(typ)+1];\
+  typ *val = (typ*)((((intptr_t)val##_)+(a)-1)&((intptr_t)-(a)))
+/* Interface header for common constant data structures and lookup tables */
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#define ROUND_POWER_OF_TWO(value, n) \
+    (((value) + (1 << ((n) - 1))) >> (n))
+
+#define ALIGN_POWER_OF_TWO(value, n) \
+    (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+
+// Only need this for fixed-size arrays, for structs just assign.
+#define vp9_copy(dest, src) {            \
+    memcpy(dest, src, sizeof(src));  \
+  }
+
+// Use this for variably-sized arrays.
+#define vp9_copy_array(dest, src, n) {       \
+    memcpy(dest, src, n * sizeof(*src)); \
+  }
+
+#define vp9_zero(dest) memset(&dest, 0, sizeof(dest))
+#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
+
+static inline uint8_t clip_pixel(int val) {
+  return (val > 255) ? 255u : (val < 0) ? 0u : val;
+}
+
+static inline int clamp(int value, int low, int high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static inline double fclamp(double value, double low, double high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+#define VP9_SYNC_CODE_0 0x49
+#define VP9_SYNC_CODE_1 0x83
+#define VP9_SYNC_CODE_2 0x42
+
+#define VP9_FRAME_MARKER 0x2
+
+
+#endif  // VP9_COMMON_VP9_COMMON_H_
diff --git a/cpu_ref/convolve/vp9_filter.h b/cpu_ref/convolve/vp9_filter.h
new file mode 100644
index 0000000..754578d
--- /dev/null
+++ b/cpu_ref/convolve/vp9_filter.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef VP9_COMMON_VP9_FILTER_H_
+#define VP9_COMMON_VP9_FILTER_H_
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+typedef enum {
+    EIGHTTAP = 0,
+    EIGHTTAP_SMOOTH = 1,
+    EIGHTTAP_SHARP = 2,
+    BILINEAR = 3,
+    SWITCHABLE = 4  /* should be the last one */
+} INTERPOLATION_TYPE;
+
+typedef int16_t subpel_kernel[SUBPEL_TAPS];
+
+struct subpix_fn_table {
+    const subpel_kernel *filter_x;
+    const subpel_kernel *filter_y;
+};
+
+const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type);
+extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
+
+// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
+// filter kernel as a 2 tap filter.
+#define BILINEAR_FILTERS_2TAP(x) \
+    (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1)
+
+#endif  // VP9_COMMON_VP9_FILTER_H_
diff --git a/cpu_ref/rsCpuConvolve.h b/cpu_ref/rsCpuConvolve.h
new file mode 100644
index 0000000..d7d2d16
--- /dev/null
+++ b/cpu_ref/rsCpuConvolve.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_CPU_CONVOLVE_NEON_H
+#define RSD_CPU_CONVOLVE_NEON_H
+
+#include <stdint.h>
+
+extern "C" {
+#if defined(ARCH_ARM_HAVE_VFP)
+void vp9_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h);
+
+void vp9_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h);
+
+void vp9_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4,
+                             int w, int h);
+
+void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x, int x_step_q4,
+                                 const int16_t *filter_y, int y_step_q4,
+                                 int w, int h);
+
+void vp9_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h);
+
+void vp9_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h);
+void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int x_step_q4,
+                        const int16_t *filter_y, int y_step_q4,
+                        int w, int h);
+
+void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h);
+
+#else
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h);
+
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int x_step_q4,
+                        const int16_t *filter_y, int y_step_q4,
+                        int w, int h);
+
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const int16_t *filter_x, int x_step_q4,
+                          const int16_t *filter_y, int y_step_q4,
+                          int w, int h);
+
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h);
+
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h);
+
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h);
+
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const int16_t *filter_x, int x_step_q4,
+                     const int16_t *filter_y, int y_step_q4,
+                     int w, int h);
+
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h);
+#endif
+}
+#endif
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 7475ccb..406b5c2 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -506,6 +506,8 @@
 
 extern RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
                                              const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_InterPred(RsdCpuReferenceImpl *ctx,
+                                                 const Script *s, const Element *e);
 extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
                                                    const Script *s, const Element *e);
 extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
@@ -531,6 +533,9 @@
     case RS_SCRIPT_INTRINSIC_ID_3DLUT:
         i = rsdIntrinsic_3DLUT(this, s, e);
         break;
+    case RS_SCRIPT_INTRINSIC_ID_INTER_PRED:
+        i = rsdIntrinsic_InterPred(this, s, e);
+        break;
     case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
         i = rsdIntrinsic_Convolve3x3(this, s, e);
         break;
diff --git a/cpu_ref/rsCpuIntrinsicInterPred.cpp b/cpu_ref/rsCpuIntrinsicInterPred.cpp
new file mode 100644
index 0000000..20e0f2e
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicInterPred.cpp
@@ -0,0 +1,173 @@
+#include "rsCpuIntrinsicInterPred.h"
+
+void RsdCpuScriptIntrinsicInterPred::setGlobalObj(uint32_t slot,
+                                                  ObjectBase *data) {
+    Allocation *alloc = static_cast<Allocation *>(data);
+    if (slot == 0) mRef = (uint8_t *)alloc->mHal.state.userProvidedPtr;
+    if (slot == 1) mParam = (uint8_t *)alloc->mHal.state.userProvidedPtr;
+}
+
+void RsdCpuScriptIntrinsicInterPred::setGlobalVar(uint32_t slot,
+                                                  const void *data,
+                                                  size_t dataLength) {
+    mFriParamCount = ((int32_t *)data)[0];
+    mSecParamCount = ((int32_t *)data)[1];
+    mParamOffset   = ((int32_t *)data)[2];
+}
+
+void RsdCpuScriptIntrinsicInterPred::kernel(const RsForEachStubParamStruct *p,
+                                            uint32_t xstart, uint32_t xend,
+                                            uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicInterPred *cp = (RsdCpuScriptIntrinsicInterPred *)p->usr;
+    cp->mCount++;
+    const int vp9_convolve_mode[2][2] = {{24, 16}, {8, 0}};
+    uint8_t *ref_base = cp->mRef;
+    INTER_PRED_PARAM *fri_param = (INTER_PRED_PARAM *)cp->mParam;
+    INTER_PRED_PARAM *sec_param = (INTER_PRED_PARAM *)(cp->mParam + cp->mParamOffset);
+    int32_t fri_count = cp->mFriParamCount;
+    int32_t sec_count = cp->mSecParamCount;
+    int mode_num;
+    uint8_t *src;
+    uint8_t *dst;
+    const int16_t *filter_x;
+    const int16_t *filter_y;
+    for (int i = 0; i < fri_count; i++) {
+
+        mode_num = vp9_convolve_mode[(fri_param[i].x_step_q4 == 16)]
+                                    [(fri_param[i].y_step_q4 == 16)];
+        src = ref_base + fri_param[i].src_mv;
+        dst = ref_base + fri_param[i].dst_mv;
+
+        filter_x = inter_pred_filters + fri_param[i].filter_x_mv;
+        filter_y = inter_pred_filters + fri_param[i].filter_y_mv;
+
+        cp->mSwitchConvolve[fri_param[i].pred_mode + mode_num](
+            src, fri_param[i].src_stride,
+            dst, fri_param[i].dst_stride,
+            filter_x, fri_param[i].x_step_q4,
+            filter_y, fri_param[i].y_step_q4,
+            fri_param[i].w, fri_param[i].h
+        );
+    }
+
+    for (int i = 0; i < sec_count; i++) {
+        mode_num = vp9_convolve_mode[(sec_param[i].x_step_q4 == 16)]
+                                    [(sec_param[i].y_step_q4 == 16)];
+        src = ref_base + sec_param[i].src_mv;
+        dst = ref_base + sec_param[i].dst_mv;
+
+        filter_x = inter_pred_filters + sec_param[i].filter_x_mv;
+        filter_y = inter_pred_filters + sec_param[i].filter_y_mv;
+
+        cp->mSwitchConvolve[sec_param[i].pred_mode + mode_num + 1](
+            src, sec_param[i].src_stride,
+            dst, sec_param[i].dst_stride,
+            filter_x, sec_param[i].x_step_q4,
+            filter_y, sec_param[i].y_step_q4,
+            sec_param[i].w, sec_param[i].h
+        );
+    }
+
+}
+
+RsdCpuScriptIntrinsicInterPred::RsdCpuScriptIntrinsicInterPred(RsdCpuReferenceImpl *ctx,
+                                                               const Script *s, const Element *e)
+            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_INTER_PRED) {
+    mRootPtr = &kernel;
+    mCount = 0;
+    mParamOffset = 0;
+    mFriParamCount = 0;
+    mSecParamCount = 0;
+    mRef = NULL;
+    mParam = NULL;
+
+#if defined(ARCH_ARM_HAVE_VFP)
+    mSwitchConvolve[0] = vp9_convolve_copy_neon;
+    mSwitchConvolve[1] = vp9_convolve_avg_neon;
+    mSwitchConvolve[2] = vp9_convolve8_vert_neon;
+    mSwitchConvolve[3] = vp9_convolve8_avg_vert_neon;
+    mSwitchConvolve[4] = vp9_convolve8_horiz_neon;
+    mSwitchConvolve[5] = vp9_convolve8_avg_horiz_neon;
+    mSwitchConvolve[6] = vp9_convolve8_neon;
+    mSwitchConvolve[7] = vp9_convolve8_avg_neon;
+
+    mSwitchConvolve[8] = vp9_convolve8_vert_neon;
+    mSwitchConvolve[9] = vp9_convolve8_avg_vert_neon;
+    mSwitchConvolve[10] = vp9_convolve8_vert_neon;
+    mSwitchConvolve[11] = vp9_convolve8_avg_vert_neon;
+    mSwitchConvolve[12] = vp9_convolve8_neon;
+    mSwitchConvolve[13] = vp9_convolve8_avg_neon;
+    mSwitchConvolve[14] = vp9_convolve8_neon;
+    mSwitchConvolve[15] = vp9_convolve8_avg_neon;
+
+    mSwitchConvolve[16] = vp9_convolve8_horiz_neon;
+    mSwitchConvolve[17] = vp9_convolve8_avg_horiz_neon;
+    mSwitchConvolve[18] = vp9_convolve8_neon;
+    mSwitchConvolve[19] = vp9_convolve8_avg_neon;
+    mSwitchConvolve[20] = vp9_convolve8_horiz_neon;
+    mSwitchConvolve[21] = vp9_convolve8_avg_horiz_neon;
+    mSwitchConvolve[22] = vp9_convolve8_neon;
+    mSwitchConvolve[23] = vp9_convolve8_avg_neon;
+
+    mSwitchConvolve[24] = vp9_convolve8_neon;
+    mSwitchConvolve[25] = vp9_convolve8_avg_neon;
+    mSwitchConvolve[26] = vp9_convolve8_neon;
+    mSwitchConvolve[27] = vp9_convolve8_avg_neon;
+    mSwitchConvolve[28] = vp9_convolve8_neon;
+    mSwitchConvolve[29] = vp9_convolve8_avg_neon;
+    mSwitchConvolve[30] = vp9_convolve8_neon;
+    mSwitchConvolve[31] = vp9_convolve8_avg_neon;
+#else
+    mSwitchConvolve[0] = vp9_convolve_copy_c;
+    mSwitchConvolve[1] = vp9_convolve_avg_c;
+    mSwitchConvolve[2] = vp9_convolve8_vert_c;
+    mSwitchConvolve[3] = vp9_convolve8_avg_vert_c;
+    mSwitchConvolve[4] = vp9_convolve8_horiz_c;
+    mSwitchConvolve[5] = vp9_convolve8_avg_horiz_c;
+    mSwitchConvolve[6] = vp9_convolve8_c;
+    mSwitchConvolve[7] = vp9_convolve8_avg_c;
+
+    mSwitchConvolve[8] = vp9_convolve8_vert_c;
+    mSwitchConvolve[9] = vp9_convolve8_avg_vert_c;
+    mSwitchConvolve[10] = vp9_convolve8_vert_c;
+    mSwitchConvolve[11] = vp9_convolve8_avg_vert_c;
+    mSwitchConvolve[12] = vp9_convolve8_c;
+    mSwitchConvolve[13] = vp9_convolve8_avg_c;
+    mSwitchConvolve[14] = vp9_convolve8_c;
+    mSwitchConvolve[15] = vp9_convolve8_avg_c;
+
+    mSwitchConvolve[16] = vp9_convolve8_horiz_c;
+    mSwitchConvolve[17] = vp9_convolve8_avg_horiz_c;
+    mSwitchConvolve[18] = vp9_convolve8_c;
+    mSwitchConvolve[19] = vp9_convolve8_avg_c;
+    mSwitchConvolve[20] = vp9_convolve8_horiz_c;
+    mSwitchConvolve[21] = vp9_convolve8_avg_horiz_c;
+    mSwitchConvolve[22] = vp9_convolve8_c;
+    mSwitchConvolve[23] = vp9_convolve8_avg_c;
+
+    mSwitchConvolve[24] = vp9_convolve8_c;
+    mSwitchConvolve[25] = vp9_convolve8_avg_c;
+    mSwitchConvolve[26] = vp9_convolve8_c;
+    mSwitchConvolve[27] = vp9_convolve8_avg_c;
+    mSwitchConvolve[28] = vp9_convolve8_c;
+    mSwitchConvolve[29] = vp9_convolve8_avg_c;
+    mSwitchConvolve[30] = vp9_convolve8_c;
+    mSwitchConvolve[31] = vp9_convolve8_avg_c;
+#endif
+}
+
+RsdCpuScriptIntrinsicInterPred::~RsdCpuScriptIntrinsicInterPred() {
+}
+
+void RsdCpuScriptIntrinsicInterPred::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 3;
+}
+
+void RsdCpuScriptIntrinsicInterPred::invokeFreeChildren() {
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_InterPred(RsdCpuReferenceImpl *ctx,
+                                          const Script *s, const Element *e) {
+    return new RsdCpuScriptIntrinsicInterPred(ctx, s, e);
+}
diff --git a/cpu_ref/rsCpuIntrinsicInterPred.h b/cpu_ref/rsCpuIntrinsicInterPred.h
new file mode 100644
index 0000000..552f4eb
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicInterPred.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_CPU_SCRIPT_INTRINSIC_INTER_PRED_H
+#define RSD_CPU_SCRIPT_INTRINSIC_INTER_PRED_H
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+#include "rsCpuConvolve.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+typedef struct inter_pred_param {
+    int pred_mode;
+
+    int src_mv;
+    int src_stride;
+    int dst_mv;
+    int dst_stride;
+
+    int filter_x_mv;
+    int x_step_q4;
+    int filter_y_mv;
+    int y_step_q4;
+
+    int w;
+    int h;
+}INTER_PRED_PARAM;
+
+static const int16_t inter_pred_filters[512] = {
+    0, 0, 0, 128, 0, 0, 0, 0, 0, 1, -5, 126, 8, -3, 1, 0,
+    -1, 3, -10, 122, 18, -6, 2, 0, -1, 4, -13, 118, 27, -9, 3, -1,
+    -1, 4, -16, 112, 37, -11, 4, -1, -1, 5, -18, 105, 48, -14, 4, -1,
+    -1, 5, -19, 97, 58, -16, 5, -1, -1, 6, -19, 88, 68, -18, 5, -1,
+    -1, 6, -19, 78, 78, -19, 6, -1, -1, 5, -18, 68, 88, -19, 6, -1,
+    -1, 5, -16, 58, 97, -19, 5, -1, -1, 4, -14, 48, 105, -18, 5, -1,
+    -1, 4, -11, 37, 112, -16, 4, -1, -1, 3, -9, 27, 118, -13, 4, -1,
+    0, 2, -6, 18, 122, -10, 3, -1, 0, 1, -3, 8, 126, -5, 1, 0,
+    0, 0, 0, 128, 0, 0, 0, 0, -3, -1, 32, 64, 38, 1, -3, 0,
+    -2, -2, 29, 63, 41, 2, -3, 0, -2, -2, 26, 63, 43, 4, -4, 0,
+    -2, -3, 24, 62, 46, 5, -4, 0, -2, -3, 21, 60, 49, 7, -4, 0,
+    -1, -4, 18, 59, 51, 9, -4, 0, -1, -4, 16, 57, 53, 12, -4, -1,
+    -1, -4, 14, 55, 55, 14, -4, -1, -1, -4, 12, 53, 57, 16, -4, -1,
+    0, -4, 9, 51, 59, 18, -4, -1, 0, -4, 7, 49, 60, 21, -3, -2,
+    0, -4, 5, 46, 62, 24, -3, -2, 0, -4, 4, 43, 63, 26, -2, -2,
+    0, -3, 2, 41, 63, 29, -2, -2, 0, -3, 1, 38, 64, 32, -1, -3,
+    0, 0, 0, 128, 0, 0, 0, 0, -1, 3, -7, 127, 8, -3, 1, 0,
+    -2, 5, -13, 125, 17, -6, 3, -1, -3, 7, -17, 121, 27, -10, 5, -2,
+    -4, 9, -20, 115, 37, -13, 6, -2, -4, 10, -23, 108, 48, -16, 8, -3,
+    -4, 10, -24, 100, 59, -19, 9, -3, -4, 11, -24, 90, 70, -21, 10, -4,
+    -4, 11, -23, 80, 80, -23, 11, -4, -4, 10, -21, 70, 90, -24, 11, -4,
+    -3, 9, -19, 59, 100, -24, 10, -4, -3, 8, -16, 48, 108, -23, 10, -4,
+    -2, 6, -13, 37, 115, -20, 9, -4, -2, 5, -10, 27, 121, -17, 7, -3,
+    -1, 3, -6, 17, 125, -13, 5, -2, 0, 1, -3, 8, 127, -7, 3, -1,
+    0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 120, 8, 0, 0, 0,
+    0, 0, 0, 112, 16, 0, 0, 0, 0, 0, 0, 104, 24, 0, 0, 0,
+    0, 0, 0, 96, 32, 0, 0, 0, 0, 0, 0, 88, 40, 0, 0, 0,
+    0, 0, 0, 80, 48, 0, 0, 0, 0, 0, 0, 72, 56, 0, 0, 0,
+    0, 0, 0, 64, 64, 0, 0, 0, 0, 0, 0, 56, 72, 0, 0, 0,
+    0, 0, 0, 48, 80, 0, 0, 0, 0, 0, 0, 40, 88, 0, 0, 0,
+    0, 0, 0, 32, 96, 0, 0, 0, 0, 0, 0, 24, 104, 0, 0, 0,
+    0, 0, 0, 16, 112, 0, 0, 0, 0, 0, 0, 8, 120, 0, 0, 0
+};
+
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h);
+
+
+class RsdCpuScriptIntrinsicInterPred: public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual ~RsdCpuScriptIntrinsicInterPred();
+    RsdCpuScriptIntrinsicInterPred(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
+
+protected:
+    uint8_t *mRef;
+    uint8_t *mParam;
+    int mFriParamCount;
+    int mSecParamCount;
+    int mParamOffset;
+    int mCount;
+    convolve_fn_t mSwitchConvolve[32];
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+#endif
+
diff --git a/rsDefines.h b/rsDefines.h
index 741f67b..fbc63cd 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -364,7 +364,8 @@
     RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB = 6,
     RS_SCRIPT_INTRINSIC_ID_BLEND = 7,
     RS_SCRIPT_INTRINSIC_ID_3DLUT = 8,
-    RS_SCRIPT_INTRINSIC_ID_HISTOGRAM = 9
+    RS_SCRIPT_INTRINSIC_ID_HISTOGRAM = 9,
+    RS_SCRIPT_INTRINSIC_ID_INTER_PRED= 10
 };
 
 typedef struct {