Merge "Revert "Add VP9 inter-frame prediction intrinsic""
diff --git a/cpp/ScriptIntrinsics.cpp b/cpp/ScriptIntrinsics.cpp
index 34b2162..ba1e5f4 100644
--- a/cpp/ScriptIntrinsics.cpp
+++ b/cpp/ScriptIntrinsics.cpp
@@ -66,52 +66,6 @@
     Script::setVar(0, lut);
 }
 
-sp<ScriptIntrinsicVP9InterPred> ScriptIntrinsicVP9InterPred::create(sp<RS> rs, sp<const Element> e) {
-    if (e->isCompatible(Element::U8(rs)) == false) {
-        rs->throwError(RS_ERROR_INVALID_ELEMENT, "Element not supported for intrinsic");
-        return NULL;
-    }
-    return new ScriptIntrinsicVP9InterPred(rs, e);
-}
-
-ScriptIntrinsicVP9InterPred::ScriptIntrinsicVP9InterPred(sp<RS> rs, sp<const Element> e)
-    : ScriptIntrinsic(rs, RS_SCRIPT_INTRINSIC_ID_INTER_PRED, e) {
-}
-
-void ScriptIntrinsicVP9InterPred::forEach(sp<Allocation> asize) {
-    if (asize->getType()->getElement()->isCompatible(mElement) == false) {
-        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "InterPred forEach element mismatch");
-        return;
-    }
-    Script::forEach(0, asize, NULL, NULL, 0);
-}
-
-void ScriptIntrinsicVP9InterPred::setRef(sp<Allocation> ref) {
-    sp<const Type> t = ref->getType();
-    if (!t->getElement()->isCompatible(mElement)) {
-        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "setRef element does not match");
-        return;
-    }
-    Script::setVar(0, ref);
-}
-
-void ScriptIntrinsicVP9InterPred::setParam(sp<Allocation> param) {
-    sp<const Type> t = param->getType();
-    if (!t->getElement()->isCompatible(mElement)) {
-        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "setFriParam element does not match");
-        return;
-    }
-    Script::setVar(1, param);
-}
-
-void ScriptIntrinsicVP9InterPred::setParamCount(int fri, int sec, int offset) {
-    FieldPacker fp(12);
-    fp.add(fri);
-    fp.add(sec);
-    fp.add(offset);
-    Script::setVar(2, fp.getData(), fp.getLength());
-}
-
 sp<ScriptIntrinsicBlend> ScriptIntrinsicBlend::create(sp<RS> rs, sp<const Element> e) {
     if (e->isCompatible(Element::U8_4(rs)) == false) {
         rs->throwError(RS_ERROR_INVALID_ELEMENT, "Element not supported for intrinsic");
diff --git a/cpp/rsCppStructs.h b/cpp/rsCppStructs.h
index 8268b61..8b07bf0 100644
--- a/cpp/rsCppStructs.h
+++ b/cpp/rsCppStructs.h
@@ -1435,20 +1435,6 @@
      */
     void setLUT(sp<Allocation> lut);
 };
-/**
- * Intrinsic for VP9InterPrediction
- */
-class ScriptIntrinsicVP9InterPred : public ScriptIntrinsic {
- private:
-    ScriptIntrinsicVP9InterPred(sp<RS> rs, sp<const Element> e);
- public:
-    static sp<ScriptIntrinsicVP9InterPred> create(sp<RS> rs, sp<const Element> e);
-
-    void forEach(sp<Allocation> asize);
-    void setRef(sp<Allocation> ref);
-    void setParamCount(int fri, int sec, int offset);
-    void setParam(sp<Allocation> param);
-};
 
 /**
  * Intrinsic kernel for blending two Allocations.
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index f87ac6e..96d8b07 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -29,12 +29,10 @@
 	rsCpuIntrinsicConvolve3x3.cpp \
 	rsCpuIntrinsicConvolve5x5.cpp \
 	rsCpuIntrinsicHistogram.cpp \
-	rsCpuIntrinsicInterPred.cpp \
 	rsCpuIntrinsicLoopFilter.cpp \
 	rsCpuIntrinsicYuvToRGB.cpp \
 	rsCpuIntrinsicResize.cpp \
-	rsCpuIntrinsicLUT.cpp \
-	convolve/convolve.c
+	rsCpuIntrinsicLUT.cpp
 
 LOCAL_CFLAGS_arm64 += -DARCH_ARM_HAVE_NEON
 LOCAL_CFLAGS_64 += -DFAKE_ARM64_BUILD
@@ -61,11 +59,6 @@
     rsCpuIntrinsics_neon_Convolve.S \
     rsCpuIntrinsics_neon_ColorMatrix.S \
     rsCpuIntrinsics_neon_YuvToRGB.S \
-    convolve/convolve_copy_neon.s \
-    convolve/convolve_avg_neon.s \
-    convolve/convolve8_neon.s \
-    convolve/convolve8_avg_neon.s \
-    convolve/convolve_neon.c\
     vp9_loopfilter_16_neon.S \
     vp9_loopfilter_neon.S \
     vp9_mb_lpf_neon.S
diff --git a/cpu_ref/convolve/convolve.c b/cpu_ref/convolve/convolve.c
deleted file mode 100644
index c85db92..0000000
--- a/cpu_ref/convolve/convolve.c
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- *
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "vp9_common.h"
-#include "vp9_filter.h"
-#include <string.h>
-#include <stdio.h>
-
-static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const subpel_kernel *x_filters,
-                           int x0_q4, int x_step_q4, int w, int h) {
-    int x, y;
-    src -= SUBPEL_TAPS / 2 - 1;
-    for (y = 0; y < h; ++y) {
-        int x_q4 = x0_q4;
-        for (x = 0; x < w; ++x) {
-            const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-            const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-            int k, sum = 0;
-            for (k = 0; k < SUBPEL_TAPS; ++k)
-                sum += src_x[k] * x_filter[k];
-            dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-            x_q4 += x_step_q4;
-        }
-        src += src_stride;
-        dst += dst_stride;
-    }
-}
-
-static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const subpel_kernel *x_filters,
-                               int x0_q4, int x_step_q4, int w, int h) {
-    int x, y;
-    src -= SUBPEL_TAPS / 2 - 1;
-    for (y = 0; y < h; ++y) {
-        int x_q4 = x0_q4;
-        for (x = 0; x < w; ++x) {
-            const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-            const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-            int k, sum = 0;
-            for (k = 0; k < SUBPEL_TAPS; ++k)
-                sum += src_x[k] * x_filter[k];
-            dst[x] = ROUND_POWER_OF_TWO(dst[x] +
-                                        clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-            x_q4 += x_step_q4;
-        }
-        src += src_stride;
-        dst += dst_stride;
-    }
-}
-
-static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const subpel_kernel *y_filters,
-                          int y0_q4, int y_step_q4, int w, int h) {
-    int x, y;
-    src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-    for (x = 0; x < w; ++x) {
-        int y_q4 = y0_q4;
-        for (y = 0; y < h; ++y) {
-            const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-            const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-            int k, sum = 0;
-            for (k = 0; k < SUBPEL_TAPS; ++k)
-                sum += src_y[k * src_stride] * y_filter[k];
-            dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-            y_q4 += y_step_q4;
-        }
-        ++src;
-        ++dst;
-    }
-}
-
-static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const subpel_kernel *y_filters,
-                              int y0_q4, int y_step_q4, int w, int h) {
-    int x, y;
-    src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-    for (x = 0; x < w; ++x) {
-        int y_q4 = y0_q4;
-        for (y = 0; y < h; ++y) {
-            const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-            const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-            int k, sum = 0;
-            for (k = 0; k < SUBPEL_TAPS; ++k)
-                sum += src_y[k * src_stride] * y_filter[k];
-            dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
-                                  clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-            y_q4 += y_step_q4;
-        }
-        ++src;
-        ++dst;
-    }
-}
-
-static void convolve(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const subpel_kernel *const x_filters,
-                     int x0_q4, int x_step_q4,
-                     const subpel_kernel *const y_filters,
-                     int y0_q4, int y_step_q4,
-                     int w, int h) {
-    // Fixed size intermediate buffer places limits on parameters.
-    // Maximum intermediate_height is 324, for y_step_q4 == 80,
-    // h == 64, taps == 8.
-    // y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
-    uint8_t temp[64 * 324];
-    int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;
-
-    if (intermediate_height < h)
-        intermediate_height = h;
-
-    convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
-                   x_filters, x0_q4, x_step_q4, w, intermediate_height);
-    convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
-                  y_filters, y0_q4, y_step_q4, w, h);
-}
-
-static const subpel_kernel *get_filter_base(const int16_t *filter) {
-    // NOTE: This assumes that the filter table is 256-byte aligned.
-    // TODO(agrange) Modify to make independent of table alignment.
-    return (const subpel_kernel *)(filter);
-}
-
-static int get_filter_offset(const int16_t *f, const subpel_kernel *base) {
-    return 0;
-}
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h) {
-    const subpel_kernel *const filters_x = get_filter_base(filter_x);
-    const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-    convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
-                   x0_q4, x_step_q4, w, h);
-}
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-    const subpel_kernel *const filters_x = get_filter_base(filter_x);
-    const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-    convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
-                       x0_q4, x_step_q4, w, h);
-}
-
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4,
-                          int w, int h) {
-    const subpel_kernel *const filters_y = get_filter_base(filter_y);
-    const int y0_q4 = get_filter_offset(filter_y, filters_y);
-    convolve_vert(src, src_stride, dst, dst_stride, filters_y,
-                  y0_q4, y_step_q4, w, h);
-}
-
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-    const subpel_kernel *const filters_y = get_filter_base(filter_y);
-    const int y0_q4 = get_filter_offset(filter_y, filters_y);
-    convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
-                      y0_q4, y_step_q4, w, h);
-}
-
-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const int16_t *filter_x, int x_step_q4,
-                     const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-    const subpel_kernel *const filters_x = get_filter_base(filter_x);
-    const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-    const subpel_kernel *const filters_y = get_filter_base(filter_y);
-    const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-    convolve(src, src_stride, dst, dst_stride,
-             filters_x, x0_q4, x_step_q4,
-             filters_y, y0_q4, y_step_q4, w, h);
-}
-
-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int filter_x_stride,
-                        const int16_t *filter_y, int filter_y_stride,
-                        int w, int h) {
-    int x, y;
-    for (y = 0; y < h; ++y) {
-        for (x = 0; x < w; ++x)
-            dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-
-        src += src_stride;
-        dst += dst_stride;
-    }
-}
-
-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-    /* Fixed size intermediate buffer places limits on parameters. */
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 64 * 64);
-
-    vp9_convolve8_c(src, src_stride, temp, 64,
-                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-    vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
-}
-
-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int filter_x_stride,
-                         const int16_t *filter_y, int filter_y_stride,
-                         int w, int h) {
-    int r;
-
-    for (r = h; r > 0; --r) {
-        memcpy(dst, src, w);
-        src += src_stride;
-        dst += dst_stride;
-    }
-}
diff --git a/cpu_ref/convolve/convolve8_avg_neon.s b/cpu_ref/convolve/convolve8_avg_neon.s
deleted file mode 100644
index 7821446..0000000
--- a/cpu_ref/convolve/convolve8_avg_neon.s
+++ /dev/null
@@ -1,323 +0,0 @@
-@ This file was created from a .asm file
-@  using the ads2gas.pl script.
-  .equ DO1STROUNDING, 0
-@
-@  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-@
-@  Use of this source code is governed by a BSD-style license
-@  that can be found in the LICENSE file in the root of the source
-@  tree. An additional intellectual property rights grant can be found
-@  in the file PATENTS.  All contributing project authors may
-@  be found in the AUTHORS file in the root of the source tree.
-@
-@  Copyright (c) 2014 The Android Open Source Project
-@
-@  Licensed under the Apache License, Version 2.0 (the "License");
-@  you may not use this file except in compliance with the License.
-@  You may obtain a copy of the License at
-@
-@      http://www.apache.org/licenses/LICENSE-2.0
-@
-@  Unless required by applicable law or agreed to in writing, software
-@  distributed under the License is distributed on an "AS IS" BASIS,
-@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@  See the License for the specific language governing permissions and
-@  limitations under the License.
-
-
-
-    @ These functions are only valid when:
-    @ x_step_q4 == 16
-    @ w%4 == 0
-    @ h%4 == 0
-    @ taps == 8
-    @ VP9_FILTER_WEIGHT == 128
-    @ VP9_FILTER_SHIFT == 7
-
-    .global vp9_convolve8_avg_horiz_neon
-  .type vp9_convolve8_avg_horiz_neon, function
-    .global vp9_convolve8_avg_vert_neon
-  .type vp9_convolve8_avg_vert_neon, function
-    .global vp9_convolve8_avg_horiz_c
-    .global vp9_convolve8_avg_vert_c
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
-
-.text
-.p2align 2
-
-    @ Multiply and accumulate by q0
-.macro    MULTIPLY_BY_Q0 dst, src0, src1, src2, src3, src4, src5, src6, src7
-    vmull.s16 \dst, \src0, d0[0]
-    vmlal.s16 \dst, \src1, d0[1]
-    vmlal.s16 \dst, \src2, d0[2]
-    vmlal.s16 \dst, \src3, d0[3]
-    vmlal.s16 \dst, \src4, d1[0]
-    vmlal.s16 \dst, \src5, d1[1]
-    vmlal.s16 \dst, \src6, d1[2]
-    vmlal.s16 \dst, \src7, d1[3]
-    .endm
-
-@ r0    const uint8_t *src
-@ r1    int src_stride
-@ r2    uint8_t *dst
-@ r3    int dst_stride
-@ sp[]const int16_t *filter_x
-@ sp[]int x_step_q4
-@ sp[]const int16_t *filter_y ; unused
-@ sp[]int y_step_q4           ; unused
-@ sp[]int w
-@ sp[]int h
-
-_vp9_convolve8_avg_horiz_neon:
-  vp9_convolve8_avg_horiz_neon: @ PROC
-    ldr             r12, [sp, #4]           @ x_step_q4
-    cmp             r12, #16
-    bne             vp9_convolve8_avg_horiz_c
-
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              @ adjust for taps
-
-    ldr             r5, [sp, #32]           @ filter_x
-    ldr             r6, [sp, #48]           @ w
-    ldr             r7, [sp, #52]           @ h
-
-    vld1.s16        {q0}, [r5]              @ filter_x
-
-    sub             r8, r1, r1, lsl #2      @ -src_stride * 3
-    add             r8, r8, #4              @ -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      @ -dst_stride * 3
-    add             r4, r4, #4              @ -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      @ reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     @ reset dst for outer loop
-
-    mov             r10, r6                 @ w loop counter
-
-loop_horiz_v:
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    @ save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-loop_horiz:
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    @ extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    @ slightly out of order load to match the existing data
-    vld1.u32        {d6[0]}, [r2], r3
-    vld1.u32        {d7[0]}, [r2], r3
-    vld1.u32        {d6[1]}, [r2], r3
-    vld1.u32        {d7[1]}, [r2], r3
-
-    sub             r2, r2, r3, lsl #2      @ reset for store
-
-    @ src[] * filter_x
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    @ += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    @ saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    @ transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    @ average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    vst1.u32        {d2[0]}, [r2,:32], r3
-    vst1.u32        {d3[0]}, [r2,:32], r3
-    vst1.u32        {d2[1]}, [r2,:32], r3
-    vst1.u32        {d3[1]}, [r2,:32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              @ w -= 4
-    bgt             loop_horiz
-
-    @ outer loop
-    mov             r6, r10                 @ restore w counter
-    add             r0, r0, r9              @ src += src_stride * 4 - w
-    add             r2, r2, r12             @ dst += dst_stride * 4 - w
-    subs            r7, r7, #4              @ h -= 4
-    bgt loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-  .size vp9_convolve8_avg_horiz_neon, .-vp9_convolve8_avg_horiz_neon    @ ENDP
-
-_vp9_convolve8_avg_vert_neon:
-  vp9_convolve8_avg_vert_neon: @ PROC
-    ldr             r12, [sp, #12]
-    cmp             r12, #16
-    bne             vp9_convolve8_avg_vert_c
-
-    push            {r4-r8, lr}
-
-    @ adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #32]           @ filter_y
-    ldr             r6, [sp, #40]           @ w
-    ldr             lr, [sp, #44]           @ h
-
-    vld1.s16        {q0}, [r4]              @ filter_y
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-loop_vert_h:
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 @ h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-loop_vert:
-    @ always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    @ extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    vld1.u32        {d6[0]}, [r5,:32], r3
-    vld1.u32        {d6[1]}, [r8,:32], r3
-    vld1.u32        {d7[0]}, [r5,:32], r3
-    vld1.u32        {d7[1]}, [r8,:32], r3
-
-    pld             [r7]
-    pld             [r4]
-
-    @ src[] * filter_y
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    @ += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    @ saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    @ average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    sub             r5, r5, r3, lsl #1      @ reset for store
-    sub             r8, r8, r3, lsl #1
-
-    vst1.u32        {d2[0]}, [r5,:32], r3
-    vst1.u32        {d2[1]}, [r8,:32], r3
-    vst1.u32        {d3[0]}, [r5,:32], r3
-    vst1.u32        {d3[1]}, [r8,:32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            @ h -= 4
-    bgt             loop_vert
-
-    @ outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              @ w -= 4
-    bgt             loop_vert_h
-
-    pop             {r4-r8, pc}
-
-  .size vp9_convolve8_avg_vert_neon, .-vp9_convolve8_avg_vert_neon    @ ENDP
-  .section  .note.GNU-stack,"",%progbits
diff --git a/cpu_ref/convolve/convolve8_neon.s b/cpu_ref/convolve/convolve8_neon.s
deleted file mode 100644
index 0bc15d9..0000000
--- a/cpu_ref/convolve/convolve8_neon.s
+++ /dev/null
@@ -1,300 +0,0 @@
-@ This file was created from a .asm file
-@  using the ads2gas.pl script.
-    .equ DO1STROUNDING, 0
-@
-@  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-@
-@  Use of this source code is governed by a BSD-style license
-@  that can be found in the LICENSE file in the root of the source
-@  tree. An additional intellectual property rights grant can be found
-@  in the file PATENTS.  All contributing project authors may
-@  be found in the AUTHORS file in the root of the source tree.
-@
-@  Copyright (c) 2014 The Android Open Source Project
-@
-@  Licensed under the Apache License, Version 2.0 (the "License");
-@  you may not use this file except in compliance with the License.
-@  You may obtain a copy of the License at
-@
-@      http://www.apache.org/licenses/LICENSE-2.0
-@
-@  Unless required by applicable law or agreed to in writing, software
-@  distributed under the License is distributed on an "AS IS" BASIS,
-@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@  See the License for the specific language governing permissions and
-@  limitations under the License.
-
-
-    @ These functions are only valid when:
-    @ x_step_q4 == 16
-    @ w%4 == 0
-    @ h%4 == 0
-    @ taps == 8
-    @ VP9_FILTER_WEIGHT == 128
-    @ VP9_FILTER_SHIFT == 7
-
-    .global vp9_convolve8_horiz_neon
-    .type vp9_convolve8_horiz_neon, function
-    .global vp9_convolve8_vert_neon
-    .type vp9_convolve8_vert_neon, function
-    .global vp9_convolve8_horiz_c
-    .global vp9_convolve8_vert_c
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
-
-.text
-.p2align 2
-
-    @ Multiply and accumulate by q0
-.macro    MULTIPLY_BY_Q0 dst, src0, src1, src2, src3, src4, src5, src6, src7
-    vmull.s16 \dst, \src0, d0[0]
-    vmlal.s16 \dst, \src1, d0[1]
-    vmlal.s16 \dst, \src2, d0[2]
-    vmlal.s16 \dst, \src3, d0[3]
-    vmlal.s16 \dst, \src4, d1[0]
-    vmlal.s16 \dst, \src5, d1[1]
-    vmlal.s16 \dst, \src6, d1[2]
-    vmlal.s16 \dst, \src7, d1[3]
-    .endm
-
-@ r0    const uint8_t *src
-@ r1    int src_stride
-@ r2    uint8_t *dst
-@ r3    int dst_stride
-@ sp[]const int16_t *filter_x
-@ sp[]int x_step_q4
-@ sp[]const int16_t *filter_y ; unused
-@ sp[]int y_step_q4           ; unused
-@ sp[]int w
-@ sp[]int h
-
-_vp9_convolve8_horiz_neon:
-    vp9_convolve8_horiz_neon: @ PROC
-    ldr             r12, [sp, #4]           @ x_step_q4
-    cmp             r12, #16
-    bne             vp9_convolve8_horiz_c
-
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              @ adjust for taps
-
-    ldr             r5, [sp, #32]           @ filter_x
-    ldr             r6, [sp, #48]           @ w
-    ldr             r7, [sp, #52]           @ h
-
-    vld1.s16        {q0}, [r5]              @ filter_x
-
-    sub             r8, r1, r1, lsl #2      @ -src_stride * 3
-    add             r8, r8, #4              @ -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      @ -dst_stride * 3
-    add             r4, r4, #4              @ -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      @ reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     @ reset dst for outer loop
-
-    mov             r10, r6                 @ w loop counter
-
-loop_horiz_v:
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    @ save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-loop_horiz:
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    @ extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    @ src[] * filter_x
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    @ += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    @ saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    @ transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    vst1.u32        {d2[0]}, [r2,:32], r3
-    vst1.u32        {d3[0]}, [r2,:32], r3
-    vst1.u32        {d2[1]}, [r2,:32], r3
-    vst1.u32        {d3[1]}, [r2,:32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              @ w -= 4
-    bgt             loop_horiz
-
-    @ outer loop
-    mov             r6, r10                 @ restore w counter
-    add             r0, r0, r9              @ src += src_stride * 4 - w
-    add             r2, r2, r12             @ dst += dst_stride * 4 - w
-    subs            r7, r7, #4              @ h -= 4
-    bgt loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    .size vp9_convolve8_horiz_neon, .-vp9_convolve8_horiz_neon    @ ENDP
-
-_vp9_convolve8_vert_neon:
-    vp9_convolve8_vert_neon: @ PROC
-    ldr             r12, [sp, #12]
-    cmp             r12, #16
-    bne             vp9_convolve8_vert_c
-
-    push            {r4-r8, lr}
-
-    @ adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #32]           @ filter_y
-    ldr             r6, [sp, #40]           @ w
-    ldr             lr, [sp, #44]           @ h
-
-    vld1.s16        {q0}, [r4]              @ filter_y
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-loop_vert_h:
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 @ h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-loop_vert:
-    @ always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    @ extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    @ src[] * filter_y
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r7]
-    pld             [r4]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    @ += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    @ saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    vst1.u32        {d2[0]}, [r5,:32], r3
-    vst1.u32        {d2[1]}, [r8,:32], r3
-    vst1.u32        {d3[0]}, [r5,:32], r3
-    vst1.u32        {d3[1]}, [r8,:32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            @ h -= 4
-    bgt             loop_vert
-
-    @ outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              @ w -= 4
-    bgt             loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    .size vp9_convolve8_vert_neon, .-vp9_convolve8_vert_neon    @ ENDP
-    .section    .note.GNU-stack,"",%progbits
diff --git a/cpu_ref/convolve/convolve_avg_neon.s b/cpu_ref/convolve/convolve_avg_neon.s
deleted file mode 100644
index 41e79f1..0000000
--- a/cpu_ref/convolve/convolve_avg_neon.s
+++ /dev/null
@@ -1,135 +0,0 @@
-@ This file was created from a .asm file
-@  using the ads2gas.pl script.
-    .equ DO1STROUNDING, 0
-@
-@  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-@
-@  Use of this source code is governed by a BSD-style license
-@  that can be found in the LICENSE file in the root of the source
-@  tree. An additional intellectual property rights grant can be found
-@  in the file PATENTS.  All contributing project authors may
-@  be found in the AUTHORS file in the root of the source tree.
-@
-@  Copyright (c) 2014 The Android Open Source Project
-@
-@  Licensed under the Apache License, Version 2.0 (the "License");
-@  you may not use this file except in compliance with the License.
-@  You may obtain a copy of the License at
-@
-@      http://www.apache.org/licenses/LICENSE-2.0
-@
-@  Unless required by applicable law or agreed to in writing, software
-@  distributed under the License is distributed on an "AS IS" BASIS,
-@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@  See the License for the specific language governing permissions and
-@  limitations under the License.
-
-    .global vp9_convolve_avg_neon
-    .type vp9_convolve_avg_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
-
-.text
-.p2align 2
-
-_vp9_convolve_avg_neon:
-    vp9_convolve_avg_neon: @ PROC
-    push                {r4-r6, lr}
-    ldrd                r4, r5, [sp, #32]
-    mov                 r6, r2
-
-    cmp                 r4, #32
-    bgt                 avg64
-    beq                 avg32
-    cmp                 r4, #8
-    bgt                 avg16
-    beq                 avg8
-    b                   avg4
-
-avg64:
-    sub                 lr, r1, #32
-    sub                 r4, r3, #32
-avg64_h:
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0]!
-    vld1.8              {q2-q3}, [r0], lr
-    pld                 [r2, r3]
-    vld1.8              {q8-q9},   [r6,:128]!
-    vld1.8              {q10-q11}, [r6,:128], r4
-    vrhadd.u8           q0, q0, q8
-    vrhadd.u8           q1, q1, q9
-    vrhadd.u8           q2, q2, q10
-    vrhadd.u8           q3, q3, q11
-    vst1.8              {q0-q1}, [r2,:128]!
-    vst1.8              {q2-q3}, [r2,:128], r4
-    subs                r5, r5, #1
-    bgt                 avg64_h
-    pop                 {r4-r6, pc}
-
-avg32:
-    vld1.8              {q0-q1}, [r0], r1
-    vld1.8              {q2-q3}, [r0], r1
-    vld1.8              {q8-q9},   [r6,:128], r3
-    vld1.8              {q10-q11}, [r6,:128], r3
-    pld                 [r0]
-    vrhadd.u8           q0, q0, q8
-    pld                 [r0, r1]
-    vrhadd.u8           q1, q1, q9
-    pld                 [r6]
-    vrhadd.u8           q2, q2, q10
-    pld                 [r6, r3]
-    vrhadd.u8           q3, q3, q11
-    vst1.8              {q0-q1}, [r2,:128], r3
-    vst1.8              {q2-q3}, [r2,:128], r3
-    subs                r5, r5, #2
-    bgt                 avg32
-    pop                 {r4-r6, pc}
-
-avg16:
-    vld1.8              {q0}, [r0], r1
-    vld1.8              {q1}, [r0], r1
-    vld1.8              {q2}, [r6,:128], r3
-    vld1.8              {q3}, [r6,:128], r3
-    pld                 [r0]
-    pld                 [r0, r1]
-    vrhadd.u8           q0, q0, q2
-    pld                 [r6]
-    pld                 [r6, r3]
-    vrhadd.u8           q1, q1, q3
-    vst1.8              {q0}, [r2,:128], r3
-    vst1.8              {q1}, [r2,:128], r3
-    subs                r5, r5, #2
-    bgt                 avg16
-    pop                 {r4-r6, pc}
-
-avg8:
-    vld1.8              {d0}, [r0], r1
-    vld1.8              {d1}, [r0], r1
-    vld1.8              {d2}, [r6,:64], r3
-    vld1.8              {d3}, [r6,:64], r3
-    pld                 [r0]
-    pld                 [r0, r1]
-    vrhadd.u8           q0, q0, q1
-    pld                 [r6]
-    pld                 [r6, r3]
-    vst1.8              {d0}, [r2,:64], r3
-    vst1.8              {d1}, [r2,:64], r3
-    subs                r5, r5, #2
-    bgt                 avg8
-    pop                 {r4-r6, pc}
-
-avg4:
-    vld1.32             {d0[0]}, [r0], r1
-    vld1.32             {d0[1]}, [r0], r1
-    vld1.32             {d2[0]}, [r6,:32], r3
-    vld1.32             {d2[1]}, [r6,:32], r3
-    vrhadd.u8           d0, d0, d2
-    vst1.32             {d0[0]}, [r2,:32], r3
-    vst1.32             {d0[1]}, [r2,:32], r3
-    subs                r5, r5, #2
-    bgt                 avg4
-    pop                 {r4-r6, pc}
-    .size vp9_convolve_avg_neon, .-vp9_convolve_avg_neon    @ ENDP
-
-    .section	.note.GNU-stack,"",%progbits
diff --git a/cpu_ref/convolve/convolve_copy_neon.s b/cpu_ref/convolve/convolve_copy_neon.s
deleted file mode 100644
index 60ada14..0000000
--- a/cpu_ref/convolve/convolve_copy_neon.s
+++ /dev/null
Binary files differ
diff --git a/cpu_ref/convolve/convolve_neon.c b/cpu_ref/convolve/convolve_neon.c
deleted file mode 100644
index 3d4bf30..0000000
--- a/cpu_ref/convolve/convolve_neon.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- *
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "vp9_common.h"
-#include "vp9_filter.h"
-#include <string.h>
-#include <stdio.h>
-
-extern void vp9_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
-                                     int w, int h);
-
-extern void vp9_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int x_step_q4,
-                                    const int16_t *filter_y, int y_step_q4,
-                                    int w, int h);
-
-extern void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                        uint8_t *dst, ptrdiff_t dst_stride,
-                                        const int16_t *filter_x, int x_step_q4,
-                                        const int16_t *filter_y, int y_step_q4,
-                                        int w, int h);
-extern void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h);
-
-extern void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h);
-
-void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int x_step_q4,
-                        const int16_t *filter_y, int y_step_q4,
-                        int w, int h) {
-    /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
-     * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
-     */
-    DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
-
-    // Account for the vertical phase needing 3 lines prior and 4 lines post
-    int intermediate_height = h + 7;
-
-    if (x_step_q4 != 16 || y_step_q4 != 16)
-        return vp9_convolve8_c(src, src_stride,
-                               dst, dst_stride,
-                               filter_x, x_step_q4,
-                               filter_y, y_step_q4,
-                               w, h);
-
-    /* Filter starting 3 lines back. The neon implementation will ignore the
-     * given height and filter a multiple of 4 lines. Since this goes in to
-     * the temp buffer which has lots of extra room and is subsequently discarded
-     * this is safe if somewhat less than ideal.
-     */
-    vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
-                             temp, 64, filter_x, x_step_q4,
-                             filter_y, y_step_q4,
-                             w, intermediate_height);
-
-    /* Step into the temp buffer 3 lines to get the actual frame data */
-    vp9_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h) {
-    DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
-    int intermediate_height = h + 7;
-
-    if (x_step_q4 != 16 || y_step_q4 != 16)
-        return vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                                   filter_x, x_step_q4, filter_y, y_step_q4,
-                                   w, h);
-
-    /* This implementation has the same issues as above. In addition, we only want
-     * to average the values after both passes.
-     */
-    vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64,
-                             filter_x, x_step_q4, filter_y, y_step_q4,
-                             w, intermediate_height);
-    vp9_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride,
-                                filter_x, x_step_q4, filter_y, y_step_q4,
-                                w, h);
-}
diff --git a/cpu_ref/convolve/vp9_common.h b/cpu_ref/convolve/vp9_common.h
deleted file mode 100644
index 73a1021..0000000
--- a/cpu_ref/convolve/vp9_common.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- *
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef VP9_COMMON_VP9_COMMON_H_
-#define VP9_COMMON_VP9_COMMON_H_
-#include <stdint.h>
-
-#define DECLARE_ALIGNED_ARRAY(a,typ,val,n)\
-  typ val##_[(n)+(a)/sizeof(typ)+1];\
-  typ *val = (typ*)((((intptr_t)val##_)+(a)-1)&((intptr_t)-(a)))
-/* Interface header for common constant data structures and lookup tables */
-
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-
-#define ROUND_POWER_OF_TWO(value, n) \
-    (((value) + (1 << ((n) - 1))) >> (n))
-
-#define ALIGN_POWER_OF_TWO(value, n) \
-    (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
-
-// Only need this for fixed-size arrays, for structs just assign.
-#define vp9_copy(dest, src) {            \
-    memcpy(dest, src, sizeof(src));  \
-  }
-
-// Use this for variably-sized arrays.
-#define vp9_copy_array(dest, src, n) {       \
-    memcpy(dest, src, n * sizeof(*src)); \
-  }
-
-#define vp9_zero(dest) memset(&dest, 0, sizeof(dest))
-#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
-
-static inline uint8_t clip_pixel(int val) {
-  return (val > 255) ? 255u : (val < 0) ? 0u : val;
-}
-
-static inline int clamp(int value, int low, int high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static inline double fclamp(double value, double low, double high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-#define VP9_SYNC_CODE_0 0x49
-#define VP9_SYNC_CODE_1 0x83
-#define VP9_SYNC_CODE_2 0x42
-
-#define VP9_FRAME_MARKER 0x2
-
-
-#endif  // VP9_COMMON_VP9_COMMON_H_
diff --git a/cpu_ref/convolve/vp9_filter.h b/cpu_ref/convolve/vp9_filter.h
deleted file mode 100644
index 754578d..0000000
--- a/cpu_ref/convolve/vp9_filter.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- *
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef VP9_COMMON_VP9_FILTER_H_
-#define VP9_COMMON_VP9_FILTER_H_
-
-#define FILTER_BITS 7
-
-#define SUBPEL_BITS 4
-#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
-#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
-#define SUBPEL_TAPS 8
-
-typedef enum {
-    EIGHTTAP = 0,
-    EIGHTTAP_SMOOTH = 1,
-    EIGHTTAP_SHARP = 2,
-    BILINEAR = 3,
-    SWITCHABLE = 4  /* should be the last one */
-} INTERPOLATION_TYPE;
-
-typedef int16_t subpel_kernel[SUBPEL_TAPS];
-
-struct subpix_fn_table {
-    const subpel_kernel *filter_x;
-    const subpel_kernel *filter_y;
-};
-
-const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type);
-extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
-
-// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
-// filter kernel as a 2 tap filter.
-#define BILINEAR_FILTERS_2TAP(x) \
-    (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1)
-
-#endif  // VP9_COMMON_VP9_FILTER_H_
diff --git a/cpu_ref/rsCpuConvolve.h b/cpu_ref/rsCpuConvolve.h
deleted file mode 100644
index d7d2d16..0000000
--- a/cpu_ref/rsCpuConvolve.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef RSD_CPU_CONVOLVE_NEON_H
-#define RSD_CPU_CONVOLVE_NEON_H
-
-#include <stdint.h>
-
-extern "C" {
-#if defined(ARCH_ARM_HAVE_VFP)
-void vp9_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h);
-
-void vp9_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h);
-
-void vp9_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4,
-                             int w, int h);
-
-void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4,
-                                 int w, int h);
-
-void vp9_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h);
-
-void vp9_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4,
-                                  int w, int h);
-void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int x_step_q4,
-                        const int16_t *filter_y, int y_step_q4,
-                        int w, int h);
-
-void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h);
-
-#else
-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h);
-
-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int x_step_q4,
-                        const int16_t *filter_y, int y_step_q4,
-                        int w, int h);
-
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4,
-                          int w, int h);
-
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h);
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h);
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h);
-
-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const int16_t *filter_x, int x_step_q4,
-                     const int16_t *filter_y, int y_step_q4,
-                     int w, int h);
-
-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h);
-#endif
-}
-#endif
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 277836a..f2ce358 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -513,8 +513,6 @@
 
 extern RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
                                              const Script *s, const Element *e);
-extern RsdCpuScriptImpl * rsdIntrinsic_InterPred(RsdCpuReferenceImpl *ctx,
-                                                 const Script *s, const Element *e);
 extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
                                                    const Script *s, const Element *e);
 extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
@@ -544,11 +542,6 @@
     case RS_SCRIPT_INTRINSIC_ID_3DLUT:
         i = rsdIntrinsic_3DLUT(this, s, e);
         break;
-#ifndef RS_COMPATIBILITY_LIB
-    case RS_SCRIPT_INTRINSIC_ID_INTER_PRED:
-        i = rsdIntrinsic_InterPred(this, s, e);
-        break;
-#endif
     case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
         i = rsdIntrinsic_Convolve3x3(this, s, e);
         break;
diff --git a/cpu_ref/rsCpuIntrinsicInterPred.cpp b/cpu_ref/rsCpuIntrinsicInterPred.cpp
deleted file mode 100644
index 20e0f2e..0000000
--- a/cpu_ref/rsCpuIntrinsicInterPred.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-#include "rsCpuIntrinsicInterPred.h"
-
-void RsdCpuScriptIntrinsicInterPred::setGlobalObj(uint32_t slot,
-                                                  ObjectBase *data) {
-    Allocation *alloc = static_cast<Allocation *>(data);
-    if (slot == 0) mRef = (uint8_t *)alloc->mHal.state.userProvidedPtr;
-    if (slot == 1) mParam = (uint8_t *)alloc->mHal.state.userProvidedPtr;
-}
-
-void RsdCpuScriptIntrinsicInterPred::setGlobalVar(uint32_t slot,
-                                                  const void *data,
-                                                  size_t dataLength) {
-    mFriParamCount = ((int32_t *)data)[0];
-    mSecParamCount = ((int32_t *)data)[1];
-    mParamOffset   = ((int32_t *)data)[2];
-}
-
-void RsdCpuScriptIntrinsicInterPred::kernel(const RsForEachStubParamStruct *p,
-                                            uint32_t xstart, uint32_t xend,
-                                            uint32_t instep, uint32_t outstep) {
-    RsdCpuScriptIntrinsicInterPred *cp = (RsdCpuScriptIntrinsicInterPred *)p->usr;
-    cp->mCount++;
-    const int vp9_convolve_mode[2][2] = {{24, 16}, {8, 0}};
-    uint8_t *ref_base = cp->mRef;
-    INTER_PRED_PARAM *fri_param = (INTER_PRED_PARAM *)cp->mParam;
-    INTER_PRED_PARAM *sec_param = (INTER_PRED_PARAM *)(cp->mParam + cp->mParamOffset);
-    int32_t fri_count = cp->mFriParamCount;
-    int32_t sec_count = cp->mSecParamCount;
-    int mode_num;
-    uint8_t *src;
-    uint8_t *dst;
-    const int16_t *filter_x;
-    const int16_t *filter_y;
-    for (int i = 0; i < fri_count; i++) {
-
-        mode_num = vp9_convolve_mode[(fri_param[i].x_step_q4 == 16)]
-                                    [(fri_param[i].y_step_q4 == 16)];
-        src = ref_base + fri_param[i].src_mv;
-        dst = ref_base + fri_param[i].dst_mv;
-
-        filter_x = inter_pred_filters + fri_param[i].filter_x_mv;
-        filter_y = inter_pred_filters + fri_param[i].filter_y_mv;
-
-        cp->mSwitchConvolve[fri_param[i].pred_mode + mode_num](
-            src, fri_param[i].src_stride,
-            dst, fri_param[i].dst_stride,
-            filter_x, fri_param[i].x_step_q4,
-            filter_y, fri_param[i].y_step_q4,
-            fri_param[i].w, fri_param[i].h
-        );
-    }
-
-    for (int i = 0; i < sec_count; i++) {
-        mode_num = vp9_convolve_mode[(sec_param[i].x_step_q4 == 16)]
-                                    [(sec_param[i].y_step_q4 == 16)];
-        src = ref_base + sec_param[i].src_mv;
-        dst = ref_base + sec_param[i].dst_mv;
-
-        filter_x = inter_pred_filters + sec_param[i].filter_x_mv;
-        filter_y = inter_pred_filters + sec_param[i].filter_y_mv;
-
-        cp->mSwitchConvolve[sec_param[i].pred_mode + mode_num + 1](
-            src, sec_param[i].src_stride,
-            dst, sec_param[i].dst_stride,
-            filter_x, sec_param[i].x_step_q4,
-            filter_y, sec_param[i].y_step_q4,
-            sec_param[i].w, sec_param[i].h
-        );
-    }
-
-}
-
-RsdCpuScriptIntrinsicInterPred::RsdCpuScriptIntrinsicInterPred(RsdCpuReferenceImpl *ctx,
-                                                               const Script *s, const Element *e)
-            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_INTER_PRED) {
-    mRootPtr = &kernel;
-    mCount = 0;
-    mParamOffset = 0;
-    mFriParamCount = 0;
-    mSecParamCount = 0;
-    mRef = NULL;
-    mParam = NULL;
-
-#if defined(ARCH_ARM_HAVE_VFP)
-    mSwitchConvolve[0] = vp9_convolve_copy_neon;
-    mSwitchConvolve[1] = vp9_convolve_avg_neon;
-    mSwitchConvolve[2] = vp9_convolve8_vert_neon;
-    mSwitchConvolve[3] = vp9_convolve8_avg_vert_neon;
-    mSwitchConvolve[4] = vp9_convolve8_horiz_neon;
-    mSwitchConvolve[5] = vp9_convolve8_avg_horiz_neon;
-    mSwitchConvolve[6] = vp9_convolve8_neon;
-    mSwitchConvolve[7] = vp9_convolve8_avg_neon;
-
-    mSwitchConvolve[8] = vp9_convolve8_vert_neon;
-    mSwitchConvolve[9] = vp9_convolve8_avg_vert_neon;
-    mSwitchConvolve[10] = vp9_convolve8_vert_neon;
-    mSwitchConvolve[11] = vp9_convolve8_avg_vert_neon;
-    mSwitchConvolve[12] = vp9_convolve8_neon;
-    mSwitchConvolve[13] = vp9_convolve8_avg_neon;
-    mSwitchConvolve[14] = vp9_convolve8_neon;
-    mSwitchConvolve[15] = vp9_convolve8_avg_neon;
-
-    mSwitchConvolve[16] = vp9_convolve8_horiz_neon;
-    mSwitchConvolve[17] = vp9_convolve8_avg_horiz_neon;
-    mSwitchConvolve[18] = vp9_convolve8_neon;
-    mSwitchConvolve[19] = vp9_convolve8_avg_neon;
-    mSwitchConvolve[20] = vp9_convolve8_horiz_neon;
-    mSwitchConvolve[21] = vp9_convolve8_avg_horiz_neon;
-    mSwitchConvolve[22] = vp9_convolve8_neon;
-    mSwitchConvolve[23] = vp9_convolve8_avg_neon;
-
-    mSwitchConvolve[24] = vp9_convolve8_neon;
-    mSwitchConvolve[25] = vp9_convolve8_avg_neon;
-    mSwitchConvolve[26] = vp9_convolve8_neon;
-    mSwitchConvolve[27] = vp9_convolve8_avg_neon;
-    mSwitchConvolve[28] = vp9_convolve8_neon;
-    mSwitchConvolve[29] = vp9_convolve8_avg_neon;
-    mSwitchConvolve[30] = vp9_convolve8_neon;
-    mSwitchConvolve[31] = vp9_convolve8_avg_neon;
-#else
-    mSwitchConvolve[0] = vp9_convolve_copy_c;
-    mSwitchConvolve[1] = vp9_convolve_avg_c;
-    mSwitchConvolve[2] = vp9_convolve8_vert_c;
-    mSwitchConvolve[3] = vp9_convolve8_avg_vert_c;
-    mSwitchConvolve[4] = vp9_convolve8_horiz_c;
-    mSwitchConvolve[5] = vp9_convolve8_avg_horiz_c;
-    mSwitchConvolve[6] = vp9_convolve8_c;
-    mSwitchConvolve[7] = vp9_convolve8_avg_c;
-
-    mSwitchConvolve[8] = vp9_convolve8_vert_c;
-    mSwitchConvolve[9] = vp9_convolve8_avg_vert_c;
-    mSwitchConvolve[10] = vp9_convolve8_vert_c;
-    mSwitchConvolve[11] = vp9_convolve8_avg_vert_c;
-    mSwitchConvolve[12] = vp9_convolve8_c;
-    mSwitchConvolve[13] = vp9_convolve8_avg_c;
-    mSwitchConvolve[14] = vp9_convolve8_c;
-    mSwitchConvolve[15] = vp9_convolve8_avg_c;
-
-    mSwitchConvolve[16] = vp9_convolve8_horiz_c;
-    mSwitchConvolve[17] = vp9_convolve8_avg_horiz_c;
-    mSwitchConvolve[18] = vp9_convolve8_c;
-    mSwitchConvolve[19] = vp9_convolve8_avg_c;
-    mSwitchConvolve[20] = vp9_convolve8_horiz_c;
-    mSwitchConvolve[21] = vp9_convolve8_avg_horiz_c;
-    mSwitchConvolve[22] = vp9_convolve8_c;
-    mSwitchConvolve[23] = vp9_convolve8_avg_c;
-
-    mSwitchConvolve[24] = vp9_convolve8_c;
-    mSwitchConvolve[25] = vp9_convolve8_avg_c;
-    mSwitchConvolve[26] = vp9_convolve8_c;
-    mSwitchConvolve[27] = vp9_convolve8_avg_c;
-    mSwitchConvolve[28] = vp9_convolve8_c;
-    mSwitchConvolve[29] = vp9_convolve8_avg_c;
-    mSwitchConvolve[30] = vp9_convolve8_c;
-    mSwitchConvolve[31] = vp9_convolve8_avg_c;
-#endif
-}
-
-RsdCpuScriptIntrinsicInterPred::~RsdCpuScriptIntrinsicInterPred() {
-}
-
-void RsdCpuScriptIntrinsicInterPred::populateScript(Script *s) {
-    s->mHal.info.exportedVariableCount = 3;
-}
-
-void RsdCpuScriptIntrinsicInterPred::invokeFreeChildren() {
-}
-
-
-RsdCpuScriptImpl * rsdIntrinsic_InterPred(RsdCpuReferenceImpl *ctx,
-                                          const Script *s, const Element *e) {
-    return new RsdCpuScriptIntrinsicInterPred(ctx, s, e);
-}
diff --git a/cpu_ref/rsCpuIntrinsicInterPred.h b/cpu_ref/rsCpuIntrinsicInterPred.h
deleted file mode 100644
index 552f4eb..0000000
--- a/cpu_ref/rsCpuIntrinsicInterPred.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef RSD_CPU_SCRIPT_INTRINSIC_INTER_PRED_H
-#define RSD_CPU_SCRIPT_INTRINSIC_INTER_PRED_H
-
-#include "rsCpuIntrinsic.h"
-#include "rsCpuIntrinsicInlines.h"
-#include "rsCpuConvolve.h"
-
-using namespace android;
-using namespace android::renderscript;
-
-namespace android {
-namespace renderscript {
-
-typedef struct inter_pred_param {
-    int pred_mode;
-
-    int src_mv;
-    int src_stride;
-    int dst_mv;
-    int dst_stride;
-
-    int filter_x_mv;
-    int x_step_q4;
-    int filter_y_mv;
-    int y_step_q4;
-
-    int w;
-    int h;
-}INTER_PRED_PARAM;
-
-static const int16_t inter_pred_filters[512] = {
-    0, 0, 0, 128, 0, 0, 0, 0, 0, 1, -5, 126, 8, -3, 1, 0,
-    -1, 3, -10, 122, 18, -6, 2, 0, -1, 4, -13, 118, 27, -9, 3, -1,
-    -1, 4, -16, 112, 37, -11, 4, -1, -1, 5, -18, 105, 48, -14, 4, -1,
-    -1, 5, -19, 97, 58, -16, 5, -1, -1, 6, -19, 88, 68, -18, 5, -1,
-    -1, 6, -19, 78, 78, -19, 6, -1, -1, 5, -18, 68, 88, -19, 6, -1,
-    -1, 5, -16, 58, 97, -19, 5, -1, -1, 4, -14, 48, 105, -18, 5, -1,
-    -1, 4, -11, 37, 112, -16, 4, -1, -1, 3, -9, 27, 118, -13, 4, -1,
-    0, 2, -6, 18, 122, -10, 3, -1, 0, 1, -3, 8, 126, -5, 1, 0,
-    0, 0, 0, 128, 0, 0, 0, 0, -3, -1, 32, 64, 38, 1, -3, 0,
-    -2, -2, 29, 63, 41, 2, -3, 0, -2, -2, 26, 63, 43, 4, -4, 0,
-    -2, -3, 24, 62, 46, 5, -4, 0, -2, -3, 21, 60, 49, 7, -4, 0,
-    -1, -4, 18, 59, 51, 9, -4, 0, -1, -4, 16, 57, 53, 12, -4, -1,
-    -1, -4, 14, 55, 55, 14, -4, -1, -1, -4, 12, 53, 57, 16, -4, -1,
-    0, -4, 9, 51, 59, 18, -4, -1, 0, -4, 7, 49, 60, 21, -3, -2,
-    0, -4, 5, 46, 62, 24, -3, -2, 0, -4, 4, 43, 63, 26, -2, -2,
-    0, -3, 2, 41, 63, 29, -2, -2, 0, -3, 1, 38, 64, 32, -1, -3,
-    0, 0, 0, 128, 0, 0, 0, 0, -1, 3, -7, 127, 8, -3, 1, 0,
-    -2, 5, -13, 125, 17, -6, 3, -1, -3, 7, -17, 121, 27, -10, 5, -2,
-    -4, 9, -20, 115, 37, -13, 6, -2, -4, 10, -23, 108, 48, -16, 8, -3,
-    -4, 10, -24, 100, 59, -19, 9, -3, -4, 11, -24, 90, 70, -21, 10, -4,
-    -4, 11, -23, 80, 80, -23, 11, -4, -4, 10, -21, 70, 90, -24, 11, -4,
-    -3, 9, -19, 59, 100, -24, 10, -4, -3, 8, -16, 48, 108, -23, 10, -4,
-    -2, 6, -13, 37, 115, -20, 9, -4, -2, 5, -10, 27, 121, -17, 7, -3,
-    -1, 3, -6, 17, 125, -13, 5, -2, 0, 1, -3, 8, 127, -7, 3, -1,
-    0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 120, 8, 0, 0, 0,
-    0, 0, 0, 112, 16, 0, 0, 0, 0, 0, 0, 104, 24, 0, 0, 0,
-    0, 0, 0, 96, 32, 0, 0, 0, 0, 0, 0, 88, 40, 0, 0, 0,
-    0, 0, 0, 80, 48, 0, 0, 0, 0, 0, 0, 72, 56, 0, 0, 0,
-    0, 0, 0, 64, 64, 0, 0, 0, 0, 0, 0, 56, 72, 0, 0, 0,
-    0, 0, 0, 48, 80, 0, 0, 0, 0, 0, 0, 40, 88, 0, 0, 0,
-    0, 0, 0, 32, 96, 0, 0, 0, 0, 0, 0, 24, 104, 0, 0, 0,
-    0, 0, 0, 16, 112, 0, 0, 0, 0, 0, 0, 8, 120, 0, 0, 0
-};
-
-typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h);
-
-
-class RsdCpuScriptIntrinsicInterPred: public RsdCpuScriptIntrinsic {
-public:
-    virtual void populateScript(Script *);
-    virtual void invokeFreeChildren();
-
-    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
-    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
-    virtual ~RsdCpuScriptIntrinsicInterPred();
-    RsdCpuScriptIntrinsicInterPred(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
-
-protected:
-    uint8_t *mRef;
-    uint8_t *mParam;
-    int mFriParamCount;
-    int mSecParamCount;
-    int mParamOffset;
-    int mCount;
-    convolve_fn_t mSwitchConvolve[32];
-    static void kernel(const RsForEachStubParamStruct *p,
-                       uint32_t xstart, uint32_t xend,
-                       uint32_t instep, uint32_t outstep);
-};
-
-}
-}
-#endif
-
diff --git a/rsDefines.h b/rsDefines.h
index 5efb2f1..1731eb7 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -365,7 +365,7 @@
     RS_SCRIPT_INTRINSIC_ID_BLEND = 7,
     RS_SCRIPT_INTRINSIC_ID_3DLUT = 8,
     RS_SCRIPT_INTRINSIC_ID_HISTOGRAM = 9,
-    RS_SCRIPT_INTRINSIC_ID_INTER_PRED= 10,
+    // unused 10
     RS_SCRIPT_INTRINSIC_ID_LOOP_FILTER = 11,
     RS_SCRIPT_INTRINSIC_ID_RESIZE = 12
 };