Add image computing oriented builtin functions

The new built-in functions covers inverse DCT, ADST and walsh transforms.

Change-Id: Ie7044da4a5becf2cc80d066a258211721a5939d4
diff --git a/driver/runtime/Android.mk b/driver/runtime/Android.mk
index ab8e33e..f67e14e 100755
--- a/driver/runtime/Android.mk
+++ b/driver/runtime/Android.mk
@@ -29,7 +29,12 @@
     rs_sampler.c \
     convert.ll \
     allocation.ll \
-    rsClamp.ll
+    rsClamp.ll \
+    rs_idct.c \
+    rs_dct.c \
+    rs_iadst.c  \
+    rs_fadst.c  \
+    rs_walsh.c
 
 clcore_files := \
     $(clcore_base_files) \
diff --git a/driver/runtime/rs_dct.c b/driver/runtime/rs_dct.c
new file mode 100644
index 0000000..8e8dbb0
--- /dev/null
+++ b/driver/runtime/rs_dct.c
@@ -0,0 +1,837 @@
+#include "rs_types.rsh"
+#include "rs_allocation.rsh"
+#include "rs_dct.h"
+
+static int fdct_round_shift(int input) {
+    int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+    return rv;
+}
+
+static void fdct8(const int16_t *input, int16_t *output) {
+    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
+    /*needs32*/ int t0, t1, t2, t3;
+    /*canbe16*/ int x0, x1, x2, x3;
+
+    // stage 1
+    s0 = input[0] + input[7];
+    s1 = input[1] + input[6];
+    s2 = input[2] + input[5];
+    s3 = input[3] + input[4];
+    s4 = input[3] - input[4];
+    s5 = input[2] - input[5];
+    s6 = input[1] - input[6];
+    s7 = input[0] - input[7];
+
+    // fdct4(step, step);
+    x0 = s0 + s3;
+    x1 = s1 + s2;
+    x2 = s1 - s2;
+    x3 = s0 - s3;
+    t0 = (x0 + x1) * cospi_16_64;
+    t1 = (x0 - x1) * cospi_16_64;
+    t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+    t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+    output[0] = fdct_round_shift(t0);
+    output[2] = fdct_round_shift(t2);
+    output[4] = fdct_round_shift(t1);
+    output[6] = fdct_round_shift(t3);
+
+    // Stage 2
+    t0 = (s6 - s5) * cospi_16_64;
+    t1 = (s6 + s5) * cospi_16_64;
+    t2 = fdct_round_shift(t0);
+    t3 = fdct_round_shift(t1);
+
+    // Stage 3
+    x0 = s4 + t2;
+    x1 = s4 - t2;
+    x2 = s7 - t3;
+    x3 = s7 + t3;
+
+    // Stage 4
+    t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+    t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+    t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+    output[1] = fdct_round_shift(t0);
+    output[3] = fdct_round_shift(t2);
+    output[5] = fdct_round_shift(t1);
+    output[7] = fdct_round_shift(t3);
+}
+
+static int dct_32_round(int input) {
+    int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+    return rv;
+}
+
+static int half_round_shift(int input) {
+    int rv = (input + 1 + (input < 0)) >> 2;
+    return rv;
+}
+
+static void dct32_1d(const int *input, int *output, int round) {
+    int step[32];
+    // Stage 1
+    step[0] = input[0] + input[(32 - 1)];
+    step[1] = input[1] + input[(32 - 2)];
+    step[2] = input[2] + input[(32 - 3)];
+    step[3] = input[3] + input[(32 - 4)];
+    step[4] = input[4] + input[(32 - 5)];
+    step[5] = input[5] + input[(32 - 6)];
+    step[6] = input[6] + input[(32 - 7)];
+    step[7] = input[7] + input[(32 - 8)];
+    step[8] = input[8] + input[(32 - 9)];
+    step[9] = input[9] + input[(32 - 10)];
+    step[10] = input[10] + input[(32 - 11)];
+    step[11] = input[11] + input[(32 - 12)];
+    step[12] = input[12] + input[(32 - 13)];
+    step[13] = input[13] + input[(32 - 14)];
+    step[14] = input[14] + input[(32 - 15)];
+    step[15] = input[15] + input[(32 - 16)];
+    step[16] = -input[16] + input[(32 - 17)];
+    step[17] = -input[17] + input[(32 - 18)];
+    step[18] = -input[18] + input[(32 - 19)];
+    step[19] = -input[19] + input[(32 - 20)];
+    step[20] = -input[20] + input[(32 - 21)];
+    step[21] = -input[21] + input[(32 - 22)];
+    step[22] = -input[22] + input[(32 - 23)];
+    step[23] = -input[23] + input[(32 - 24)];
+    step[24] = -input[24] + input[(32 - 25)];
+    step[25] = -input[25] + input[(32 - 26)];
+    step[26] = -input[26] + input[(32 - 27)];
+    step[27] = -input[27] + input[(32 - 28)];
+    step[28] = -input[28] + input[(32 - 29)];
+    step[29] = -input[29] + input[(32 - 30)];
+    step[30] = -input[30] + input[(32 - 31)];
+    step[31] = -input[31] + input[(32 - 32)];
+
+    // Stage 2
+    output[0] = step[0] + step[16 - 1];
+    output[1] = step[1] + step[16 - 2];
+    output[2] = step[2] + step[16 - 3];
+    output[3] = step[3] + step[16 - 4];
+    output[4] = step[4] + step[16 - 5];
+    output[5] = step[5] + step[16 - 6];
+    output[6] = step[6] + step[16 - 7];
+    output[7] = step[7] + step[16 - 8];
+    output[8] = -step[8] + step[16 - 9];
+    output[9] = -step[9] + step[16 - 10];
+    output[10] = -step[10] + step[16 - 11];
+    output[11] = -step[11] + step[16 - 12];
+    output[12] = -step[12] + step[16 - 13];
+    output[13] = -step[13] + step[16 - 14];
+    output[14] = -step[14] + step[16 - 15];
+    output[15] = -step[15] + step[16 - 16];
+
+    output[16] = step[16];
+    output[17] = step[17];
+    output[18] = step[18];
+    output[19] = step[19];
+
+    output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
+    output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
+    output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
+    output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
+
+    output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
+    output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
+    output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
+    output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
+
+    output[28] = step[28];
+    output[29] = step[29];
+    output[30] = step[30];
+    output[31] = step[31];
+
+    // dump the magnitude by 4, hence the intermediate values are within
+    // the range of 16 bits.
+    if (round) {
+        output[0] = half_round_shift(output[0]);
+        output[1] = half_round_shift(output[1]);
+        output[2] = half_round_shift(output[2]);
+        output[3] = half_round_shift(output[3]);
+        output[4] = half_round_shift(output[4]);
+        output[5] = half_round_shift(output[5]);
+        output[6] = half_round_shift(output[6]);
+        output[7] = half_round_shift(output[7]);
+        output[8] = half_round_shift(output[8]);
+        output[9] = half_round_shift(output[9]);
+        output[10] = half_round_shift(output[10]);
+        output[11] = half_round_shift(output[11]);
+        output[12] = half_round_shift(output[12]);
+        output[13] = half_round_shift(output[13]);
+        output[14] = half_round_shift(output[14]);
+        output[15] = half_round_shift(output[15]);
+
+        output[16] = half_round_shift(output[16]);
+        output[17] = half_round_shift(output[17]);
+        output[18] = half_round_shift(output[18]);
+        output[19] = half_round_shift(output[19]);
+        output[20] = half_round_shift(output[20]);
+        output[21] = half_round_shift(output[21]);
+        output[22] = half_round_shift(output[22]);
+        output[23] = half_round_shift(output[23]);
+        output[24] = half_round_shift(output[24]);
+        output[25] = half_round_shift(output[25]);
+        output[26] = half_round_shift(output[26]);
+        output[27] = half_round_shift(output[27]);
+        output[28] = half_round_shift(output[28]);
+        output[29] = half_round_shift(output[29]);
+        output[30] = half_round_shift(output[30]);
+        output[31] = half_round_shift(output[31]);
+    }
+
+    // Stage 3
+    step[0] = output[0] + output[(8 - 1)];
+    step[1] = output[1] + output[(8 - 2)];
+    step[2] = output[2] + output[(8 - 3)];
+    step[3] = output[3] + output[(8 - 4)];
+    step[4] = -output[4] + output[(8 - 5)];
+    step[5] = -output[5] + output[(8 - 6)];
+    step[6] = -output[6] + output[(8 - 7)];
+    step[7] = -output[7] + output[(8 - 8)];
+    step[8] = output[8];
+    step[9] = output[9];
+    step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
+    step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
+    step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
+    step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
+    step[14] = output[14];
+    step[15] = output[15];
+
+    step[16] = output[16] + output[23];
+    step[17] = output[17] + output[22];
+    step[18] = output[18] + output[21];
+    step[19] = output[19] + output[20];
+    step[20] = -output[20] + output[19];
+    step[21] = -output[21] + output[18];
+    step[22] = -output[22] + output[17];
+    step[23] = -output[23] + output[16];
+    step[24] = -output[24] + output[31];
+    step[25] = -output[25] + output[30];
+    step[26] = -output[26] + output[29];
+    step[27] = -output[27] + output[28];
+    step[28] = output[28] + output[27];
+    step[29] = output[29] + output[26];
+    step[30] = output[30] + output[25];
+    step[31] = output[31] + output[24];
+
+    // Stage 4
+    output[0] = step[0] + step[3];
+    output[1] = step[1] + step[2];
+    output[2] = -step[2] + step[1];
+    output[3] = -step[3] + step[0];
+    output[4] = step[4];
+    output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
+    output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
+    output[7] = step[7];
+    output[8] = step[8] + step[11];
+    output[9] = step[9] + step[10];
+    output[10] = -step[10] + step[9];
+    output[11] = -step[11] + step[8];
+    output[12] = -step[12] + step[15];
+    output[13] = -step[13] + step[14];
+    output[14] = step[14] + step[13];
+    output[15] = step[15] + step[12];
+
+    output[16] = step[16];
+    output[17] = step[17];
+    output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
+    output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
+    output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
+    output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
+    output[22] = step[22];
+    output[23] = step[23];
+    output[24] = step[24];
+    output[25] = step[25];
+    output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
+    output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
+    output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
+    output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
+    output[30] = step[30];
+    output[31] = step[31];
+
+    // Stage 5
+    step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
+    step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
+    step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
+    step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
+    step[4] = output[4] + output[5];
+    step[5] = -output[5] + output[4];
+    step[6] = -output[6] + output[7];
+    step[7] = output[7] + output[6];
+    step[8] = output[8];
+    step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
+    step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
+    step[11] = output[11];
+    step[12] = output[12];
+    step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
+    step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
+    step[15] = output[15];
+
+    step[16] = output[16] + output[19];
+    step[17] = output[17] + output[18];
+    step[18] = -output[18] + output[17];
+    step[19] = -output[19] + output[16];
+    step[20] = -output[20] + output[23];
+    step[21] = -output[21] + output[22];
+    step[22] = output[22] + output[21];
+    step[23] = output[23] + output[20];
+    step[24] = output[24] + output[27];
+    step[25] = output[25] + output[26];
+    step[26] = -output[26] + output[25];
+    step[27] = -output[27] + output[24];
+    step[28] = -output[28] + output[31];
+    step[29] = -output[29] + output[30];
+    step[30] = output[30] + output[29];
+    step[31] = output[31] + output[28];
+
+    // Stage 6
+    output[0] = step[0];
+    output[1] = step[1];
+    output[2] = step[2];
+    output[3] = step[3];
+    output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
+    output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
+    output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
+    output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
+    output[8] = step[8] + step[9];
+    output[9] = -step[9] + step[8];
+    output[10] = -step[10] + step[11];
+    output[11] = step[11] + step[10];
+    output[12] = step[12] + step[13];
+    output[13] = -step[13] + step[12];
+    output[14] = -step[14] + step[15];
+    output[15] = step[15] + step[14];
+
+    output[16] = step[16];
+    output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
+    output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
+    output[19] = step[19];
+    output[20] = step[20];
+    output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
+    output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
+    output[23] = step[23];
+    output[24] = step[24];
+    output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
+    output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
+    output[27] = step[27];
+    output[28] = step[28];
+    output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
+    output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
+    output[31] = step[31];
+
+    // Stage 7
+    step[0] = output[0];
+    step[1] = output[1];
+    step[2] = output[2];
+    step[3] = output[3];
+    step[4] = output[4];
+    step[5] = output[5];
+    step[6] = output[6];
+    step[7] = output[7];
+    step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
+    step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
+    step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
+    step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
+    step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
+    step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
+    step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
+    step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
+
+    step[16] = output[16] + output[17];
+    step[17] = -output[17] + output[16];
+    step[18] = -output[18] + output[19];
+    step[19] = output[19] + output[18];
+    step[20] = output[20] + output[21];
+    step[21] = -output[21] + output[20];
+    step[22] = -output[22] + output[23];
+    step[23] = output[23] + output[22];
+    step[24] = output[24] + output[25];
+    step[25] = -output[25] + output[24];
+    step[26] = -output[26] + output[27];
+    step[27] = output[27] + output[26];
+    step[28] = output[28] + output[29];
+    step[29] = -output[29] + output[28];
+    step[30] = -output[30] + output[31];
+    step[31] = output[31] + output[30];
+
+    // Final stage --- outputs indices are bit-reversed.
+    output[0]  = step[0];
+    output[16] = step[1];
+    output[8]  = step[2];
+    output[24] = step[3];
+    output[4]  = step[4];
+    output[20] = step[5];
+    output[12] = step[6];
+    output[28] = step[7];
+    output[2]  = step[8];
+    output[18] = step[9];
+    output[10] = step[10];
+    output[26] = step[11];
+    output[6]  = step[12];
+    output[22] = step[13];
+    output[14] = step[14];
+    output[30] = step[15];
+
+    output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
+    output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
+    output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
+    output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
+    output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
+    output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
+    output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
+    output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
+    output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
+    output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
+    output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
+    output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
+    output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
+    output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
+    output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
+    output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
+}
+
+extern void dct4x4(rs_allocation input, rs_allocation output, int xoff, int yoff) {
+    // The 2D transform is done with two passes which are actually pretty
+    // similar. In the first one, we transform the columns and transpose
+    // the results. In the second one, we transform the rows. To achieve that,
+    // as the first pass results are transposed, we tranpose the columns (that
+    // is the transposed rows) and transpose the results (so that it goes back
+    // in normal/row positions).
+    int pass;
+    int i, j;
+    // We need an intermediate buffer between passes.
+    int16_t intermediate[4 * 4];
+    int16_t inptr[4 * 4];
+    int16_t outptr[4 * 4];
+    int16_t *in = inptr;
+    int16_t *output_1 = outptr;
+    int16_t *out = intermediate;
+    for (i = 0; i < 4; i++) {
+        for (j = 0; j < 4; j++) {
+            inptr[i + j * 4] = rsGetElementAt_short(input, xoff + i, yoff + j);
+            outptr[i + j * 4] = rsGetElementAt_short(output, xoff + i, yoff + j);
+        }
+    }
+
+    // Do the two transform/transpose passes
+    for (pass = 0; pass < 2; ++pass) {
+        /*canbe16*/ int input[4];
+        /*canbe16*/ int step[4];
+        /*needs32*/ int temp1, temp2;
+        int i;
+        for (i = 0; i < 4; ++i) {
+            // Load inputs.
+            if (0 == pass) {
+                input[0] = in[0 * 4] * 16;
+                input[1] = in[1 * 4] * 16;
+                input[2] = in[2 * 4] * 16;
+                input[3] = in[3 * 4] * 16;
+                if (i == 0 && input[0]) {
+                    input[0] += 1;
+                }
+            } else {
+                input[0] = in[0 * 4];
+                input[1] = in[1 * 4];
+                input[2] = in[2 * 4];
+                input[3] = in[3 * 4];
+            }
+            // Transform.
+            step[0] = input[0] + input[3];
+            step[1] = input[1] + input[2];
+            step[2] = input[1] - input[2];
+            step[3] = input[0] - input[3];
+            temp1 = (step[0] + step[1]) * cospi_16_64;
+            temp2 = (step[0] - step[1]) * cospi_16_64;
+            out[0] = fdct_round_shift(temp1);
+            out[2] = fdct_round_shift(temp2);
+            temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+            temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+            out[1] = fdct_round_shift(temp1);
+            out[3] = fdct_round_shift(temp2);
+            // Do next column (which is a transposed row in second/horizontal pass)
+            in++;
+            out += 4;
+        }
+        // Setup in/out for next pass.
+        in = intermediate;
+        out = output_1;
+    }
+
+    for (i = 0; i < 4; i++) {
+        for (j = 0; j < 4; j++) {
+            rsSetElementAt_short(output, (output_1[i + j * 4] + 1) >> 2,
+                    xoff + i, yoff + j);
+        }
+    }
+}
+
+extern void dct8x8(rs_allocation input, rs_allocation output, int xoff, int yoff) {
+    int i, j;
+    int16_t intermediate[64];
+    int16_t inptr[64];
+    int16_t outptr[64];
+    int16_t *in = inptr;
+    int16_t *output_1 = outptr;
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            inptr[i + j * 8] = rsGetElementAt_short(input, xoff + i, yoff + j);
+            outptr[i + j * 8] = rsGetElementAt_short(output, xoff + i, yoff + j);
+        }
+    }
+
+    // Transform columns
+    {
+        int16_t *output1 = intermediate;
+        /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
+        /*needs32*/ int t0, t1, t2, t3;
+        /*canbe16*/ int x0, x1, x2, x3;
+
+        int i;
+        for (i = 0; i < 8; i++) {
+            // stage 1
+            s0 = (in[0 * 8] + in[7 * 8]) * 4;
+            s1 = (in[1 * 8] + in[6 * 8]) * 4;
+            s2 = (in[2 * 8] + in[5 * 8]) * 4;
+            s3 = (in[3 * 8] + in[4 * 8]) * 4;
+            s4 = (in[3 * 8] - in[4 * 8]) * 4;
+            s5 = (in[2 * 8] - in[5 * 8]) * 4;
+            s6 = (in[1 * 8] - in[6 * 8]) * 4;
+            s7 = (in[0 * 8] - in[7 * 8]) * 4;
+
+            // fdct4(step, step);
+            x0 = s0 + s3;
+            x1 = s1 + s2;
+            x2 = s1 - s2;
+            x3 = s0 - s3;
+            t0 = (x0 + x1) * cospi_16_64;
+            t1 = (x0 - x1) * cospi_16_64;
+            t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+            t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+            output1[0 * 8] = fdct_round_shift(t0);
+            output1[2 * 8] = fdct_round_shift(t2);
+            output1[4 * 8] = fdct_round_shift(t1);
+            output1[6 * 8] = fdct_round_shift(t3);
+
+            // Stage 2
+            t0 = (s6 - s5) * cospi_16_64;
+            t1 = (s6 + s5) * cospi_16_64;
+            t2 = fdct_round_shift(t0);
+            t3 = fdct_round_shift(t1);
+
+            // Stage 3
+            x0 = s4 + t2;
+            x1 = s4 - t2;
+            x2 = s7 - t3;
+            x3 = s7 + t3;
+
+            // Stage 4
+            t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+            t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+            t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+            t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+            output1[1 * 8] = fdct_round_shift(t0);
+            output1[3 * 8] = fdct_round_shift(t2);
+            output1[5 * 8] = fdct_round_shift(t1);
+            output1[7 * 8] = fdct_round_shift(t3);
+            in++;
+            output1++;
+        }
+    }
+
+    // Rows
+    for (i = 0; i < 8; ++i) {
+        fdct8(&intermediate[i * 8], &output_1[i * 8]);
+    }
+
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            rsSetElementAt_short(output, output_1[i + j * 8] / 2, xoff + i, yoff + j);
+        }
+    }
+
+}
+
+extern void dct16x16(rs_allocation input, rs_allocation output, int xoff, int yoff) {
+    // The 2D transform is done with two passes which are actually pretty
+    // similar. In the first one, we transform the columns and transpose
+    // the results. In the second one, we transform the rows. To achieve that,
+    // as the first pass results are transposed, we tranpose the columns (that
+    // is the transposed rows) and transpose the results (so that it goes back
+    // in normal/row positions).
+    int pass;
+    int i, j;
+    // We need an intermediate buffer between passes.
+    int16_t intermediate[256];
+    int16_t *out = intermediate;
+    int16_t inptr[256];
+    int16_t outptr[256];
+    int16_t *in = inptr;
+    int16_t *output_1 = outptr;
+    for (i = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++) {
+            inptr[i + j * 16] = rsGetElementAt_short(input, xoff + i, yoff + j);
+            outptr[i + j * 16] = rsGetElementAt_short(output, xoff + i, yoff + j);
+        }
+    }
+
+    // Do the two transform/transpose passes
+    for (pass = 0; pass < 2; ++pass) {
+        /*canbe16*/ int step1[8];
+        /*canbe16*/ int step2[8];
+        /*canbe16*/ int step3[8];
+        /*canbe16*/ int input[8];
+        /*needs32*/ int temp1, temp2;
+        int i;
+        for (i = 0; i < 16; i++) {
+            if (0 == pass) {
+                // Calculate input for the first 8 results.
+                input[0] = (in[0 * 16] + in[15 * 16]) * 4;
+                input[1] = (in[1 * 16] + in[14 * 16]) * 4;
+                input[2] = (in[2 * 16] + in[13 * 16]) * 4;
+                input[3] = (in[3 * 16] + in[12 * 16]) * 4;
+                input[4] = (in[4 * 16] + in[11 * 16]) * 4;
+                input[5] = (in[5 * 16] + in[10 * 16]) * 4;
+                input[6] = (in[6 * 16] + in[ 9 * 16]) * 4;
+                input[7] = (in[7 * 16] + in[ 8 * 16]) * 4;
+                // Calculate input for the next 8 results.
+                step1[0] = (in[7 * 16] - in[ 8 * 16]) * 4;
+                step1[1] = (in[6 * 16] - in[ 9 * 16]) * 4;
+                step1[2] = (in[5 * 16] - in[10 * 16]) * 4;
+                step1[3] = (in[4 * 16] - in[11 * 16]) * 4;
+                step1[4] = (in[3 * 16] - in[12 * 16]) * 4;
+                step1[5] = (in[2 * 16] - in[13 * 16]) * 4;
+                step1[6] = (in[1 * 16] - in[14 * 16]) * 4;
+                step1[7] = (in[0 * 16] - in[15 * 16]) * 4;
+            } else {
+                // Calculate input for the first 8 results.
+                input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
+                input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
+                input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
+                input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
+                input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
+                input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
+                input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
+                input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
+                // Calculate input for the next 8 results.
+                step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
+                step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
+                step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
+                step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
+                step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
+                step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
+                step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
+                step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+            }
+            // Work on the first eight values; fdct8(input, even_results);
+            {
+                /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
+                /*needs32*/ int t0, t1, t2, t3;
+                /*canbe16*/ int x0, x1, x2, x3;
+
+                // stage 1
+                s0 = input[0] + input[7];
+                s1 = input[1] + input[6];
+                s2 = input[2] + input[5];
+                s3 = input[3] + input[4];
+                s4 = input[3] - input[4];
+                s5 = input[2] - input[5];
+                s6 = input[1] - input[6];
+                s7 = input[0] - input[7];
+
+                // fdct4(step, step);
+                x0 = s0 + s3;
+                x1 = s1 + s2;
+                x2 = s1 - s2;
+                x3 = s0 - s3;
+                t0 = (x0 + x1) * cospi_16_64;
+                t1 = (x0 - x1) * cospi_16_64;
+                t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
+                t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+                out[0] = fdct_round_shift(t0);
+                out[4] = fdct_round_shift(t2);
+                out[8] = fdct_round_shift(t1);
+                out[12] = fdct_round_shift(t3);
+
+                // Stage 2
+                t0 = (s6 - s5) * cospi_16_64;
+                t1 = (s6 + s5) * cospi_16_64;
+                t2 = fdct_round_shift(t0);
+                t3 = fdct_round_shift(t1);
+
+                // Stage 3
+                x0 = s4 + t2;
+                x1 = s4 - t2;
+                x2 = s7 - t3;
+                x3 = s7 + t3;
+
+                // Stage 4
+                t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+                t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+                t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+                t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+                out[2] = fdct_round_shift(t0);
+                out[6] = fdct_round_shift(t2);
+                out[10] = fdct_round_shift(t1);
+                out[14] = fdct_round_shift(t3);
+            }
+            // Work on the next eight values; step1 -> odd_results
+            {
+                // step 2
+                temp1 = (step1[5] - step1[2]) * cospi_16_64;
+                temp2 = (step1[4] - step1[3]) * cospi_16_64;
+                step2[2] = fdct_round_shift(temp1);
+                step2[3] = fdct_round_shift(temp2);
+                temp1 = (step1[4] + step1[3]) * cospi_16_64;
+                temp2 = (step1[5] + step1[2]) * cospi_16_64;
+                step2[4] = fdct_round_shift(temp1);
+                step2[5] = fdct_round_shift(temp2);
+                // step 3
+                step3[0] = step1[0] + step2[3];
+                step3[1] = step1[1] + step2[2];
+                step3[2] = step1[1] - step2[2];
+                step3[3] = step1[0] - step2[3];
+                step3[4] = step1[7] - step2[4];
+                step3[5] = step1[6] - step2[5];
+                step3[6] = step1[6] + step2[5];
+                step3[7] = step1[7] + step2[4];
+                // step 4
+                temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
+                temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64;
+                step2[1] = fdct_round_shift(temp1);
+                step2[2] = fdct_round_shift(temp2);
+                temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
+                temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
+                step2[5] = fdct_round_shift(temp1);
+                step2[6] = fdct_round_shift(temp2);
+                // step 5
+                step1[0] = step3[0] + step2[1];
+                step1[1] = step3[0] - step2[1];
+                step1[2] = step3[3] - step2[2];
+                step1[3] = step3[3] + step2[2];
+                step1[4] = step3[4] + step2[5];
+                step1[5] = step3[4] - step2[5];
+                step1[6] = step3[7] - step2[6];
+                step1[7] = step3[7] + step2[6];
+                // step 6
+                temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
+                temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+                out[1] = fdct_round_shift(temp1);
+                out[9] = fdct_round_shift(temp2);
+                temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+                temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
+                out[5] = fdct_round_shift(temp1);
+                out[13] = fdct_round_shift(temp2);
+                temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
+                temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+                out[3] = fdct_round_shift(temp1);
+                out[11] = fdct_round_shift(temp2);
+                temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+                temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
+                out[7] = fdct_round_shift(temp1);
+                out[15] = fdct_round_shift(temp2);
+            }
+            // Do next column (which is a transposed row in second/horizontal pass)
+            in++;
+            out += 16;
+        }
+        // Setup in/out for next pass.
+        in = intermediate;
+        out = output_1;
+    }
+
+    for (i = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++) {
+            rsSetElementAt_short(output, output_1[i + j * 16], xoff + i, yoff + j);
+        }
+    }
+}
+
+extern void dct32x32(rs_allocation input, rs_allocation out, int xoff, int yoff) {
+    int i, j;
+    int output[32 * 32];
+    int16_t inptr[1024];
+    int16_t outptr[1024];
+    int16_t *in = inptr;
+    int16_t *output_1 = outptr;
+    for (i = 0; i < 32; i++) {
+        for (j = 0; j < 32; j++) {
+            inptr[i + j * 32] = rsGetElementAt_short(input, xoff + i, yoff + j);
+            outptr[i + j * 32] = rsGetElementAt_short(out, xoff + i, yoff + j);
+        }
+    }
+
+    // Columns
+    for (i = 0; i < 32; ++i) {
+        int temp_in[32], temp_out[32];
+        for (j = 0; j < 32; ++j)
+            temp_in[j] = in[j * 32 + i] * 4;
+        dct32_1d(temp_in, temp_out, 0);
+        for (j = 0; j < 32; ++j)
+            output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+    }
+
+    // Rows
+    for (i = 0; i < 32; ++i) {
+        int temp_in[32], temp_out[32];
+        for (j = 0; j < 32; ++j)
+            temp_in[j] = output[j + i * 32];
+        dct32_1d(temp_in, temp_out, 0);
+        for (j = 0; j < 32; ++j)
+            output_1[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+    }
+
+    for (i = 0; i < 32; i++) {
+        for (j = 0; j < 32; j++) {
+            rsSetElementAt_short(out, output_1[i + j * 32], xoff + i, yoff + j);
+        }
+    }
+}
+
+// Note that although we use dct_32_round in dct32_1d computation flow,
+// this 2d fdct32x32 for rate-distortion optimization loop is operating
+// within 16 bits precision.
+extern void dct32x32_rd(rs_allocation input, rs_allocation out, int xoff, int yoff) {
+    int i, j;
+    int output[32 * 32];
+    int16_t inptr[1024];
+    int16_t outptr[1024];
+    int16_t *in = inptr;
+    int16_t *output_1 = outptr;
+    for (i = 0; i < 32; i++) {
+        for (j = 0; j < 32; j++) {
+            inptr[i + j * 32] = rsGetElementAt_short(input, xoff + i, yoff + j);
+            outptr[i + j * 32] = rsGetElementAt_short(out, xoff + i, yoff + j);
+        }
+    }
+
+    // Columns
+    for (i = 0; i < 32; ++i) {
+        int temp_in[32], temp_out[32];
+        for (j = 0; j < 32; ++j)
+            temp_in[j] = in[j * 32 + i] * 4;
+        dct32_1d(temp_in, temp_out, 0);
+        for (j = 0; j < 32; ++j)
+            // TODO(cd): see quality impact of only doing
+            //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
+            //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
+            output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+    }
+
+    // Rows
+    for (i = 0; i < 32; ++i) {
+        int temp_in[32], temp_out[32];
+        for (j = 0; j < 32; ++j)
+            temp_in[j] = output[j + i * 32];
+        dct32_1d(temp_in, temp_out, 1);
+        for (j = 0; j < 32; ++j)
+            output_1[j + i * 32] = temp_out[j];
+    }
+
+    for (i = 0; i < 32; i++) {
+        for (j = 0; j < 32; j++) {
+            rsSetElementAt_short(out, output_1[i + j * 32], xoff + i, yoff + j);
+        }
+    }
+}
diff --git a/driver/runtime/rs_dct.h b/driver/runtime/rs_dct.h
new file mode 100644
index 0000000..a337b76
--- /dev/null
+++ b/driver/runtime/rs_dct.h
@@ -0,0 +1,42 @@
+#ifndef _RS_DCT_H_
+#define _RS_DCT_H_
+
+#include "rs_dct.rsh"
+
+#define ROUND_POWER_OF_TWO(value, n) \
+    (((value) + (1 << ((n) - 1))) >> (n))
+
+#define DCT_CONST_BITS 14
+
+static const int cospi_1_64  = 16364;
+static const int cospi_2_64  = 16305;
+static const int cospi_3_64  = 16207;
+static const int cospi_4_64  = 16069;
+static const int cospi_5_64  = 15893;
+static const int cospi_6_64  = 15679;
+static const int cospi_7_64  = 15426;
+static const int cospi_8_64  = 15137;
+static const int cospi_9_64  = 14811;
+static const int cospi_10_64 = 14449;
+static const int cospi_11_64 = 14053;
+static const int cospi_12_64 = 13623;
+static const int cospi_13_64 = 13160;
+static const int cospi_14_64 = 12665;
+static const int cospi_15_64 = 12140;
+static const int cospi_16_64 = 11585;
+static const int cospi_17_64 = 11003;
+static const int cospi_18_64 = 10394;
+static const int cospi_19_64 = 9760;
+static const int cospi_20_64 = 9102;
+static const int cospi_21_64 = 8423;
+static const int cospi_22_64 = 7723;
+static const int cospi_23_64 = 7005;
+static const int cospi_24_64 = 6270;
+static const int cospi_25_64 = 5520;
+static const int cospi_26_64 = 4756;
+static const int cospi_27_64 = 3981;
+static const int cospi_28_64 = 3196;
+static const int cospi_29_64 = 2404;
+static const int cospi_30_64 = 1606;
+static const int cospi_31_64 = 804;
+#endif
diff --git a/driver/runtime/rs_fadst.c b/driver/runtime/rs_fadst.c
new file mode 100644
index 0000000..b3483b7
--- /dev/null
+++ b/driver/runtime/rs_fadst.c
@@ -0,0 +1,289 @@
+#include "rs_fadst.h"
+#include "rs_allocation.rsh"
+
+extern void fadst4(const rs_allocation input, rs_allocation output, int32_t xoff) {
+    int x0, x1, x2, x3;
+    int s0, s1, s2, s3, s4, s5, s6, s7;
+
+    x0 = rsGetElementAt_short(input, xoff);
+    x1 = rsGetElementAt_short(input, xoff + 1);
+    x2 = rsGetElementAt_short(input, xoff + 2);
+    x3 = rsGetElementAt_short(input, xoff + 3);
+
+    if (!(x0 | x1 | x2 | x3)) {
+        rsSetElementAt_short(output, 0, xoff);
+        rsSetElementAt_short(output, 0, xoff + 1);
+        rsSetElementAt_short(output, 0, xoff + 2);
+        rsSetElementAt_short(output, 0, xoff + 3);
+        return;
+    }
+
+    s0 = sinpi_1_9 * x0;
+    s1 = sinpi_4_9 * x0;
+    s2 = sinpi_2_9 * x1;
+    s3 = sinpi_1_9 * x1;
+    s4 = sinpi_3_9 * x2;
+    s5 = sinpi_4_9 * x3;
+    s6 = sinpi_2_9 * x3;
+    s7 = x0 + x1 - x3;
+
+    x0 = s0 + s2 + s5;
+    x1 = sinpi_3_9 * s7;
+    x2 = s1 - s3 + s6;
+    x3 = s4;
+
+    s0 = x0 + x3;
+    s1 = x1;
+    s2 = x2 - x3;
+    s3 = x2 - x0 + x3;
+
+    rsSetElementAt_short(output, fdct_round_shift(s0), xoff);
+    rsSetElementAt_short(output, fdct_round_shift(s1), xoff + 1);
+    rsSetElementAt_short(output, fdct_round_shift(s2), xoff + 2);
+    rsSetElementAt_short(output, fdct_round_shift(s3), xoff + 3);
+}
+
+extern void fadst8(const rs_allocation input, rs_allocation output, int32_t xoff) {
+    int s0, s1, s2, s3, s4, s5, s6, s7;
+    int16_t outArr[8];
+
+    int x0 = rsGetElementAt_short(input, xoff + 7);
+    int x1 = rsGetElementAt_short(input, xoff);
+    int x2 = rsGetElementAt_short(input, xoff + 5);
+    int x3 = rsGetElementAt_short(input, xoff + 2);
+    int x4 = rsGetElementAt_short(input, xoff + 3);
+    int x5 = rsGetElementAt_short(input, xoff + 4);
+    int x6 = rsGetElementAt_short(input, xoff + 1);
+    int x7 = rsGetElementAt_short(input, xoff + 6);
+
+    // stage 1
+    s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+    s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+    s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+    s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+    s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+    s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+    s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+    s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+    x0 = fdct_round_shift(s0 + s4);
+    x1 = fdct_round_shift(s1 + s5);
+    x2 = fdct_round_shift(s2 + s6);
+    x3 = fdct_round_shift(s3 + s7);
+    x4 = fdct_round_shift(s0 - s4);
+    x5 = fdct_round_shift(s1 - s5);
+    x6 = fdct_round_shift(s2 - s6);
+    x7 = fdct_round_shift(s3 - s7);
+
+    // stage 2
+    s0 = x0;
+    s1 = x1;
+    s2 = x2;
+    s3 = x3;
+    s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
+    s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
+    s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
+    s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
+
+    x0 = s0 + s2;
+    x1 = s1 + s3;
+    x2 = s0 - s2;
+    x3 = s1 - s3;
+    x4 = fdct_round_shift(s4 + s6);
+    x5 = fdct_round_shift(s5 + s7);
+    x6 = fdct_round_shift(s4 - s6);
+    x7 = fdct_round_shift(s5 - s7);
+
+    // stage 3
+    s2 = cospi_16_64 * (x2 + x3);
+    s3 = cospi_16_64 * (x2 - x3);
+    s6 = cospi_16_64 * (x6 + x7);
+    s7 = cospi_16_64 * (x6 - x7);
+
+    x2 = fdct_round_shift(s2);
+    x3 = fdct_round_shift(s3);
+    x6 = fdct_round_shift(s6);
+    x7 = fdct_round_shift(s7);
+
+    outArr[0] =   x0;
+    outArr[1] = - x4;
+    outArr[2] =   x6;
+    outArr[3] = - x2;
+    outArr[4] =   x3;
+    outArr[5] = - x7;
+    outArr[6] =   x5;
+    outArr[7] = - x1;
+
+    int i;
+    for (i = 0; i < 8; ++i) {
+        rsSetElementAt_short(output, outArr[i], xoff + i);
+    }
+}
+
+extern void fadst16(const rs_allocation input, rs_allocation output, int32_t xoff) {
+    int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+    int16_t outArr[16];
+
+    int x0 = rsGetElementAt_short(input, xoff + 15);
+    int x1 = rsGetElementAt_short(input, xoff);
+    int x2 = rsGetElementAt_short(input, xoff + 13);
+    int x3 = rsGetElementAt_short(input, xoff + 2);
+    int x4 = rsGetElementAt_short(input, xoff + 11);
+    int x5 = rsGetElementAt_short(input, xoff + 4);
+    int x6 = rsGetElementAt_short(input, xoff + 9);
+    int x7 = rsGetElementAt_short(input, xoff + 6);
+    int x8 = rsGetElementAt_short(input, xoff + 7);
+    int x9 = rsGetElementAt_short(input, xoff + 8);
+    int x10 = rsGetElementAt_short(input, xoff + 5);
+    int x11 = rsGetElementAt_short(input, xoff + 10);
+    int x12 = rsGetElementAt_short(input, xoff + 3);
+    int x13 = rsGetElementAt_short(input, xoff + 12);
+    int x14 = rsGetElementAt_short(input, xoff + 1);
+    int x15 = rsGetElementAt_short(input, xoff + 14);
+
+    // stage 1
+    s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+    s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+    s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+    s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+    s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+    s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+    s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+    s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+    s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+    s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+    s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+    s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+    s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+    s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+    s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+    s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+    x0 = fdct_round_shift(s0 + s8);
+    x1 = fdct_round_shift(s1 + s9);
+    x2 = fdct_round_shift(s2 + s10);
+    x3 = fdct_round_shift(s3 + s11);
+    x4 = fdct_round_shift(s4 + s12);
+    x5 = fdct_round_shift(s5 + s13);
+    x6 = fdct_round_shift(s6 + s14);
+    x7 = fdct_round_shift(s7 + s15);
+    x8  = fdct_round_shift(s0 - s8);
+    x9  = fdct_round_shift(s1 - s9);
+    x10 = fdct_round_shift(s2 - s10);
+    x11 = fdct_round_shift(s3 - s11);
+    x12 = fdct_round_shift(s4 - s12);
+    x13 = fdct_round_shift(s5 - s13);
+    x14 = fdct_round_shift(s6 - s14);
+    x15 = fdct_round_shift(s7 - s15);
+
+    // stage 2
+    s0 = x0;
+    s1 = x1;
+    s2 = x2;
+    s3 = x3;
+    s4 = x4;
+    s5 = x5;
+    s6 = x6;
+    s7 = x7;
+    s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+    s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+    s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+    s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+    s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+    s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+    s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+    s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+    x0 = s0 + s4;
+    x1 = s1 + s5;
+    x2 = s2 + s6;
+    x3 = s3 + s7;
+    x4 = s0 - s4;
+    x5 = s1 - s5;
+    x6 = s2 - s6;
+    x7 = s3 - s7;
+    x8 = fdct_round_shift(s8 + s12);
+    x9 = fdct_round_shift(s9 + s13);
+    x10 = fdct_round_shift(s10 + s14);
+    x11 = fdct_round_shift(s11 + s15);
+    x12 = fdct_round_shift(s8 - s12);
+    x13 = fdct_round_shift(s9 - s13);
+    x14 = fdct_round_shift(s10 - s14);
+    x15 = fdct_round_shift(s11 - s15);
+
+    // stage 3
+    s0 = x0;
+    s1 = x1;
+    s2 = x2;
+    s3 = x3;
+    s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+    s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+    s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+    s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+    s8 = x8;
+    s9 = x9;
+    s10 = x10;
+    s11 = x11;
+    s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+    s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+    s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+    s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+    x0 = s0 + s2;
+    x1 = s1 + s3;
+    x2 = s0 - s2;
+    x3 = s1 - s3;
+    x4 = fdct_round_shift(s4 + s6);
+    x5 = fdct_round_shift(s5 + s7);
+    x6 = fdct_round_shift(s4 - s6);
+    x7 = fdct_round_shift(s5 - s7);
+    x8 = s8 + s10;
+    x9 = s9 + s11;
+    x10 = s8 - s10;
+    x11 = s9 - s11;
+    x12 = fdct_round_shift(s12 + s14);
+    x13 = fdct_round_shift(s13 + s15);
+    x14 = fdct_round_shift(s12 - s14);
+    x15 = fdct_round_shift(s13 - s15);
+
+    // stage 4
+    s2 = (- cospi_16_64) * (x2 + x3);
+    s3 = cospi_16_64 * (x2 - x3);
+    s6 = cospi_16_64 * (x6 + x7);
+    s7 = cospi_16_64 * (- x6 + x7);
+    s10 = cospi_16_64 * (x10 + x11);
+    s11 = cospi_16_64 * (- x10 + x11);
+    s14 = (- cospi_16_64) * (x14 + x15);
+    s15 = cospi_16_64 * (x14 - x15);
+
+    x2 = fdct_round_shift(s2);
+    x3 = fdct_round_shift(s3);
+    x6 = fdct_round_shift(s6);
+    x7 = fdct_round_shift(s7);
+    x10 = fdct_round_shift(s10);
+    x11 = fdct_round_shift(s11);
+    x14 = fdct_round_shift(s14);
+    x15 = fdct_round_shift(s15);
+
+    outArr[0] = x0;
+    outArr[1] = - x8;
+    outArr[2] = x12;
+    outArr[3] = - x4;
+    outArr[4] = x6;
+    outArr[5] = x14;
+    outArr[6] = x10;
+    outArr[7] = x2;
+    outArr[8] = x3;
+    outArr[9] =  x11;
+    outArr[10] = x15;
+    outArr[11] = x7;
+    outArr[12] = x5;
+    outArr[13] = - x13;
+    outArr[14] = x9;
+    outArr[15] = - x1;
+
+    int i;
+    for (i = 0; i < 16; ++i) {
+        rsSetElementAt_short(output, outArr[i], xoff + i);
+    }
+}
diff --git a/driver/runtime/rs_fadst.h b/driver/runtime/rs_fadst.h
new file mode 100644
index 0000000..edff2ee
--- /dev/null
+++ b/driver/runtime/rs_fadst.h
@@ -0,0 +1,53 @@
+#ifndef _RS_FADST_H_
+#define _RS_FADST_H_
+
+#include "rs_types.rsh"
+
+#define DCT_CONST_BITS 14
+
+static const int cospi_1_64  = 16364;
+static const int cospi_2_64  = 16305;
+static const int cospi_3_64  = 16207;
+static const int cospi_4_64  = 16069;
+static const int cospi_5_64  = 15893;
+static const int cospi_6_64  = 15679;
+static const int cospi_7_64  = 15426;
+static const int cospi_8_64  = 15137;
+static const int cospi_9_64  = 14811;
+static const int cospi_10_64 = 14449;
+static const int cospi_11_64 = 14053;
+static const int cospi_12_64 = 13623;
+static const int cospi_13_64 = 13160;
+static const int cospi_14_64 = 12665;
+static const int cospi_15_64 = 12140;
+static const int cospi_16_64 = 11585;
+static const int cospi_17_64 = 11003;
+static const int cospi_18_64 = 10394;
+static const int cospi_19_64 = 9760;
+static const int cospi_20_64 = 9102;
+static const int cospi_21_64 = 8423;
+static const int cospi_22_64 = 7723;
+static const int cospi_23_64 = 7005;
+static const int cospi_24_64 = 6270;
+static const int cospi_25_64 = 5520;
+static const int cospi_26_64 = 4756;
+static const int cospi_27_64 = 3981;
+static const int cospi_28_64 = 3196;
+static const int cospi_29_64 = 2404;
+static const int cospi_30_64 = 1606;
+static const int cospi_31_64 = 804;
+
+static const int sinpi_1_9 = 5283;
+static const int sinpi_2_9 = 9929;
+static const int sinpi_3_9 = 13377;
+static const int sinpi_4_9 = 15212;
+
+#define ROUND_POWER_OF_TWO(value, n) \
+    (((value) + (1 << ((n) - 1))) >> (n))
+
+static int fdct_round_shift(int input) {
+    int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+    return rv;
+}
+
+#endif
diff --git a/driver/runtime/rs_iadst.c b/driver/runtime/rs_iadst.c
new file mode 100644
index 0000000..ee1d0cf
--- /dev/null
+++ b/driver/runtime/rs_iadst.c
@@ -0,0 +1,308 @@
+#include "rs_iadst.h"
+#include "rs_allocation.rsh"
+
+extern void iadst4(const rs_allocation input, rs_allocation output, int32_t xoff) {
+    int s0, s1, s2, s3, s4, s5, s6, s7;
+
+    int x0 = rsGetElementAt_short(input, xoff);
+    int x1 = rsGetElementAt_short(input, xoff + 1);
+    int x2 = rsGetElementAt_short(input, xoff + 2);
+    int x3 = rsGetElementAt_short(input, xoff + 3);
+
+    if (!(x0 | x1 | x2 | x3)) {
+        rsSetElementAt_short(output, 0, xoff);
+        rsSetElementAt_short(output, 0, xoff + 1);
+        rsSetElementAt_short(output, 0, xoff + 2);
+        rsSetElementAt_short(output, 0, xoff + 3);
+        return;
+    }
+
+    s0 = sinpi_1_9 * x0;
+    s1 = sinpi_2_9 * x0;
+    s2 = sinpi_3_9 * x1;
+    s3 = sinpi_4_9 * x2;
+    s4 = sinpi_1_9 * x2;
+    s5 = sinpi_2_9 * x3;
+    s6 = sinpi_4_9 * x3;
+    s7 = x0 - x2 + x3;
+
+    x0 = s0 + s3 + s5;
+    x1 = s1 - s4 - s6;
+    x2 = sinpi_3_9 * s7;
+    x3 = s2;
+
+    s0 = x0 + x3;
+    s1 = x1 + x3;
+    s2 = x2;
+    s3 = x0 + x1 - x3;
+
+    rsSetElementAt_short(output, dct_const_round_shift(s0), xoff);
+    rsSetElementAt_short(output, dct_const_round_shift(s1), xoff + 1);
+    rsSetElementAt_short(output, dct_const_round_shift(s2), xoff + 2);
+    rsSetElementAt_short(output, dct_const_round_shift(s3), xoff + 3);
+}
+
+extern void iadst8(const rs_allocation input, rs_allocation output, int32_t xoff) {
+    int s0, s1, s2, s3, s4, s5, s6, s7;
+    int16_t outArr[8];
+
+    int x0 = rsGetElementAt_short(input, xoff + 7);
+    int x1 = rsGetElementAt_short(input, xoff);
+    int x2 = rsGetElementAt_short(input, xoff + 5);
+    int x3 = rsGetElementAt_short(input, xoff + 2);
+    int x4 = rsGetElementAt_short(input, xoff + 3);
+    int x5 = rsGetElementAt_short(input, xoff + 4);
+    int x6 = rsGetElementAt_short(input, xoff + 1);
+    int x7 = rsGetElementAt_short(input, xoff + 6);
+
+    if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+        rsSetElementAt_short(output, 0, xoff);
+        rsSetElementAt_short(output, 0, xoff + 1);
+        rsSetElementAt_short(output, 0, xoff + 2);
+        rsSetElementAt_short(output, 0, xoff + 3);
+        rsSetElementAt_short(output, 0, xoff + 4);
+        rsSetElementAt_short(output, 0, xoff + 5);
+        rsSetElementAt_short(output, 0, xoff + 6);
+        rsSetElementAt_short(output, 0, xoff + 7);
+        return;
+    }
+
+    // stage 1
+    s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+    s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+    s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+    s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+    s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+    s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+    s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+    s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+    x0 = dct_const_round_shift(s0 + s4);
+    x1 = dct_const_round_shift(s1 + s5);
+    x2 = dct_const_round_shift(s2 + s6);
+    x3 = dct_const_round_shift(s3 + s7);
+    x4 = dct_const_round_shift(s0 - s4);
+    x5 = dct_const_round_shift(s1 - s5);
+    x6 = dct_const_round_shift(s2 - s6);
+    x7 = dct_const_round_shift(s3 - s7);
+
+    // stage 2
+    s0 = x0;
+    s1 = x1;
+    s2 = x2;
+    s3 = x3;
+    s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+    s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+    s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+    s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+    x0 = s0 + s2;
+    x1 = s1 + s3;
+    x2 = s0 - s2;
+    x3 = s1 - s3;
+    x4 = dct_const_round_shift(s4 + s6);
+    x5 = dct_const_round_shift(s5 + s7);
+    x6 = dct_const_round_shift(s4 - s6);
+    x7 = dct_const_round_shift(s5 - s7);
+
+    // stage 3
+    s2 = cospi_16_64 * (x2 + x3);
+    s3 = cospi_16_64 * (x2 - x3);
+    s6 = cospi_16_64 * (x6 + x7);
+    s7 = cospi_16_64 * (x6 - x7);
+
+    x2 = dct_const_round_shift(s2);
+    x3 = dct_const_round_shift(s3);
+    x6 = dct_const_round_shift(s6);
+    x7 = dct_const_round_shift(s7);
+
+    outArr[0] =  x0;
+    outArr[1] = -x4;
+    outArr[2] =  x6;
+    outArr[3] = -x2;
+    outArr[4] =  x3;
+    outArr[5] = -x7;
+    outArr[6] =  x5;
+    outArr[7] = -x1;
+
+    int i;
+    for (i = 0; i < 8; ++i) {
+        rsSetElementAt_short(output, outArr[i], xoff + i);
+    }
+}
+
+extern void iadst16(const rs_allocation input, rs_allocation output, int32_t xoff) {
+    int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+    int16_t outArr[16];
+    int i;
+
+    int x0 = rsGetElementAt_short(input, xoff + 15);
+    int x1 = rsGetElementAt_short(input, xoff);
+    int x2 = rsGetElementAt_short(input, xoff + 13);
+    int x3 = rsGetElementAt_short(input, xoff + 2);
+    int x4 = rsGetElementAt_short(input, xoff + 11);
+    int x5 = rsGetElementAt_short(input, xoff + 4);
+    int x6 = rsGetElementAt_short(input, xoff + 9);
+    int x7 = rsGetElementAt_short(input, xoff + 6);
+    int x8 = rsGetElementAt_short(input, xoff + 7);
+    int x9 = rsGetElementAt_short(input, xoff + 8);
+    int x10 = rsGetElementAt_short(input, xoff + 5);
+    int x11 = rsGetElementAt_short(input, xoff + 10);
+    int x12 = rsGetElementAt_short(input, xoff + 3);
+    int x13 = rsGetElementAt_short(input, xoff + 12);
+    int x14 = rsGetElementAt_short(input, xoff + 1);
+    int x15 = rsGetElementAt_short(input, xoff + 14);
+
+    if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+        for (i = 0; i < 16; ++i) {
+            rsSetElementAt_short(output, 0, xoff + i);
+        }
+        return;
+    }
+
+    // stage 1
+    s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+    s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+    s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+    s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+    s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+    s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+    s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+    s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+    s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+    s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+    s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+    s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+    s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+    s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+    s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+    s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+    x0 = dct_const_round_shift(s0 + s8);
+    x1 = dct_const_round_shift(s1 + s9);
+    x2 = dct_const_round_shift(s2 + s10);
+    x3 = dct_const_round_shift(s3 + s11);
+    x4 = dct_const_round_shift(s4 + s12);
+    x5 = dct_const_round_shift(s5 + s13);
+    x6 = dct_const_round_shift(s6 + s14);
+    x7 = dct_const_round_shift(s7 + s15);
+    x8  = dct_const_round_shift(s0 - s8);
+    x9  = dct_const_round_shift(s1 - s9);
+    x10 = dct_const_round_shift(s2 - s10);
+    x11 = dct_const_round_shift(s3 - s11);
+    x12 = dct_const_round_shift(s4 - s12);
+    x13 = dct_const_round_shift(s5 - s13);
+    x14 = dct_const_round_shift(s6 - s14);
+    x15 = dct_const_round_shift(s7 - s15);
+
+    // stage 2
+    s0 = x0;
+    s1 = x1;
+    s2 = x2;
+    s3 = x3;
+    s4 = x4;
+    s5 = x5;
+    s6 = x6;
+    s7 = x7;
+    s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+    s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+    s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+    s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+    s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+    s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+    s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+    s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+    x0 = s0 + s4;
+    x1 = s1 + s5;
+    x2 = s2 + s6;
+    x3 = s3 + s7;
+    x4 = s0 - s4;
+    x5 = s1 - s5;
+    x6 = s2 - s6;
+    x7 = s3 - s7;
+    x8 = dct_const_round_shift(s8 + s12);
+    x9 = dct_const_round_shift(s9 + s13);
+    x10 = dct_const_round_shift(s10 + s14);
+    x11 = dct_const_round_shift(s11 + s15);
+    x12 = dct_const_round_shift(s8 - s12);
+    x13 = dct_const_round_shift(s9 - s13);
+    x14 = dct_const_round_shift(s10 - s14);
+    x15 = dct_const_round_shift(s11 - s15);
+
+    // stage 3
+    s0 = x0;
+    s1 = x1;
+    s2 = x2;
+    s3 = x3;
+    s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+    s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+    s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+    s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+    s8 = x8;
+    s9 = x9;
+    s10 = x10;
+    s11 = x11;
+    s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+    s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+    s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+    s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+    x0 = s0 + s2;
+    x1 = s1 + s3;
+    x2 = s0 - s2;
+    x3 = s1 - s3;
+    x4 = dct_const_round_shift(s4 + s6);
+    x5 = dct_const_round_shift(s5 + s7);
+    x6 = dct_const_round_shift(s4 - s6);
+    x7 = dct_const_round_shift(s5 - s7);
+    x8 = s8 + s10;
+    x9 = s9 + s11;
+    x10 = s8 - s10;
+    x11 = s9 - s11;
+    x12 = dct_const_round_shift(s12 + s14);
+    x13 = dct_const_round_shift(s13 + s15);
+    x14 = dct_const_round_shift(s12 - s14);
+    x15 = dct_const_round_shift(s13 - s15);
+
+    // stage 4
+    s2 = (- cospi_16_64) * (x2 + x3);
+    s3 = cospi_16_64 * (x2 - x3);
+    s6 = cospi_16_64 * (x6 + x7);
+    s7 = cospi_16_64 * (- x6 + x7);
+    s10 = cospi_16_64 * (x10 + x11);
+    s11 = cospi_16_64 * (- x10 + x11);
+    s14 = (- cospi_16_64) * (x14 + x15);
+    s15 = cospi_16_64 * (x14 - x15);
+
+    x2 = dct_const_round_shift(s2);
+    x3 = dct_const_round_shift(s3);
+    x6 = dct_const_round_shift(s6);
+    x7 = dct_const_round_shift(s7);
+    x10 = dct_const_round_shift(s10);
+    x11 = dct_const_round_shift(s11);
+    x14 = dct_const_round_shift(s14);
+    x15 = dct_const_round_shift(s15);
+
+    outArr[0] =  x0;
+    outArr[1] = -x8;
+    outArr[2] =  x12;
+    outArr[3] = -x4;
+    outArr[4] =  x6;
+    outArr[5] =  x14;
+    outArr[6] =  x10;
+    outArr[7] =  x2;
+    outArr[8] =  x3;
+    outArr[9] =  x11;
+    outArr[10] =  x15;
+    outArr[11] =  x7;
+    outArr[12] =  x5;
+    outArr[13] = -x13;
+    outArr[14] =  x9;
+    outArr[15] = -x1;
+
+    for (i = 0; i < 16; ++i) {
+        rsSetElementAt_short(output, outArr[i], xoff + i);
+    }
+}
diff --git a/driver/runtime/rs_iadst.h b/driver/runtime/rs_iadst.h
new file mode 100644
index 0000000..3f9fd1c
--- /dev/null
+++ b/driver/runtime/rs_iadst.h
@@ -0,0 +1,53 @@
+#ifndef _RS_IADST_H_
+#define _RS_IADST_H_
+
+#include "rs_types.rsh"
+
+#define DCT_CONST_BITS 14
+
+static const int cospi_1_64  = 16364;
+static const int cospi_2_64  = 16305;
+static const int cospi_3_64  = 16207;
+static const int cospi_4_64  = 16069;
+static const int cospi_5_64  = 15893;
+static const int cospi_6_64  = 15679;
+static const int cospi_7_64  = 15426;
+static const int cospi_8_64  = 15137;
+static const int cospi_9_64  = 14811;
+static const int cospi_10_64 = 14449;
+static const int cospi_11_64 = 14053;
+static const int cospi_12_64 = 13623;
+static const int cospi_13_64 = 13160;
+static const int cospi_14_64 = 12665;
+static const int cospi_15_64 = 12140;
+static const int cospi_16_64 = 11585;
+static const int cospi_17_64 = 11003;
+static const int cospi_18_64 = 10394;
+static const int cospi_19_64 = 9760;
+static const int cospi_20_64 = 9102;
+static const int cospi_21_64 = 8423;
+static const int cospi_22_64 = 7723;
+static const int cospi_23_64 = 7005;
+static const int cospi_24_64 = 6270;
+static const int cospi_25_64 = 5520;
+static const int cospi_26_64 = 4756;
+static const int cospi_27_64 = 3981;
+static const int cospi_28_64 = 3196;
+static const int cospi_29_64 = 2404;
+static const int cospi_30_64 = 1606;
+static const int cospi_31_64 = 804;
+
+static const int sinpi_1_9 = 5283;
+static const int sinpi_2_9 = 9929;
+static const int sinpi_3_9 = 13377;
+static const int sinpi_4_9 = 15212;
+
+#define ROUND_POWER_OF_TWO(value, n) \
+    (((value) + (1 << ((n) - 1))) >> (n))
+
+static int dct_const_round_shift(int input) {
+    int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+    return (int16_t)rv;
+}
+
+#endif
diff --git a/driver/runtime/rs_idct.c b/driver/runtime/rs_idct.c
new file mode 100644
index 0000000..575599f
--- /dev/null
+++ b/driver/runtime/rs_idct.c
@@ -0,0 +1,971 @@
+#include "rs_idct.h"
+#include "rs_allocation.rsh"
+#include <string.h>
+
+static void idct4_1d(const int16_t *input, int16_t *output) {
+    int16_t step[4];
+    int temp1, temp2;
+    // stage 1
+    temp1 = (input[0] + input[2]) * cospi_16_64;
+    temp2 = (input[0] - input[2]) * cospi_16_64;
+    step[0] = dct_const_round_shift(temp1);
+    step[1] = dct_const_round_shift(temp2);
+    temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+    step[2] = dct_const_round_shift(temp1);
+    step[3] = dct_const_round_shift(temp2);
+
+    // stage 2
+    output[0] = step[0] + step[3];
+    output[1] = step[1] + step[2];
+    output[2] = step[1] - step[2];
+    output[3] = step[0] - step[3];
+}
+
+void idct4x4_1(const rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    int i, j;
+    int a1;
+    int16_t out = dct_const_round_shift(rsGetElementAt_short(input, xoff, yoff)
+                          * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 4);
+
+    uint8_t result;
+    for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j) {
+            result = clip_pixel(rsGetElementAt_uchar(dest, j + xoff, i + yoff) + a1);
+            rsSetElementAt_uchar(dest, result, j + xoff, i + yoff);
+        }
+    }
+}
+
+void idct4x4_16(const rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    int16_t out[4 * 4];
+    int16_t *outptr = out;
+    int i, j;
+    int16_t temp_in[4], temp_out[4];
+
+    int16_t in[4 * 4];
+    int16_t *inptr = in;
+    for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j) {
+            in[j + i * 4] = rsGetElementAt_short(input, j + xoff, i + yoff);
+        }
+    }
+
+    // Rows
+    for (i = 0; i < 4; ++i) {
+        idct4_1d(inptr, outptr);
+        inptr += 4;
+        outptr += 4;
+    }
+
+    // Columns
+    uint8_t result;
+    for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j) {
+            temp_in[j] = out[j * 4 + i];
+        }
+        idct4_1d(temp_in, temp_out);
+        for (j = 0; j < 4; ++j) {
+            result = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                             + rsGetElementAt_uchar(dest, i + xoff, j + yoff));
+            rsSetElementAt_uchar(dest, result, i + xoff, j + yoff);
+        }
+    }
+}
+
+static void idct8_1d(const int16_t *input, int16_t *output) {
+    int16_t step1[8], step2[8];
+    int temp1, temp2;
+    // stage 1
+    step1[0] = input[0];
+    step1[2] = input[4];
+    step1[1] = input[2];
+    step1[3] = input[6];
+    temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+    temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+    step1[4] = dct_const_round_shift(temp1);
+    step1[7] = dct_const_round_shift(temp2);
+    temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+    temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+    step1[5] = dct_const_round_shift(temp1);
+    step1[6] = dct_const_round_shift(temp2);
+
+    // stage 2 & stage 3 - even half
+    idct4_1d(step1, step1);
+
+    // stage 2 - odd half
+    step2[4] = step1[4] + step1[5];
+    step2[5] = step1[4] - step1[5];
+    step2[6] = -step1[6] + step1[7];
+    step2[7] = step1[6] + step1[7];
+
+    // stage 3 -odd half
+    step1[4] = step2[4];
+    temp1 = (step2[6] - step2[5]) * cospi_16_64;
+    temp2 = (step2[5] + step2[6]) * cospi_16_64;
+    step1[5] = dct_const_round_shift(temp1);
+    step1[6] = dct_const_round_shift(temp2);
+    step1[7] = step2[7];
+
+    // stage 4
+    output[0] = step1[0] + step1[7];
+    output[1] = step1[1] + step1[6];
+    output[2] = step1[2] + step1[5];
+    output[3] = step1[3] + step1[4];
+    output[4] = step1[3] - step1[4];
+    output[5] = step1[2] - step1[5];
+    output[6] = step1[1] - step1[6];
+    output[7] = step1[0] - step1[7];
+}
+
+void idct8x8_1(const rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    int i, j;
+    int a1;
+    int16_t out = dct_const_round_shift(rsGetElementAt_short(input, xoff, yoff)
+                          * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 5);
+
+    uint8_t result;
+    for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j) {
+            result = clip_pixel(rsGetElementAt_uchar(dest, j + xoff, i + yoff) + a1);
+            rsSetElementAt_uchar(dest, result, j + xoff, i + yoff);
+        }
+    }
+}
+
+void idct8x8_10(const rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    int16_t out[8 * 8] = { 0 };
+    int16_t *outptr = out;
+    int i, j;
+    int16_t temp_in[8], temp_out[8];
+
+    int16_t in[8 * 8];
+    int16_t *inptr = in;
+    for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j) {
+            in[j + i * 8] = rsGetElementAt_short(input, j + xoff, i + yoff);
+        }
+    }
+
+    // First transform rows
+    // only first 4 row has non-zero coefs
+    for (i = 0; i < 4; ++i) {
+        idct8_1d(inptr, outptr);
+        inptr += 8;
+        outptr += 8;
+    }
+
+    // Then transform columns
+    uint8_t result;
+    for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j)
+            temp_in[j] = out[j * 8 + i];
+        idct8_1d(temp_in, temp_out);
+        for (j = 0; j < 8; ++j) {
+            result = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                             + rsGetElementAt_uchar(dest, i + xoff, j + yoff));
+            rsSetElementAt_uchar(dest, result, i + xoff, j + yoff);
+        }
+    }
+}
+
+void idct8x8_64(const rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    int16_t out[8 * 8];
+    int16_t *outptr = out;
+    int i, j;
+    int16_t temp_in[8], temp_out[8];
+
+    int16_t in[8 * 8];
+    int16_t *inptr = in;
+    for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j) {
+            in[j + i * 8] = rsGetElementAt_short(input, j + xoff, i + yoff);
+        }
+    }
+
+    // First transform rows
+    for (i = 0; i < 8; ++i) {
+        idct8_1d(inptr, outptr);
+        inptr += 8;
+        outptr += 8;
+    }
+
+    // Then transform columns
+    uint8_t result;
+    for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j)
+            temp_in[j] = out[j * 8 + i];
+        idct8_1d(temp_in, temp_out);
+        for (j = 0; j < 8; ++j) {
+            result = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                             + rsGetElementAt_uchar(dest, i + xoff, j + yoff));
+            rsSetElementAt_uchar(dest, result, i + xoff, j + yoff);
+        }
+    }
+}
+
+static void idct16_1d(const int16_t *input, int16_t *output) {
+    int16_t step1[16], step2[16];
+    int temp1, temp2;
+
+    // stage 1
+    step1[0] = input[0/2];
+    step1[1] = input[16/2];
+    step1[2] = input[8/2];
+    step1[3] = input[24/2];
+    step1[4] = input[4/2];
+    step1[5] = input[20/2];
+    step1[6] = input[12/2];
+    step1[7] = input[28/2];
+    step1[8] = input[2/2];
+    step1[9] = input[18/2];
+    step1[10] = input[10/2];
+    step1[11] = input[26/2];
+    step1[12] = input[6/2];
+    step1[13] = input[22/2];
+    step1[14] = input[14/2];
+    step1[15] = input[30/2];
+
+    // stage 2
+    step2[0] = step1[0];
+    step2[1] = step1[1];
+    step2[2] = step1[2];
+    step2[3] = step1[3];
+    step2[4] = step1[4];
+    step2[5] = step1[5];
+    step2[6] = step1[6];
+    step2[7] = step1[7];
+
+    temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+    temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+    step2[8] = dct_const_round_shift(temp1);
+    step2[15] = dct_const_round_shift(temp2);
+
+    temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+    temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+    step2[9] = dct_const_round_shift(temp1);
+    step2[14] = dct_const_round_shift(temp2);
+
+    temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+    temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+    step2[10] = dct_const_round_shift(temp1);
+    step2[13] = dct_const_round_shift(temp2);
+
+    temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+    temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+    step2[11] = dct_const_round_shift(temp1);
+    step2[12] = dct_const_round_shift(temp2);
+
+    // stage 3
+    step1[0] = step2[0];
+    step1[1] = step2[1];
+    step1[2] = step2[2];
+    step1[3] = step2[3];
+
+    temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+    temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+    step1[4] = dct_const_round_shift(temp1);
+    step1[7] = dct_const_round_shift(temp2);
+    temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+    temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+    step1[5] = dct_const_round_shift(temp1);
+    step1[6] = dct_const_round_shift(temp2);
+
+    step1[8] = step2[8] + step2[9];
+    step1[9] = step2[8] - step2[9];
+    step1[10] = -step2[10] + step2[11];
+    step1[11] = step2[10] + step2[11];
+    step1[12] = step2[12] + step2[13];
+    step1[13] = step2[12] - step2[13];
+    step1[14] = -step2[14] + step2[15];
+    step1[15] = step2[14] + step2[15];
+
+    // stage 4
+    temp1 = (step1[0] + step1[1]) * cospi_16_64;
+    temp2 = (step1[0] - step1[1]) * cospi_16_64;
+    step2[0] = dct_const_round_shift(temp1);
+    step2[1] = dct_const_round_shift(temp2);
+    temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+    temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+    step2[2] = dct_const_round_shift(temp1);
+    step2[3] = dct_const_round_shift(temp2);
+    step2[4] = step1[4] + step1[5];
+    step2[5] = step1[4] - step1[5];
+    step2[6] = -step1[6] + step1[7];
+    step2[7] = step1[6] + step1[7];
+
+    step2[8] = step1[8];
+    step2[15] = step1[15];
+    temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+    temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+    step2[9] = dct_const_round_shift(temp1);
+    step2[14] = dct_const_round_shift(temp2);
+    temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+    temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+    step2[10] = dct_const_round_shift(temp1);
+    step2[13] = dct_const_round_shift(temp2);
+    step2[11] = step1[11];
+    step2[12] = step1[12];
+
+    // stage 5
+    step1[0] = step2[0] + step2[3];
+    step1[1] = step2[1] + step2[2];
+    step1[2] = step2[1] - step2[2];
+    step1[3] = step2[0] - step2[3];
+    step1[4] = step2[4];
+    temp1 = (step2[6] - step2[5]) * cospi_16_64;
+    temp2 = (step2[5] + step2[6]) * cospi_16_64;
+    step1[5] = dct_const_round_shift(temp1);
+    step1[6] = dct_const_round_shift(temp2);
+    step1[7] = step2[7];
+
+    step1[8] = step2[8] + step2[11];
+    step1[9] = step2[9] + step2[10];
+    step1[10] = step2[9] - step2[10];
+    step1[11] = step2[8] - step2[11];
+    step1[12] = -step2[12] + step2[15];
+    step1[13] = -step2[13] + step2[14];
+    step1[14] = step2[13] + step2[14];
+    step1[15] = step2[12] + step2[15];
+
+    // stage 6
+    step2[0] = step1[0] + step1[7];
+    step2[1] = step1[1] + step1[6];
+    step2[2] = step1[2] + step1[5];
+    step2[3] = step1[3] + step1[4];
+    step2[4] = step1[3] - step1[4];
+    step2[5] = step1[2] - step1[5];
+    step2[6] = step1[1] - step1[6];
+    step2[7] = step1[0] - step1[7];
+    step2[8] = step1[8];
+    step2[9] = step1[9];
+    temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+    temp2 = (step1[10] + step1[13]) * cospi_16_64;
+    step2[10] = dct_const_round_shift(temp1);
+    step2[13] = dct_const_round_shift(temp2);
+    temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+    temp2 = (step1[11] + step1[12]) * cospi_16_64;
+    step2[11] = dct_const_round_shift(temp1);
+    step2[12] = dct_const_round_shift(temp2);
+    step2[14] = step1[14];
+    step2[15] = step1[15];
+
+    // stage 7
+    output[0] = step2[0] + step2[15];
+    output[1] = step2[1] + step2[14];
+    output[2] = step2[2] + step2[13];
+    output[3] = step2[3] + step2[12];
+    output[4] = step2[4] + step2[11];
+    output[5] = step2[5] + step2[10];
+    output[6] = step2[6] + step2[9];
+    output[7] = step2[7] + step2[8];
+    output[8] = step2[7] - step2[8];
+    output[9] = step2[6] - step2[9];
+    output[10] = step2[5] - step2[10];
+    output[11] = step2[4] - step2[11];
+    output[12] = step2[3] - step2[12];
+    output[13] = step2[2] - step2[13];
+    output[14] = step2[1] - step2[14];
+    output[15] = step2[0] - step2[15];
+}
+
+void idct16x16_1(const rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    int i, j;
+    int a1;
+    int16_t out = dct_const_round_shift(rsGetElementAt_short(input, xoff, yoff)
+                          * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 6);
+
+    uint8_t result;
+    for (i = 0; i < 16; ++i) {
+        for (j = 0; j < 16; ++j) {
+            result = clip_pixel(rsGetElementAt_uchar(dest, j + xoff, i + yoff) + a1);
+            rsSetElementAt_uchar(dest, result, j + xoff, i + yoff);
+        }
+    }
+}
+
+void idct16x16_10(const rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    int16_t out[16 * 16] = { 0 };
+    int16_t *outptr = out;
+    int i, j;
+    int16_t temp_in[16], temp_out[16];
+
+    int16_t in[16 * 16];
+    int16_t *inptr = in;
+    for (i = 0; i < 16; ++i) {
+        for (j = 0; j < 16; ++j) {
+            in[j + i * 16] = rsGetElementAt_short(input, j + xoff, i + yoff);
+        }
+    }
+
+    // First transform rows. Since all non-zero dct coefficients are in
+    // upper-left 4x4 area, we only need to calculate first 4 rows here.
+    for (i = 0; i < 4; ++i) {
+        idct16_1d(inptr, outptr);
+        inptr += 16;
+        outptr += 16;
+    }
+
+    // Then transform columns
+    uint8_t result;
+    for (i = 0; i < 16; ++i) {
+        for (j = 0; j < 16; ++j)
+            temp_in[j] = out[j * 16 + i];
+        idct16_1d(temp_in, temp_out);
+        for (j = 0; j < 16; ++j) {
+            result = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                             + rsGetElementAt_uchar(dest, i + xoff,j + yoff));
+            rsSetElementAt_uchar(dest, result, i + xoff, j + yoff);
+        }
+    }
+}
+
+void idct16x16_256(const rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    int16_t out[16 * 16];
+    int16_t *outptr = out;
+    int i, j;
+    int16_t temp_in[16], temp_out[16];
+
+    int16_t in[16 * 16];
+    int16_t *inptr = in;
+    for (i = 0; i < 16; ++i) {
+        for (j = 0; j < 16; ++j) {
+            in[j + i * 16] = rsGetElementAt_short(input, j + xoff, i + yoff);
+        }
+    }
+
+    // First transform rows
+    for (i = 0; i < 16; ++i) {
+        idct16_1d(inptr, outptr);
+        inptr += 16;
+        outptr += 16;
+    }
+
+    // Then transform columns
+    uint8_t result;
+    for (i = 0; i < 16; ++i) {
+        for (j = 0; j < 16; ++j)
+            temp_in[j] = out[j * 16 + i];
+        idct16_1d(temp_in, temp_out);
+        for (j = 0; j < 16; ++j) {
+            result = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                             + rsGetElementAt_uchar(dest, i + xoff, j + yoff));
+            rsSetElementAt_uchar(dest, result, i + xoff, j + yoff);
+        }
+    }
+}
+
+static void idct32_1d(const int16_t *input, int16_t *output) {
+    int16_t step1[32], step2[32];
+    int temp1, temp2;
+
+    // stage 1
+    step1[0] = input[0];
+    step1[1] = input[16];
+    step1[2] = input[8];
+    step1[3] = input[24];
+    step1[4] = input[4];
+    step1[5] = input[20];
+    step1[6] = input[12];
+    step1[7] = input[28];
+    step1[8] = input[2];
+    step1[9] = input[18];
+    step1[10] = input[10];
+    step1[11] = input[26];
+    step1[12] = input[6];
+    step1[13] = input[22];
+    step1[14] = input[14];
+    step1[15] = input[30];
+
+    temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+    temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+    step1[16] = dct_const_round_shift(temp1);
+    step1[31] = dct_const_round_shift(temp2);
+
+    temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+    temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+    step1[17] = dct_const_round_shift(temp1);
+    step1[30] = dct_const_round_shift(temp2);
+
+    temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+    temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+    step1[18] = dct_const_round_shift(temp1);
+    step1[29] = dct_const_round_shift(temp2);
+
+    temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+    temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+    step1[19] = dct_const_round_shift(temp1);
+    step1[28] = dct_const_round_shift(temp2);
+
+    temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+    temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+    step1[20] = dct_const_round_shift(temp1);
+    step1[27] = dct_const_round_shift(temp2);
+
+    temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+    temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+    step1[21] = dct_const_round_shift(temp1);
+    step1[26] = dct_const_round_shift(temp2);
+
+    temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+    temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+    step1[22] = dct_const_round_shift(temp1);
+    step1[25] = dct_const_round_shift(temp2);
+
+    temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+    temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+    step1[23] = dct_const_round_shift(temp1);
+    step1[24] = dct_const_round_shift(temp2);
+
+    // stage 2
+    step2[0] = step1[0];
+    step2[1] = step1[1];
+    step2[2] = step1[2];
+    step2[3] = step1[3];
+    step2[4] = step1[4];
+    step2[5] = step1[5];
+    step2[6] = step1[6];
+    step2[7] = step1[7];
+
+    temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+    temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+    step2[8] = dct_const_round_shift(temp1);
+    step2[15] = dct_const_round_shift(temp2);
+
+    temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+    temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+    step2[9] = dct_const_round_shift(temp1);
+    step2[14] = dct_const_round_shift(temp2);
+
+    temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+    temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+    step2[10] = dct_const_round_shift(temp1);
+    step2[13] = dct_const_round_shift(temp2);
+
+    temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+    temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+    step2[11] = dct_const_round_shift(temp1);
+    step2[12] = dct_const_round_shift(temp2);
+
+    step2[16] = step1[16] + step1[17];
+    step2[17] = step1[16] - step1[17];
+    step2[18] = -step1[18] + step1[19];
+    step2[19] = step1[18] + step1[19];
+    step2[20] = step1[20] + step1[21];
+    step2[21] = step1[20] - step1[21];
+    step2[22] = -step1[22] + step1[23];
+    step2[23] = step1[22] + step1[23];
+    step2[24] = step1[24] + step1[25];
+    step2[25] = step1[24] - step1[25];
+    step2[26] = -step1[26] + step1[27];
+    step2[27] = step1[26] + step1[27];
+    step2[28] = step1[28] + step1[29];
+    step2[29] = step1[28] - step1[29];
+    step2[30] = -step1[30] + step1[31];
+    step2[31] = step1[30] + step1[31];
+
+    // stage 3
+    step1[0] = step2[0];
+    step1[1] = step2[1];
+    step1[2] = step2[2];
+    step1[3] = step2[3];
+
+    temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+    temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+    step1[4] = dct_const_round_shift(temp1);
+    step1[7] = dct_const_round_shift(temp2);
+    temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+    temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+    step1[5] = dct_const_round_shift(temp1);
+    step1[6] = dct_const_round_shift(temp2);
+
+    step1[8] = step2[8] + step2[9];
+    step1[9] = step2[8] - step2[9];
+    step1[10] = -step2[10] + step2[11];
+    step1[11] = step2[10] + step2[11];
+    step1[12] = step2[12] + step2[13];
+    step1[13] = step2[12] - step2[13];
+    step1[14] = -step2[14] + step2[15];
+    step1[15] = step2[14] + step2[15];
+
+    step1[16] = step2[16];
+    step1[31] = step2[31];
+    temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+    temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+    step1[17] = dct_const_round_shift(temp1);
+    step1[30] = dct_const_round_shift(temp2);
+    temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+    temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+    step1[18] = dct_const_round_shift(temp1);
+    step1[29] = dct_const_round_shift(temp2);
+    step1[19] = step2[19];
+    step1[20] = step2[20];
+    temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+    temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+    step1[21] = dct_const_round_shift(temp1);
+    step1[26] = dct_const_round_shift(temp2);
+    temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+    temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+    step1[22] = dct_const_round_shift(temp1);
+    step1[25] = dct_const_round_shift(temp2);
+    step1[23] = step2[23];
+    step1[24] = step2[24];
+    step1[27] = step2[27];
+    step1[28] = step2[28];
+
+    // stage 4
+    temp1 = (step1[0] + step1[1]) * cospi_16_64;
+    temp2 = (step1[0] - step1[1]) * cospi_16_64;
+    step2[0] = dct_const_round_shift(temp1);
+    step2[1] = dct_const_round_shift(temp2);
+    temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+    temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+    step2[2] = dct_const_round_shift(temp1);
+    step2[3] = dct_const_round_shift(temp2);
+    step2[4] = step1[4] + step1[5];
+    step2[5] = step1[4] - step1[5];
+    step2[6] = -step1[6] + step1[7];
+    step2[7] = step1[6] + step1[7];
+
+    step2[8] = step1[8];
+    step2[15] = step1[15];
+    temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+    temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+    step2[9] = dct_const_round_shift(temp1);
+    step2[14] = dct_const_round_shift(temp2);
+    temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+    temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+    step2[10] = dct_const_round_shift(temp1);
+    step2[13] = dct_const_round_shift(temp2);
+    step2[11] = step1[11];
+    step2[12] = step1[12];
+
+    step2[16] = step1[16] + step1[19];
+    step2[17] = step1[17] + step1[18];
+    step2[18] = step1[17] - step1[18];
+    step2[19] = step1[16] - step1[19];
+    step2[20] = -step1[20] + step1[23];
+    step2[21] = -step1[21] + step1[22];
+    step2[22] = step1[21] + step1[22];
+    step2[23] = step1[20] + step1[23];
+
+    step2[24] = step1[24] + step1[27];
+    step2[25] = step1[25] + step1[26];
+    step2[26] = step1[25] - step1[26];
+    step2[27] = step1[24] - step1[27];
+    step2[28] = -step1[28] + step1[31];
+    step2[29] = -step1[29] + step1[30];
+    step2[30] = step1[29] + step1[30];
+    step2[31] = step1[28] + step1[31];
+
+    // stage 5
+    step1[0] = step2[0] + step2[3];
+    step1[1] = step2[1] + step2[2];
+    step1[2] = step2[1] - step2[2];
+    step1[3] = step2[0] - step2[3];
+    step1[4] = step2[4];
+    temp1 = (step2[6] - step2[5]) * cospi_16_64;
+    temp2 = (step2[5] + step2[6]) * cospi_16_64;
+    step1[5] = dct_const_round_shift(temp1);
+    step1[6] = dct_const_round_shift(temp2);
+    step1[7] = step2[7];
+
+    step1[8] = step2[8] + step2[11];
+    step1[9] = step2[9] + step2[10];
+    step1[10] = step2[9] - step2[10];
+    step1[11] = step2[8] - step2[11];
+    step1[12] = -step2[12] + step2[15];
+    step1[13] = -step2[13] + step2[14];
+    step1[14] = step2[13] + step2[14];
+    step1[15] = step2[12] + step2[15];
+
+    step1[16] = step2[16];
+    step1[17] = step2[17];
+    temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+    temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+    step1[18] = dct_const_round_shift(temp1);
+    step1[29] = dct_const_round_shift(temp2);
+    temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+    temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+    step1[19] = dct_const_round_shift(temp1);
+    step1[28] = dct_const_round_shift(temp2);
+    temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+    temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+    step1[20] = dct_const_round_shift(temp1);
+    step1[27] = dct_const_round_shift(temp2);
+    temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+    temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+    step1[21] = dct_const_round_shift(temp1);
+    step1[26] = dct_const_round_shift(temp2);
+    step1[22] = step2[22];
+    step1[23] = step2[23];
+    step1[24] = step2[24];
+    step1[25] = step2[25];
+    step1[30] = step2[30];
+    step1[31] = step2[31];
+
+    // stage 6
+    step2[0] = step1[0] + step1[7];
+    step2[1] = step1[1] + step1[6];
+    step2[2] = step1[2] + step1[5];
+    step2[3] = step1[3] + step1[4];
+    step2[4] = step1[3] - step1[4];
+    step2[5] = step1[2] - step1[5];
+    step2[6] = step1[1] - step1[6];
+    step2[7] = step1[0] - step1[7];
+    step2[8] = step1[8];
+    step2[9] = step1[9];
+    temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+    temp2 = (step1[10] + step1[13]) * cospi_16_64;
+    step2[10] = dct_const_round_shift(temp1);
+    step2[13] = dct_const_round_shift(temp2);
+    temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+    temp2 = (step1[11] + step1[12]) * cospi_16_64;
+    step2[11] = dct_const_round_shift(temp1);
+    step2[12] = dct_const_round_shift(temp2);
+    step2[14] = step1[14];
+    step2[15] = step1[15];
+
+    step2[16] = step1[16] + step1[23];
+    step2[17] = step1[17] + step1[22];
+    step2[18] = step1[18] + step1[21];
+    step2[19] = step1[19] + step1[20];
+    step2[20] = step1[19] - step1[20];
+    step2[21] = step1[18] - step1[21];
+    step2[22] = step1[17] - step1[22];
+    step2[23] = step1[16] - step1[23];
+
+    step2[24] = -step1[24] + step1[31];
+    step2[25] = -step1[25] + step1[30];
+    step2[26] = -step1[26] + step1[29];
+    step2[27] = -step1[27] + step1[28];
+    step2[28] = step1[27] + step1[28];
+    step2[29] = step1[26] + step1[29];
+    step2[30] = step1[25] + step1[30];
+    step2[31] = step1[24] + step1[31];
+
+    // stage 7
+    step1[0] = step2[0] + step2[15];
+    step1[1] = step2[1] + step2[14];
+    step1[2] = step2[2] + step2[13];
+    step1[3] = step2[3] + step2[12];
+    step1[4] = step2[4] + step2[11];
+    step1[5] = step2[5] + step2[10];
+    step1[6] = step2[6] + step2[9];
+    step1[7] = step2[7] + step2[8];
+    step1[8] = step2[7] - step2[8];
+    step1[9] = step2[6] - step2[9];
+    step1[10] = step2[5] - step2[10];
+    step1[11] = step2[4] - step2[11];
+    step1[12] = step2[3] - step2[12];
+    step1[13] = step2[2] - step2[13];
+    step1[14] = step2[1] - step2[14];
+    step1[15] = step2[0] - step2[15];
+
+    step1[16] = step2[16];
+    step1[17] = step2[17];
+    step1[18] = step2[18];
+    step1[19] = step2[19];
+    temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+    temp2 = (step2[20] + step2[27]) * cospi_16_64;
+    step1[20] = dct_const_round_shift(temp1);
+    step1[27] = dct_const_round_shift(temp2);
+    temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+    temp2 = (step2[21] + step2[26]) * cospi_16_64;
+    step1[21] = dct_const_round_shift(temp1);
+    step1[26] = dct_const_round_shift(temp2);
+    temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+    temp2 = (step2[22] + step2[25]) * cospi_16_64;
+    step1[22] = dct_const_round_shift(temp1);
+    step1[25] = dct_const_round_shift(temp2);
+    temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+    temp2 = (step2[23] + step2[24]) * cospi_16_64;
+    step1[23] = dct_const_round_shift(temp1);
+    step1[24] = dct_const_round_shift(temp2);
+    step1[28] = step2[28];
+    step1[29] = step2[29];
+    step1[30] = step2[30];
+    step1[31] = step2[31];
+
+    // final stage
+    output[0] = step1[0] + step1[31];
+    output[1] = step1[1] + step1[30];
+    output[2] = step1[2] + step1[29];
+    output[3] = step1[3] + step1[28];
+    output[4] = step1[4] + step1[27];
+    output[5] = step1[5] + step1[26];
+    output[6] = step1[6] + step1[25];
+    output[7] = step1[7] + step1[24];
+    output[8] = step1[8] + step1[23];
+    output[9] = step1[9] + step1[22];
+    output[10] = step1[10] + step1[21];
+    output[11] = step1[11] + step1[20];
+    output[12] = step1[12] + step1[19];
+    output[13] = step1[13] + step1[18];
+    output[14] = step1[14] + step1[17];
+    output[15] = step1[15] + step1[16];
+    output[16] = step1[15] - step1[16];
+    output[17] = step1[14] - step1[17];
+    output[18] = step1[13] - step1[18];
+    output[19] = step1[12] - step1[19];
+    output[20] = step1[11] - step1[20];
+    output[21] = step1[10] - step1[21];
+    output[22] = step1[9] - step1[22];
+    output[23] = step1[8] - step1[23];
+    output[24] = step1[7] - step1[24];
+    output[25] = step1[6] - step1[25];
+    output[26] = step1[5] - step1[26];
+    output[27] = step1[4] - step1[27];
+    output[28] = step1[3] - step1[28];
+    output[29] = step1[2] - step1[29];
+    output[30] = step1[1] - step1[30];
+    output[31] = step1[0] - step1[31];
+}
+
+void idct32x32_1(const rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    int i, j;
+    int a1;
+    int16_t out = dct_const_round_shift(rsGetElementAt_short(input, xoff, yoff)
+                          * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 6);
+    uint8_t result;
+    for (i = 0; i < 32; ++i) {
+        for (j = 0; j < 32; ++j) {
+            result = clip_pixel(rsGetElementAt_uchar(dest, j + xoff, i + yoff) + a1);
+            rsSetElementAt_uchar(dest, result, j + xoff, i + yoff);
+        }
+    }
+}
+
+void idct32x32_34(const rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    int16_t out[32 * 32] = { 0 };
+    int16_t *outptr = out;
+    int i, j;
+    int16_t temp_in[32], temp_out[32];
+
+    int16_t in[32 * 32];
+    int16_t *inptr = in;
+    for (i = 0; i < 32; ++i) {
+        for (j = 0; j < 32; ++j) {
+            in[j + i * 32] = rsGetElementAt_short(input, j + xoff, i + yoff);
+        }
+    }
+
+    // Rows
+    // only upper-left 8x8 has non-zero coeff
+    for (i = 0; i < 8; ++i) {
+        idct32_1d(inptr, outptr);
+        inptr += 32;
+        outptr += 32;
+    }
+
+    // Columns
+    uint8_t result;
+    for (i = 0; i < 32; ++i) {
+        for (j = 0; j < 32; ++j)
+            temp_in[j] = out[j * 32 + i];
+        idct32_1d(temp_in, temp_out);
+        for (j = 0; j < 32; ++j) {
+            result = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                             + rsGetElementAt_uchar(dest, i + xoff, j + yoff));
+            rsSetElementAt_uchar(dest, result, i + xoff, j + yoff);
+        }
+    }
+}
+
+void idct32x32_1024(const rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    int16_t out[32 * 32];
+    int16_t *outptr = out;
+    int i, j;
+    int16_t temp_in[32], temp_out[32];
+
+    int16_t in[32 * 32];
+    int16_t *inptr = in;
+    for (i = 0; i < 32; ++i) {
+        for (j = 0; j < 32; ++j) {
+            in[j + i * 32] = rsGetElementAt_short(input, j + xoff, i + yoff);
+        }
+    }
+
+    // Rows
+    for (i = 0; i < 32; ++i) {
+        int16_t zero_coeff[16];
+        for (j = 0; j < 16; ++j)
+            zero_coeff[j] = inptr[2 * j] | inptr[2 * j + 1];
+        for (j = 0; j < 8; ++j)
+            zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+        for (j = 0; j < 4; ++j)
+            zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+        for (j = 0; j < 2; ++j)
+            zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+        if (zero_coeff[0] | zero_coeff[1])
+            idct32_1d(inptr, outptr);
+        else
+            memset(outptr, 0, sizeof(int16_t) * 32);
+        inptr += 32;
+        outptr += 32;
+    }
+
+    // Columns
+    uint8_t result;
+    for (i = 0; i < 32; ++i) {
+        for (j = 0; j < 32; ++j)
+            temp_in[j] = out[j * 32 + i];
+        idct32_1d(temp_in, temp_out);
+        for (j = 0; j < 32; ++j) {
+            result = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                             + rsGetElementAt_uchar(dest, i + xoff, j + yoff));
+            rsSetElementAt_uchar(dest, result, i + xoff, j + yoff);
+        }
+    }
+}
+
+extern void idct4x4(const rs_allocation input, rs_allocation dest, int eob,
+        int xoff, int yoff) {
+    if (eob > 1) {
+        idct4x4_16(input, dest, xoff, yoff);
+    } else {
+        idct4x4_1(input, dest, xoff, yoff);
+    }
+}
+
+extern void idct8x8(const rs_allocation input, rs_allocation dest, int eob,
+        int xoff, int yoff) {
+    if (eob == 1)
+        // DC only DCT coefficient
+        idct8x8_1(input, dest, xoff, yoff);
+    else if (eob <= 10)
+        idct8x8_10(input, dest, xoff, yoff);
+    else
+        idct8x8_64(input, dest, xoff, yoff);
+}
+
+extern void idct16x16(const rs_allocation input, rs_allocation dest, int eob,
+        int xoff, int yoff) {
+    if (eob == 1)
+        /* DC only DCT coefficient. */
+        idct16x16_1(input, dest, xoff, yoff);
+    else if (eob <= 10)
+        idct16x16_10(input, dest, xoff, yoff);
+    else
+        idct16x16_256(input, dest, xoff, yoff);
+}
+
+extern void idct32x32(const rs_allocation input, rs_allocation dest, int eob,
+        int xoff, int yoff) {
+    if (eob == 1)
+        idct32x32_1(input, dest, xoff, yoff);
+    else if (eob <= 34)
+        // non-zero coeff only in upper-left 8x8
+        idct32x32_34(input, dest, xoff, yoff);
+    else
+        idct32x32_1024(input, dest, xoff, yoff);
+}
diff --git a/driver/runtime/rs_idct.h b/driver/runtime/rs_idct.h
new file mode 100644
index 0000000..49cc3aa
--- /dev/null
+++ b/driver/runtime/rs_idct.h
@@ -0,0 +1,58 @@
+#ifndef _RS_IDCT_H_
+#define _RS_IDCT_H_
+
+#include "rs_types.rsh"
+
+#define DCT_CONST_BITS 14
+
+// Constants:
+//  for (int i = 1; i< 32; ++i)
+//      printf("static const int cospi_%d_64 = %.0f;\n", i,
+//           round(16384 * cos(i*M_PI/64)));
+// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
+static const int cospi_1_64  = 16364;
+static const int cospi_2_64  = 16305;
+static const int cospi_3_64  = 16207;
+static const int cospi_4_64  = 16069;
+static const int cospi_5_64  = 15893;
+static const int cospi_6_64  = 15679;
+static const int cospi_7_64  = 15426;
+static const int cospi_8_64  = 15137;
+static const int cospi_9_64  = 14811;
+static const int cospi_10_64 = 14449;
+static const int cospi_11_64 = 14053;
+static const int cospi_12_64 = 13623;
+static const int cospi_13_64 = 13160;
+static const int cospi_14_64 = 12665;
+static const int cospi_15_64 = 12140;
+static const int cospi_16_64 = 11585;
+static const int cospi_17_64 = 11003;
+static const int cospi_18_64 = 10394;
+static const int cospi_19_64 = 9760;
+static const int cospi_20_64 = 9102;
+static const int cospi_21_64 = 8423;
+static const int cospi_22_64 = 7723;
+static const int cospi_23_64 = 7005;
+static const int cospi_24_64 = 6270;
+static const int cospi_25_64 = 5520;
+static const int cospi_26_64 = 4756;
+static const int cospi_27_64 = 3981;
+static const int cospi_28_64 = 3196;
+static const int cospi_29_64 = 2404;
+static const int cospi_30_64 = 1606;
+static const int cospi_31_64 = 804;
+
+#define ROUND_POWER_OF_TWO(value, n) \
+    (((value) + (1 << ((n) - 1))) >> (n))
+
+
+
+static uint8_t clip_pixel(int val) {
+    return (val > 255) ? 255u : (val < 0) ? 0u : val;
+}
+
+static int dct_const_round_shift(int input) {
+    int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+    return (int16_t)rv;
+}
+#endif
diff --git a/driver/runtime/rs_walsh.c b/driver/runtime/rs_walsh.c
new file mode 100644
index 0000000..9d7630e
--- /dev/null
+++ b/driver/runtime/rs_walsh.c
@@ -0,0 +1,71 @@
+#include "rs_types.rsh"
+#include "rs_allocation.rsh"
+
+extern void walsh4x4(rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    short output[16];
+    int i, j;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+    short *op = output;
+    int16_t inptr[16];
+    int16_t *in = inptr;
+    for (i = 0; i < 16; i++) {
+        inptr[i] = rsGetElementAt_short(input, xoff + i, yoff);
+    }
+
+    for (i = 0; i < 4; i++) {
+        a1 = in[0] + in[12];
+        b1 = in[4] + in[8];
+        c1 = in[4] - in[8];
+        d1 = in[0] - in[12];
+
+        op[0] = a1 + b1;
+        op[4] = c1 + d1;
+        op[8] = a1 - b1;
+        op[12] = d1 - c1;
+        in++;
+        op++;
+    }
+
+    in = output;
+    op = output;
+
+    for (i = 0; i < 4; i++) {
+        a1 = in[0] + in[3];
+        b1 = in[1] + in[2];
+        c1 = in[1] - in[2];
+        d1 = in[0] - in[3];
+
+        a2 = a1 + b1;
+        b2 = c1 + d1;
+        c2 = a1 - b1;
+        d2 = d1 - c1;
+
+        op[0] = (a2 + 3) >> 3;
+        op[1] = (b2 + 3) >> 3;
+        op[2] = (c2 + 3) >> 3;
+        op[3] = (d2 + 3) >> 3;
+
+        in += 4;
+        op += 4;
+    }
+
+    for (i = 0; i < 16; i++) {
+        rsSetElementAt_short(dest, output[i], xoff, yoff + i);
+    }
+
+}
+
+extern void walsh4x4_1(rs_allocation input, rs_allocation dest, int xoff, int yoff) {
+    int i, j;
+    int a1;
+    int16_t inptr[16];
+    int16_t *in = inptr;
+    for (i = 0; i < 16; i++) {
+        inptr[i] = rsGetElementAt_short(input, xoff + i, yoff);
+    }
+    a1 = ((in[0] + 3) >> 3);
+    for (i = 0; i < 16; i++) {
+        rsSetElementAt_short(dest, a1, xoff, yoff + i);
+    }
+}
diff --git a/scriptc/rs_core.rsh b/scriptc/rs_core.rsh
index 9caf355..6b38df4 100644
--- a/scriptc/rs_core.rsh
+++ b/scriptc/rs_core.rsh
@@ -60,6 +60,11 @@
 #include "rs_quaternion.rsh"
 #include "rs_sampler.rsh"
 #include "rs_time.rsh"
+#include "rs_idct.rsh"
+#include "rs_dct.rsh"
+#include "rs_iadst.rsh"
+#include "rs_fadst.rsh"
+#include "rs_walsh.rsh"
 
 /**
  * Send a message back to the client.  Will not block and returns true
diff --git a/scriptc/rs_dct.rsh b/scriptc/rs_dct.rsh
new file mode 100644
index 0000000..51e789d
--- /dev/null
+++ b/scriptc/rs_dct.rsh
@@ -0,0 +1,14 @@
+#ifndef __RS_DCT_RSH__
+#define __RS_DCT_RSH__
+
+#if RS_VERSION > 19
+
+extern void dct4x4(rs_allocation input, rs_allocation output, int xoff, int yoff);
+extern void dct8x8(rs_allocation input, rs_allocation output, int xoff, int yoff);
+extern void dct16x16(rs_allocation input, rs_allocation output, int xoff, int yoff);
+extern void dct32x32(rs_allocation input, rs_allocation out, int xoff, int yoff);
+extern void dct32x32_rd(rs_allocation input, rs_allocation out,  int xoff, int yoff);
+
+#endif
+
+#endif
diff --git a/scriptc/rs_fadst.rsh b/scriptc/rs_fadst.rsh
new file mode 100644
index 0000000..6dc0170
--- /dev/null
+++ b/scriptc/rs_fadst.rsh
@@ -0,0 +1,14 @@
+#ifndef __RS_FADST_RSH__
+#define __RS_FADST_RSH__
+
+#if RS_VERSION > 19
+
+extern void fadst4(const rs_allocation input, rs_allocation output, int32_t xoff);
+
+extern void fadst8(const rs_allocation input, rs_allocation output, int32_t xoff);
+
+extern void fadst16(const rs_allocation input, rs_allocation output, int32_t xoff);
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/scriptc/rs_iadst.rsh b/scriptc/rs_iadst.rsh
new file mode 100644
index 0000000..dad396c
--- /dev/null
+++ b/scriptc/rs_iadst.rsh
@@ -0,0 +1,14 @@
+#ifndef __RS_IADST_RSH__
+#define __RS_IADST_RSH__
+
+#if RS_VERSION > 19
+
+extern void iadst4(const rs_allocation input, rs_allocation output, int32_t xoff);
+
+extern void iadst8(const rs_allocation input, rs_allocation output, int32_t xoff);
+
+extern void iadst16(const rs_allocation input, rs_allocation output, int32_t xoff);
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/scriptc/rs_idct.rsh b/scriptc/rs_idct.rsh
new file mode 100644
index 0000000..e379736
--- /dev/null
+++ b/scriptc/rs_idct.rsh
@@ -0,0 +1,13 @@
+#ifndef __RS_IDCT_RSH__
+#define __RS_IDCT_RSH__
+
+#if RS_VERSION > 19
+
+extern void idct4x4(const rs_allocation input, rs_allocation dest, int eob, int xoff, int yoff);
+extern void idct8x8(const rs_allocation input, rs_allocation dest, int eob, int xoff, int yoff);
+extern void idct16x16(const rs_allocation input, rs_allocation dest, int eob, int xoff, int yoff);
+extern void idct32x32(const rs_allocation input, rs_allocation dest, int eob, int xoff, int yoff);
+
+#endif
+
+#endif
diff --git a/scriptc/rs_walsh.rsh b/scriptc/rs_walsh.rsh
new file mode 100644
index 0000000..de058ce
--- /dev/null
+++ b/scriptc/rs_walsh.rsh
@@ -0,0 +1,11 @@
+#ifndef __RS_WALSH_RSH__
+#define __RS_WALSH_RSH__
+
+#if RS_VERSION > 19
+
+extern void walsh4x4(rs_allocation input, rs_allocation dest, int xoff, int yoff);
+extern void walsh4x4_1(rs_allocation input, rs_allocation dest, int xoff, int yoff);
+
+#endif
+
+#endif